dspy 0.22.1 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,9 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'ostruct'
3
4
  require 'sorbet-runtime'
4
5
  require_relative 'teleprompter'
6
+ require_relative '../events/subscriber_mixin'
5
7
 
6
8
  module DSPy
7
9
  module Teleprompt
@@ -11,6 +13,26 @@ module DSPy
11
13
  class GEPA < Teleprompter
12
14
  extend T::Sig
13
15
 
16
+ # Enum for mutation operation types
17
+ class MutationType < T::Enum
18
+ enums do
19
+ Rewrite = new
20
+ Expand = new
21
+ Simplify = new
22
+ Combine = new
23
+ Rephrase = new
24
+ end
25
+ end
26
+
27
+ # Enum for crossover operation types
28
+ class CrossoverType < T::Enum
29
+ enums do
30
+ Uniform = new
31
+ Blend = new
32
+ Structured = new
33
+ end
34
+ end
35
+
14
36
  # Immutable execution trace record using Ruby's Data class
15
37
  # Captures execution events for GEPA's reflective analysis
16
38
  class ExecutionTrace < Data.define(
@@ -537,51 +559,2167 @@ module DSPy
537
559
  timestamps = traces.map(&:timestamp).sort
538
560
  (timestamps.last - timestamps.first).to_f
539
561
  end
540
- end
562
+
563
+ # LLM-based reflection methods for Phase 2
564
+
565
+ public
566
+
567
+ # Perform LLM-based reflection on execution traces using DSPy::Predict
568
+ sig { params(traces: T::Array[ExecutionTrace]).returns(ReflectionResult) }
569
+ def reflect_with_llm(traces)
570
+ return reflect_on_traces(traces) if traces.empty?
571
+
572
+ begin
573
+ # Use DSPy::Predict for analysis instead of raw prompts
574
+ prediction = analyze_traces_with_dspy(traces)
575
+ convert_prediction_to_reflection_result(prediction, traces)
576
+ rescue => e
577
+ # Fallback to rule-based analysis on LLM failure
578
+ fallback_result = reflect_on_traces(traces)
579
+ fallback_result.class.new(
580
+ trace_id: fallback_result.trace_id,
581
+ diagnosis: "LLM reflection failed (#{e.message}), using fallback analysis: #{fallback_result.diagnosis}",
582
+ improvements: fallback_result.improvements,
583
+ confidence: [fallback_result.confidence * 0.5, 0.5].min,
584
+ reasoning: "Fallback to rule-based analysis after LLM error: #{fallback_result.reasoning}",
585
+ suggested_mutations: fallback_result.suggested_mutations,
586
+ metadata: fallback_result.metadata.merge(
587
+ llm_error: e.message,
588
+ fallback_used: true
589
+ )
590
+ )
591
+ end
592
+ end
593
+
594
+ # Generate structured reflection prompt for LLM (public API)
595
+ sig { params(traces: T::Array[ExecutionTrace]).returns(String) }
596
+ def generate_reflection_prompt(traces)
597
+ if traces.empty?
598
+ return <<~PROMPT
599
+ You are analyzing execution traces for a genetic algorithm-based prompt optimization system called GEPA.
600
+
601
+ **Task**: Analyze execution patterns and provide optimization recommendations.
602
+
603
+ **Context**: No execution traces available.
604
+
605
+ Please provide your analysis in the following JSON format:
606
+ {
607
+ "diagnosis": "Brief description of what you observed",
608
+ "improvements": ["List of actionable improvement suggestions"],
609
+ "confidence": 0.0,
610
+ "reasoning": "Your reasoning process",
611
+ "suggested_mutations": ["expand", "rewrite", "simplify", "combine", "rephrase"],
612
+ "insights": {
613
+ "pattern_detected": "no_data",
614
+ "optimization_opportunity": "data_collection"
615
+ }
616
+ }
617
+ PROMPT
618
+ end
619
+
620
+ summary = trace_summary_for_reflection(traces)
621
+ insights = extract_optimization_insights(traces)
622
+
623
+ <<~PROMPT
624
+ You are analyzing execution traces for a genetic algorithm-based prompt optimization system called GEPA.
625
+
626
+ **Task**: Analyze execution patterns and provide optimization recommendations for prompt evolution.
627
+
628
+ **Execution Summary**:
629
+ #{summary}
630
+
631
+ **Optimization Context**:
632
+ - This is part of a genetic algorithm for prompt optimization
633
+ - Available mutation types: rewrite, expand, simplify, combine, rephrase
634
+ - Goal is to improve prompt effectiveness through iterative evolution
635
+ - Focus on actionable insights that can guide mutation and crossover operations
636
+
637
+ **Key Optimization Insights**:
638
+ #{insights.map { |k, v| "- #{k}: #{v.is_a?(Hash) ? v.values.join(', ') : v}" }.join("\n")}
639
+
640
+ **Sample Traces**:
641
+ #{format_traces_for_prompt(traces.take(3))}
642
+
643
+ Please analyze these execution patterns and provide optimization recommendations in the following JSON format:
644
+ {
645
+ "diagnosis": "Brief description of execution patterns and issues identified",
646
+ "improvements": ["List of 2-4 specific, actionable improvement suggestions"],
647
+ "confidence": 0.85,
648
+ "reasoning": "Your detailed reasoning process for the analysis",
649
+ "suggested_mutations": ["List of 2-3 mutation types that would be most beneficial"],
650
+ "insights": {
651
+ "pattern_detected": "primary_pattern_identified",
652
+ "optimization_opportunity": "key_area_for_improvement"
653
+ }
654
+ }
655
+
656
+ Focus on practical recommendations that will improve prompt performance through genetic algorithm evolution.
657
+ PROMPT
658
+ end
659
+
660
+ # Parse LLM reflection response into ReflectionResult (public API)
661
+ sig { params(response_text: String, original_traces: T::Array[ExecutionTrace]).returns(ReflectionResult) }
662
+ def parse_llm_reflection(response_text, original_traces)
663
+ reflection_id = generate_reflection_id
664
+
665
+ begin
666
+ parsed = JSON.parse(response_text)
667
+
668
+ # Extract and validate components
669
+ diagnosis = parsed['diagnosis'] || 'LLM reflection analysis'
670
+ improvements = Array(parsed['improvements']).select { |i| i.is_a?(String) && !i.strip.empty? }
671
+ confidence = [parsed['confidence'].to_f, 1.0].min
672
+ reasoning = parsed['reasoning'] || 'LLM-based analysis of execution traces'
673
+
674
+ # Validate and sanitize mutation suggestions
675
+ raw_mutations = Array(parsed['suggested_mutations'])
676
+ valid_mutations = raw_mutations.filter_map do |mut|
677
+ mutation_symbol = mut.to_s.downcase.to_sym
678
+ if [:rewrite, :expand, :simplify, :combine, :rephrase].include?(mutation_symbol)
679
+ mutation_symbol
680
+ end
681
+ end.uniq
682
+
683
+ # Ensure we have at least one valid mutation suggestion
684
+ valid_mutations = [:rewrite] if valid_mutations.empty?
685
+
686
+ ReflectionResult.new(
687
+ trace_id: reflection_id,
688
+ diagnosis: diagnosis,
689
+ improvements: improvements,
690
+ confidence: confidence,
691
+ reasoning: reasoning,
692
+ suggested_mutations: valid_mutations,
693
+ metadata: {
694
+ reflection_model: @config.reflection_lm,
695
+ analysis_timestamp: Time.now,
696
+ trace_count: original_traces.size,
697
+ token_usage: estimate_token_usage(response_text),
698
+ llm_based: true,
699
+ insights: parsed['insights'] || {}
700
+ }
701
+ )
702
+
703
+ rescue JSON::ParserError => e
704
+ # Handle malformed JSON response
705
+ ReflectionResult.new(
706
+ trace_id: reflection_id,
707
+ diagnosis: "LLM reflection JSON parsing error: #{e.message}",
708
+ improvements: ['Review prompt structure and LLM response format'],
709
+ confidence: 0.3,
710
+ reasoning: "Failed to parse LLM reflection response as valid JSON",
711
+ suggested_mutations: [:rewrite],
712
+ metadata: {
713
+ reflection_model: @config.reflection_lm,
714
+ analysis_timestamp: Time.now,
715
+ trace_count: original_traces.size,
716
+ token_usage: 0,
717
+ parsing_error: e.message,
718
+ raw_response: response_text.length > 500 ? "#{response_text[0..500]}..." : response_text
719
+ }
720
+ )
721
+ end
722
+ end
723
+
724
+ # Create comprehensive trace summary for reflection (public API)
725
+ sig { params(traces: T::Array[ExecutionTrace]).returns(String) }
726
+ def trace_summary_for_reflection(traces)
727
+ return "No execution traces available" if traces.empty?
728
+
729
+ llm_traces = traces.select(&:llm_trace?)
730
+ module_traces = traces.select(&:module_trace?)
731
+
732
+ total_tokens = llm_traces.sum(&:token_usage)
733
+ unique_models = llm_traces.map(&:model_name).compact.uniq
734
+ timespan = calculate_timespan(traces)
735
+
736
+ avg_response_length = if llm_traces.any?
737
+ total_length = llm_traces.sum { |t| t.response_text&.length || 0 }
738
+ total_length / llm_traces.size
739
+ else
740
+ 0
741
+ end
742
+
743
+ <<~SUMMARY
744
+ Total traces: #{traces.size}
745
+ LLM interactions: #{llm_traces.size}
746
+ Module calls: #{module_traces.size}
747
+ Total tokens: #{total_tokens}
748
+ Models used: #{unique_models.join(', ')}
749
+ Average response length: #{avg_response_length} characters
750
+ Execution timespan: #{timespan.round(2)} seconds
751
+ SUMMARY
752
+ end
753
+
754
+ # Extract optimization insights from trace analysis (public API)
755
+ sig { params(traces: T::Array[ExecutionTrace]).returns(T::Hash[Symbol, T.untyped]) }
756
+ def extract_optimization_insights(traces)
757
+ llm_traces = traces.select(&:llm_trace?)
758
+
759
+ insights = {
760
+ token_efficiency: analyze_token_efficiency(llm_traces),
761
+ response_quality: analyze_response_quality(llm_traces),
762
+ model_consistency: analyze_model_consistency(llm_traces)
763
+ }
764
+
765
+ insights
766
+ end
767
+
768
+ # Reflection with optimization context (public API)
769
+ sig { params(traces: T::Array[ExecutionTrace], context: T::Hash[Symbol, T.untyped]).returns(ReflectionResult) }
770
+ def reflection_with_context(traces, context)
771
+ base_result = reflect_with_llm(traces)
772
+
773
+ # Incorporate context into reasoning
774
+ context_reasoning = "Generation #{context[:generation] || 'unknown'} analysis. "
775
+ context_reasoning += "Population size: #{context[:population_size] || 'unknown'}. "
776
+
777
+ if context[:current_best_score]
778
+ context_reasoning += "Current best score: #{context[:current_best_score]}. "
779
+ end
780
+
781
+ # Adjust mutation suggestions based on history
782
+ adjusted_mutations = adjust_mutations_for_history(
783
+ base_result.suggested_mutations,
784
+ context[:mutation_history] || [],
785
+ context[:recent_performance_trend]
786
+ )
787
+
788
+ ReflectionResult.new(
789
+ trace_id: base_result.trace_id,
790
+ diagnosis: base_result.diagnosis,
791
+ improvements: base_result.improvements,
792
+ confidence: base_result.confidence,
793
+ reasoning: context_reasoning + base_result.reasoning,
794
+ suggested_mutations: adjusted_mutations,
795
+ metadata: base_result.metadata.merge(optimization_context: context)
796
+ )
797
+ end
798
+
799
+ # LLM-based reflection methods for Phase 2
800
+
801
+ public
802
+
803
+ # Perform LLM-based reflection on execution traces using DSPy::Predict
804
+ sig { params(traces: T::Array[ExecutionTrace]).returns(ReflectionResult) }
805
+ def reflect_with_llm(traces)
806
+ return reflect_on_traces(traces) if traces.empty?
807
+
808
+ begin
809
+ # Use DSPy::Predict for analysis instead of raw prompts
810
+ prediction = analyze_traces_with_dspy(traces)
811
+ convert_prediction_to_reflection_result(prediction, traces)
812
+ rescue => e
813
+ # Fallback to rule-based analysis on LLM failure
814
+ fallback_result = reflect_on_traces(traces)
815
+ fallback_result.class.new(
816
+ trace_id: fallback_result.trace_id,
817
+ diagnosis: "LLM reflection failed (#{e.message}), using fallback analysis: #{fallback_result.diagnosis}",
818
+ improvements: fallback_result.improvements,
819
+ confidence: [fallback_result.confidence * 0.5, 0.5].min,
820
+ reasoning: "Fallback to rule-based analysis after LLM error: #{fallback_result.reasoning}",
821
+ suggested_mutations: fallback_result.suggested_mutations,
822
+ metadata: fallback_result.metadata.merge(
823
+ llm_error: e.message,
824
+ fallback_used: true
825
+ )
826
+ )
827
+ end
828
+ end
829
+
830
+ # Generate structured reflection prompt for LLM (public API)
831
+ sig { params(traces: T::Array[ExecutionTrace]).returns(String) }
832
+ def generate_reflection_prompt(traces)
833
+ if traces.empty?
834
+ return <<~PROMPT
835
+ You are analyzing execution traces for a genetic algorithm-based prompt optimization system called GEPA.
836
+
837
+ **Task**: Analyze execution patterns and provide optimization recommendations.
838
+
839
+ **Context**: No execution traces available.
840
+
841
+ Please provide your analysis in the following JSON format:
842
+ {
843
+ "diagnosis": "Brief description of what you observed",
844
+ "improvements": ["List of actionable improvement suggestions"],
845
+ "confidence": 0.0,
846
+ "reasoning": "Your reasoning process",
847
+ "suggested_mutations": ["expand", "rewrite", "simplify", "combine", "rephrase"],
848
+ "insights": {
849
+ "pattern_detected": "no_data",
850
+ "optimization_opportunity": "data_collection"
851
+ }
852
+ }
853
+ PROMPT
854
+ end
855
+
856
+ summary = trace_summary_for_reflection(traces)
857
+ insights = extract_optimization_insights(traces)
858
+
859
+ <<~PROMPT
860
+ You are analyzing execution traces for a genetic algorithm-based prompt optimization system called GEPA.
861
+
862
+ **Task**: Analyze execution patterns and provide optimization recommendations for prompt evolution.
863
+
864
+ **Execution Summary**:
865
+ #{summary}
866
+
867
+ **Optimization Context**:
868
+ - This is part of a genetic algorithm for prompt optimization
869
+ - Available mutation types: rewrite, expand, simplify, combine, rephrase
870
+ - Goal is to improve prompt effectiveness through iterative evolution
871
+ - Focus on actionable insights that can guide mutation and crossover operations
872
+
873
+ **Key Optimization Insights**:
874
+ #{insights.map { |k, v| "- #{k}: #{v.is_a?(Hash) ? v.values.join(', ') : v}" }.join("\n")}
875
+
876
+ **Sample Traces**:
877
+ #{format_traces_for_prompt(traces.take(3))}
878
+
879
+ Please analyze these execution patterns and provide optimization recommendations in the following JSON format:
880
+ {
881
+ "diagnosis": "Brief description of execution patterns and issues identified",
882
+ "improvements": ["List of 2-4 specific, actionable improvement suggestions"],
883
+ "confidence": 0.85,
884
+ "reasoning": "Your detailed reasoning process for the analysis",
885
+ "suggested_mutations": ["List of 2-3 mutation types that would be most beneficial"],
886
+ "insights": {
887
+ "pattern_detected": "primary_pattern_identified",
888
+ "optimization_opportunity": "key_area_for_improvement"
889
+ }
890
+ }
891
+
892
+ Focus on practical recommendations that will improve prompt performance through genetic algorithm evolution.
893
+ PROMPT
894
+ end
895
+
896
+ # Parse LLM reflection response into ReflectionResult (public API)
897
+ sig { params(response_text: String, original_traces: T::Array[ExecutionTrace]).returns(ReflectionResult) }
898
+ def parse_llm_reflection(response_text, original_traces)
899
+ reflection_id = generate_reflection_id
900
+
901
+ begin
902
+ parsed = JSON.parse(response_text)
903
+
904
+ # Extract and validate components
905
+ diagnosis = parsed['diagnosis'] || 'LLM reflection analysis'
906
+ improvements = Array(parsed['improvements']).select { |i| i.is_a?(String) && !i.strip.empty? }
907
+ confidence = [parsed['confidence'].to_f, 1.0].min
908
+ reasoning = parsed['reasoning'] || 'LLM-based analysis of execution traces'
909
+
910
+ # Validate and sanitize mutation suggestions
911
+ raw_mutations = Array(parsed['suggested_mutations'])
912
+ valid_mutations = raw_mutations.filter_map do |mut|
913
+ mutation_symbol = mut.to_s.downcase.to_sym
914
+ if [:rewrite, :expand, :simplify, :combine, :rephrase].include?(mutation_symbol)
915
+ mutation_symbol
916
+ end
917
+ end.uniq
918
+
919
+ # Ensure we have at least one valid mutation suggestion
920
+ valid_mutations = [:rewrite] if valid_mutations.empty?
921
+
922
+ ReflectionResult.new(
923
+ trace_id: reflection_id,
924
+ diagnosis: diagnosis,
925
+ improvements: improvements,
926
+ confidence: confidence,
927
+ reasoning: reasoning,
928
+ suggested_mutations: valid_mutations,
929
+ metadata: {
930
+ reflection_model: @config.reflection_lm,
931
+ analysis_timestamp: Time.now,
932
+ trace_count: original_traces.size,
933
+ token_usage: estimate_token_usage(response_text),
934
+ llm_based: true,
935
+ insights: parsed['insights'] || {}
936
+ }
937
+ )
938
+
939
+ rescue JSON::ParserError => e
940
+ # Handle malformed JSON response
941
+ ReflectionResult.new(
942
+ trace_id: reflection_id,
943
+ diagnosis: "LLM reflection JSON parsing error: #{e.message}",
944
+ improvements: ['Review prompt structure and LLM response format'],
945
+ confidence: 0.3,
946
+ reasoning: "Failed to parse LLM reflection response as valid JSON",
947
+ suggested_mutations: [:rewrite],
948
+ metadata: {
949
+ reflection_model: @config.reflection_lm,
950
+ analysis_timestamp: Time.now,
951
+ trace_count: original_traces.size,
952
+ token_usage: 0,
953
+ parsing_error: e.message,
954
+ raw_response: response_text.length > 500 ? "#{response_text[0..500]}..." : response_text
955
+ }
956
+ )
957
+ end
958
+ end
959
+
960
+ # Create comprehensive trace summary for reflection (public API)
961
+ sig { params(traces: T::Array[ExecutionTrace]).returns(String) }
962
+ def trace_summary_for_reflection(traces)
963
+ return "No execution traces available" if traces.empty?
964
+
965
+ llm_traces = traces.select(&:llm_trace?)
966
+ module_traces = traces.select(&:module_trace?)
967
+
968
+ total_tokens = llm_traces.sum(&:token_usage)
969
+ unique_models = llm_traces.map(&:model_name).compact.uniq
970
+ timespan = calculate_timespan(traces)
971
+
972
+ avg_response_length = if llm_traces.any?
973
+ total_length = llm_traces.sum { |t| t.response_text&.length || 0 }
974
+ total_length / llm_traces.size
975
+ else
976
+ 0
977
+ end
978
+
979
+ <<~SUMMARY
980
+ Total traces: #{traces.size}
981
+ LLM interactions: #{llm_traces.size}
982
+ Module calls: #{module_traces.size}
983
+ Total tokens: #{total_tokens}
984
+ Models used: #{unique_models.join(', ')}
985
+ Average response length: #{avg_response_length} characters
986
+ Execution timespan: #{timespan.round(2)} seconds
987
+ SUMMARY
988
+ end
989
+
990
+ # Extract optimization insights from trace analysis (public API)
991
+ sig { params(traces: T::Array[ExecutionTrace]).returns(T::Hash[Symbol, T.untyped]) }
992
+ def extract_optimization_insights(traces)
993
+ llm_traces = traces.select(&:llm_trace?)
994
+
995
+ insights = {
996
+ token_efficiency: analyze_token_efficiency(llm_traces),
997
+ response_quality: analyze_response_quality(llm_traces),
998
+ model_consistency: analyze_model_consistency(llm_traces)
999
+ }
1000
+
1001
+ insights
1002
+ end
1003
+
1004
+ # Reflection with optimization context (public API)
1005
+ sig { params(traces: T::Array[ExecutionTrace], context: T::Hash[Symbol, T.untyped]).returns(ReflectionResult) }
1006
+ def reflection_with_context(traces, context)
1007
+ base_result = reflect_with_llm(traces)
1008
+
1009
+ # Incorporate context into reasoning
1010
+ context_reasoning = "Generation #{context[:generation] || 'unknown'} analysis. "
1011
+ context_reasoning += "Population size: #{context[:population_size] || 'unknown'}. "
1012
+
1013
+ if context[:current_best_score]
1014
+ context_reasoning += "Current best score: #{context[:current_best_score]}. "
1015
+ end
1016
+
1017
+ # Adjust mutation suggestions based on history
1018
+ adjusted_mutations = adjust_mutations_for_history(
1019
+ base_result.suggested_mutations,
1020
+ context[:mutation_history] || [],
1021
+ context[:recent_performance_trend]
1022
+ )
1023
+
1024
+ ReflectionResult.new(
1025
+ trace_id: base_result.trace_id,
1026
+ diagnosis: base_result.diagnosis,
1027
+ improvements: base_result.improvements,
1028
+ confidence: base_result.confidence,
1029
+ reasoning: context_reasoning + base_result.reasoning,
1030
+ suggested_mutations: adjusted_mutations,
1031
+ metadata: base_result.metadata.merge(optimization_context: context)
1032
+ )
1033
+ end
1034
+
1035
+ public
1036
+
1037
+ # Create signature for trace reflection analysis (public API)
1038
+ sig { returns(T.class_of(DSPy::Signature)) }
1039
+ def create_trace_reflection_signature
1040
+ @trace_reflection_signature ||= Class.new(DSPy::Signature) do
1041
+ description "Analyze execution traces from GEPA optimization system and provide actionable optimization insights"
1042
+
1043
+ input do
1044
+ const :execution_summary, String, description: "Summary of execution traces and performance patterns"
1045
+ const :optimization_context, String, description: "Context about the genetic algorithm optimization goals"
1046
+ const :key_insights, String, description: "Key insights extracted from trace analysis"
1047
+ const :sample_traces, String, description: "Representative execution trace samples"
1048
+ end
1049
+
1050
+ output do
1051
+ const :diagnosis, String, description: "Brief description of execution patterns and issues identified"
1052
+ const :improvements, T::Array[String], description: "List of 2-4 specific actionable improvement suggestions"
1053
+ const :confidence, Float, description: "Confidence level in analysis (0.0 to 1.0)"
1054
+ const :reasoning, String, description: "Detailed reasoning process for the analysis"
1055
+ const :suggested_mutations, T::Array[String], description: "List of 2-3 most beneficial mutation types from: rewrite, expand, simplify, combine, rephrase"
1056
+ const :pattern_detected, String, description: "Primary pattern identified in execution traces"
1057
+ const :optimization_opportunity, String, description: "Key area identified for performance improvement"
1058
+ end
1059
+ end
1060
+ end
541
1061
 
542
- # Configuration for GEPA optimization
543
- class GEPAConfig < Config
544
- extend T::Sig
1062
+ # Perform LLM analysis using DSPy::Predict (public API)
1063
+ sig { params(traces: T::Array[ExecutionTrace]).returns(DSPy::Prediction) }
1064
+ def analyze_traces_with_dspy(traces)
1065
+ predictor = DSPy::Predict.new(create_trace_reflection_signature)
1066
+
1067
+ # Prepare input data
1068
+ summary = trace_summary_for_reflection(traces)
1069
+ insights = extract_optimization_insights(traces)
1070
+ insights_text = insights.map { |k, v| "- #{k}: #{v.is_a?(Hash) ? v.values.join(', ') : v}" }.join("\n")
1071
+
1072
+ # Get LLM analysis
1073
+ predictor.call(
1074
+ execution_summary: summary,
1075
+ optimization_context: "GEPA genetic algorithm for prompt optimization. Available mutations: rewrite, expand, simplify, combine, rephrase. Goal: improve prompt effectiveness through iterative evolution.",
1076
+ key_insights: insights_text,
1077
+ sample_traces: format_traces_for_prompt(traces.take(3))
1078
+ )
1079
+ end
545
1080
 
1081
+ # Convert DSPy prediction to ReflectionResult (public API)
1082
+ sig { params(prediction: DSPy::Prediction, original_traces: T::Array[ExecutionTrace]).returns(ReflectionResult) }
1083
+ def convert_prediction_to_reflection_result(prediction, original_traces)
1084
+ reflection_id = generate_reflection_id
1085
+
1086
+ # Extract and validate prediction results
1087
+ diagnosis = prediction.diagnosis || 'DSPy reflection analysis'
1088
+ improvements = Array(prediction.improvements).select { |i| i.is_a?(String) && !i.strip.empty? }
1089
+ confidence = [[prediction.confidence&.to_f || 0.0, 1.0].min, 0.0].max
1090
+ reasoning = prediction.reasoning || 'DSPy-based analysis of execution traces'
1091
+
1092
+ # Validate mutation suggestions
1093
+ valid_mutations = Array(prediction.suggested_mutations).filter_map do |mut|
1094
+ mutation_symbol = mut.to_s.downcase.to_sym
1095
+ if [:rewrite, :expand, :simplify, :combine, :rephrase].include?(mutation_symbol)
1096
+ mutation_symbol
1097
+ end
1098
+ end.uniq
1099
+
1100
+ # Ensure we have at least one valid mutation suggestion
1101
+ valid_mutations = [:rewrite] if valid_mutations.empty?
1102
+
1103
+ ReflectionResult.new(
1104
+ trace_id: reflection_id,
1105
+ diagnosis: diagnosis,
1106
+ improvements: improvements,
1107
+ confidence: confidence,
1108
+ reasoning: reasoning,
1109
+ suggested_mutations: valid_mutations,
1110
+ metadata: {
1111
+ reflection_model: @config.reflection_lm,
1112
+ analysis_timestamp: Time.now,
1113
+ trace_count: original_traces.size,
1114
+ token_usage: estimate_token_usage(prediction.to_s),
1115
+ llm_based: true,
1116
+ dspy_prediction: true,
1117
+ insights: {
1118
+ pattern_detected: prediction.pattern_detected || "unknown_pattern",
1119
+ optimization_opportunity: prediction.optimization_opportunity || "general_optimization"
1120
+ }
1121
+ }
1122
+ )
1123
+ end
1124
+
1125
+ private
1126
+
1127
+ # Generate unique reflection ID
546
1128
  sig { returns(String) }
547
- attr_accessor :reflection_lm
548
-
549
- sig { returns(Integer) }
550
- attr_accessor :num_generations
551
-
552
- sig { returns(Integer) }
553
- attr_accessor :population_size
1129
+ def generate_reflection_id
1130
+ "reflection-#{SecureRandom.hex(4)}"
1131
+ end
554
1132
 
555
- sig { returns(Float) }
556
- attr_accessor :mutation_rate
1133
+ # Generate diagnosis text
1134
+ sig { params(patterns: T::Hash[Symbol, T.untyped]).returns(String) }
1135
+ def generate_diagnosis(patterns)
1136
+ if patterns[:total_tokens] > 400
1137
+ 'High token usage indicates potential inefficiency in prompt design'
1138
+ elsif patterns[:llm_traces_count] == 0
1139
+ 'No LLM interactions found - execution may not be working as expected'
1140
+ elsif patterns[:avg_response_length] < 10
1141
+ 'Responses are unusually brief which may indicate prompt clarity issues'
1142
+ else
1143
+ 'Execution patterns appear normal with room for optimization'
1144
+ end
1145
+ end
557
1146
 
558
- sig { returns(T::Boolean) }
559
- attr_accessor :use_pareto_selection
1147
+ # Generate reasoning text
1148
+ sig { params(patterns: T::Hash[Symbol, T.untyped], traces: T::Array[ExecutionTrace]).returns(String) }
1149
+ def generate_reasoning(patterns, traces)
1150
+ reasoning_parts = []
1151
+
1152
+ reasoning_parts << "Analyzed #{traces.size} execution traces"
1153
+ reasoning_parts << "#{patterns[:llm_traces_count]} LLM interactions"
1154
+ reasoning_parts << "#{patterns[:module_traces_count]} module operations"
1155
+ reasoning_parts << "Total token usage: #{patterns[:total_tokens]}"
1156
+
1157
+ reasoning_parts.join('. ') + '.'
1158
+ end
560
1159
 
561
- sig { void }
562
- def initialize
563
- super
564
- @reflection_lm = 'gpt-4o'
565
- @num_generations = 10
566
- @population_size = 8
567
- @mutation_rate = 0.7
568
- @use_pareto_selection = true
1160
+ # Calculate confidence based on patterns
1161
+ sig { params(patterns: T::Hash[Symbol, T.untyped]).returns(Float) }
1162
+ def calculate_confidence(patterns)
1163
+ base_confidence = 0.7
1164
+
1165
+ # More traces = higher confidence
1166
+ trace_bonus = [patterns[:llm_traces_count] + patterns[:module_traces_count], 10].min * 0.02
1167
+
1168
+ # Reasonable token usage = higher confidence
1169
+ token_penalty = patterns[:total_tokens] > 1000 ? -0.1 : 0.0
1170
+
1171
+ [(base_confidence + trace_bonus + token_penalty), 1.0].min
569
1172
  end
570
1173
 
571
- sig { returns(T::Hash[Symbol, T.untyped]) }
572
- def to_h
573
- super.merge({
574
- reflection_lm: @reflection_lm,
575
- num_generations: @num_generations,
576
- population_size: @population_size,
577
- mutation_rate: @mutation_rate,
578
- use_pareto_selection: @use_pareto_selection
579
- })
1174
+ # Calculate average response length from LLM traces
1175
+ sig { params(llm_traces: T::Array[ExecutionTrace]).returns(Integer) }
1176
+ def calculate_avg_response_length(llm_traces)
1177
+ return 0 if llm_traces.empty?
1178
+
1179
+ total_length = llm_traces.sum do |trace|
1180
+ response = trace.response_text
1181
+ response ? response.length : 0
1182
+ end
1183
+
1184
+ total_length / llm_traces.size
580
1185
  end
581
- end
582
1186
 
583
- sig { returns(GEPAConfig) }
584
- attr_reader :config
1187
+ # Calculate timespan of traces
1188
+ sig { params(traces: T::Array[ExecutionTrace]).returns(Float) }
1189
+ def calculate_timespan(traces)
1190
+ return 0.0 if traces.size < 2
1191
+
1192
+ timestamps = traces.map(&:timestamp).sort
1193
+ (timestamps.last - timestamps.first).to_f
1194
+ end
1195
+
1196
+
1197
+ # Format traces for inclusion in prompt
1198
+ sig { params(traces: T::Array[ExecutionTrace]).returns(String) }
1199
+ def format_traces_for_prompt(traces)
1200
+ traces.map.with_index do |trace, idx|
1201
+ prompt_preview = truncate_text(trace.prompt_text || 'N/A', 100)
1202
+ response_preview = truncate_text(trace.response_text || 'N/A', 100)
1203
+ "#{idx + 1}. [#{trace.event_name}] #{prompt_preview} → #{response_preview}"
1204
+ end.join("\n")
1205
+ end
1206
+
1207
+ # Estimate token usage from response
1208
+ sig { params(text: String).returns(Integer) }
1209
+ def estimate_token_usage(text)
1210
+ # Rough estimation: ~4 characters per token
1211
+ (text.length / 4.0).ceil
1212
+ end
1213
+
1214
+ # Analyze token efficiency patterns
1215
+ sig { params(llm_traces: T::Array[ExecutionTrace]).returns(T::Hash[Symbol, T.untyped]) }
1216
+ def analyze_token_efficiency(llm_traces)
1217
+ return { status: 'no_data', suggestions: [] } if llm_traces.empty?
1218
+
1219
+ total_tokens = llm_traces.sum(&:token_usage)
1220
+ avg_tokens = total_tokens.to_f / llm_traces.size
1221
+
1222
+ if avg_tokens > 400
1223
+ {
1224
+ status: 'poor',
1225
+ average_tokens: avg_tokens,
1226
+ suggestions: ['Consider reducing prompt length', 'Optimize instruction clarity']
1227
+ }
1228
+ elsif avg_tokens > 200
1229
+ {
1230
+ status: 'moderate',
1231
+ average_tokens: avg_tokens,
1232
+ suggestions: ['Monitor token usage trends', 'Consider prompt optimization']
1233
+ }
1234
+ else
1235
+ {
1236
+ status: 'good',
1237
+ average_tokens: avg_tokens,
1238
+ suggestions: ['Token usage appears efficient']
1239
+ }
1240
+ end
1241
+ end
1242
+
1243
+ # Analyze response quality patterns
1244
+ sig { params(llm_traces: T::Array[ExecutionTrace]).returns(T::Hash[Symbol, T.untyped]) }
1245
+ def analyze_response_quality(llm_traces)
1246
+ return { consistency: 'no_data', recommendations: [] } if llm_traces.empty?
1247
+
1248
+ response_lengths = llm_traces.map { |t| t.response_text&.length || 0 }
1249
+ length_variance = calculate_variance(response_lengths)
1250
+
1251
+ if length_variance > 1000
1252
+ {
1253
+ consistency: 'inconsistent',
1254
+ variance: length_variance,
1255
+ recommendations: [
1256
+ 'Add response format guidelines',
1257
+ 'Consider structured output templates'
1258
+ ]
1259
+ }
1260
+ else
1261
+ {
1262
+ consistency: 'consistent',
1263
+ variance: length_variance,
1264
+ recommendations: ['Response quality appears consistent']
1265
+ }
1266
+ end
1267
+ end
1268
+
1269
+ # Analyze model consistency
1270
+ sig { params(llm_traces: T::Array[ExecutionTrace]).returns(T::Hash[Symbol, T.untyped]) }
1271
+ def analyze_model_consistency(llm_traces)
1272
+ models = llm_traces.map(&:model_name).compact.uniq
1273
+
1274
+ {
1275
+ unique_models: models.size,
1276
+ models_used: models,
1277
+ recommendation: models.size > 1 ? 'Consider using single model for consistency' : 'Model usage is consistent'
1278
+ }
1279
+ end
1280
+
1281
+ # Adjust mutations based on history to avoid repetition
1282
+ sig { params(suggested: T::Array[Symbol], history: T::Array[Symbol], trend: T.nilable(String)).returns(T::Array[Symbol]) }
1283
+ def adjust_mutations_for_history(suggested, history, trend)
1284
+ # Count recent usage of each mutation type
1285
+ recent_usage = history.last(5).tally
1286
+
1287
+ # Filter out overused mutations
1288
+ adjusted = suggested.reject do |mutation|
1289
+ recent_usage[mutation] && recent_usage[mutation] >= 2
1290
+ end
1291
+
1292
+ # If trend is declining, prefer different strategies
1293
+ if trend == 'declining'
1294
+ adjusted = adjusted.reject { |m| m == :expand } # Avoid expansion if performance declining
1295
+ adjusted += [:simplify, :rephrase] unless adjusted.include?(:simplify) || adjusted.include?(:rephrase)
1296
+ end
1297
+
1298
+ # Ensure we always have at least one suggestion
1299
+ adjusted.empty? ? [:rewrite] : adjusted.uniq
1300
+ end
1301
+
1302
+ # Calculate variance for array of numbers
1303
+ sig { params(values: T::Array[Integer]).returns(Float) }
1304
+ def calculate_variance(values)
1305
+ return 0.0 if values.size < 2
1306
+
1307
+ mean = values.sum.to_f / values.size
1308
+ sum_squared_diff = values.sum { |v| (v - mean) ** 2 }
1309
+ sum_squared_diff / values.size
1310
+ end
1311
+
1312
+ # Truncate text to specified length with ellipsis
1313
+ sig { params(text: String, length: Integer).returns(String) }
1314
+ def truncate_text(text, length)
1315
+ return text if text.length <= length
1316
+ "#{text[0...length]}..."
1317
+ end
1318
+ end
1319
+
1320
+ # GeneticEngine orchestrates the genetic algorithm for prompt evolution
1321
+ # Manages population, selection, and evolution across generations
1322
+ class GeneticEngine
1323
+ extend T::Sig
1324
+
1325
+ sig { returns(GEPAConfig) }
1326
+ attr_reader :config
1327
+
1328
+ sig { returns(T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T.untyped)) }
1329
+ attr_reader :metric
1330
+
1331
+ sig { returns(T::Array[T.untyped]) }
1332
+ attr_reader :population
1333
+
1334
+ sig { returns(Integer) }
1335
+ attr_reader :generation
1336
+
1337
+ sig { params(config: GEPAConfig, metric: T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T.untyped)).void }
1338
+ def initialize(config:, metric:)
1339
+ @config = config
1340
+ @metric = metric
1341
+ @population = T.let([], T::Array[T.untyped])
1342
+ @generation = 0
1343
+ @fitness_scores = T.let([], T::Array[Float])
1344
+ end
1345
+
1346
+ # Initialize population with diverse instruction variants
1347
+ sig { params(program: T.untyped).void }
1348
+ def initialize_population(program)
1349
+ @population = []
1350
+
1351
+ # Start with original program
1352
+ @population << program
1353
+
1354
+ # Generate instruction variants to fill population
1355
+ original_instruction = program.signature_class.description
1356
+ variants = generate_instruction_variants(original_instruction)
1357
+
1358
+ # Create program copies with different instructions
1359
+ variants.take(@config.population_size - 1).each do |variant|
1360
+ variant_program = create_program_with_instruction(program, variant)
1361
+ @population << variant_program
1362
+ end
1363
+
1364
+ # If we need more candidates, duplicate and mutate
1365
+ while @population.size < @config.population_size
1366
+ base_program = @population.sample
1367
+ mutated = create_program_with_instruction(base_program,
1368
+ generate_instruction_variants(base_program.signature_class.description).first)
1369
+ @population << mutated
1370
+ end
1371
+
1372
+ @generation = 0
1373
+ end
1374
+
1375
+ # Evaluate all population members on the training set
1376
+ sig { params(trainset: T::Array[T.untyped]).returns(T::Array[Float]) }
1377
+ def evaluate_population(trainset)
1378
+ @fitness_scores = @population.map do |candidate|
1379
+ scores = trainset.map do |example|
1380
+ prediction = candidate.call(**example.input_values)
1381
+ @metric.call(example, prediction).to_f
1382
+ rescue => e
1383
+ # Handle evaluation errors gracefully
1384
+ 0.0
1385
+ end
1386
+
1387
+ scores.sum / scores.size
1388
+ end
1389
+
1390
+ @fitness_scores
1391
+ end
1392
+
1393
+ # Evolve to next generation using selection and mutation
1394
+ sig { params(trainset: T::Array[T.untyped]).void }
1395
+ def evolve_generation(trainset)
1396
+ current_scores = evaluate_population(trainset)
1397
+
1398
+ # Simple selection: keep top 50% and mutate them
1399
+ sorted_indices = (0...@population.size).sort_by { |i| -current_scores[i] }
1400
+ survivors = sorted_indices.take(@config.population_size / 2)
1401
+
1402
+ new_population = []
1403
+
1404
+ # Keep best performers
1405
+ survivors.each { |i| new_population << @population[i] }
1406
+
1407
+ # Fill rest with mutations of survivors
1408
+ while new_population.size < @config.population_size
1409
+ parent_index = survivors.sample
1410
+ parent = @population[parent_index]
1411
+
1412
+ # Generate mutation
1413
+ variants = generate_instruction_variants(parent.signature_class.description)
1414
+ mutated = create_program_with_instruction(parent, variants.first || parent.signature_class.description)
1415
+ new_population << mutated
1416
+ end
1417
+
1418
+ @population = new_population
1419
+ @generation += 1
1420
+ end
1421
+
1422
+ # Run complete evolution process
1423
+ sig { params(program: T.untyped, trainset: T::Array[T.untyped]).returns(T::Hash[Symbol, T.untyped]) }
1424
+ def run_evolution(program, trainset)
1425
+ initialize_population(program)
1426
+
1427
+ history = []
1428
+
1429
+ # Initial evaluation
1430
+ initial_scores = evaluate_population(trainset)
1431
+ history << {
1432
+ generation: 0,
1433
+ best_fitness: initial_scores.max,
1434
+ avg_fitness: initial_scores.sum / initial_scores.size,
1435
+ diversity: population_diversity
1436
+ }
1437
+
1438
+ # Evolution loop
1439
+ @config.num_generations.times do
1440
+ evolve_generation(trainset)
1441
+ scores = evaluate_population(trainset)
1442
+
1443
+ history << {
1444
+ generation: @generation,
1445
+ best_fitness: scores.max,
1446
+ avg_fitness: scores.sum / scores.size,
1447
+ diversity: population_diversity
1448
+ }
1449
+ end
1450
+
1451
+ {
1452
+ best_candidate: get_best_candidate,
1453
+ best_fitness: @fitness_scores.max,
1454
+ generation_history: history,
1455
+ final_population: @population.dup
1456
+ }
1457
+ end
1458
+
1459
+ # Get the best performing candidate from current population
1460
+ sig { returns(T.untyped) }
1461
+ def get_best_candidate
1462
+ return @population.first if @fitness_scores.empty?
1463
+
1464
+ best_index = @fitness_scores.each_with_index.max_by { |score, _| score }[1]
1465
+ @population[best_index]
1466
+ end
1467
+
1468
+ # Measure diversity of instructions in current population
1469
+ sig { returns(Float) }
1470
+ def population_diversity
1471
+ return 0.0 if @population.empty?
1472
+
1473
+ instructions = @population.map(&:signature_class).map(&:description)
1474
+ unique_instructions = instructions.uniq.size
1475
+
1476
+ unique_instructions.to_f / @population.size.to_f
1477
+ end
1478
+
1479
+ private
1480
+
1481
+ # Generate instruction variants (similar to simple optimization)
1482
+ sig { params(original_instruction: String).returns(T::Array[String]) }
1483
+ def generate_instruction_variants(original_instruction)
1484
+ variants = []
1485
+
1486
+ # Add "step by step" variant
1487
+ unless original_instruction.include?("step")
1488
+ variants << "#{original_instruction} Think step by step."
1489
+ end
1490
+
1491
+ # Add "detailed" variant
1492
+ unless original_instruction.include?("detail")
1493
+ variants << "#{original_instruction} Provide detailed reasoning."
1494
+ end
1495
+
1496
+ # Add "careful" variant
1497
+ unless original_instruction.include?("careful")
1498
+ variants << "Be careful and accurate. #{original_instruction}"
1499
+ end
1500
+
1501
+ # Add "examples" variant
1502
+ unless original_instruction.include?("example")
1503
+ variants << "#{original_instruction} Use examples in your response."
1504
+ end
1505
+
1506
+ # Add "precise" variant
1507
+ unless original_instruction.include?("precise")
1508
+ variants << "Be precise and specific. #{original_instruction}"
1509
+ end
1510
+
1511
+ variants.shuffle.take(5) # Return up to 5 variants, shuffled
1512
+ end
1513
+
1514
+ # Create program copy with modified instruction using DSPy.rb dynamic capabilities
1515
+ sig { params(original_program: T.untyped, new_instruction: String).returns(T.untyped) }
1516
+ def create_program_with_instruction(original_program, new_instruction)
1517
+ case original_program
1518
+ when DSPy::Predict
1519
+ # DSPy::Predict has built-in support for instruction modification
1520
+ original_program.with_instruction(new_instruction)
1521
+ when DSPy::Module
1522
+ # For custom DSPy::Module classes, create new instance with updated predictors
1523
+ create_modified_module(original_program, new_instruction)
1524
+ else
1525
+ # For other types (like test doubles), check available methods
1526
+ if original_program.respond_to?(:with_instruction)
1527
+ original_program.with_instruction(new_instruction)
1528
+ elsif original_program.respond_to?(:signature_class)
1529
+ # Create new DSPy::Predict with the same signature but new instruction
1530
+ signature_class = original_program.signature_class
1531
+ DSPy::Predict.new(signature_class).with_instruction(new_instruction)
1532
+ else
1533
+ # Fallback: return original if we can't modify
1534
+ original_program
1535
+ end
1536
+ end
1537
+ rescue => e
1538
+ # Return original program on error
1539
+ original_program
1540
+ end
1541
+
1542
+ # Create modified version of custom DSPy::Module (for GeneticEngine)
1543
+ sig { params(original_module: DSPy::Module, new_instruction: String).returns(DSPy::Module) }
1544
+ def create_modified_module(original_module, new_instruction)
1545
+ begin
1546
+ # Create a new instance of the same class
1547
+ new_module = original_module.class.new
1548
+
1549
+ # Try to find and update any internal predictors
1550
+ original_module.instance_variables.each do |var_name|
1551
+ var_value = original_module.instance_variable_get(var_name)
1552
+
1553
+ if var_value.is_a?(DSPy::Predict)
1554
+ # Update the instruction for internal predictors
1555
+ modified_predictor = var_value.with_instruction(new_instruction)
1556
+ new_module.instance_variable_set(var_name, modified_predictor)
1557
+ else
1558
+ # Copy other instance variables as-is
1559
+ new_module.instance_variable_set(var_name, var_value)
1560
+ end
1561
+ end
1562
+
1563
+ new_module
1564
+ rescue => e
1565
+ # Fallback to original module
1566
+ original_module
1567
+ end
1568
+ end
1569
+ end
1570
+
1571
+ # FitnessScore represents multi-dimensional evaluation results
1572
+ class FitnessScore < T::Struct
1573
+ extend T::Sig
1574
+
1575
+ const :primary_score, Float
1576
+ const :secondary_scores, T::Hash[Symbol, Float]
1577
+ const :overall_score, Float
1578
+ const :metadata, T::Hash[Symbol, T.untyped]
1579
+
1580
+ sig do
1581
+ params(
1582
+ primary_score: Float,
1583
+ secondary_scores: T::Hash[Symbol, Float],
1584
+ overall_score: Float,
1585
+ metadata: T.nilable(T::Hash[Symbol, T.untyped])
1586
+ ).void
1587
+ end
1588
+ def initialize(primary_score:, secondary_scores:, overall_score:, metadata: nil)
1589
+ # Validate score ranges
1590
+ [primary_score, overall_score].each do |score|
1591
+ if score < 0.0 || score > 1.0
1592
+ raise ArgumentError, "Score must be between 0.0 and 1.0, got #{score}"
1593
+ end
1594
+ end
1595
+
1596
+ secondary_scores.each do |name, score|
1597
+ if score < 0.0 || score > 1.0
1598
+ raise ArgumentError, "Secondary score #{name} must be between 0.0 and 1.0, got #{score}"
1599
+ end
1600
+ end
1601
+
1602
+ super(
1603
+ primary_score: primary_score,
1604
+ secondary_scores: secondary_scores.freeze,
1605
+ overall_score: overall_score,
1606
+ metadata: (metadata || {}).freeze
1607
+ )
1608
+ end
1609
+
1610
+ # Check if this score is dominated by another (for Pareto analysis)
1611
+ sig { params(other: FitnessScore).returns(T::Boolean) }
1612
+ def dominated_by?(other)
1613
+ return false if overall_score > other.overall_score
1614
+ return true if overall_score < other.overall_score
1615
+
1616
+ # If overall scores are equal, check secondary metrics
1617
+ secondary_scores.all? do |metric, score|
1618
+ other_score = other.secondary_scores[metric] || 0.0
1619
+ score <= other_score
1620
+ end
1621
+ end
1622
+
1623
+ # Get combined score for specific objectives
1624
+ sig { params(objectives: T::Array[Symbol]).returns(Float) }
1625
+ def score_for_objectives(objectives)
1626
+ relevant_scores = objectives.map { |obj| secondary_scores[obj] || 0.0 }
1627
+ return primary_score if relevant_scores.empty?
1628
+
1629
+ (primary_score + relevant_scores.sum) / (objectives.size + 1)
1630
+ end
1631
+ end
1632
+
1633
+ # FitnessEvaluator provides multi-dimensional evaluation of prompt candidates
1634
+ class FitnessEvaluator
1635
+ extend T::Sig
1636
+
1637
+ sig { returns(T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T.untyped)) }
1638
+ attr_reader :primary_metric
1639
+
1640
+ sig { returns(GEPAConfig) }
1641
+ attr_reader :config
1642
+
1643
+ sig { returns(T::Hash[Symbol, T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T.untyped)]) }
1644
+ attr_reader :secondary_metrics
1645
+
1646
+ sig do
1647
+ params(
1648
+ primary_metric: T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T.untyped),
1649
+ config: GEPAConfig,
1650
+ secondary_metrics: T.nilable(T::Hash[Symbol, T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T.untyped)])
1651
+ ).void
1652
+ end
1653
+ def initialize(primary_metric:, config:, secondary_metrics: nil)
1654
+ @primary_metric = primary_metric
1655
+ @config = config
1656
+ @secondary_metrics = secondary_metrics || default_secondary_metrics
1657
+ @trace_collector = TraceCollector.new
1658
+ end
1659
+
1660
+ # Evaluate a single candidate program
1661
+ sig { params(program: T.untyped, trainset: T::Array[T.untyped]).returns(FitnessScore) }
1662
+ def evaluate_candidate(program, trainset)
1663
+ start_time = Time.now
1664
+ predictions = []
1665
+ traces = []
1666
+
1667
+ # Collect primary metric scores and execution data
1668
+ primary_scores = trainset.map do |example|
1669
+ prediction_start = Time.now
1670
+ prediction = program.call(**example.input_values)
1671
+ prediction_time = Time.now - prediction_start
1672
+
1673
+ predictions << {
1674
+ prediction: prediction,
1675
+ latency: prediction_time,
1676
+ example: example
1677
+ }
1678
+
1679
+ @primary_metric.call(example, prediction).to_f
1680
+ rescue => e
1681
+ # Handle prediction errors
1682
+ predictions << {
1683
+ prediction: nil,
1684
+ latency: 0.0,
1685
+ example: example,
1686
+ error: e.message
1687
+ }
1688
+ 0.0
1689
+ end
1690
+
1691
+ primary_score = primary_scores.sum / primary_scores.size
1692
+
1693
+ # Calculate secondary metrics
1694
+ secondary_scores = {}
1695
+
1696
+ # Token efficiency (mock data for now - will be replaced with real trace collection)
1697
+ mock_traces = predictions.map.with_index do |pred, i|
1698
+ OpenStruct.new(token_usage: 50 + rand(100))
1699
+ end
1700
+ secondary_scores[:token_efficiency] = calculate_token_efficiency(mock_traces, predictions.size)
1701
+
1702
+ # Response consistency
1703
+ response_texts = predictions.map { |p| p[:prediction]&.answer&.to_s || '' }
1704
+ secondary_scores[:consistency] = calculate_consistency(response_texts)
1705
+
1706
+ # Latency performance
1707
+ latencies = predictions.map { |p| p[:latency] }
1708
+ secondary_scores[:latency] = calculate_latency_score(latencies)
1709
+
1710
+ # Calculate weighted overall score
1711
+ overall_score = calculate_overall_score(primary_score, secondary_scores)
1712
+
1713
+ FitnessScore.new(
1714
+ primary_score: primary_score,
1715
+ secondary_scores: secondary_scores,
1716
+ overall_score: overall_score,
1717
+ metadata: {
1718
+ evaluation_time: Time.now - start_time,
1719
+ examples_count: trainset.size,
1720
+ errors_count: predictions.count { |p| p[:error] }
1721
+ }
1722
+ )
1723
+ end
1724
+
1725
+ # Evaluate multiple candidates in batch
1726
+ sig { params(programs: T::Array[T.untyped], trainset: T::Array[T.untyped]).returns(T::Array[FitnessScore]) }
1727
+ def batch_evaluate(programs, trainset)
1728
+ programs.map { |program| evaluate_candidate(program, trainset) }
1729
+ end
1730
+
1731
+ # Compare two fitness scores (positive if first is better)
1732
+ sig { params(score1: FitnessScore, score2: FitnessScore).returns(Float) }
1733
+ def compare_candidates(score1, score2)
1734
+ score1.overall_score - score2.overall_score
1735
+ end
1736
+
1737
+ # Rank candidates by fitness (returns indices sorted by fitness, best first)
1738
+ sig { params(scores: T::Array[FitnessScore]).returns(T::Array[Integer]) }
1739
+ def rank_candidates(scores)
1740
+ scores.each_with_index.sort_by { |score, _| -score.overall_score }.map(&:last)
1741
+ end
1742
+
1743
+ private
1744
+
1745
+ # Default secondary metrics for fitness evaluation
1746
+ sig { returns(T::Hash[Symbol, T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T.untyped)]) }
1747
+ def default_secondary_metrics
1748
+ {
1749
+ token_efficiency: proc { |traces, count| calculate_token_efficiency(traces, count) },
1750
+ consistency: proc { |responses| calculate_consistency(responses) },
1751
+ latency: proc { |latencies| calculate_latency_score(latencies) }
1752
+ }
1753
+ end
1754
+
1755
+ # Calculate token usage efficiency (lower usage = higher score)
1756
+ sig { params(traces: T::Array[T.untyped], example_count: Integer).returns(Float) }
1757
+ def calculate_token_efficiency(traces, example_count)
1758
+ return 1.0 if traces.empty? || example_count == 0
1759
+
1760
+ total_tokens = traces.sum(&:token_usage)
1761
+ avg_tokens_per_example = total_tokens.to_f / example_count
1762
+
1763
+ # Efficiency decreases as token usage increases
1764
+ # Assume 100 tokens per example is baseline (score 0.5)
1765
+ baseline_tokens = 100.0
1766
+ efficiency = baseline_tokens / (baseline_tokens + avg_tokens_per_example)
1767
+
1768
+ [efficiency, 1.0].min
1769
+ end
1770
+
1771
+ # Calculate consistency of responses (similar structure = higher score)
1772
+ sig { params(responses: T::Array[String]).returns(Float) }
1773
+ def calculate_consistency(responses)
1774
+ return 1.0 if responses.empty? || responses.size == 1
1775
+
1776
+ # Simple consistency measure: average word overlap between responses
1777
+ word_sets = responses.map { |response| response.downcase.split.to_set }
1778
+
1779
+ total_similarity = 0.0
1780
+ comparisons = 0
1781
+
1782
+ word_sets.each_with_index do |set1, i|
1783
+ word_sets[(i+1)..-1].each do |set2|
1784
+ intersection = set1 & set2
1785
+ union = set1 | set2
1786
+
1787
+ similarity = union.empty? ? 0.0 : intersection.size.to_f / union.size
1788
+ total_similarity += similarity
1789
+ comparisons += 1
1790
+ end
1791
+ end
1792
+
1793
+ comparisons == 0 ? 1.0 : total_similarity / comparisons
1794
+ end
1795
+
1796
+ # Calculate latency performance score (faster = higher score)
1797
+ sig { params(latencies: T::Array[Float]).returns(Float) }
1798
+ def calculate_latency_score(latencies)
1799
+ return 1.0 if latencies.empty?
1800
+
1801
+ avg_latency = latencies.sum / latencies.size
1802
+
1803
+ # Penalize high latencies (assume 2 seconds is baseline for 0.5 score)
1804
+ baseline_latency = 2.0
1805
+ latency_score = baseline_latency / (baseline_latency + avg_latency)
1806
+
1807
+ [latency_score, 1.0].min
1808
+ end
1809
+
1810
+ # Calculate weighted overall score combining primary and secondary metrics
1811
+ sig { params(primary_score: Float, secondary_scores: T::Hash[Symbol, Float]).returns(Float) }
1812
+ def calculate_overall_score(primary_score, secondary_scores)
1813
+ # Weight primary metric at 70%, secondary metrics at 30%
1814
+ primary_weight = 0.7
1815
+ secondary_weight = 0.3
1816
+
1817
+ return primary_score if secondary_scores.empty?
1818
+
1819
+ avg_secondary = secondary_scores.values.sum / secondary_scores.size
1820
+ overall = (primary_score * primary_weight) + (avg_secondary * secondary_weight)
1821
+
1822
+ [overall, 1.0].min
1823
+ end
1824
+ end
1825
+
1826
+ # InstructionProposer: Analyzes execution traces and generates improved instructions using LLM reflection
1827
+ class InstructionProposer
1828
+ extend T::Sig
1829
+
1830
+ sig { params(config: GEPAConfig).void }
1831
+ def initialize(config:)
1832
+ @config = config
1833
+ end
1834
+
1835
+ # Generate improved instruction based on execution traces and failures
1836
+ sig { params(original_instruction: String, execution_traces: T::Array[ExecutionTrace], failed_examples: T::Array[T.untyped]).returns(String) }
1837
+ def propose_instruction(original_instruction:, execution_traces:, failed_examples:)
1838
+ if execution_traces.empty? && failed_examples.empty?
1839
+ # No traces or failures to analyze, return original
1840
+ return original_instruction
1841
+ end
1842
+
1843
+ # Use LLM-based reflection to generate improved instruction
1844
+ reflect_and_propose(
1845
+ original_instruction: original_instruction,
1846
+ execution_traces: execution_traces,
1847
+ failed_examples: failed_examples
1848
+ )
1849
+ rescue => e
1850
+ # Fallback to original instruction on error
1851
+ original_instruction
1852
+ end
1853
+
1854
+ private
1855
+
1856
+ sig { returns(GEPAConfig) }
1857
+ attr_reader :config
1858
+
1859
+ # Use LLM reflection to propose improved instruction
1860
+ sig { params(original_instruction: String, execution_traces: T::Array[ExecutionTrace], failed_examples: T::Array[T.untyped]).returns(String) }
1861
+ def reflect_and_propose(original_instruction:, execution_traces:, failed_examples:)
1862
+ # Create signature for instruction improvement
1863
+ improvement_signature = create_instruction_improvement_signature
1864
+
1865
+ # Create predictor for instruction proposal
1866
+ proposer = DSPy::Predict.new(improvement_signature)
1867
+
1868
+ # Analyze traces and failures
1869
+ trace_analysis = analyze_execution_traces(execution_traces)
1870
+ failure_analysis = analyze_failed_examples(failed_examples)
1871
+
1872
+ # Generate improved instruction
1873
+ result = proposer.call(
1874
+ original_instruction: original_instruction,
1875
+ trace_analysis: trace_analysis,
1876
+ failure_analysis: failure_analysis,
1877
+ improvement_context: "GEPA prompt optimization for better performance"
1878
+ )
1879
+
1880
+ result.improved_instruction || original_instruction
1881
+ rescue => e
1882
+ # Return original instruction if LLM call fails
1883
+ original_instruction
1884
+ end
1885
+
1886
+ # Create signature for instruction improvement
1887
+ sig { returns(T.class_of(DSPy::Signature)) }
1888
+ def create_instruction_improvement_signature
1889
+ Class.new(DSPy::Signature) do
1890
+ description "Analyze execution traces and propose improved instructions for better AI system performance"
1891
+
1892
+ input do
1893
+ const :original_instruction, String, description: "The current instruction/prompt being used"
1894
+ const :trace_analysis, String, description: "Analysis of execution traces showing patterns and issues"
1895
+ const :failure_analysis, String, description: "Analysis of failed examples and their patterns"
1896
+ const :improvement_context, String, description: "Context about what kind of improvement is needed"
1897
+ end
1898
+
1899
+ output do
1900
+ const :improved_instruction, String, description: "Improved instruction that addresses identified issues"
1901
+ const :reasoning, String, description: "Explanation of why this improvement should work better"
1902
+ const :confidence, Float, description: "Confidence in the improvement (0.0-1.0)"
1903
+ end
1904
+ end
1905
+ end
1906
+
1907
+ # Analyze execution traces to identify patterns
1908
+ sig { params(traces: T::Array[ExecutionTrace]).returns(String) }
1909
+ def analyze_execution_traces(traces)
1910
+ return "No execution traces available" if traces.empty?
1911
+
1912
+ llm_traces = traces.select(&:llm_trace?)
1913
+ module_traces = traces.select(&:module_trace?)
1914
+
1915
+ analysis = []
1916
+ analysis << "Execution Trace Analysis:"
1917
+ analysis << "- Total traces: #{traces.size}"
1918
+ analysis << "- LLM interactions: #{llm_traces.size}"
1919
+ analysis << "- Module calls: #{module_traces.size}"
1920
+
1921
+ if llm_traces.any?
1922
+ token_usage = llm_traces.sum(&:token_usage)
1923
+ avg_response_length = llm_traces.map { |t| t.attributes['response']&.to_s&.length || 0 }.sum / llm_traces.size
1924
+
1925
+ analysis << "- Total tokens used: #{token_usage}"
1926
+ analysis << "- Average response length: #{avg_response_length} characters"
1927
+
1928
+ # Identify models used
1929
+ models = llm_traces.map { |t| t.attributes['gen_ai.request.model'] }.compact.uniq
1930
+ analysis << "- Models used: #{models.join(', ')}" if models.any?
1931
+ end
1932
+
1933
+ # Analyze timing patterns
1934
+ if traces.size > 1
1935
+ timespan = traces.max_by(&:timestamp).timestamp - traces.min_by(&:timestamp).timestamp
1936
+ analysis << "- Execution timespan: #{timespan.round(2)} seconds"
1937
+ end
1938
+
1939
+ analysis.join("\n")
1940
+ end
1941
+
1942
+ # Analyze failed examples to identify failure patterns
1943
+ sig { params(failed_examples: T::Array[T.untyped]).returns(String) }
1944
+ def analyze_failed_examples(failed_examples)
1945
+ return "No failed examples to analyze" if failed_examples.empty?
1946
+
1947
+ analysis = []
1948
+ analysis << "Failure Pattern Analysis:"
1949
+ analysis << "- Failed examples count: #{failed_examples.size}"
1950
+
1951
+ # Group failures by type if possible
1952
+ if failed_examples.first.respond_to?(:input)
1953
+ input_patterns = failed_examples.map { |ex| ex.input.keys }.flatten.uniq
1954
+ analysis << "- Input fields involved: #{input_patterns.join(', ')}"
1955
+ end
1956
+
1957
+ # Sample some failure cases for context
1958
+ sample_size = [failed_examples.size, 3].min
1959
+ analysis << "- Sample failures:"
1960
+ failed_examples.take(sample_size).each_with_index do |example, idx|
1961
+ if example.respond_to?(:input) && example.respond_to?(:expected_values)
1962
+ input_summary = example.input.values.first.to_s[0..50] + "..."
1963
+ expected = example.expected_values.values.first.to_s[0..30] + "..."
1964
+ analysis << " #{idx + 1}. Input: #{input_summary} | Expected: #{expected}"
1965
+ end
1966
+ end
1967
+
1968
+ analysis.join("\n")
1969
+ end
1970
+ end
1971
+
1972
+ # MutationEngine: Handles LLM-based prompt transformations for genetic evolution
1973
+ class MutationEngine
1974
+ extend T::Sig
1975
+
1976
+ sig { returns(GEPAConfig) }
1977
+ attr_reader :config
1978
+
1979
+ sig { returns(InstructionProposer) }
1980
+ attr_reader :instruction_proposer
1981
+
1982
+ sig { params(config: GEPAConfig).void }
1983
+ def initialize(config:)
1984
+ @config = config
1985
+ @instruction_proposer = InstructionProposer.new(config: config)
1986
+ end
1987
+
1988
+ # Mutate a single program with LLM-based instruction proposal
1989
+ sig { params(program: T.untyped, execution_traces: T::Array[ExecutionTrace], failed_examples: T::Array[T.untyped]).returns(T.untyped) }
1990
+ def mutate_program(program, execution_traces: [], failed_examples: [])
1991
+ return program if rand > @config.mutation_rate
1992
+
1993
+ begin
1994
+ original_instruction = extract_instruction(program)
1995
+
1996
+ # Use LLM-based instruction proposal instead of hardcoded mutations
1997
+ improved_instruction = @instruction_proposer.propose_instruction(
1998
+ original_instruction: original_instruction,
1999
+ execution_traces: execution_traces,
2000
+ failed_examples: failed_examples
2001
+ )
2002
+
2003
+ create_mutated_program(program, improved_instruction)
2004
+ rescue => e
2005
+ emit_event('mutation_error', {
2006
+ error: e.message,
2007
+ program_type: program.class.name
2008
+ })
2009
+ # Return original program on mutation failure
2010
+ program
2011
+ end
2012
+ end
2013
+
2014
+ # Batch mutation of multiple programs with shared execution context
2015
+ sig { params(programs: T::Array[T.untyped], execution_traces: T::Array[ExecutionTrace], failed_examples: T::Array[T.untyped]).returns(T::Array[T.untyped]) }
2016
+ def batch_mutate(programs, execution_traces: [], failed_examples: [])
2017
+ return [] if programs.empty?
2018
+
2019
+ programs.map { |program| mutate_program(program, execution_traces: execution_traces, failed_examples: failed_examples) }
2020
+ end
2021
+
2022
+ # Emit events for logging and monitoring
2023
+ sig { params(event_name: String, data: T::Hash[Symbol, T.untyped]).void }
2024
+ def emit_event(event_name, data = {})
2025
+ # For now, just a placeholder - could integrate with DSPy event system
2026
+ # In full implementation, this would emit events for monitoring
2027
+ end
2028
+
2029
+ private
2030
+
2031
+ # Extract instruction text from program
2032
+ sig { params(program: T.untyped).returns(String) }
2033
+ def extract_instruction(program)
2034
+ if program.signature_class&.description
2035
+ program.signature_class.description
2036
+ else
2037
+ "Analyze the input and complete the task accurately"
2038
+ end
2039
+ end
2040
+
2041
+ # Apply specific mutation type to instruction
2042
+ sig { params(instruction: String, mutation_type: MutationType).returns(String) }
2043
+ def apply_mutation(instruction, mutation_type)
2044
+ case mutation_type
2045
+ when MutationType::Rewrite
2046
+ apply_rewrite_mutation(instruction)
2047
+ when MutationType::Expand
2048
+ apply_expand_mutation(instruction)
2049
+ when MutationType::Simplify
2050
+ apply_simplify_mutation(instruction)
2051
+ when MutationType::Combine
2052
+ apply_combine_mutation(instruction)
2053
+ when MutationType::Rephrase
2054
+ apply_rephrase_mutation(instruction)
2055
+ else
2056
+ instruction
2057
+ end
2058
+ end
2059
+
2060
+ # Rewrite the instruction with different phrasing
2061
+ sig { params(instruction: String).returns(String) }
2062
+ def apply_rewrite_mutation(instruction)
2063
+ # Simple rewrite patterns for now - in full implementation would use LLM
2064
+ patterns = [
2065
+ -> (inst) { "Carefully #{inst.downcase}" },
2066
+ -> (inst) { "Please #{inst.downcase}" },
2067
+ -> (inst) { "#{inst} with precision" }
2068
+ ]
2069
+
2070
+ patterns.sample.call(instruction)
2071
+ end
2072
+
2073
+ # Expand instruction with additional context
2074
+ sig { params(instruction: String).returns(String) }
2075
+ def apply_expand_mutation(instruction)
2076
+ expansions = [
2077
+ "Think step by step.",
2078
+ "Provide detailed reasoning.",
2079
+ "Consider all aspects carefully.",
2080
+ "Explain your thought process."
2081
+ ]
2082
+
2083
+ "#{instruction} #{expansions.sample}"
2084
+ end
2085
+
2086
+ # Simplify instruction by removing complex terms
2087
+ sig { params(instruction: String).returns(String) }
2088
+ def apply_simplify_mutation(instruction)
2089
+ # Remove common complexity words
2090
+ simplified = instruction.gsub(/\b(carefully|detailed|comprehensive|thorough)\b/i, '')
2091
+ .gsub(/\s+/, ' ')
2092
+ .strip
2093
+
2094
+ simplified.empty? ? instruction : simplified
2095
+ end
2096
+
2097
+ # Combine instruction with complementary strategies
2098
+ sig { params(instruction: String).returns(String) }
2099
+ def apply_combine_mutation(instruction)
2100
+ strategies = [
2101
+ "Break down the problem systematically.",
2102
+ "Use logical reasoning.",
2103
+ "Apply domain knowledge.",
2104
+ "Consider edge cases."
2105
+ ]
2106
+
2107
+ "#{instruction} #{strategies.sample}"
2108
+ end
2109
+
2110
+ # Rephrase instruction with synonyms
2111
+ sig { params(instruction: String).returns(String) }
2112
+ def apply_rephrase_mutation(instruction)
2113
+ # Simple synonym replacement - in full implementation would use LLM
2114
+ synonyms = {
2115
+ 'solve' => 'resolve',
2116
+ 'answer' => 'respond to',
2117
+ 'analyze' => 'examine',
2118
+ 'calculate' => 'compute',
2119
+ 'determine' => 'identify'
2120
+ }
2121
+
2122
+ result = instruction.dup
2123
+ synonyms.each do |original, replacement|
2124
+ result.gsub!(/\b#{original}\b/i, replacement) if rand < 0.3
2125
+ end
2126
+
2127
+ result
2128
+ end
2129
+
2130
+ # Create new program with mutated instruction
2131
+ sig { params(original_program: T.untyped, new_instruction: String).returns(T.untyped) }
2132
+ def create_mutated_program(original_program, new_instruction)
2133
+ case original_program
2134
+ when DSPy::Predict
2135
+ # DSPy::Predict has built-in support for instruction modification
2136
+ original_program.with_instruction(new_instruction)
2137
+ when DSPy::Module
2138
+ # For custom DSPy::Module classes, we need to create a new instance
2139
+ # and update any internal predictors that have instruction-based signatures
2140
+ create_mutated_module(original_program, new_instruction)
2141
+ else
2142
+ # For other types (like test doubles), check if they respond to with_instruction
2143
+ if original_program.respond_to?(:with_instruction)
2144
+ original_program.with_instruction(new_instruction)
2145
+ elsif original_program.respond_to?(:signature_class)
2146
+ # Try to create a new DSPy::Predict with the same signature but new instruction
2147
+ signature_class = original_program.signature_class
2148
+ DSPy::Predict.new(signature_class).with_instruction(new_instruction)
2149
+ else
2150
+ # Fallback: return original if we can't mutate
2151
+ emit_event('mutation_fallback', {
2152
+ program_type: original_program.class.name,
2153
+ reason: 'No mutation method available'
2154
+ })
2155
+ original_program
2156
+ end
2157
+ end
2158
+ rescue => e
2159
+ emit_event('mutation_error', {
2160
+ error: e.message,
2161
+ program_type: original_program.class.name,
2162
+ backtrace: e.backtrace&.first(3)
2163
+ })
2164
+ # Return original program on error
2165
+ original_program
2166
+ end
2167
+
2168
+ # Create mutated version of custom DSPy::Module
2169
+ sig { params(original_module: DSPy::Module, new_instruction: String).returns(DSPy::Module) }
2170
+ def create_mutated_module(original_module, new_instruction)
2171
+ # For custom modules, we need to create a new instance
2172
+ # This is a simplified approach - in practice, modules might need
2173
+ # more sophisticated copying of their internal state
2174
+ begin
2175
+ # Create a new instance of the same class
2176
+ new_module = original_module.class.new
2177
+
2178
+ # Try to find and update any internal predictors
2179
+ original_module.instance_variables.each do |var_name|
2180
+ var_value = original_module.instance_variable_get(var_name)
2181
+
2182
+ if var_value.is_a?(DSPy::Predict)
2183
+ # Update the instruction for internal predictors
2184
+ mutated_predictor = var_value.with_instruction(new_instruction)
2185
+ new_module.instance_variable_set(var_name, mutated_predictor)
2186
+ else
2187
+ # Copy other instance variables as-is
2188
+ new_module.instance_variable_set(var_name, var_value)
2189
+ end
2190
+ end
2191
+
2192
+ new_module
2193
+ rescue => e
2194
+ emit_event('module_mutation_error', {
2195
+ error: e.message,
2196
+ module_class: original_module.class.name
2197
+ })
2198
+ # Fallback to original module
2199
+ original_module
2200
+ end
2201
+ end
2202
+
2203
+ # Select mutation type based on context and configuration
2204
+ sig { params(instruction: T.nilable(String)).returns(MutationType) }
2205
+ def select_mutation_type(instruction = nil)
2206
+ # Adaptive selection based on instruction characteristics
2207
+ if instruction && instruction.length < 20
2208
+ # Short instructions benefit from expansion
2209
+ [MutationType::Expand, MutationType::Combine].sample
2210
+ elsif instruction && instruction.length > 100
2211
+ # Long instructions benefit from simplification
2212
+ [MutationType::Simplify, MutationType::Rephrase].sample
2213
+ else
2214
+ # Balanced selection from all types
2215
+ @config.mutation_types.sample
2216
+ end
2217
+ end
2218
+
2219
+ # Calculate diversity of mutations applied
2220
+ sig { params(mutations: T::Array[MutationType]).returns(Float) }
2221
+ def mutation_diversity(mutations)
2222
+ return 0.0 if mutations.empty?
2223
+
2224
+ unique_types = mutations.uniq.size
2225
+ total_types = @config.mutation_types.size
2226
+
2227
+ unique_types.to_f / total_types
2228
+ end
2229
+ end
2230
+
2231
+ # CrossoverEngine: Handles genetic recombination of prompts for diversity
2232
+ class CrossoverEngine
2233
+ extend T::Sig
2234
+
2235
+ # Struct for instruction components
2236
+ class InstructionComponents < T::Struct
2237
+ prop :action, String
2238
+ prop :modifiers, String
2239
+ end
2240
+
2241
+ sig { returns(GEPAConfig) }
2242
+ attr_reader :config
2243
+
2244
+ sig { params(config: GEPAConfig).void }
2245
+ def initialize(config:)
2246
+ @config = config
2247
+ end
2248
+
2249
+ # Perform crossover between two parent programs
2250
+ sig { params(parent_a: T.untyped, parent_b: T.untyped).returns(T::Array[T.untyped]) }
2251
+ def crossover_programs(parent_a, parent_b)
2252
+ return [parent_a, parent_b] if rand > @config.crossover_rate
2253
+
2254
+ begin
2255
+ instruction_a = extract_instruction(parent_a)
2256
+ instruction_b = extract_instruction(parent_b)
2257
+
2258
+ crossover_type = select_crossover_type(instruction_a, instruction_b)
2259
+ offspring_instructions = apply_crossover(instruction_a, instruction_b, crossover_type)
2260
+
2261
+ offspring = [
2262
+ create_crossover_program(parent_a, offspring_instructions[0]),
2263
+ create_crossover_program(parent_b, offspring_instructions[1])
2264
+ ]
2265
+
2266
+ offspring
2267
+ rescue => e
2268
+ # Return original parents on crossover failure
2269
+ [parent_a, parent_b]
2270
+ end
2271
+ end
2272
+
2273
+ # Batch crossover for entire population
2274
+ sig { params(population: T::Array[T.untyped]).returns(T::Array[T.untyped]) }
2275
+ def batch_crossover(population)
2276
+ return [] if population.empty?
2277
+ return [population.first] if population.size == 1
2278
+
2279
+ offspring = []
2280
+
2281
+ # Pair up population for crossover
2282
+ population.each_slice(2) do |pair|
2283
+ if pair.size == 2
2284
+ crossed = crossover_programs(pair[0], pair[1])
2285
+ offspring.concat(crossed)
2286
+ else
2287
+ offspring << pair[0] # Unpaired individual passes through
2288
+ end
2289
+ end
2290
+
2291
+ offspring
2292
+ end
2293
+
2294
+ private
2295
+
2296
+ # Extract instruction text from program
2297
+ sig { params(program: T.untyped).returns(String) }
2298
+ def extract_instruction(program)
2299
+ if program.signature_class&.description
2300
+ program.signature_class.description
2301
+ else
2302
+ "Analyze the input and complete the task accurately"
2303
+ end
2304
+ end
2305
+
2306
+ # Apply specific crossover type to two instructions
2307
+ sig { params(instruction_a: String, instruction_b: String, crossover_type: CrossoverType).returns(T::Array[String]) }
2308
+ def apply_crossover(instruction_a, instruction_b, crossover_type)
2309
+ case crossover_type
2310
+ when CrossoverType::Uniform
2311
+ uniform_crossover(instruction_a, instruction_b)
2312
+ when CrossoverType::Blend
2313
+ blend_crossover(instruction_a, instruction_b)
2314
+ when CrossoverType::Structured
2315
+ structured_crossover(instruction_a, instruction_b)
2316
+ else
2317
+ [instruction_a, instruction_b]
2318
+ end
2319
+ end
2320
+
2321
+ # Uniform crossover: Exchange elements randomly at word level
2322
+ sig { params(instruction_a: String, instruction_b: String).returns(T::Array[String]) }
2323
+ def uniform_crossover(instruction_a, instruction_b)
2324
+ return [instruction_a, instruction_b] if instruction_a == instruction_b
2325
+
2326
+ words_a = instruction_a.split
2327
+ words_b = instruction_b.split
2328
+
2329
+ # Create offspring by randomly selecting words from parents
2330
+ offspring_a_words = []
2331
+ offspring_b_words = []
2332
+
2333
+ max_length = [words_a.size, words_b.size].max
2334
+
2335
+ max_length.times do |i|
2336
+ word_a = words_a[i]
2337
+ word_b = words_b[i]
2338
+
2339
+ if rand < 0.5
2340
+ offspring_a_words << (word_a || word_b)
2341
+ offspring_b_words << (word_b || word_a)
2342
+ else
2343
+ offspring_a_words << (word_b || word_a)
2344
+ offspring_b_words << (word_a || word_b)
2345
+ end
2346
+ end
2347
+
2348
+ [
2349
+ offspring_a_words.compact.join(' '),
2350
+ offspring_b_words.compact.join(' ')
2351
+ ]
2352
+ end
2353
+
2354
+ # Blend crossover: Semantically combine instructions
2355
+ sig { params(instruction_a: String, instruction_b: String).returns(T::Array[String]) }
2356
+ def blend_crossover(instruction_a, instruction_b)
2357
+ # Simple blending patterns - in full implementation would use LLM
2358
+ patterns = [
2359
+ -> (a, b) { "#{a} and #{b}" },
2360
+ -> (a, b) { "#{a}, specifically #{b}" },
2361
+ -> (a, b) { "#{b} while #{a.downcase}" },
2362
+ -> (a, b) { "Combine #{a.downcase} with #{b.downcase}" }
2363
+ ]
2364
+
2365
+ pattern = patterns.sample
2366
+
2367
+ [
2368
+ pattern.call(instruction_a, instruction_b),
2369
+ pattern.call(instruction_b, instruction_a)
2370
+ ]
2371
+ end
2372
+
2373
+ # Structured crossover: Maintain grammatical and logical structure
2374
+ sig { params(instruction_a: String, instruction_b: String).returns(T::Array[String]) }
2375
+ def structured_crossover(instruction_a, instruction_b)
2376
+ # Extract structural components
2377
+ components_a = extract_components(instruction_a)
2378
+ components_b = extract_components(instruction_b)
2379
+
2380
+ # Cross structural components
2381
+ offspring_a = combine_components(components_a.action, components_b.modifiers)
2382
+ offspring_b = combine_components(components_b.action, components_a.modifiers)
2383
+
2384
+ [offspring_a, offspring_b]
2385
+ end
2386
+
2387
+ # Extract structural components from instruction
2388
+ sig { params(instruction: String).returns(InstructionComponents) }
2389
+ def extract_components(instruction)
2390
+ words = instruction.split
2391
+
2392
+ # Simple heuristic: first verb-like word is action, rest are modifiers
2393
+ action_idx = words.find_index { |word| verb_like?(word) } || 0
2394
+
2395
+ InstructionComponents.new(
2396
+ action: words[action_idx] || words.first || "complete",
2397
+ modifiers: (words - [words[action_idx]]).join(' ')
2398
+ )
2399
+ end
2400
+
2401
+ # Combine action and modifiers into coherent instruction
2402
+ sig { params(action: String, modifiers: String).returns(String) }
2403
+ def combine_components(action, modifiers)
2404
+ if modifiers.empty?
2405
+ "#{action.capitalize} the task"
2406
+ else
2407
+ "#{action.capitalize} #{modifiers}"
2408
+ end
2409
+ end
2410
+
2411
+ # Simple heuristic to identify verb-like words
2412
+ sig { params(word: String).returns(T::Boolean) }
2413
+ def verb_like?(word)
2414
+ verb_patterns = %w[solve answer calculate determine analyze compute resolve examine]
2415
+ verb_patterns.any? { |pattern| word.downcase.include?(pattern) }
2416
+ end
2417
+
2418
+ # Create new program with crossover instruction
2419
+ sig { params(original_program: T.untyped, new_instruction: String).returns(T.untyped) }
2420
+ def create_crossover_program(original_program, new_instruction)
2421
+ # For now, return the original program as we don't modify instruction in place
2422
+ # In full implementation, would create new program instance with modified instruction
2423
+ original_program
2424
+ end
2425
+
2426
+ # Select crossover type based on instruction characteristics
2427
+ sig { params(instruction_a: T.nilable(String), instruction_b: T.nilable(String)).returns(CrossoverType) }
2428
+ def select_crossover_type(instruction_a = nil, instruction_b = nil)
2429
+ # Adaptive selection based on instruction characteristics
2430
+ if instruction_a && instruction_b
2431
+ combined_length = instruction_a.length + instruction_b.length
2432
+
2433
+ if combined_length < 40
2434
+ # Short instructions benefit from blending
2435
+ [CrossoverType::Blend, CrossoverType::Uniform].sample
2436
+ elsif combined_length > 200
2437
+ # Long instructions benefit from structured crossover
2438
+ [CrossoverType::Structured, CrossoverType::Uniform].sample
2439
+ else
2440
+ # Balanced selection
2441
+ @config.crossover_types.sample
2442
+ end
2443
+ else
2444
+ @config.crossover_types.sample
2445
+ end
2446
+ end
2447
+
2448
+ # Calculate diversity of crossover operations
2449
+ sig { params(crossovers: T::Array[CrossoverType]).returns(Float) }
2450
+ def crossover_diversity(crossovers)
2451
+ return 0.0 if crossovers.empty?
2452
+
2453
+ unique_types = crossovers.uniq.size
2454
+ total_types = @config.crossover_types.size
2455
+
2456
+ unique_types.to_f / total_types
2457
+ end
2458
+ end
2459
+
2460
+ # ParetoSelector: Multi-objective optimization using Pareto frontier analysis
2461
+ class ParetoSelector
2462
+ extend T::Sig
2463
+
2464
+ sig { returns(FitnessEvaluator) }
2465
+ attr_reader :evaluator
2466
+
2467
+ sig { returns(GEPAConfig) }
2468
+ attr_reader :config
2469
+
2470
+ sig { params(evaluator: FitnessEvaluator, config: GEPAConfig).void }
2471
+ def initialize(evaluator:, config:)
2472
+ @evaluator = evaluator
2473
+ @config = config
2474
+ end
2475
+
2476
+ # Select parents for breeding using Pareto-based selection
2477
+ sig { params(population_with_scores: T::Array[T::Array[T.untyped]], count: Integer).returns(T::Array[T.untyped]) }
2478
+ def select_parents(population_with_scores, count:)
2479
+ return [] if population_with_scores.empty?
2480
+ return population_with_scores.map(&:first) if count >= population_with_scores.size
2481
+
2482
+ # Combine tournament and Pareto-based selection for parent selection
2483
+ selected = []
2484
+
2485
+ count.times do
2486
+ parent = tournament_selection(population_with_scores)
2487
+ selected << parent
2488
+ end
2489
+
2490
+ selected
2491
+ end
2492
+
2493
+ # Select survivors for next generation balancing elite and diversity
2494
+ sig { params(population_with_scores: T::Array[T::Array[T.untyped]], count: Integer).returns(T::Array[T.untyped]) }
2495
+ def select_survivors(population_with_scores, count:)
2496
+ return [] if population_with_scores.empty?
2497
+ return population_with_scores.map(&:first) if count >= population_with_scores.size
2498
+
2499
+ scores = population_with_scores.map(&:last)
2500
+
2501
+ # Find Pareto frontier first
2502
+ pareto_frontier = find_pareto_frontier(scores)
2503
+ frontier_indices = scores.each_index.select { |i| pareto_frontier.include?(scores[i]) }
2504
+ frontier_programs = frontier_indices.map { |i| population_with_scores[i].first }
2505
+
2506
+ if frontier_programs.size >= count
2507
+ # Use diversity selection within frontier
2508
+ frontier_with_scores = frontier_indices.map { |i| population_with_scores[i] }
2509
+ return diversity_selection(frontier_with_scores, count: count)
2510
+ else
2511
+ # Include all frontier + fill remaining with elite selection
2512
+ remaining_count = count - frontier_programs.size
2513
+ remaining_population = population_with_scores.reject.with_index { |_, i| frontier_indices.include?(i) }
2514
+
2515
+ additional = elite_selection(remaining_population, count: remaining_count)
2516
+ frontier_programs + additional
2517
+ end
2518
+ end
2519
+
2520
+ private
2521
+
2522
+ # Find Pareto frontier (non-dominated solutions)
2523
+ sig { params(fitness_scores: T::Array[FitnessScore]).returns(T::Array[FitnessScore]) }
2524
+ def find_pareto_frontier(fitness_scores)
2525
+ return [] if fitness_scores.empty?
2526
+ return fitness_scores if fitness_scores.size == 1
2527
+
2528
+ frontier = []
2529
+
2530
+ fitness_scores.each do |candidate|
2531
+ # Check if candidate is dominated by any other solution
2532
+ is_dominated = fitness_scores.any? do |other|
2533
+ other != candidate && candidate.dominated_by?(other)
2534
+ end
2535
+
2536
+ frontier << candidate unless is_dominated
2537
+ end
2538
+
2539
+ frontier
2540
+ end
2541
+
2542
+ # Calculate crowding distance for diversity preservation
2543
+ sig { params(fitness_scores: T::Array[FitnessScore]).returns(T::Hash[FitnessScore, Float]) }
2544
+ def calculate_crowding_distance(fitness_scores)
2545
+ distances = {}
2546
+
2547
+ # Initialize distances for all solutions
2548
+ fitness_scores.each { |score| distances[score] = 0.0 }
2549
+
2550
+ return distances if fitness_scores.size <= 2
2551
+
2552
+ # Calculate crowding distance for each objective
2553
+ objectives = [:primary_score, :overall_score]
2554
+ secondary_objectives = fitness_scores.first.secondary_scores.keys
2555
+ all_objectives = objectives + secondary_objectives
2556
+
2557
+ all_objectives.each do |objective|
2558
+ # Sort by current objective
2559
+ sorted_scores = fitness_scores.sort_by do |score|
2560
+ case objective
2561
+ when :primary_score
2562
+ score.primary_score
2563
+ when :overall_score
2564
+ score.overall_score
2565
+ else
2566
+ score.secondary_scores[objective] || 0.0
2567
+ end
2568
+ end
2569
+
2570
+ # Set boundary solutions to high distance
2571
+ distances[sorted_scores.first] = Float::INFINITY if sorted_scores.size > 0
2572
+ distances[sorted_scores.last] = Float::INFINITY if sorted_scores.size > 1
2573
+
2574
+ next if sorted_scores.size <= 2
2575
+
2576
+ # Calculate range for normalization
2577
+ min_val = get_objective_value(sorted_scores.first, objective)
2578
+ max_val = get_objective_value(sorted_scores.last, objective)
2579
+ range = max_val - min_val
2580
+
2581
+ next if range <= 0
2582
+
2583
+ # Calculate crowding distance for intermediate solutions
2584
+ (1...(sorted_scores.size - 1)).each do |i|
2585
+ prev_val = get_objective_value(sorted_scores[i - 1], objective)
2586
+ next_val = get_objective_value(sorted_scores[i + 1], objective)
2587
+
2588
+ distances[sorted_scores[i]] += (next_val - prev_val) / range
2589
+ end
2590
+ end
2591
+
2592
+ distances
2593
+ end
2594
+
2595
+ # Get objective value from fitness score
2596
+ sig { params(score: FitnessScore, objective: Symbol).returns(Float) }
2597
+ def get_objective_value(score, objective)
2598
+ case objective
2599
+ when :primary_score
2600
+ score.primary_score
2601
+ when :overall_score
2602
+ score.overall_score
2603
+ else
2604
+ score.secondary_scores[objective] || 0.0
2605
+ end
2606
+ end
2607
+
2608
+ # Tournament selection with Pareto preference
2609
+ sig { params(population_with_scores: T::Array[T::Array[T.untyped]]).returns(T.untyped) }
2610
+ def tournament_selection(population_with_scores)
2611
+ return population_with_scores.first.first if population_with_scores.size == 1
2612
+
2613
+ tournament_size = [3, population_with_scores.size].min
2614
+ tournament = population_with_scores.sample(tournament_size)
2615
+
2616
+ # Select best from tournament based on Pareto dominance and crowding
2617
+ best_program, best_score = tournament.first
2618
+
2619
+ tournament[1..].each do |program, score|
2620
+ if score.dominated_by?(best_score)
2621
+ # Current best dominates this candidate, keep current
2622
+ next
2623
+ elsif best_score.dominated_by?(score)
2624
+ # This candidate dominates current best, replace
2625
+ best_program, best_score = program, score
2626
+ else
2627
+ # Non-dominated comparison, use overall score as tiebreaker
2628
+ if score.overall_score > best_score.overall_score
2629
+ best_program, best_score = program, score
2630
+ end
2631
+ end
2632
+ end
2633
+
2634
+ best_program
2635
+ end
2636
+
2637
+ # Diversity-based selection using crowding distance
2638
+ sig { params(population_with_scores: T::Array[T::Array[T.untyped]], count: Integer).returns(T::Array[T.untyped]) }
2639
+ def diversity_selection(population_with_scores, count:)
2640
+ return population_with_scores.map(&:first) if count >= population_with_scores.size
2641
+
2642
+ scores = population_with_scores.map(&:last)
2643
+ distances = calculate_crowding_distance(scores)
2644
+
2645
+ # Sort by crowding distance (descending - prefer more diverse)
2646
+ sorted_pairs = population_with_scores.sort_by { |_, score| -distances[score] }
2647
+
2648
+ sorted_pairs.take(count).map(&:first)
2649
+ end
2650
+
2651
+ # Elite selection based on overall fitness
2652
+ sig { params(population_with_scores: T::Array[T::Array[T.untyped]], count: Integer).returns(T::Array[T.untyped]) }
2653
+ def elite_selection(population_with_scores, count:)
2654
+ return population_with_scores.map(&:first) if count >= population_with_scores.size
2655
+
2656
+ # Sort by overall score (descending - best first)
2657
+ sorted_pairs = population_with_scores.sort_by { |_, score| -score.overall_score }
2658
+
2659
+ sorted_pairs.take(count).map(&:first)
2660
+ end
2661
+ end
2662
+
2663
+ # Configuration for GEPA optimization
2664
+ class GEPAConfig < Config
2665
+ extend T::Sig
2666
+
2667
+ sig { returns(String) }
2668
+ attr_accessor :reflection_lm
2669
+
2670
+ sig { returns(Integer) }
2671
+ attr_accessor :num_generations
2672
+
2673
+ sig { returns(Integer) }
2674
+ attr_accessor :population_size
2675
+
2676
+ sig { returns(Float) }
2677
+ attr_accessor :mutation_rate
2678
+
2679
+ sig { returns(T::Boolean) }
2680
+ attr_accessor :use_pareto_selection
2681
+
2682
+ sig { returns(T::Boolean) }
2683
+ attr_accessor :simple_mode
2684
+ sig { returns(T::Array[MutationType]) }
2685
+ attr_accessor :mutation_types
2686
+ sig { returns(Float) }
2687
+ attr_accessor :crossover_rate
2688
+ sig { returns(T::Array[CrossoverType]) }
2689
+ attr_accessor :crossover_types
2690
+
2691
+ sig { void }
2692
+ def initialize
2693
+ super
2694
+ @reflection_lm = 'gpt-4o'
2695
+ @num_generations = 10
2696
+ @population_size = 8
2697
+ @mutation_rate = 0.7
2698
+ @use_pareto_selection = true
2699
+ @simple_mode = false
2700
+ @mutation_types = [MutationType::Rewrite, MutationType::Expand, MutationType::Simplify, MutationType::Combine, MutationType::Rephrase]
2701
+ @crossover_rate = 0.6
2702
+ @crossover_types = [CrossoverType::Uniform, CrossoverType::Blend, CrossoverType::Structured]
2703
+ end
2704
+
2705
+ sig { returns(T::Hash[Symbol, T.untyped]) }
2706
+ def to_h
2707
+ super.merge({
2708
+ reflection_lm: @reflection_lm,
2709
+ num_generations: @num_generations,
2710
+ population_size: @population_size,
2711
+ mutation_rate: @mutation_rate,
2712
+ use_pareto_selection: @use_pareto_selection,
2713
+ simple_mode: @simple_mode,
2714
+ mutation_types: @mutation_types,
2715
+ crossover_rate: @crossover_rate,
2716
+ crossover_types: @crossover_types
2717
+ })
2718
+ end
2719
+ end
2720
+
2721
+ sig { returns(GEPAConfig) }
2722
+ attr_reader :config
585
2723
 
586
2724
  sig do
587
2725
  params(
@@ -611,27 +2749,632 @@ module DSPy
611
2749
  num_generations: @config.num_generations,
612
2750
  population_size: @config.population_size
613
2751
  }) do
614
- # For Phase 1, return a basic optimization result
615
- # Future phases will implement the full genetic algorithm
2752
+ # Simple optimization for Phase 1.5 - basic instruction optimization
2753
+ if @config.simple_mode
2754
+ perform_simple_optimization(program, trainset, valset)
2755
+ else
2756
+ # Phase 2 - Full GEPA genetic algorithm implementation
2757
+ perform_gepa_optimization(program, trainset, valset)
2758
+ end
2759
+ end
2760
+ end
2761
+
2762
+ private
2763
+
2764
+ # Simple optimization implementation for testing
2765
+ sig do
2766
+ params(
2767
+ program: T.untyped,
2768
+ trainset: T::Array[T.untyped],
2769
+ valset: T.nilable(T::Array[T.untyped])
2770
+ ).returns(OptimizationResult)
2771
+ end
2772
+ def perform_simple_optimization(program, trainset, valset)
2773
+ return basic_result(program) unless program.respond_to?(:signature_class)
2774
+
2775
+ original_description = program.signature_class.description
2776
+ best_program = program
2777
+ best_score = simple_evaluate_program(program, trainset)
2778
+
2779
+ # Try different instruction variations
2780
+ instruction_variants = generate_instruction_variants(original_description)
2781
+
2782
+ instruction_variants.each_with_index do |variant, index|
2783
+ emit_event('instruction_variant_test', {
2784
+ variant: variant,
2785
+ iteration: index + 1,
2786
+ total_variants: instruction_variants.size
2787
+ })
2788
+
2789
+ # Create modified program
2790
+ modified_program = create_program_with_instruction(program, variant)
2791
+ score = simple_evaluate_program(modified_program, trainset)
2792
+
2793
+ if score > best_score
2794
+ best_program = modified_program
2795
+ best_score = score
2796
+
2797
+ emit_event('improvement_found', {
2798
+ new_score: score,
2799
+ previous_score: best_score,
2800
+ instruction: variant
2801
+ })
2802
+ end
2803
+ end
2804
+
2805
+ OptimizationResult.new(
2806
+ optimized_program: best_program,
2807
+ scores: { accuracy: best_score },
2808
+ history: {
2809
+ original_score: simple_evaluate_program(program, trainset),
2810
+ variants_tested: instruction_variants.size,
2811
+ best_instruction: best_program.signature_class.description
2812
+ },
2813
+ best_score_name: 'accuracy',
2814
+ best_score_value: best_score,
2815
+ metadata: {
2816
+ optimizer: 'GEPA',
2817
+ mode: 'Simple Optimization',
2818
+ reflection_lm: @config.reflection_lm
2819
+ }
2820
+ )
2821
+ end
2822
+
2823
+ # Generate variations of the instruction
2824
+ sig { params(original_instruction: String).returns(T::Array[String]) }
2825
+ def generate_instruction_variants(original_instruction)
2826
+ variants = []
2827
+
2828
+ # Add "step by step" variant
2829
+ unless original_instruction.include?("step")
2830
+ variants << "#{original_instruction} Think step by step."
2831
+ end
2832
+
2833
+ # Add "detailed" variant
2834
+ unless original_instruction.include?("detail")
2835
+ variants << "#{original_instruction} Provide detailed reasoning."
2836
+ end
2837
+
2838
+ # Add "careful" variant
2839
+ unless original_instruction.include?("careful")
2840
+ variants << "Be careful and accurate. #{original_instruction}"
2841
+ end
2842
+
2843
+ variants.take(3) # Limit to 3 variants for simple mode
2844
+ end
2845
+
2846
+ # Create a new program instance with modified instruction using DSPy.rb dynamic capabilities
2847
+ sig { params(original_program: T.untyped, new_instruction: String).returns(T.untyped) }
2848
+ def create_program_with_instruction(original_program, new_instruction)
2849
+ case original_program
2850
+ when DSPy::Predict
2851
+ # DSPy::Predict has built-in support for instruction modification
2852
+ original_program.with_instruction(new_instruction)
2853
+ when DSPy::Module
2854
+ # For custom DSPy::Module classes, create new instance with updated predictors
2855
+ create_modified_module_instance(original_program, new_instruction)
2856
+ else
2857
+ # For other types (like test doubles), check available methods
2858
+ if original_program.respond_to?(:with_instruction)
2859
+ original_program.with_instruction(new_instruction)
2860
+ elsif original_program.respond_to?(:signature_class)
2861
+ # Create new DSPy::Predict with the same signature but new instruction
2862
+ signature_class = original_program.signature_class
2863
+ DSPy::Predict.new(signature_class).with_instruction(new_instruction)
2864
+ else
2865
+ # Fallback: return original if we can't modify
2866
+ emit_event('program_modification_fallback', {
2867
+ program_type: original_program.class.name,
2868
+ reason: 'No modification method available'
2869
+ })
2870
+ original_program
2871
+ end
2872
+ end
2873
+ rescue => e
2874
+ emit_event('program_modification_error', {
2875
+ error: e.message,
2876
+ program_type: original_program.class.name
2877
+ })
2878
+ # Return original program on error
2879
+ original_program
2880
+ end
2881
+
2882
+ # Create modified version of custom DSPy::Module instance (for main GEPA class)
2883
+ sig { params(original_module: DSPy::Module, new_instruction: String).returns(DSPy::Module) }
2884
+ def create_modified_module_instance(original_module, new_instruction)
2885
+ begin
2886
+ # Create a new instance of the same class
2887
+ new_module = original_module.class.new
2888
+
2889
+ # Try to find and update any internal predictors
2890
+ original_module.instance_variables.each do |var_name|
2891
+ var_value = original_module.instance_variable_get(var_name)
2892
+
2893
+ if var_value.is_a?(DSPy::Predict)
2894
+ # Update the instruction for internal predictors
2895
+ modified_predictor = var_value.with_instruction(new_instruction)
2896
+ new_module.instance_variable_set(var_name, modified_predictor)
2897
+ else
2898
+ # Copy other instance variables as-is
2899
+ new_module.instance_variable_set(var_name, var_value)
2900
+ end
2901
+ end
2902
+
2903
+ new_module
2904
+ rescue => e
2905
+ emit_event('module_modification_error', {
2906
+ error: e.message,
2907
+ module_class: original_module.class.name
2908
+ })
2909
+ # Fallback to original module
2910
+ original_module
2911
+ end
2912
+ end
2913
+
2914
+ # Simple evaluation for testing (different from base class evaluate_program)
2915
+ sig { params(program: T.untyped, trainset: T::Array[T.untyped]).returns(Float) }
2916
+ def simple_evaluate_program(program, trainset)
2917
+ return 0.0 unless @metric
2918
+
2919
+ scores = trainset.map do |example|
2920
+ prediction = program.call(**example.input_values)
2921
+ @metric.call(example, prediction).to_f
2922
+ rescue => e
2923
+ emit_event('evaluation_error', { error: e.message, example_id: example.object_id.to_s })
2924
+ 0.0
2925
+ end
2926
+
2927
+ scores.sum / scores.size
2928
+ end
2929
+
2930
+ # Return basic result when simple optimization isn't applicable
2931
+ sig { params(program: T.untyped).returns(OptimizationResult) }
2932
+ def basic_result(program)
2933
+ OptimizationResult.new(
2934
+ optimized_program: program,
2935
+ scores: { gepa_score: 0.0 },
2936
+ history: { phase: 'Phase 1 - Basic Structure' },
2937
+ best_score_name: 'gepa_score',
2938
+ best_score_value: 0.0,
2939
+ metadata: {
2940
+ optimizer: 'GEPA',
2941
+ implementation_status: 'Phase 1 - Infrastructure Complete'
2942
+ }
2943
+ )
2944
+ end
2945
+
2946
+ # Complete GEPA genetic algorithm optimization
2947
+ sig do
2948
+ params(
2949
+ program: T.untyped,
2950
+ trainset: T::Array[T.untyped],
2951
+ valset: T.nilable(T::Array[T.untyped])
2952
+ ).returns(OptimizationResult)
2953
+ end
2954
+ def perform_gepa_optimization(program, trainset, valset)
2955
+ # Initialize all GEPA components
2956
+ fitness_evaluator = create_fitness_evaluator
2957
+ genetic_engine = create_genetic_engine(fitness_evaluator)
2958
+ reflection_engine = create_reflection_engine
2959
+ mutation_engine = create_mutation_engine
2960
+ crossover_engine = create_crossover_engine
2961
+ pareto_selector = create_pareto_selector(fitness_evaluator)
2962
+
2963
+ # Initialize trace collection for reflection
2964
+ trace_collector = TraceCollector.new
2965
+ optimization_run_id = "gepa-run-#{SecureRandom.hex(4)}"
2966
+
2967
+ emit_event('gepa_optimization_start', {
2968
+ optimization_run_id: optimization_run_id,
2969
+ num_generations: @config.num_generations,
2970
+ population_size: @config.population_size,
2971
+ mutation_rate: @config.mutation_rate,
2972
+ crossover_rate: @config.crossover_rate
2973
+ })
2974
+
2975
+ begin
2976
+ # Run the complete genetic algorithm evolution
2977
+ evolution_result = genetic_engine.run_evolution(program, trainset)
2978
+
2979
+ # Collect traces for reflection analysis
2980
+ execution_traces = trace_collector.traces_for_run(optimization_run_id)
2981
+
2982
+ # Generate reflection insights on the optimization process
2983
+ reflection_result = reflection_engine.reflect_with_llm(execution_traces)
2984
+
2985
+ # Evaluate final candidate on validation set if provided
2986
+ final_validation_score = if valset && !valset.empty?
2987
+ validation_fitness = fitness_evaluator.evaluate_candidate(evolution_result[:best_candidate], valset)
2988
+ validation_fitness.overall_score
2989
+ else
2990
+ evolution_result[:best_fitness].overall_score
2991
+ end
2992
+
2993
+ emit_event('gepa_optimization_complete', {
2994
+ optimization_run_id: optimization_run_id,
2995
+ best_fitness: evolution_result[:best_fitness].overall_score,
2996
+ final_generation: evolution_result[:generation_count],
2997
+ validation_score: final_validation_score,
2998
+ reflection_confidence: reflection_result.confidence
2999
+ })
3000
+
3001
+ # Create comprehensive optimization result
3002
+ OptimizationResult.new(
3003
+ optimized_program: evolution_result[:best_candidate],
3004
+ scores: {
3005
+ fitness_score: evolution_result[:best_fitness].overall_score,
3006
+ validation_score: final_validation_score,
3007
+ primary_score: evolution_result[:best_fitness].primary_score,
3008
+ **evolution_result[:best_fitness].secondary_scores
3009
+ },
3010
+ history: {
3011
+ num_generations: evolution_result[:generation_count],
3012
+ population_size: @config.population_size,
3013
+ generation_history: evolution_result[:generation_history],
3014
+ final_population: evolution_result[:final_population],
3015
+ phase: 'Phase 2 - Complete GEPA',
3016
+ mutation_rate: @config.mutation_rate,
3017
+ crossover_rate: @config.crossover_rate,
3018
+ selection_strategy: @config.use_pareto_selection ? 'pareto' : 'tournament'
3019
+ },
3020
+ best_score_name: 'fitness_score',
3021
+ best_score_value: evolution_result[:best_fitness].overall_score,
3022
+ metadata: {
3023
+ optimizer: 'GEPA',
3024
+ reflection_lm: @config.reflection_lm,
3025
+ implementation_status: 'Phase 2 - Complete Implementation',
3026
+ optimization_run_id: optimization_run_id,
3027
+ reflection_insights: {
3028
+ diagnosis: reflection_result.diagnosis,
3029
+ improvements: reflection_result.improvements,
3030
+ confidence: reflection_result.confidence,
3031
+ suggested_mutations: reflection_result.suggested_mutations
3032
+ },
3033
+ trace_analysis: {
3034
+ total_traces: execution_traces.size,
3035
+ llm_traces: execution_traces.count(&:llm_trace?),
3036
+ module_traces: execution_traces.count(&:module_trace?),
3037
+ execution_timespan: calculate_execution_timespan(execution_traces)
3038
+ },
3039
+ component_versions: {
3040
+ genetic_engine: 'v2.0',
3041
+ fitness_evaluator: 'v2.0',
3042
+ reflection_engine: 'v2.0',
3043
+ mutation_engine: 'v2.0',
3044
+ crossover_engine: 'v2.0',
3045
+ pareto_selector: 'v2.0'
3046
+ }
3047
+ }
3048
+ )
3049
+
3050
+ rescue => e
3051
+ emit_event('gepa_optimization_error', {
3052
+ optimization_run_id: optimization_run_id,
3053
+ error: e.message,
3054
+ backtrace: e.backtrace&.take(5)
3055
+ })
3056
+
3057
+ # Return fallback result on optimization failure
3058
+ fallback_fitness = fitness_evaluator.evaluate_candidate(program, trainset)
616
3059
 
617
3060
  OptimizationResult.new(
618
3061
  optimized_program: program,
619
- scores: { gepa_score: 0.0 },
620
- history: {
621
- num_generations: @config.num_generations,
3062
+ scores: {
3063
+ fitness_score: fallback_fitness.overall_score,
3064
+ primary_score: fallback_fitness.primary_score,
3065
+ **fallback_fitness.secondary_scores
3066
+ },
3067
+ history: {
3068
+ num_generations: 0,
622
3069
  population_size: @config.population_size,
623
- phase: 'Phase 1 - Basic Structure'
3070
+ phase: 'Phase 2 - Error Recovery',
3071
+ error: e.message
624
3072
  },
625
- best_score_name: 'gepa_score',
626
- best_score_value: 0.0,
3073
+ best_score_name: 'fitness_score',
3074
+ best_score_value: fallback_fitness.overall_score,
627
3075
  metadata: {
628
3076
  optimizer: 'GEPA',
629
3077
  reflection_lm: @config.reflection_lm,
630
- implementation_status: 'Phase 1 - Infrastructure Complete'
3078
+ implementation_status: 'Phase 2 - Error Recovery',
3079
+ optimization_run_id: optimization_run_id,
3080
+ error_details: {
3081
+ message: e.message,
3082
+ class: e.class.name,
3083
+ recovery_strategy: 'fallback_to_original'
3084
+ }
631
3085
  }
632
3086
  )
633
3087
  end
634
3088
  end
3089
+
3090
+ # Create and configure fitness evaluator
3091
+ sig { returns(FitnessEvaluator) }
3092
+ def create_fitness_evaluator
3093
+ FitnessEvaluator.new(primary_metric: @metric, config: @config)
3094
+ end
3095
+
3096
+ # Create and configure genetic engine
3097
+ sig { params(fitness_evaluator: FitnessEvaluator).returns(GeneticEngine) }
3098
+ def create_genetic_engine(fitness_evaluator)
3099
+ GeneticEngine.new(config: @config, metric: @metric)
3100
+ end
3101
+
3102
+ # Create and configure reflection engine
3103
+ sig { returns(ReflectionEngine) }
3104
+ def create_reflection_engine
3105
+ ReflectionEngine.new(@config)
3106
+ end
3107
+
3108
+ # Create and configure mutation engine
3109
+ sig { returns(MutationEngine) }
3110
+ def create_mutation_engine
3111
+ MutationEngine.new(config: @config)
3112
+ end
3113
+
3114
+ # Create and configure crossover engine
3115
+ sig { returns(CrossoverEngine) }
3116
+ def create_crossover_engine
3117
+ CrossoverEngine.new(config: @config)
3118
+ end
3119
+
3120
+ # Create and configure pareto selector
3121
+ sig { params(fitness_evaluator: FitnessEvaluator).returns(ParetoSelector) }
3122
+ def create_pareto_selector(fitness_evaluator)
3123
+ ParetoSelector.new(evaluator: fitness_evaluator, config: @config)
3124
+ end
3125
+
3126
+ # Calculate execution timespan from traces
3127
+ sig { params(traces: T::Array[ExecutionTrace]).returns(Float) }
3128
+ def calculate_execution_timespan(traces)
3129
+ return 0.0 if traces.size < 2
3130
+
3131
+ timestamps = traces.map(&:timestamp).sort
3132
+ (timestamps.last - timestamps.first).to_f
3133
+ end
3134
+ end
3135
+
3136
+ # GEPA Feedback Metric Protocol
3137
+ # Defines interface for providing scores with optional textual feedback
3138
+ module GEPAFeedbackMetric
3139
+ extend T::Sig
3140
+ extend T::Helpers
3141
+
3142
+ interface!
3143
+
3144
+ # Evaluates prediction and provides score with optional feedback
3145
+ sig do
3146
+ abstract
3147
+ .params(
3148
+ example: DSPy::Example,
3149
+ prediction: DSPy::Prediction,
3150
+ trace: T.nilable(T::Array[ExecutionTrace])
3151
+ )
3152
+ .returns(ScoreWithFeedback)
3153
+ end
3154
+ def call(example, prediction, trace = nil); end
3155
+ end
3156
+
3157
+ # Extended prediction result with score and feedback
3158
+ class ScoreWithFeedback < T::Struct
3159
+ extend T::Sig
3160
+
3161
+ const :score, Float
3162
+ const :feedback, T.nilable(String)
3163
+ const :prediction, DSPy::Prediction
3164
+
3165
+ sig { params(score: Float, prediction: DSPy::Prediction, feedback: T.nilable(String)).void }
3166
+ def initialize(score:, prediction:, feedback: nil)
3167
+ super
3168
+ end
3169
+ end
3170
+
3171
+ # Module Evaluator - Evaluates DSPy modules with metrics and feedback
3172
+ class ModuleEvaluator
3173
+ extend T::Sig
3174
+
3175
+ sig do
3176
+ params(
3177
+ student: T.untyped, # DSPy::Module or similar callable
3178
+ metric: T.untyped,
3179
+ feedback_map: T::Hash[String, String],
3180
+ custom_instruction_proposer: T.nilable(T.untyped)
3181
+ ).void
3182
+ end
3183
+ def initialize(student:, metric:, feedback_map: {}, custom_instruction_proposer: nil)
3184
+ @student = student
3185
+ @metric = metric
3186
+ @feedback_map = feedback_map
3187
+ @custom_instruction_proposer = custom_instruction_proposer
3188
+ @trace_collector = GEPA::TraceCollector.new
3189
+ end
3190
+
3191
+ # Build program with candidate instruction
3192
+ sig { params(candidate_instruction: String).returns(T.untyped) }
3193
+ def build_program(candidate_instruction)
3194
+ # For DSPy::Module compatibility, we'll need to create a new instance
3195
+ # with modified signature description
3196
+ if @student.respond_to?(:signature_class) && @student.signature_class.respond_to?(:description=)
3197
+ modified_student = @student.class.new
3198
+ modified_student.signature_class.description = candidate_instruction
3199
+ modified_student
3200
+ else
3201
+ # Fallback: return student as-is for non-standard modules
3202
+ @student
3203
+ end
3204
+ end
3205
+
3206
+ # Evaluate program on batch with trace capture
3207
+ sig do
3208
+ params(
3209
+ batch: T::Array[DSPy::Example],
3210
+ candidate_instruction: String,
3211
+ capture_traces: T::Boolean
3212
+ )
3213
+ .returns(T::Array[T.any(Float, ScoreWithFeedback)])
3214
+ end
3215
+ def evaluate_batch(batch, candidate_instruction, capture_traces: true)
3216
+ program = build_program(candidate_instruction)
3217
+ results = []
3218
+
3219
+ batch.each do |example|
3220
+ begin
3221
+ # Execute program on example
3222
+ prediction = if program.respond_to?(:call)
3223
+ program.call(**example.input_values)
3224
+ elsif program.respond_to?(:forward)
3225
+ program.forward(**example.input_values)
3226
+ else
3227
+ raise "Program must respond to :call or :forward"
3228
+ end
3229
+
3230
+ # Get collected traces (if trace collection is enabled)
3231
+ # Note: TraceCollector automatically collects via event subscriptions
3232
+ traces = capture_traces ? @trace_collector.traces : []
3233
+
3234
+ # Evaluate with metric
3235
+ # Try with traces first (for GEPAFeedbackMetric), fallback to standard metric
3236
+ begin
3237
+ # Check if metric can accept 3 parameters (example, prediction, traces)
3238
+ if @metric.respond_to?(:arity) && (@metric.arity == 3 || @metric.arity < 0)
3239
+ score_result = @metric.call(example, prediction, traces)
3240
+ else
3241
+ score_result = @metric.call(example, prediction)
3242
+ end
3243
+ rescue ArgumentError => arg_error
3244
+ # If 3-arg call fails, try 2-arg call
3245
+ if arg_error.message.include?('wrong number of arguments')
3246
+ score_result = @metric.call(example, prediction)
3247
+ else
3248
+ raise arg_error
3249
+ end
3250
+ end
3251
+
3252
+ # Ensure we always have a ScoreWithFeedback object
3253
+ if score_result.is_a?(ScoreWithFeedback)
3254
+ results << score_result
3255
+ else
3256
+ # Wrap plain float scores in ScoreWithFeedback
3257
+ results << ScoreWithFeedback.new(
3258
+ score: score_result.to_f,
3259
+ prediction: prediction,
3260
+ feedback: nil
3261
+ )
3262
+ end
3263
+
3264
+ rescue => e
3265
+ DSPy.logger.error("Evaluation error: #{e.message}")
3266
+ # Return zero score on failure
3267
+ results << 0.0
3268
+ end
3269
+ end
3270
+
3271
+ results
3272
+ end
3273
+
3274
+ # Create reflective dataset from failed predictions
3275
+ sig do
3276
+ params(
3277
+ examples: T::Array[DSPy::Example],
3278
+ predictions: T::Array[DSPy::Prediction],
3279
+ scores: T::Array[T.any(Float, ScoreWithFeedback)],
3280
+ threshold: Float
3281
+ )
3282
+ .returns(T::Array[T::Hash[String, T.untyped]])
3283
+ end
3284
+ def make_reflective_dataset(examples, predictions, scores, threshold: 0.5)
3285
+ reflective_data = []
3286
+
3287
+ examples.zip(predictions, scores).each do |example, prediction, score|
3288
+ # Extract score value
3289
+ score_value = score.is_a?(ScoreWithFeedback) ? score.score : score
3290
+
3291
+ # Include failed predictions (below threshold)
3292
+ next if score_value >= threshold
3293
+
3294
+ # Extract feedback if available
3295
+ feedback = if score.is_a?(ScoreWithFeedback) && score.feedback
3296
+ score.feedback
3297
+ else
3298
+ "Low performance (score: #{score_value.round(2)})"
3299
+ end
3300
+
3301
+ reflective_data << {
3302
+ 'input' => example.input_values,
3303
+ 'expected' => example.expected_values,
3304
+ 'prediction' => extract_prediction_values(prediction),
3305
+ 'score' => score_value,
3306
+ 'feedback' => feedback
3307
+ }
3308
+ end
3309
+
3310
+ reflective_data
3311
+ end
3312
+
3313
+ # Propose new instruction texts based on reflective dataset
3314
+ sig do
3315
+ params(
3316
+ current_instruction: String,
3317
+ reflective_dataset: T::Array[T::Hash[String, T.untyped]],
3318
+ components_to_update: T::Array[String]
3319
+ )
3320
+ .returns(T::Array[String])
3321
+ end
3322
+ def propose_new_texts(current_instruction, reflective_dataset, components_to_update = ['instruction'])
3323
+ if @custom_instruction_proposer
3324
+ # Use custom proposer if provided
3325
+ proposed = @custom_instruction_proposer.call(current_instruction, reflective_dataset)
3326
+ [proposed].compact
3327
+ else
3328
+ # Use built-in proposal logic
3329
+ analyze_failures_and_propose(current_instruction, reflective_dataset)
3330
+ end
3331
+ end
3332
+
3333
+ private
3334
+
3335
+ # Extract prediction values for reflective analysis
3336
+ sig { params(prediction: DSPy::Prediction).returns(T::Hash[String, T.untyped]) }
3337
+ def extract_prediction_values(prediction)
3338
+ # DSPy::Prediction implements to_h which returns the underlying struct's data
3339
+ prediction.to_h.transform_keys(&:to_s)
3340
+ end
3341
+
3342
+ # Analyze failures and propose improvements
3343
+ sig do
3344
+ params(
3345
+ current_instruction: String,
3346
+ reflective_dataset: T::Array[T::Hash[String, T.untyped]]
3347
+ )
3348
+ .returns(T::Array[String])
3349
+ end
3350
+ def analyze_failures_and_propose(current_instruction, reflective_dataset)
3351
+ return [current_instruction] if reflective_dataset.empty?
3352
+
3353
+ # Extract common failure patterns
3354
+ feedback_texts = reflective_dataset.map { |data| data['feedback'] }.compact
3355
+
3356
+ # Simple heuristic-based proposals
3357
+ proposals = []
3358
+
3359
+ # If many failures, suggest more detailed instruction
3360
+ if reflective_dataset.size >= 3
3361
+ proposals << "#{current_instruction} Please provide step-by-step reasoning."
3362
+ end
3363
+
3364
+ # If feedback mentions specific issues, address them
3365
+ if feedback_texts.any? { |fb| fb.include?('unclear') || fb.include?('ambiguous') }
3366
+ proposals << "#{current_instruction} Be specific and clear in your response."
3367
+ end
3368
+
3369
+ if feedback_texts.any? { |fb| fb.include?('incomplete') || fb.include?('missing') }
3370
+ proposals << "#{current_instruction} Ensure your answer is complete and addresses all aspects."
3371
+ end
3372
+
3373
+ # Always include at least one proposal
3374
+ proposals << "#{current_instruction.strip}. Think carefully before responding." if proposals.empty?
3375
+
3376
+ proposals.uniq.take(3) # Return up to 3 proposals
3377
+ end
635
3378
  end
636
3379
  end
637
3380
  end