dspy 0.24.2 → 0.25.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +3 -1
- data/lib/dspy/chain_of_thought.rb +4 -2
- data/lib/dspy/context.rb +42 -11
- data/lib/dspy/lm/adapters/openai/schema_converter.rb +63 -3
- data/lib/dspy/lm/retry_handler.rb +7 -3
- data/lib/dspy/lm.rb +16 -13
- data/lib/dspy/observability/async_span_processor.rb +274 -0
- data/lib/dspy/observability.rb +20 -11
- data/lib/dspy/predict.rb +2 -1
- data/lib/dspy/teleprompt/gepa.rb +329 -772
- data/lib/dspy/utils/serialization.rb +35 -0
- data/lib/dspy/version.rb +1 -1
- data/lib/dspy.rb +30 -25
- metadata +6 -4
data/lib/dspy/teleprompt/gepa.rb
CHANGED
@@ -24,7 +24,7 @@ module DSPy
|
|
24
24
|
end
|
25
25
|
end
|
26
26
|
|
27
|
-
# Enum for crossover operation types
|
27
|
+
# Enum for crossover operation types
|
28
28
|
class CrossoverType < T::Enum
|
29
29
|
enums do
|
30
30
|
Uniform = new
|
@@ -59,6 +59,7 @@ module DSPy
|
|
59
59
|
metadata: T.nilable(MetadataHash)
|
60
60
|
).void
|
61
61
|
end
|
62
|
+
|
62
63
|
def initialize(trace_id:, event_name:, timestamp:, span_id: nil, attributes: {}, metadata: nil)
|
63
64
|
# Freeze nested structures for true immutability
|
64
65
|
frozen_attributes = attributes.freeze
|
@@ -236,7 +237,7 @@ module DSPy
|
|
236
237
|
def summary
|
237
238
|
confidence_pct = (confidence * 100).round
|
238
239
|
mutation_list = suggested_mutations.map(&:to_s).join(', ')
|
239
|
-
|
240
|
+
|
240
241
|
"#{diagnosis.split('.').first}. " \
|
241
242
|
"Confidence: #{confidence_pct}%. " \
|
242
243
|
"#{improvements.size} improvements suggested. " \
|
@@ -289,7 +290,7 @@ module DSPy
|
|
289
290
|
def collect_trace(event_name, event_data)
|
290
291
|
@traces_mutex.synchronize do
|
291
292
|
trace_id = event_data['trace_id'] || event_data[:trace_id] || generate_trace_id
|
292
|
-
|
293
|
+
|
293
294
|
# Avoid duplicates
|
294
295
|
return if @traces.any? { |t| t.trace_id == trace_id }
|
295
296
|
|
@@ -350,7 +351,7 @@ module DSPy
|
|
350
351
|
collect_trace(name, attrs)
|
351
352
|
end
|
352
353
|
|
353
|
-
# Subscribe to module events
|
354
|
+
# Subscribe to module events
|
354
355
|
self.class.add_subscription('*.reasoning_complete') do |name, attrs|
|
355
356
|
collect_trace(name, attrs)
|
356
357
|
end
|
@@ -394,7 +395,7 @@ module DSPy
|
|
394
395
|
reasoning: 'Cannot provide reflection without execution traces',
|
395
396
|
suggested_mutations: [],
|
396
397
|
metadata: {
|
397
|
-
reflection_model: @config.reflection_lm,
|
398
|
+
reflection_model: @config.reflection_lm&.model,
|
398
399
|
analysis_timestamp: Time.now,
|
399
400
|
trace_count: 0
|
400
401
|
}
|
@@ -404,7 +405,7 @@ module DSPy
|
|
404
405
|
patterns = analyze_execution_patterns(traces)
|
405
406
|
improvements = generate_improvement_suggestions(patterns)
|
406
407
|
mutations = suggest_mutations(patterns)
|
407
|
-
|
408
|
+
|
408
409
|
# For Phase 1, we generate a simple rule-based analysis
|
409
410
|
# Future phases will use LLM-based reflection
|
410
411
|
diagnosis = generate_diagnosis(patterns)
|
@@ -419,7 +420,7 @@ module DSPy
|
|
419
420
|
reasoning: reasoning,
|
420
421
|
suggested_mutations: mutations,
|
421
422
|
metadata: {
|
422
|
-
reflection_model: @config.reflection_lm,
|
423
|
+
reflection_model: @config.reflection_lm&.model,
|
423
424
|
analysis_timestamp: Time.now,
|
424
425
|
trace_count: traces.size,
|
425
426
|
token_usage: 0 # Phase 1 doesn't use actual LLM reflection
|
@@ -485,326 +486,17 @@ module DSPy
|
|
485
486
|
mutations << :combine if llm_count > 2
|
486
487
|
mutations << :rewrite if llm_count == 1
|
487
488
|
mutations << :rephrase if mutations.empty?
|
488
|
-
|
489
|
-
mutations.uniq
|
490
|
-
end
|
491
|
-
|
492
|
-
private
|
493
|
-
|
494
|
-
# Generate unique reflection ID
|
495
|
-
sig { returns(String) }
|
496
|
-
def generate_reflection_id
|
497
|
-
"reflection-#{SecureRandom.hex(4)}"
|
498
|
-
end
|
499
|
-
|
500
|
-
# Generate diagnosis text
|
501
|
-
sig { params(patterns: T::Hash[Symbol, T.untyped]).returns(String) }
|
502
|
-
def generate_diagnosis(patterns)
|
503
|
-
if patterns[:total_tokens] > 400
|
504
|
-
'High token usage indicates potential inefficiency in prompt design'
|
505
|
-
elsif patterns[:llm_traces_count] == 0
|
506
|
-
'No LLM interactions found - execution may not be working as expected'
|
507
|
-
elsif patterns[:avg_response_length] < 10
|
508
|
-
'Responses are unusually brief which may indicate prompt clarity issues'
|
509
|
-
else
|
510
|
-
'Execution patterns appear normal with room for optimization'
|
511
|
-
end
|
512
|
-
end
|
513
|
-
|
514
|
-
# Generate reasoning text
|
515
|
-
sig { params(patterns: T::Hash[Symbol, T.untyped], traces: T::Array[ExecutionTrace]).returns(String) }
|
516
|
-
def generate_reasoning(patterns, traces)
|
517
|
-
reasoning_parts = []
|
518
|
-
|
519
|
-
reasoning_parts << "Analyzed #{traces.size} execution traces"
|
520
|
-
reasoning_parts << "#{patterns[:llm_traces_count]} LLM interactions"
|
521
|
-
reasoning_parts << "#{patterns[:module_traces_count]} module operations"
|
522
|
-
reasoning_parts << "Total token usage: #{patterns[:total_tokens]}"
|
523
|
-
|
524
|
-
reasoning_parts.join('. ') + '.'
|
525
|
-
end
|
526
489
|
|
527
|
-
|
528
|
-
sig { params(patterns: T::Hash[Symbol, T.untyped]).returns(Float) }
|
529
|
-
def calculate_confidence(patterns)
|
530
|
-
base_confidence = 0.7
|
531
|
-
|
532
|
-
# More traces = higher confidence
|
533
|
-
trace_bonus = [patterns[:llm_traces_count] + patterns[:module_traces_count], 10].min * 0.02
|
534
|
-
|
535
|
-
# Reasonable token usage = higher confidence
|
536
|
-
token_penalty = patterns[:total_tokens] > 1000 ? -0.1 : 0.0
|
537
|
-
|
538
|
-
[(base_confidence + trace_bonus + token_penalty), 1.0].min
|
539
|
-
end
|
540
|
-
|
541
|
-
# Calculate average response length from LLM traces
|
542
|
-
sig { params(llm_traces: T::Array[ExecutionTrace]).returns(Integer) }
|
543
|
-
def calculate_avg_response_length(llm_traces)
|
544
|
-
return 0 if llm_traces.empty?
|
545
|
-
|
546
|
-
total_length = llm_traces.sum do |trace|
|
547
|
-
response = trace.response_text
|
548
|
-
response ? response.length : 0
|
549
|
-
end
|
550
|
-
|
551
|
-
total_length / llm_traces.size
|
490
|
+
mutations.uniq
|
552
491
|
end
|
553
492
|
|
554
|
-
# Calculate timespan of traces
|
555
|
-
sig { params(traces: T::Array[ExecutionTrace]).returns(Float) }
|
556
|
-
def calculate_timespan(traces)
|
557
|
-
return 0.0 if traces.size < 2
|
558
|
-
|
559
|
-
timestamps = traces.map(&:timestamp).sort
|
560
|
-
(timestamps.last - timestamps.first).to_f
|
561
|
-
end
|
562
|
-
|
563
|
-
# LLM-based reflection methods for Phase 2
|
564
|
-
|
565
493
|
public
|
566
|
-
|
567
|
-
# Perform LLM-based reflection on execution traces using DSPy::Predict
|
568
|
-
sig { params(traces: T::Array[ExecutionTrace]).returns(ReflectionResult) }
|
569
|
-
def reflect_with_llm(traces)
|
570
|
-
return reflect_on_traces(traces) if traces.empty?
|
571
|
-
|
572
|
-
begin
|
573
|
-
# Use DSPy::Predict for analysis instead of raw prompts
|
574
|
-
prediction = analyze_traces_with_dspy(traces)
|
575
|
-
convert_prediction_to_reflection_result(prediction, traces)
|
576
|
-
rescue => e
|
577
|
-
# Fallback to rule-based analysis on LLM failure
|
578
|
-
fallback_result = reflect_on_traces(traces)
|
579
|
-
fallback_result.class.new(
|
580
|
-
trace_id: fallback_result.trace_id,
|
581
|
-
diagnosis: "LLM reflection failed (#{e.message}), using fallback analysis: #{fallback_result.diagnosis}",
|
582
|
-
improvements: fallback_result.improvements,
|
583
|
-
confidence: [fallback_result.confidence * 0.5, 0.5].min,
|
584
|
-
reasoning: "Fallback to rule-based analysis after LLM error: #{fallback_result.reasoning}",
|
585
|
-
suggested_mutations: fallback_result.suggested_mutations,
|
586
|
-
metadata: fallback_result.metadata.merge(
|
587
|
-
llm_error: e.message,
|
588
|
-
fallback_used: true
|
589
|
-
)
|
590
|
-
)
|
591
|
-
end
|
592
|
-
end
|
593
|
-
|
594
|
-
# Generate structured reflection prompt for LLM (public API)
|
595
|
-
sig { params(traces: T::Array[ExecutionTrace]).returns(String) }
|
596
|
-
def generate_reflection_prompt(traces)
|
597
|
-
if traces.empty?
|
598
|
-
return <<~PROMPT
|
599
|
-
You are analyzing execution traces for a genetic algorithm-based prompt optimization system called GEPA.
|
600
|
-
|
601
|
-
**Task**: Analyze execution patterns and provide optimization recommendations.
|
602
|
-
|
603
|
-
**Context**: No execution traces available.
|
604
|
-
|
605
|
-
Please provide your analysis in the following JSON format:
|
606
|
-
{
|
607
|
-
"diagnosis": "Brief description of what you observed",
|
608
|
-
"improvements": ["List of actionable improvement suggestions"],
|
609
|
-
"confidence": 0.0,
|
610
|
-
"reasoning": "Your reasoning process",
|
611
|
-
"suggested_mutations": ["expand", "rewrite", "simplify", "combine", "rephrase"],
|
612
|
-
"insights": {
|
613
|
-
"pattern_detected": "no_data",
|
614
|
-
"optimization_opportunity": "data_collection"
|
615
|
-
}
|
616
|
-
}
|
617
|
-
PROMPT
|
618
|
-
end
|
619
|
-
|
620
|
-
summary = trace_summary_for_reflection(traces)
|
621
|
-
insights = extract_optimization_insights(traces)
|
622
|
-
|
623
|
-
<<~PROMPT
|
624
|
-
You are analyzing execution traces for a genetic algorithm-based prompt optimization system called GEPA.
|
625
|
-
|
626
|
-
**Task**: Analyze execution patterns and provide optimization recommendations for prompt evolution.
|
627
|
-
|
628
|
-
**Execution Summary**:
|
629
|
-
#{summary}
|
630
|
-
|
631
|
-
**Optimization Context**:
|
632
|
-
- This is part of a genetic algorithm for prompt optimization
|
633
|
-
- Available mutation types: rewrite, expand, simplify, combine, rephrase
|
634
|
-
- Goal is to improve prompt effectiveness through iterative evolution
|
635
|
-
- Focus on actionable insights that can guide mutation and crossover operations
|
636
|
-
|
637
|
-
**Key Optimization Insights**:
|
638
|
-
#{insights.map { |k, v| "- #{k}: #{v.is_a?(Hash) ? v.values.join(', ') : v}" }.join("\n")}
|
639
|
-
|
640
|
-
**Sample Traces**:
|
641
|
-
#{format_traces_for_prompt(traces.take(3))}
|
642
|
-
|
643
|
-
Please analyze these execution patterns and provide optimization recommendations in the following JSON format:
|
644
|
-
{
|
645
|
-
"diagnosis": "Brief description of execution patterns and issues identified",
|
646
|
-
"improvements": ["List of 2-4 specific, actionable improvement suggestions"],
|
647
|
-
"confidence": 0.85,
|
648
|
-
"reasoning": "Your detailed reasoning process for the analysis",
|
649
|
-
"suggested_mutations": ["List of 2-3 mutation types that would be most beneficial"],
|
650
|
-
"insights": {
|
651
|
-
"pattern_detected": "primary_pattern_identified",
|
652
|
-
"optimization_opportunity": "key_area_for_improvement"
|
653
|
-
}
|
654
|
-
}
|
655
|
-
|
656
|
-
Focus on practical recommendations that will improve prompt performance through genetic algorithm evolution.
|
657
|
-
PROMPT
|
658
|
-
end
|
659
|
-
|
660
|
-
# Parse LLM reflection response into ReflectionResult (public API)
|
661
|
-
sig { params(response_text: String, original_traces: T::Array[ExecutionTrace]).returns(ReflectionResult) }
|
662
|
-
def parse_llm_reflection(response_text, original_traces)
|
663
|
-
reflection_id = generate_reflection_id
|
664
|
-
|
665
|
-
begin
|
666
|
-
parsed = JSON.parse(response_text)
|
667
|
-
|
668
|
-
# Extract and validate components
|
669
|
-
diagnosis = parsed['diagnosis'] || 'LLM reflection analysis'
|
670
|
-
improvements = Array(parsed['improvements']).select { |i| i.is_a?(String) && !i.strip.empty? }
|
671
|
-
confidence = [parsed['confidence'].to_f, 1.0].min
|
672
|
-
reasoning = parsed['reasoning'] || 'LLM-based analysis of execution traces'
|
673
|
-
|
674
|
-
# Validate and sanitize mutation suggestions
|
675
|
-
raw_mutations = Array(parsed['suggested_mutations'])
|
676
|
-
valid_mutations = raw_mutations.filter_map do |mut|
|
677
|
-
mutation_symbol = mut.to_s.downcase.to_sym
|
678
|
-
if [:rewrite, :expand, :simplify, :combine, :rephrase].include?(mutation_symbol)
|
679
|
-
mutation_symbol
|
680
|
-
end
|
681
|
-
end.uniq
|
682
|
-
|
683
|
-
# Ensure we have at least one valid mutation suggestion
|
684
|
-
valid_mutations = [:rewrite] if valid_mutations.empty?
|
685
|
-
|
686
|
-
ReflectionResult.new(
|
687
|
-
trace_id: reflection_id,
|
688
|
-
diagnosis: diagnosis,
|
689
|
-
improvements: improvements,
|
690
|
-
confidence: confidence,
|
691
|
-
reasoning: reasoning,
|
692
|
-
suggested_mutations: valid_mutations,
|
693
|
-
metadata: {
|
694
|
-
reflection_model: @config.reflection_lm,
|
695
|
-
analysis_timestamp: Time.now,
|
696
|
-
trace_count: original_traces.size,
|
697
|
-
token_usage: estimate_token_usage(response_text),
|
698
|
-
llm_based: true,
|
699
|
-
insights: parsed['insights'] || {}
|
700
|
-
}
|
701
|
-
)
|
702
|
-
|
703
|
-
rescue JSON::ParserError => e
|
704
|
-
# Handle malformed JSON response
|
705
|
-
ReflectionResult.new(
|
706
|
-
trace_id: reflection_id,
|
707
|
-
diagnosis: "LLM reflection JSON parsing error: #{e.message}",
|
708
|
-
improvements: ['Review prompt structure and LLM response format'],
|
709
|
-
confidence: 0.3,
|
710
|
-
reasoning: "Failed to parse LLM reflection response as valid JSON",
|
711
|
-
suggested_mutations: [:rewrite],
|
712
|
-
metadata: {
|
713
|
-
reflection_model: @config.reflection_lm,
|
714
|
-
analysis_timestamp: Time.now,
|
715
|
-
trace_count: original_traces.size,
|
716
|
-
token_usage: 0,
|
717
|
-
parsing_error: e.message,
|
718
|
-
raw_response: response_text.length > 500 ? "#{response_text[0..500]}..." : response_text
|
719
|
-
}
|
720
|
-
)
|
721
|
-
end
|
722
|
-
end
|
723
|
-
|
724
|
-
# Create comprehensive trace summary for reflection (public API)
|
725
|
-
sig { params(traces: T::Array[ExecutionTrace]).returns(String) }
|
726
|
-
def trace_summary_for_reflection(traces)
|
727
|
-
return "No execution traces available" if traces.empty?
|
728
|
-
|
729
|
-
llm_traces = traces.select(&:llm_trace?)
|
730
|
-
module_traces = traces.select(&:module_trace?)
|
731
|
-
|
732
|
-
total_tokens = llm_traces.sum(&:token_usage)
|
733
|
-
unique_models = llm_traces.map(&:model_name).compact.uniq
|
734
|
-
timespan = calculate_timespan(traces)
|
735
|
-
|
736
|
-
avg_response_length = if llm_traces.any?
|
737
|
-
total_length = llm_traces.sum { |t| t.response_text&.length || 0 }
|
738
|
-
total_length / llm_traces.size
|
739
|
-
else
|
740
|
-
0
|
741
|
-
end
|
742
|
-
|
743
|
-
<<~SUMMARY
|
744
|
-
Total traces: #{traces.size}
|
745
|
-
LLM interactions: #{llm_traces.size}
|
746
|
-
Module calls: #{module_traces.size}
|
747
|
-
Total tokens: #{total_tokens}
|
748
|
-
Models used: #{unique_models.join(', ')}
|
749
|
-
Average response length: #{avg_response_length} characters
|
750
|
-
Execution timespan: #{timespan.round(2)} seconds
|
751
|
-
SUMMARY
|
752
|
-
end
|
753
|
-
|
754
|
-
# Extract optimization insights from trace analysis (public API)
|
755
|
-
sig { params(traces: T::Array[ExecutionTrace]).returns(T::Hash[Symbol, T.untyped]) }
|
756
|
-
def extract_optimization_insights(traces)
|
757
|
-
llm_traces = traces.select(&:llm_trace?)
|
758
|
-
|
759
|
-
insights = {
|
760
|
-
token_efficiency: analyze_token_efficiency(llm_traces),
|
761
|
-
response_quality: analyze_response_quality(llm_traces),
|
762
|
-
model_consistency: analyze_model_consistency(llm_traces)
|
763
|
-
}
|
764
|
-
|
765
|
-
insights
|
766
|
-
end
|
767
|
-
|
768
|
-
# Reflection with optimization context (public API)
|
769
|
-
sig { params(traces: T::Array[ExecutionTrace], context: T::Hash[Symbol, T.untyped]).returns(ReflectionResult) }
|
770
|
-
def reflection_with_context(traces, context)
|
771
|
-
base_result = reflect_with_llm(traces)
|
772
|
-
|
773
|
-
# Incorporate context into reasoning
|
774
|
-
context_reasoning = "Generation #{context[:generation] || 'unknown'} analysis. "
|
775
|
-
context_reasoning += "Population size: #{context[:population_size] || 'unknown'}. "
|
776
|
-
|
777
|
-
if context[:current_best_score]
|
778
|
-
context_reasoning += "Current best score: #{context[:current_best_score]}. "
|
779
|
-
end
|
780
|
-
|
781
|
-
# Adjust mutation suggestions based on history
|
782
|
-
adjusted_mutations = adjust_mutations_for_history(
|
783
|
-
base_result.suggested_mutations,
|
784
|
-
context[:mutation_history] || [],
|
785
|
-
context[:recent_performance_trend]
|
786
|
-
)
|
787
|
-
|
788
|
-
ReflectionResult.new(
|
789
|
-
trace_id: base_result.trace_id,
|
790
|
-
diagnosis: base_result.diagnosis,
|
791
|
-
improvements: base_result.improvements,
|
792
|
-
confidence: base_result.confidence,
|
793
|
-
reasoning: context_reasoning + base_result.reasoning,
|
794
|
-
suggested_mutations: adjusted_mutations,
|
795
|
-
metadata: base_result.metadata.merge(optimization_context: context)
|
796
|
-
)
|
797
|
-
end
|
798
|
-
|
799
|
-
# LLM-based reflection methods for Phase 2
|
800
|
-
|
801
|
-
public
|
802
|
-
|
494
|
+
|
803
495
|
# Perform LLM-based reflection on execution traces using DSPy::Predict
|
804
496
|
sig { params(traces: T::Array[ExecutionTrace]).returns(ReflectionResult) }
|
805
497
|
def reflect_with_llm(traces)
|
806
498
|
return reflect_on_traces(traces) if traces.empty?
|
807
|
-
|
499
|
+
|
808
500
|
begin
|
809
501
|
# Use DSPy::Predict for analysis instead of raw prompts
|
810
502
|
prediction = analyze_traces_with_dspy(traces)
|
@@ -826,7 +518,7 @@ module DSPy
|
|
826
518
|
)
|
827
519
|
end
|
828
520
|
end
|
829
|
-
|
521
|
+
|
830
522
|
# Generate structured reflection prompt for LLM (public API)
|
831
523
|
sig { params(traces: T::Array[ExecutionTrace]).returns(String) }
|
832
524
|
def generate_reflection_prompt(traces)
|
@@ -852,10 +544,10 @@ module DSPy
|
|
852
544
|
}
|
853
545
|
PROMPT
|
854
546
|
end
|
855
|
-
|
547
|
+
|
856
548
|
summary = trace_summary_for_reflection(traces)
|
857
549
|
insights = extract_optimization_insights(traces)
|
858
|
-
|
550
|
+
|
859
551
|
<<~PROMPT
|
860
552
|
You are analyzing execution traces for a genetic algorithm-based prompt optimization system called GEPA.
|
861
553
|
|
@@ -892,21 +584,21 @@ module DSPy
|
|
892
584
|
Focus on practical recommendations that will improve prompt performance through genetic algorithm evolution.
|
893
585
|
PROMPT
|
894
586
|
end
|
895
|
-
|
587
|
+
|
896
588
|
# Parse LLM reflection response into ReflectionResult (public API)
|
897
589
|
sig { params(response_text: String, original_traces: T::Array[ExecutionTrace]).returns(ReflectionResult) }
|
898
590
|
def parse_llm_reflection(response_text, original_traces)
|
899
591
|
reflection_id = generate_reflection_id
|
900
|
-
|
592
|
+
|
901
593
|
begin
|
902
594
|
parsed = JSON.parse(response_text)
|
903
|
-
|
595
|
+
|
904
596
|
# Extract and validate components
|
905
597
|
diagnosis = parsed['diagnosis'] || 'LLM reflection analysis'
|
906
598
|
improvements = Array(parsed['improvements']).select { |i| i.is_a?(String) && !i.strip.empty? }
|
907
599
|
confidence = [parsed['confidence'].to_f, 1.0].min
|
908
600
|
reasoning = parsed['reasoning'] || 'LLM-based analysis of execution traces'
|
909
|
-
|
601
|
+
|
910
602
|
# Validate and sanitize mutation suggestions
|
911
603
|
raw_mutations = Array(parsed['suggested_mutations'])
|
912
604
|
valid_mutations = raw_mutations.filter_map do |mut|
|
@@ -915,10 +607,10 @@ module DSPy
|
|
915
607
|
mutation_symbol
|
916
608
|
end
|
917
609
|
end.uniq
|
918
|
-
|
610
|
+
|
919
611
|
# Ensure we have at least one valid mutation suggestion
|
920
612
|
valid_mutations = [:rewrite] if valid_mutations.empty?
|
921
|
-
|
613
|
+
|
922
614
|
ReflectionResult.new(
|
923
615
|
trace_id: reflection_id,
|
924
616
|
diagnosis: diagnosis,
|
@@ -927,7 +619,7 @@ module DSPy
|
|
927
619
|
reasoning: reasoning,
|
928
620
|
suggested_mutations: valid_mutations,
|
929
621
|
metadata: {
|
930
|
-
reflection_model: @config.reflection_lm,
|
622
|
+
reflection_model: @config.reflection_lm&.model,
|
931
623
|
analysis_timestamp: Time.now,
|
932
624
|
trace_count: original_traces.size,
|
933
625
|
token_usage: estimate_token_usage(response_text),
|
@@ -935,7 +627,7 @@ module DSPy
|
|
935
627
|
insights: parsed['insights'] || {}
|
936
628
|
}
|
937
629
|
)
|
938
|
-
|
630
|
+
|
939
631
|
rescue JSON::ParserError => e
|
940
632
|
# Handle malformed JSON response
|
941
633
|
ReflectionResult.new(
|
@@ -946,7 +638,7 @@ module DSPy
|
|
946
638
|
reasoning: "Failed to parse LLM reflection response as valid JSON",
|
947
639
|
suggested_mutations: [:rewrite],
|
948
640
|
metadata: {
|
949
|
-
reflection_model: @config.reflection_lm,
|
641
|
+
reflection_model: @config.reflection_lm&.model,
|
950
642
|
analysis_timestamp: Time.now,
|
951
643
|
trace_count: original_traces.size,
|
952
644
|
token_usage: 0,
|
@@ -956,26 +648,26 @@ module DSPy
|
|
956
648
|
)
|
957
649
|
end
|
958
650
|
end
|
959
|
-
|
651
|
+
|
960
652
|
# Create comprehensive trace summary for reflection (public API)
|
961
653
|
sig { params(traces: T::Array[ExecutionTrace]).returns(String) }
|
962
654
|
def trace_summary_for_reflection(traces)
|
963
655
|
return "No execution traces available" if traces.empty?
|
964
|
-
|
656
|
+
|
965
657
|
llm_traces = traces.select(&:llm_trace?)
|
966
658
|
module_traces = traces.select(&:module_trace?)
|
967
|
-
|
659
|
+
|
968
660
|
total_tokens = llm_traces.sum(&:token_usage)
|
969
661
|
unique_models = llm_traces.map(&:model_name).compact.uniq
|
970
662
|
timespan = calculate_timespan(traces)
|
971
|
-
|
663
|
+
|
972
664
|
avg_response_length = if llm_traces.any?
|
973
665
|
total_length = llm_traces.sum { |t| t.response_text&.length || 0 }
|
974
666
|
total_length / llm_traces.size
|
975
667
|
else
|
976
668
|
0
|
977
669
|
end
|
978
|
-
|
670
|
+
|
979
671
|
<<~SUMMARY
|
980
672
|
Total traces: #{traces.size}
|
981
673
|
LLM interactions: #{llm_traces.size}
|
@@ -986,41 +678,41 @@ module DSPy
|
|
986
678
|
Execution timespan: #{timespan.round(2)} seconds
|
987
679
|
SUMMARY
|
988
680
|
end
|
989
|
-
|
681
|
+
|
990
682
|
# Extract optimization insights from trace analysis (public API)
|
991
683
|
sig { params(traces: T::Array[ExecutionTrace]).returns(T::Hash[Symbol, T.untyped]) }
|
992
684
|
def extract_optimization_insights(traces)
|
993
685
|
llm_traces = traces.select(&:llm_trace?)
|
994
|
-
|
686
|
+
|
995
687
|
insights = {
|
996
688
|
token_efficiency: analyze_token_efficiency(llm_traces),
|
997
689
|
response_quality: analyze_response_quality(llm_traces),
|
998
690
|
model_consistency: analyze_model_consistency(llm_traces)
|
999
691
|
}
|
1000
|
-
|
692
|
+
|
1001
693
|
insights
|
1002
694
|
end
|
1003
|
-
|
695
|
+
|
1004
696
|
# Reflection with optimization context (public API)
|
1005
697
|
sig { params(traces: T::Array[ExecutionTrace], context: T::Hash[Symbol, T.untyped]).returns(ReflectionResult) }
|
1006
698
|
def reflection_with_context(traces, context)
|
1007
699
|
base_result = reflect_with_llm(traces)
|
1008
|
-
|
700
|
+
|
1009
701
|
# Incorporate context into reasoning
|
1010
702
|
context_reasoning = "Generation #{context[:generation] || 'unknown'} analysis. "
|
1011
703
|
context_reasoning += "Population size: #{context[:population_size] || 'unknown'}. "
|
1012
|
-
|
704
|
+
|
1013
705
|
if context[:current_best_score]
|
1014
706
|
context_reasoning += "Current best score: #{context[:current_best_score]}. "
|
1015
707
|
end
|
1016
|
-
|
708
|
+
|
1017
709
|
# Adjust mutation suggestions based on history
|
1018
710
|
adjusted_mutations = adjust_mutations_for_history(
|
1019
711
|
base_result.suggested_mutations,
|
1020
712
|
context[:mutation_history] || [],
|
1021
713
|
context[:recent_performance_trend]
|
1022
714
|
)
|
1023
|
-
|
715
|
+
|
1024
716
|
ReflectionResult.new(
|
1025
717
|
trace_id: base_result.trace_id,
|
1026
718
|
diagnosis: base_result.diagnosis,
|
@@ -1031,22 +723,22 @@ module DSPy
|
|
1031
723
|
metadata: base_result.metadata.merge(optimization_context: context)
|
1032
724
|
)
|
1033
725
|
end
|
1034
|
-
|
726
|
+
|
1035
727
|
public
|
1036
|
-
|
728
|
+
|
1037
729
|
# Create signature for trace reflection analysis (public API)
|
1038
730
|
sig { returns(T.class_of(DSPy::Signature)) }
|
1039
731
|
def create_trace_reflection_signature
|
1040
732
|
@trace_reflection_signature ||= Class.new(DSPy::Signature) do
|
1041
733
|
description "Analyze execution traces from GEPA optimization system and provide actionable optimization insights"
|
1042
|
-
|
734
|
+
|
1043
735
|
input do
|
1044
736
|
const :execution_summary, String, description: "Summary of execution traces and performance patterns"
|
1045
737
|
const :optimization_context, String, description: "Context about the genetic algorithm optimization goals"
|
1046
|
-
const :key_insights, String, description: "Key insights extracted from trace analysis"
|
738
|
+
const :key_insights, String, description: "Key insights extracted from trace analysis"
|
1047
739
|
const :sample_traces, String, description: "Representative execution trace samples"
|
1048
740
|
end
|
1049
|
-
|
741
|
+
|
1050
742
|
output do
|
1051
743
|
const :diagnosis, String, description: "Brief description of execution patterns and issues identified"
|
1052
744
|
const :improvements, T::Array[String], description: "List of 2-4 specific actionable improvement suggestions"
|
@@ -1060,35 +752,40 @@ module DSPy
|
|
1060
752
|
end
|
1061
753
|
|
1062
754
|
# Perform LLM analysis using DSPy::Predict (public API)
|
1063
|
-
sig { params(traces: T::Array[ExecutionTrace]).returns(
|
755
|
+
sig { params(traces: T::Array[ExecutionTrace]).returns(T.untyped) }
|
1064
756
|
def analyze_traces_with_dspy(traces)
|
757
|
+
raise ArgumentError, "reflection_lm must be configured on GEPAConfig for LLM-based reflection" unless @config.reflection_lm
|
758
|
+
|
1065
759
|
predictor = DSPy::Predict.new(create_trace_reflection_signature)
|
1066
|
-
|
760
|
+
|
761
|
+
# Configure predictor to use reflection-specific LM
|
762
|
+
predictor.config.lm = @config.reflection_lm
|
763
|
+
|
1067
764
|
# Prepare input data
|
1068
765
|
summary = trace_summary_for_reflection(traces)
|
1069
766
|
insights = extract_optimization_insights(traces)
|
1070
767
|
insights_text = insights.map { |k, v| "- #{k}: #{v.is_a?(Hash) ? v.values.join(', ') : v}" }.join("\n")
|
1071
|
-
|
768
|
+
|
1072
769
|
# Get LLM analysis
|
1073
|
-
predictor.call(
|
770
|
+
T.unsafe(predictor.call(
|
1074
771
|
execution_summary: summary,
|
1075
772
|
optimization_context: "GEPA genetic algorithm for prompt optimization. Available mutations: rewrite, expand, simplify, combine, rephrase. Goal: improve prompt effectiveness through iterative evolution.",
|
1076
773
|
key_insights: insights_text,
|
1077
774
|
sample_traces: format_traces_for_prompt(traces.take(3))
|
1078
|
-
)
|
775
|
+
))
|
1079
776
|
end
|
1080
777
|
|
1081
778
|
# Convert DSPy prediction to ReflectionResult (public API)
|
1082
|
-
sig { params(prediction:
|
779
|
+
sig { params(prediction: T.untyped, original_traces: T::Array[ExecutionTrace]).returns(ReflectionResult) }
|
1083
780
|
def convert_prediction_to_reflection_result(prediction, original_traces)
|
1084
781
|
reflection_id = generate_reflection_id
|
1085
|
-
|
782
|
+
|
1086
783
|
# Extract and validate prediction results
|
1087
784
|
diagnosis = prediction.diagnosis || 'DSPy reflection analysis'
|
1088
785
|
improvements = Array(prediction.improvements).select { |i| i.is_a?(String) && !i.strip.empty? }
|
1089
786
|
confidence = [[prediction.confidence&.to_f || 0.0, 1.0].min, 0.0].max
|
1090
787
|
reasoning = prediction.reasoning || 'DSPy-based analysis of execution traces'
|
1091
|
-
|
788
|
+
|
1092
789
|
# Validate mutation suggestions
|
1093
790
|
valid_mutations = Array(prediction.suggested_mutations).filter_map do |mut|
|
1094
791
|
mutation_symbol = mut.to_s.downcase.to_sym
|
@@ -1096,10 +793,10 @@ module DSPy
|
|
1096
793
|
mutation_symbol
|
1097
794
|
end
|
1098
795
|
end.uniq
|
1099
|
-
|
796
|
+
|
1100
797
|
# Ensure we have at least one valid mutation suggestion
|
1101
798
|
valid_mutations = [:rewrite] if valid_mutations.empty?
|
1102
|
-
|
799
|
+
|
1103
800
|
ReflectionResult.new(
|
1104
801
|
trace_id: reflection_id,
|
1105
802
|
diagnosis: diagnosis,
|
@@ -1108,7 +805,7 @@ module DSPy
|
|
1108
805
|
reasoning: reasoning,
|
1109
806
|
suggested_mutations: valid_mutations,
|
1110
807
|
metadata: {
|
1111
|
-
reflection_model: @config.reflection_lm,
|
808
|
+
reflection_model: @config.reflection_lm&.model,
|
1112
809
|
analysis_timestamp: Time.now,
|
1113
810
|
trace_count: original_traces.size,
|
1114
811
|
token_usage: estimate_token_usage(prediction.to_s),
|
@@ -1121,9 +818,9 @@ module DSPy
|
|
1121
818
|
}
|
1122
819
|
)
|
1123
820
|
end
|
1124
|
-
|
821
|
+
|
1125
822
|
private
|
1126
|
-
|
823
|
+
|
1127
824
|
# Generate unique reflection ID
|
1128
825
|
sig { returns(String) }
|
1129
826
|
def generate_reflection_id
|
@@ -1148,12 +845,12 @@ module DSPy
|
|
1148
845
|
sig { params(patterns: T::Hash[Symbol, T.untyped], traces: T::Array[ExecutionTrace]).returns(String) }
|
1149
846
|
def generate_reasoning(patterns, traces)
|
1150
847
|
reasoning_parts = []
|
1151
|
-
|
848
|
+
|
1152
849
|
reasoning_parts << "Analyzed #{traces.size} execution traces"
|
1153
850
|
reasoning_parts << "#{patterns[:llm_traces_count]} LLM interactions"
|
1154
851
|
reasoning_parts << "#{patterns[:module_traces_count]} module operations"
|
1155
852
|
reasoning_parts << "Total token usage: #{patterns[:total_tokens]}"
|
1156
|
-
|
853
|
+
|
1157
854
|
reasoning_parts.join('. ') + '.'
|
1158
855
|
end
|
1159
856
|
|
@@ -1161,13 +858,13 @@ module DSPy
|
|
1161
858
|
sig { params(patterns: T::Hash[Symbol, T.untyped]).returns(Float) }
|
1162
859
|
def calculate_confidence(patterns)
|
1163
860
|
base_confidence = 0.7
|
1164
|
-
|
861
|
+
|
1165
862
|
# More traces = higher confidence
|
1166
863
|
trace_bonus = [patterns[:llm_traces_count] + patterns[:module_traces_count], 10].min * 0.02
|
1167
|
-
|
864
|
+
|
1168
865
|
# Reasonable token usage = higher confidence
|
1169
866
|
token_penalty = patterns[:total_tokens] > 1000 ? -0.1 : 0.0
|
1170
|
-
|
867
|
+
|
1171
868
|
[(base_confidence + trace_bonus + token_penalty), 1.0].min
|
1172
869
|
end
|
1173
870
|
|
@@ -1175,12 +872,12 @@ module DSPy
|
|
1175
872
|
sig { params(llm_traces: T::Array[ExecutionTrace]).returns(Integer) }
|
1176
873
|
def calculate_avg_response_length(llm_traces)
|
1177
874
|
return 0 if llm_traces.empty?
|
1178
|
-
|
875
|
+
|
1179
876
|
total_length = llm_traces.sum do |trace|
|
1180
877
|
response = trace.response_text
|
1181
878
|
response ? response.length : 0
|
1182
879
|
end
|
1183
|
-
|
880
|
+
|
1184
881
|
total_length / llm_traces.size
|
1185
882
|
end
|
1186
883
|
|
@@ -1188,11 +885,11 @@ module DSPy
|
|
1188
885
|
sig { params(traces: T::Array[ExecutionTrace]).returns(Float) }
|
1189
886
|
def calculate_timespan(traces)
|
1190
887
|
return 0.0 if traces.size < 2
|
1191
|
-
|
888
|
+
|
1192
889
|
timestamps = traces.map(&:timestamp).sort
|
1193
890
|
(timestamps.last - timestamps.first).to_f
|
1194
891
|
end
|
1195
|
-
|
892
|
+
|
1196
893
|
|
1197
894
|
# Format traces for inclusion in prompt
|
1198
895
|
sig { params(traces: T::Array[ExecutionTrace]).returns(String) }
|
@@ -1203,22 +900,22 @@ module DSPy
|
|
1203
900
|
"#{idx + 1}. [#{trace.event_name}] #{prompt_preview} → #{response_preview}"
|
1204
901
|
end.join("\n")
|
1205
902
|
end
|
1206
|
-
|
903
|
+
|
1207
904
|
# Estimate token usage from response
|
1208
905
|
sig { params(text: String).returns(Integer) }
|
1209
906
|
def estimate_token_usage(text)
|
1210
907
|
# Rough estimation: ~4 characters per token
|
1211
908
|
(text.length / 4.0).ceil
|
1212
909
|
end
|
1213
|
-
|
910
|
+
|
1214
911
|
# Analyze token efficiency patterns
|
1215
912
|
sig { params(llm_traces: T::Array[ExecutionTrace]).returns(T::Hash[Symbol, T.untyped]) }
|
1216
913
|
def analyze_token_efficiency(llm_traces)
|
1217
914
|
return { status: 'no_data', suggestions: [] } if llm_traces.empty?
|
1218
|
-
|
915
|
+
|
1219
916
|
total_tokens = llm_traces.sum(&:token_usage)
|
1220
917
|
avg_tokens = total_tokens.to_f / llm_traces.size
|
1221
|
-
|
918
|
+
|
1222
919
|
if avg_tokens > 400
|
1223
920
|
{
|
1224
921
|
status: 'poor',
|
@@ -1239,15 +936,15 @@ module DSPy
|
|
1239
936
|
}
|
1240
937
|
end
|
1241
938
|
end
|
1242
|
-
|
939
|
+
|
1243
940
|
# Analyze response quality patterns
|
1244
941
|
sig { params(llm_traces: T::Array[ExecutionTrace]).returns(T::Hash[Symbol, T.untyped]) }
|
1245
942
|
def analyze_response_quality(llm_traces)
|
1246
943
|
return { consistency: 'no_data', recommendations: [] } if llm_traces.empty?
|
1247
|
-
|
944
|
+
|
1248
945
|
response_lengths = llm_traces.map { |t| t.response_text&.length || 0 }
|
1249
946
|
length_variance = calculate_variance(response_lengths)
|
1250
|
-
|
947
|
+
|
1251
948
|
if length_variance > 1000
|
1252
949
|
{
|
1253
950
|
consistency: 'inconsistent',
|
@@ -1265,50 +962,50 @@ module DSPy
|
|
1265
962
|
}
|
1266
963
|
end
|
1267
964
|
end
|
1268
|
-
|
965
|
+
|
1269
966
|
# Analyze model consistency
|
1270
967
|
sig { params(llm_traces: T::Array[ExecutionTrace]).returns(T::Hash[Symbol, T.untyped]) }
|
1271
968
|
def analyze_model_consistency(llm_traces)
|
1272
969
|
models = llm_traces.map(&:model_name).compact.uniq
|
1273
|
-
|
970
|
+
|
1274
971
|
{
|
1275
972
|
unique_models: models.size,
|
1276
973
|
models_used: models,
|
1277
974
|
recommendation: models.size > 1 ? 'Consider using single model for consistency' : 'Model usage is consistent'
|
1278
975
|
}
|
1279
976
|
end
|
1280
|
-
|
977
|
+
|
1281
978
|
# Adjust mutations based on history to avoid repetition
|
1282
979
|
sig { params(suggested: T::Array[Symbol], history: T::Array[Symbol], trend: T.nilable(String)).returns(T::Array[Symbol]) }
|
1283
980
|
def adjust_mutations_for_history(suggested, history, trend)
|
1284
981
|
# Count recent usage of each mutation type
|
1285
982
|
recent_usage = history.last(5).tally
|
1286
|
-
|
983
|
+
|
1287
984
|
# Filter out overused mutations
|
1288
985
|
adjusted = suggested.reject do |mutation|
|
1289
986
|
recent_usage[mutation] && recent_usage[mutation] >= 2
|
1290
987
|
end
|
1291
|
-
|
988
|
+
|
1292
989
|
# If trend is declining, prefer different strategies
|
1293
990
|
if trend == 'declining'
|
1294
991
|
adjusted = adjusted.reject { |m| m == :expand } # Avoid expansion if performance declining
|
1295
992
|
adjusted += [:simplify, :rephrase] unless adjusted.include?(:simplify) || adjusted.include?(:rephrase)
|
1296
993
|
end
|
1297
|
-
|
994
|
+
|
1298
995
|
# Ensure we always have at least one suggestion
|
1299
996
|
adjusted.empty? ? [:rewrite] : adjusted.uniq
|
1300
997
|
end
|
1301
|
-
|
998
|
+
|
1302
999
|
# Calculate variance for array of numbers
|
1303
1000
|
sig { params(values: T::Array[Integer]).returns(Float) }
|
1304
1001
|
def calculate_variance(values)
|
1305
1002
|
return 0.0 if values.size < 2
|
1306
|
-
|
1003
|
+
|
1307
1004
|
mean = values.sum.to_f / values.size
|
1308
1005
|
sum_squared_diff = values.sum { |v| (v - mean) ** 2 }
|
1309
1006
|
sum_squared_diff / values.size
|
1310
1007
|
end
|
1311
|
-
|
1008
|
+
|
1312
1009
|
# Truncate text to specified length with ellipsis
|
1313
1010
|
sig { params(text: String, length: Integer).returns(String) }
|
1314
1011
|
def truncate_text(text, length)
|
@@ -1325,8 +1022,8 @@ module DSPy
|
|
1325
1022
|
sig { returns(GEPAConfig) }
|
1326
1023
|
attr_reader :config
|
1327
1024
|
|
1328
|
-
sig { returns(
|
1329
|
-
attr_reader :
|
1025
|
+
sig { returns(FitnessEvaluator) }
|
1026
|
+
attr_reader :fitness_evaluator
|
1330
1027
|
|
1331
1028
|
sig { returns(T::Array[T.untyped]) }
|
1332
1029
|
attr_reader :population
|
@@ -1334,59 +1031,69 @@ module DSPy
|
|
1334
1031
|
sig { returns(Integer) }
|
1335
1032
|
attr_reader :generation
|
1336
1033
|
|
1337
|
-
sig { params(config: GEPAConfig,
|
1338
|
-
def initialize(config:,
|
1034
|
+
sig { params(config: GEPAConfig, fitness_evaluator: FitnessEvaluator).void }
|
1035
|
+
def initialize(config:, fitness_evaluator:)
|
1339
1036
|
@config = config
|
1340
|
-
@
|
1037
|
+
@fitness_evaluator = fitness_evaluator
|
1341
1038
|
@population = T.let([], T::Array[T.untyped])
|
1342
1039
|
@generation = 0
|
1343
|
-
@fitness_scores = T.let([], T::Array[
|
1040
|
+
@fitness_scores = T.let([], T::Array[FitnessScore])
|
1344
1041
|
end
|
1345
1042
|
|
1346
1043
|
# Initialize population with diverse instruction variants
|
1347
1044
|
sig { params(program: T.untyped).void }
|
1348
1045
|
def initialize_population(program)
|
1349
1046
|
@population = []
|
1350
|
-
|
1047
|
+
|
1351
1048
|
# Start with original program
|
1352
1049
|
@population << program
|
1353
|
-
|
1354
|
-
# Generate instruction variants to fill population
|
1355
|
-
|
1356
|
-
|
1357
|
-
|
1050
|
+
|
1051
|
+
# Generate instruction variants to fill population if program has signature_class
|
1052
|
+
if program.respond_to?(:signature_class) && program.signature_class.respond_to?(:description)
|
1053
|
+
original_instruction = program.signature_class.description
|
1054
|
+
if original_instruction && !original_instruction.empty?
|
1055
|
+
variants = generate_instruction_variants(original_instruction)
|
1056
|
+
else
|
1057
|
+
variants = []
|
1058
|
+
end
|
1059
|
+
else
|
1060
|
+
variants = []
|
1061
|
+
end
|
1062
|
+
|
1358
1063
|
# Create program copies with different instructions
|
1359
1064
|
variants.take(@config.population_size - 1).each do |variant|
|
1360
1065
|
variant_program = create_program_with_instruction(program, variant)
|
1361
1066
|
@population << variant_program
|
1362
1067
|
end
|
1363
|
-
|
1068
|
+
|
1364
1069
|
# If we need more candidates, duplicate and mutate
|
1365
1070
|
while @population.size < @config.population_size
|
1366
1071
|
base_program = @population.sample
|
1367
|
-
|
1368
|
-
generate_instruction_variants(base_program.signature_class.description)
|
1369
|
-
|
1072
|
+
if base_program.respond_to?(:signature_class) && base_program.signature_class.respond_to?(:description)
|
1073
|
+
instruction_variants = generate_instruction_variants(base_program.signature_class.description)
|
1074
|
+
if instruction_variants.any?
|
1075
|
+
mutated = create_program_with_instruction(base_program, instruction_variants.first)
|
1076
|
+
@population << mutated
|
1077
|
+
else
|
1078
|
+
# If no variants available, just duplicate the base program
|
1079
|
+
@population << base_program
|
1080
|
+
end
|
1081
|
+
else
|
1082
|
+
# If no signature_class available, just duplicate the base program
|
1083
|
+
@population << base_program
|
1084
|
+
end
|
1370
1085
|
end
|
1371
|
-
|
1086
|
+
|
1372
1087
|
@generation = 0
|
1373
1088
|
end
|
1374
1089
|
|
1375
1090
|
# Evaluate all population members on the training set
|
1376
|
-
sig { params(trainset: T::Array[T.untyped]).returns(T::Array[
|
1091
|
+
sig { params(trainset: T::Array[T.untyped]).returns(T::Array[FitnessScore]) }
|
1377
1092
|
def evaluate_population(trainset)
|
1378
1093
|
@fitness_scores = @population.map do |candidate|
|
1379
|
-
|
1380
|
-
prediction = candidate.call(**example.input_values)
|
1381
|
-
@metric.call(example, prediction).to_f
|
1382
|
-
rescue => e
|
1383
|
-
# Handle evaluation errors gracefully
|
1384
|
-
0.0
|
1385
|
-
end
|
1386
|
-
|
1387
|
-
scores.sum / scores.size
|
1094
|
+
@fitness_evaluator.evaluate_candidate(candidate, trainset)
|
1388
1095
|
end
|
1389
|
-
|
1096
|
+
|
1390
1097
|
@fitness_scores
|
1391
1098
|
end
|
1392
1099
|
|
@@ -1394,27 +1101,32 @@ module DSPy
|
|
1394
1101
|
sig { params(trainset: T::Array[T.untyped]).void }
|
1395
1102
|
def evolve_generation(trainset)
|
1396
1103
|
current_scores = evaluate_population(trainset)
|
1397
|
-
|
1104
|
+
|
1398
1105
|
# Simple selection: keep top 50% and mutate them
|
1399
|
-
sorted_indices = (0...@population.size).sort_by { |i| -current_scores[i] }
|
1400
|
-
survivors = sorted_indices.take(@config.population_size / 2)
|
1401
|
-
|
1106
|
+
sorted_indices = (0...@population.size).sort_by { |i| -current_scores[i].overall_score }
|
1107
|
+
survivors = sorted_indices.take([@config.population_size / 2, 1].max)
|
1108
|
+
|
1402
1109
|
new_population = []
|
1403
|
-
|
1110
|
+
|
1404
1111
|
# Keep best performers
|
1405
1112
|
survivors.each { |i| new_population << @population[i] }
|
1406
|
-
|
1113
|
+
|
1407
1114
|
# Fill rest with mutations of survivors
|
1408
1115
|
while new_population.size < @config.population_size
|
1409
1116
|
parent_index = survivors.sample
|
1410
1117
|
parent = @population[parent_index]
|
1411
|
-
|
1412
|
-
# Generate mutation
|
1413
|
-
|
1414
|
-
|
1415
|
-
|
1118
|
+
|
1119
|
+
# Generate mutation if parent has signature_class
|
1120
|
+
if parent.respond_to?(:signature_class) && parent.signature_class.respond_to?(:description)
|
1121
|
+
variants = generate_instruction_variants(parent.signature_class.description)
|
1122
|
+
mutated = create_program_with_instruction(parent, variants.first || parent.signature_class.description)
|
1123
|
+
new_population << mutated
|
1124
|
+
else
|
1125
|
+
# If no signature_class, just duplicate the parent
|
1126
|
+
new_population << parent
|
1127
|
+
end
|
1416
1128
|
end
|
1417
|
-
|
1129
|
+
|
1418
1130
|
@population = new_population
|
1419
1131
|
@generation += 1
|
1420
1132
|
end
|
@@ -1423,35 +1135,46 @@ module DSPy
|
|
1423
1135
|
sig { params(program: T.untyped, trainset: T::Array[T.untyped]).returns(T::Hash[Symbol, T.untyped]) }
|
1424
1136
|
def run_evolution(program, trainset)
|
1425
1137
|
initialize_population(program)
|
1426
|
-
|
1138
|
+
|
1427
1139
|
history = []
|
1428
|
-
|
1140
|
+
|
1429
1141
|
# Initial evaluation
|
1430
1142
|
initial_scores = evaluate_population(trainset)
|
1143
|
+
best_initial = initial_scores.max_by(&:overall_score)
|
1144
|
+
avg_initial = initial_scores.map(&:overall_score).sum / initial_scores.size
|
1431
1145
|
history << {
|
1432
1146
|
generation: 0,
|
1433
|
-
best_fitness:
|
1434
|
-
avg_fitness:
|
1147
|
+
best_fitness: best_initial.overall_score,
|
1148
|
+
avg_fitness: avg_initial,
|
1435
1149
|
diversity: population_diversity
|
1436
1150
|
}
|
1437
|
-
|
1151
|
+
|
1438
1152
|
# Evolution loop
|
1439
1153
|
@config.num_generations.times do
|
1440
1154
|
evolve_generation(trainset)
|
1441
1155
|
scores = evaluate_population(trainset)
|
1442
|
-
|
1156
|
+
best_score = scores.max_by(&:overall_score)
|
1157
|
+
avg_score = scores.map(&:overall_score).sum / scores.size
|
1158
|
+
|
1443
1159
|
history << {
|
1444
1160
|
generation: @generation,
|
1445
|
-
best_fitness:
|
1446
|
-
avg_fitness:
|
1161
|
+
best_fitness: best_score.overall_score,
|
1162
|
+
avg_fitness: avg_score,
|
1447
1163
|
diversity: population_diversity
|
1448
1164
|
}
|
1449
1165
|
end
|
1450
|
-
|
1166
|
+
|
1167
|
+
best_fitness_score = @fitness_scores.max_by(&:overall_score)
|
1451
1168
|
{
|
1452
1169
|
best_candidate: get_best_candidate,
|
1453
|
-
best_fitness:
|
1170
|
+
best_fitness: best_fitness_score || FitnessScore.new(
|
1171
|
+
primary_score: 0.0,
|
1172
|
+
secondary_scores: {},
|
1173
|
+
overall_score: 0.0,
|
1174
|
+
metadata: {}
|
1175
|
+
),
|
1454
1176
|
generation_history: history,
|
1177
|
+
generation_count: @generation,
|
1455
1178
|
final_population: @population.dup
|
1456
1179
|
}
|
1457
1180
|
end
|
@@ -1460,8 +1183,8 @@ module DSPy
|
|
1460
1183
|
sig { returns(T.untyped) }
|
1461
1184
|
def get_best_candidate
|
1462
1185
|
return @population.first if @fitness_scores.empty?
|
1463
|
-
|
1464
|
-
best_index = @fitness_scores.each_with_index.max_by { |score, _| score }[1]
|
1186
|
+
|
1187
|
+
best_index = @fitness_scores.each_with_index.max_by { |score, _| score.overall_score }[1]
|
1465
1188
|
@population[best_index]
|
1466
1189
|
end
|
1467
1190
|
|
@@ -1469,11 +1192,20 @@ module DSPy
|
|
1469
1192
|
sig { returns(Float) }
|
1470
1193
|
def population_diversity
|
1471
1194
|
return 0.0 if @population.empty?
|
1472
|
-
|
1473
|
-
|
1195
|
+
|
1196
|
+
# Only calculate diversity for programs that have signature_class
|
1197
|
+
instructions = @population.filter_map do |program|
|
1198
|
+
if program.respond_to?(:signature_class) && program.signature_class.respond_to?(:description)
|
1199
|
+
program.signature_class.description
|
1200
|
+
else
|
1201
|
+
nil
|
1202
|
+
end
|
1203
|
+
end
|
1204
|
+
|
1205
|
+
return 0.0 if instructions.empty?
|
1206
|
+
|
1474
1207
|
unique_instructions = instructions.uniq.size
|
1475
|
-
|
1476
|
-
unique_instructions.to_f / @population.size.to_f
|
1208
|
+
unique_instructions.to_f / instructions.size.to_f
|
1477
1209
|
end
|
1478
1210
|
|
1479
1211
|
private
|
@@ -1482,32 +1214,32 @@ module DSPy
|
|
1482
1214
|
sig { params(original_instruction: String).returns(T::Array[String]) }
|
1483
1215
|
def generate_instruction_variants(original_instruction)
|
1484
1216
|
variants = []
|
1485
|
-
|
1217
|
+
|
1486
1218
|
# Add "step by step" variant
|
1487
1219
|
unless original_instruction.include?("step")
|
1488
1220
|
variants << "#{original_instruction} Think step by step."
|
1489
1221
|
end
|
1490
|
-
|
1222
|
+
|
1491
1223
|
# Add "detailed" variant
|
1492
1224
|
unless original_instruction.include?("detail")
|
1493
1225
|
variants << "#{original_instruction} Provide detailed reasoning."
|
1494
1226
|
end
|
1495
|
-
|
1227
|
+
|
1496
1228
|
# Add "careful" variant
|
1497
1229
|
unless original_instruction.include?("careful")
|
1498
1230
|
variants << "Be careful and accurate. #{original_instruction}"
|
1499
1231
|
end
|
1500
|
-
|
1232
|
+
|
1501
1233
|
# Add "examples" variant
|
1502
1234
|
unless original_instruction.include?("example")
|
1503
1235
|
variants << "#{original_instruction} Use examples in your response."
|
1504
1236
|
end
|
1505
|
-
|
1237
|
+
|
1506
1238
|
# Add "precise" variant
|
1507
1239
|
unless original_instruction.include?("precise")
|
1508
1240
|
variants << "Be precise and specific. #{original_instruction}"
|
1509
1241
|
end
|
1510
|
-
|
1242
|
+
|
1511
1243
|
variants.shuffle.take(5) # Return up to 5 variants, shuffled
|
1512
1244
|
end
|
1513
1245
|
|
@@ -1545,11 +1277,11 @@ module DSPy
|
|
1545
1277
|
begin
|
1546
1278
|
# Create a new instance of the same class
|
1547
1279
|
new_module = original_module.class.new
|
1548
|
-
|
1280
|
+
|
1549
1281
|
# Try to find and update any internal predictors
|
1550
1282
|
original_module.instance_variables.each do |var_name|
|
1551
1283
|
var_value = original_module.instance_variable_get(var_name)
|
1552
|
-
|
1284
|
+
|
1553
1285
|
if var_value.is_a?(DSPy::Predict)
|
1554
1286
|
# Update the instruction for internal predictors
|
1555
1287
|
modified_predictor = var_value.with_instruction(new_instruction)
|
@@ -1559,7 +1291,7 @@ module DSPy
|
|
1559
1291
|
new_module.instance_variable_set(var_name, var_value)
|
1560
1292
|
end
|
1561
1293
|
end
|
1562
|
-
|
1294
|
+
|
1563
1295
|
new_module
|
1564
1296
|
rescue => e
|
1565
1297
|
# Fallback to original module
|
@@ -1571,6 +1303,7 @@ module DSPy
|
|
1571
1303
|
# FitnessScore represents multi-dimensional evaluation results
|
1572
1304
|
class FitnessScore < T::Struct
|
1573
1305
|
extend T::Sig
|
1306
|
+
include Comparable
|
1574
1307
|
|
1575
1308
|
const :primary_score, Float
|
1576
1309
|
const :secondary_scores, T::Hash[Symbol, Float]
|
@@ -1607,6 +1340,13 @@ module DSPy
|
|
1607
1340
|
)
|
1608
1341
|
end
|
1609
1342
|
|
1343
|
+
# Comparison method for Comparable module
|
1344
|
+
sig { params(other: FitnessScore).returns(T.nilable(Integer)) }
|
1345
|
+
def <=>(other)
|
1346
|
+
return nil unless other.is_a?(FitnessScore)
|
1347
|
+
overall_score <=> other.overall_score
|
1348
|
+
end
|
1349
|
+
|
1610
1350
|
# Check if this score is dominated by another (for Pareto analysis)
|
1611
1351
|
sig { params(other: FitnessScore).returns(T::Boolean) }
|
1612
1352
|
def dominated_by?(other)
|
@@ -1692,7 +1432,7 @@ module DSPy
|
|
1692
1432
|
|
1693
1433
|
# Calculate secondary metrics
|
1694
1434
|
secondary_scores = {}
|
1695
|
-
|
1435
|
+
|
1696
1436
|
# Token efficiency (mock data for now - will be replaced with real trace collection)
|
1697
1437
|
mock_traces = predictions.map.with_index do |pred, i|
|
1698
1438
|
OpenStruct.new(token_usage: 50 + rand(100))
|
@@ -1784,7 +1524,7 @@ module DSPy
|
|
1784
1524
|
|
1785
1525
|
# Simple consistency measure: average word overlap between responses
|
1786
1526
|
word_sets = responses.map { |response| response.downcase.split.to_set }
|
1787
|
-
|
1527
|
+
|
1788
1528
|
total_similarity = 0.0
|
1789
1529
|
comparisons = 0
|
1790
1530
|
|
@@ -1792,7 +1532,7 @@ module DSPy
|
|
1792
1532
|
word_sets[(i+1)..-1].each do |set2|
|
1793
1533
|
intersection = set1 & set2
|
1794
1534
|
union = set1 | set2
|
1795
|
-
|
1535
|
+
|
1796
1536
|
similarity = union.empty? ? 0.0 : intersection.size.to_f / union.size
|
1797
1537
|
total_similarity += similarity
|
1798
1538
|
comparisons += 1
|
@@ -1808,7 +1548,7 @@ module DSPy
|
|
1808
1548
|
return 1.0 if latencies.empty?
|
1809
1549
|
|
1810
1550
|
avg_latency = latencies.sum / latencies.size
|
1811
|
-
|
1551
|
+
|
1812
1552
|
# Penalize high latencies (assume 2 seconds is baseline for 0.5 score)
|
1813
1553
|
baseline_latency = 2.0
|
1814
1554
|
latency_score = baseline_latency / (baseline_latency + avg_latency)
|
@@ -1930,10 +1670,10 @@ module DSPy
|
|
1930
1670
|
if llm_traces.any?
|
1931
1671
|
token_usage = llm_traces.sum(&:token_usage)
|
1932
1672
|
avg_response_length = llm_traces.map { |t| t.attributes['response']&.to_s&.length || 0 }.sum / llm_traces.size
|
1933
|
-
|
1673
|
+
|
1934
1674
|
analysis << "- Total tokens used: #{token_usage}"
|
1935
1675
|
analysis << "- Average response length: #{avg_response_length} characters"
|
1936
|
-
|
1676
|
+
|
1937
1677
|
# Identify models used
|
1938
1678
|
models = llm_traces.map { |t| t.attributes['gen_ai.request.model'] }.compact.uniq
|
1939
1679
|
analysis << "- Models used: #{models.join(', ')}" if models.any?
|
@@ -2001,14 +1741,14 @@ module DSPy
|
|
2001
1741
|
|
2002
1742
|
begin
|
2003
1743
|
original_instruction = extract_instruction(program)
|
2004
|
-
|
1744
|
+
|
2005
1745
|
# Use LLM-based instruction proposal instead of hardcoded mutations
|
2006
1746
|
improved_instruction = @instruction_proposer.propose_instruction(
|
2007
1747
|
original_instruction: original_instruction,
|
2008
1748
|
execution_traces: execution_traces,
|
2009
1749
|
failed_examples: failed_examples
|
2010
1750
|
)
|
2011
|
-
|
1751
|
+
|
2012
1752
|
create_mutated_program(program, improved_instruction)
|
2013
1753
|
rescue => e
|
2014
1754
|
emit_event('mutation_error', {
|
@@ -2024,7 +1764,7 @@ module DSPy
|
|
2024
1764
|
sig { params(programs: T::Array[T.untyped], execution_traces: T::Array[ExecutionTrace], failed_examples: T::Array[T.untyped]).returns(T::Array[T.untyped]) }
|
2025
1765
|
def batch_mutate(programs, execution_traces: [], failed_examples: [])
|
2026
1766
|
return [] if programs.empty?
|
2027
|
-
|
1767
|
+
|
2028
1768
|
programs.map { |program| mutate_program(program, execution_traces: execution_traces, failed_examples: failed_examples) }
|
2029
1769
|
end
|
2030
1770
|
|
@@ -2075,7 +1815,7 @@ module DSPy
|
|
2075
1815
|
-> (inst) { "Please #{inst.downcase}" },
|
2076
1816
|
-> (inst) { "#{inst} with precision" }
|
2077
1817
|
]
|
2078
|
-
|
1818
|
+
|
2079
1819
|
patterns.sample.call(instruction)
|
2080
1820
|
end
|
2081
1821
|
|
@@ -2088,7 +1828,7 @@ module DSPy
|
|
2088
1828
|
"Consider all aspects carefully.",
|
2089
1829
|
"Explain your thought process."
|
2090
1830
|
]
|
2091
|
-
|
1831
|
+
|
2092
1832
|
"#{instruction} #{expansions.sample}"
|
2093
1833
|
end
|
2094
1834
|
|
@@ -2099,7 +1839,7 @@ module DSPy
|
|
2099
1839
|
simplified = instruction.gsub(/\b(carefully|detailed|comprehensive|thorough)\b/i, '')
|
2100
1840
|
.gsub(/\s+/, ' ')
|
2101
1841
|
.strip
|
2102
|
-
|
1842
|
+
|
2103
1843
|
simplified.empty? ? instruction : simplified
|
2104
1844
|
end
|
2105
1845
|
|
@@ -2112,12 +1852,12 @@ module DSPy
|
|
2112
1852
|
"Apply domain knowledge.",
|
2113
1853
|
"Consider edge cases."
|
2114
1854
|
]
|
2115
|
-
|
1855
|
+
|
2116
1856
|
"#{instruction} #{strategies.sample}"
|
2117
1857
|
end
|
2118
1858
|
|
2119
1859
|
# Rephrase instruction with synonyms
|
2120
|
-
sig { params(instruction: String).returns(String) }
|
1860
|
+
sig { params(instruction: String).returns(String) }
|
2121
1861
|
def apply_rephrase_mutation(instruction)
|
2122
1862
|
# Simple synonym replacement - in full implementation would use LLM
|
2123
1863
|
synonyms = {
|
@@ -2127,12 +1867,12 @@ module DSPy
|
|
2127
1867
|
'calculate' => 'compute',
|
2128
1868
|
'determine' => 'identify'
|
2129
1869
|
}
|
2130
|
-
|
1870
|
+
|
2131
1871
|
result = instruction.dup
|
2132
1872
|
synonyms.each do |original, replacement|
|
2133
1873
|
result.gsub!(/\b#{original}\b/i, replacement) if rand < 0.3
|
2134
1874
|
end
|
2135
|
-
|
1875
|
+
|
2136
1876
|
result
|
2137
1877
|
end
|
2138
1878
|
|
@@ -2183,11 +1923,11 @@ module DSPy
|
|
2183
1923
|
begin
|
2184
1924
|
# Create a new instance of the same class
|
2185
1925
|
new_module = original_module.class.new
|
2186
|
-
|
1926
|
+
|
2187
1927
|
# Try to find and update any internal predictors
|
2188
1928
|
original_module.instance_variables.each do |var_name|
|
2189
1929
|
var_value = original_module.instance_variable_get(var_name)
|
2190
|
-
|
1930
|
+
|
2191
1931
|
if var_value.is_a?(DSPy::Predict)
|
2192
1932
|
# Update the instruction for internal predictors
|
2193
1933
|
mutated_predictor = var_value.with_instruction(new_instruction)
|
@@ -2197,7 +1937,7 @@ module DSPy
|
|
2197
1937
|
new_module.instance_variable_set(var_name, var_value)
|
2198
1938
|
end
|
2199
1939
|
end
|
2200
|
-
|
1940
|
+
|
2201
1941
|
new_module
|
2202
1942
|
rescue => e
|
2203
1943
|
emit_event('module_mutation_error', {
|
@@ -2229,10 +1969,10 @@ module DSPy
|
|
2229
1969
|
sig { params(mutations: T::Array[MutationType]).returns(Float) }
|
2230
1970
|
def mutation_diversity(mutations)
|
2231
1971
|
return 0.0 if mutations.empty?
|
2232
|
-
|
1972
|
+
|
2233
1973
|
unique_types = mutations.uniq.size
|
2234
1974
|
total_types = @config.mutation_types.size
|
2235
|
-
|
1975
|
+
|
2236
1976
|
unique_types.to_f / total_types
|
2237
1977
|
end
|
2238
1978
|
end
|
@@ -2263,15 +2003,15 @@ module DSPy
|
|
2263
2003
|
begin
|
2264
2004
|
instruction_a = extract_instruction(parent_a)
|
2265
2005
|
instruction_b = extract_instruction(parent_b)
|
2266
|
-
|
2006
|
+
|
2267
2007
|
crossover_type = select_crossover_type(instruction_a, instruction_b)
|
2268
2008
|
offspring_instructions = apply_crossover(instruction_a, instruction_b, crossover_type)
|
2269
|
-
|
2009
|
+
|
2270
2010
|
offspring = [
|
2271
2011
|
create_crossover_program(parent_a, offspring_instructions[0]),
|
2272
2012
|
create_crossover_program(parent_b, offspring_instructions[1])
|
2273
2013
|
]
|
2274
|
-
|
2014
|
+
|
2275
2015
|
offspring
|
2276
2016
|
rescue => e
|
2277
2017
|
# Return original parents on crossover failure
|
@@ -2284,9 +2024,9 @@ module DSPy
|
|
2284
2024
|
def batch_crossover(population)
|
2285
2025
|
return [] if population.empty?
|
2286
2026
|
return [population.first] if population.size == 1
|
2287
|
-
|
2027
|
+
|
2288
2028
|
offspring = []
|
2289
|
-
|
2029
|
+
|
2290
2030
|
# Pair up population for crossover
|
2291
2031
|
population.each_slice(2) do |pair|
|
2292
2032
|
if pair.size == 2
|
@@ -2296,7 +2036,7 @@ module DSPy
|
|
2296
2036
|
offspring << pair[0] # Unpaired individual passes through
|
2297
2037
|
end
|
2298
2038
|
end
|
2299
|
-
|
2039
|
+
|
2300
2040
|
offspring
|
2301
2041
|
end
|
2302
2042
|
|
@@ -2331,20 +2071,20 @@ module DSPy
|
|
2331
2071
|
sig { params(instruction_a: String, instruction_b: String).returns(T::Array[String]) }
|
2332
2072
|
def uniform_crossover(instruction_a, instruction_b)
|
2333
2073
|
return [instruction_a, instruction_b] if instruction_a == instruction_b
|
2334
|
-
|
2074
|
+
|
2335
2075
|
words_a = instruction_a.split
|
2336
2076
|
words_b = instruction_b.split
|
2337
|
-
|
2077
|
+
|
2338
2078
|
# Create offspring by randomly selecting words from parents
|
2339
2079
|
offspring_a_words = []
|
2340
2080
|
offspring_b_words = []
|
2341
|
-
|
2081
|
+
|
2342
2082
|
max_length = [words_a.size, words_b.size].max
|
2343
|
-
|
2083
|
+
|
2344
2084
|
max_length.times do |i|
|
2345
2085
|
word_a = words_a[i]
|
2346
2086
|
word_b = words_b[i]
|
2347
|
-
|
2087
|
+
|
2348
2088
|
if rand < 0.5
|
2349
2089
|
offspring_a_words << (word_a || word_b)
|
2350
2090
|
offspring_b_words << (word_b || word_a)
|
@@ -2353,7 +2093,7 @@ module DSPy
|
|
2353
2093
|
offspring_b_words << (word_a || word_b)
|
2354
2094
|
end
|
2355
2095
|
end
|
2356
|
-
|
2096
|
+
|
2357
2097
|
[
|
2358
2098
|
offspring_a_words.compact.join(' '),
|
2359
2099
|
offspring_b_words.compact.join(' ')
|
@@ -2370,9 +2110,9 @@ module DSPy
|
|
2370
2110
|
-> (a, b) { "#{b} while #{a.downcase}" },
|
2371
2111
|
-> (a, b) { "Combine #{a.downcase} with #{b.downcase}" }
|
2372
2112
|
]
|
2373
|
-
|
2113
|
+
|
2374
2114
|
pattern = patterns.sample
|
2375
|
-
|
2115
|
+
|
2376
2116
|
[
|
2377
2117
|
pattern.call(instruction_a, instruction_b),
|
2378
2118
|
pattern.call(instruction_b, instruction_a)
|
@@ -2385,11 +2125,11 @@ module DSPy
|
|
2385
2125
|
# Extract structural components
|
2386
2126
|
components_a = extract_components(instruction_a)
|
2387
2127
|
components_b = extract_components(instruction_b)
|
2388
|
-
|
2128
|
+
|
2389
2129
|
# Cross structural components
|
2390
2130
|
offspring_a = combine_components(components_a.action, components_b.modifiers)
|
2391
2131
|
offspring_b = combine_components(components_b.action, components_a.modifiers)
|
2392
|
-
|
2132
|
+
|
2393
2133
|
[offspring_a, offspring_b]
|
2394
2134
|
end
|
2395
2135
|
|
@@ -2397,10 +2137,10 @@ module DSPy
|
|
2397
2137
|
sig { params(instruction: String).returns(InstructionComponents) }
|
2398
2138
|
def extract_components(instruction)
|
2399
2139
|
words = instruction.split
|
2400
|
-
|
2140
|
+
|
2401
2141
|
# Simple heuristic: first verb-like word is action, rest are modifiers
|
2402
2142
|
action_idx = words.find_index { |word| verb_like?(word) } || 0
|
2403
|
-
|
2143
|
+
|
2404
2144
|
InstructionComponents.new(
|
2405
2145
|
action: words[action_idx] || words.first || "complete",
|
2406
2146
|
modifiers: (words - [words[action_idx]]).join(' ')
|
@@ -2438,7 +2178,7 @@ module DSPy
|
|
2438
2178
|
# Adaptive selection based on instruction characteristics
|
2439
2179
|
if instruction_a && instruction_b
|
2440
2180
|
combined_length = instruction_a.length + instruction_b.length
|
2441
|
-
|
2181
|
+
|
2442
2182
|
if combined_length < 40
|
2443
2183
|
# Short instructions benefit from blending
|
2444
2184
|
[CrossoverType::Blend, CrossoverType::Uniform].sample
|
@@ -2458,10 +2198,10 @@ module DSPy
|
|
2458
2198
|
sig { params(crossovers: T::Array[CrossoverType]).returns(Float) }
|
2459
2199
|
def crossover_diversity(crossovers)
|
2460
2200
|
return 0.0 if crossovers.empty?
|
2461
|
-
|
2201
|
+
|
2462
2202
|
unique_types = crossovers.uniq.size
|
2463
2203
|
total_types = @config.crossover_types.size
|
2464
|
-
|
2204
|
+
|
2465
2205
|
unique_types.to_f / total_types
|
2466
2206
|
end
|
2467
2207
|
end
|
@@ -2487,15 +2227,15 @@ module DSPy
|
|
2487
2227
|
def select_parents(population_with_scores, count:)
|
2488
2228
|
return [] if population_with_scores.empty?
|
2489
2229
|
return population_with_scores.map(&:first) if count >= population_with_scores.size
|
2490
|
-
|
2230
|
+
|
2491
2231
|
# Combine tournament and Pareto-based selection for parent selection
|
2492
2232
|
selected = []
|
2493
|
-
|
2233
|
+
|
2494
2234
|
count.times do
|
2495
2235
|
parent = tournament_selection(population_with_scores)
|
2496
2236
|
selected << parent
|
2497
2237
|
end
|
2498
|
-
|
2238
|
+
|
2499
2239
|
selected
|
2500
2240
|
end
|
2501
2241
|
|
@@ -2504,14 +2244,14 @@ module DSPy
|
|
2504
2244
|
def select_survivors(population_with_scores, count:)
|
2505
2245
|
return [] if population_with_scores.empty?
|
2506
2246
|
return population_with_scores.map(&:first) if count >= population_with_scores.size
|
2507
|
-
|
2247
|
+
|
2508
2248
|
scores = population_with_scores.map(&:last)
|
2509
|
-
|
2249
|
+
|
2510
2250
|
# Find Pareto frontier first
|
2511
2251
|
pareto_frontier = find_pareto_frontier(scores)
|
2512
2252
|
frontier_indices = scores.each_index.select { |i| pareto_frontier.include?(scores[i]) }
|
2513
2253
|
frontier_programs = frontier_indices.map { |i| population_with_scores[i].first }
|
2514
|
-
|
2254
|
+
|
2515
2255
|
if frontier_programs.size >= count
|
2516
2256
|
# Use diversity selection within frontier
|
2517
2257
|
frontier_with_scores = frontier_indices.map { |i| population_with_scores[i] }
|
@@ -2520,7 +2260,7 @@ module DSPy
|
|
2520
2260
|
# Include all frontier + fill remaining with elite selection
|
2521
2261
|
remaining_count = count - frontier_programs.size
|
2522
2262
|
remaining_population = population_with_scores.reject.with_index { |_, i| frontier_indices.include?(i) }
|
2523
|
-
|
2263
|
+
|
2524
2264
|
additional = elite_selection(remaining_population, count: remaining_count)
|
2525
2265
|
frontier_programs + additional
|
2526
2266
|
end
|
@@ -2533,18 +2273,18 @@ module DSPy
|
|
2533
2273
|
def find_pareto_frontier(fitness_scores)
|
2534
2274
|
return [] if fitness_scores.empty?
|
2535
2275
|
return fitness_scores if fitness_scores.size == 1
|
2536
|
-
|
2276
|
+
|
2537
2277
|
frontier = []
|
2538
|
-
|
2278
|
+
|
2539
2279
|
fitness_scores.each do |candidate|
|
2540
2280
|
# Check if candidate is dominated by any other solution
|
2541
2281
|
is_dominated = fitness_scores.any? do |other|
|
2542
2282
|
other != candidate && candidate.dominated_by?(other)
|
2543
2283
|
end
|
2544
|
-
|
2284
|
+
|
2545
2285
|
frontier << candidate unless is_dominated
|
2546
2286
|
end
|
2547
|
-
|
2287
|
+
|
2548
2288
|
frontier
|
2549
2289
|
end
|
2550
2290
|
|
@@ -2552,17 +2292,17 @@ module DSPy
|
|
2552
2292
|
sig { params(fitness_scores: T::Array[FitnessScore]).returns(T::Hash[FitnessScore, Float]) }
|
2553
2293
|
def calculate_crowding_distance(fitness_scores)
|
2554
2294
|
distances = {}
|
2555
|
-
|
2295
|
+
|
2556
2296
|
# Initialize distances for all solutions
|
2557
2297
|
fitness_scores.each { |score| distances[score] = 0.0 }
|
2558
|
-
|
2298
|
+
|
2559
2299
|
return distances if fitness_scores.size <= 2
|
2560
|
-
|
2300
|
+
|
2561
2301
|
# Calculate crowding distance for each objective
|
2562
2302
|
objectives = [:primary_score, :overall_score]
|
2563
2303
|
secondary_objectives = fitness_scores.first.secondary_scores.keys
|
2564
2304
|
all_objectives = objectives + secondary_objectives
|
2565
|
-
|
2305
|
+
|
2566
2306
|
all_objectives.each do |objective|
|
2567
2307
|
# Sort by current objective
|
2568
2308
|
sorted_scores = fitness_scores.sort_by do |score|
|
@@ -2575,29 +2315,29 @@ module DSPy
|
|
2575
2315
|
score.secondary_scores[objective] || 0.0
|
2576
2316
|
end
|
2577
2317
|
end
|
2578
|
-
|
2318
|
+
|
2579
2319
|
# Set boundary solutions to high distance
|
2580
2320
|
distances[sorted_scores.first] = Float::INFINITY if sorted_scores.size > 0
|
2581
2321
|
distances[sorted_scores.last] = Float::INFINITY if sorted_scores.size > 1
|
2582
|
-
|
2322
|
+
|
2583
2323
|
next if sorted_scores.size <= 2
|
2584
|
-
|
2324
|
+
|
2585
2325
|
# Calculate range for normalization
|
2586
2326
|
min_val = get_objective_value(sorted_scores.first, objective)
|
2587
2327
|
max_val = get_objective_value(sorted_scores.last, objective)
|
2588
2328
|
range = max_val - min_val
|
2589
|
-
|
2329
|
+
|
2590
2330
|
next if range <= 0
|
2591
|
-
|
2331
|
+
|
2592
2332
|
# Calculate crowding distance for intermediate solutions
|
2593
2333
|
(1...(sorted_scores.size - 1)).each do |i|
|
2594
2334
|
prev_val = get_objective_value(sorted_scores[i - 1], objective)
|
2595
2335
|
next_val = get_objective_value(sorted_scores[i + 1], objective)
|
2596
|
-
|
2336
|
+
|
2597
2337
|
distances[sorted_scores[i]] += (next_val - prev_val) / range
|
2598
2338
|
end
|
2599
2339
|
end
|
2600
|
-
|
2340
|
+
|
2601
2341
|
distances
|
2602
2342
|
end
|
2603
2343
|
|
@@ -2618,13 +2358,13 @@ module DSPy
|
|
2618
2358
|
sig { params(population_with_scores: T::Array[T::Array[T.untyped]]).returns(T.untyped) }
|
2619
2359
|
def tournament_selection(population_with_scores)
|
2620
2360
|
return population_with_scores.first.first if population_with_scores.size == 1
|
2621
|
-
|
2361
|
+
|
2622
2362
|
tournament_size = [3, population_with_scores.size].min
|
2623
2363
|
tournament = population_with_scores.sample(tournament_size)
|
2624
|
-
|
2364
|
+
|
2625
2365
|
# Select best from tournament based on Pareto dominance and crowding
|
2626
2366
|
best_program, best_score = tournament.first
|
2627
|
-
|
2367
|
+
|
2628
2368
|
tournament[1..].each do |program, score|
|
2629
2369
|
if score.dominated_by?(best_score)
|
2630
2370
|
# Current best dominates this candidate, keep current
|
@@ -2639,7 +2379,7 @@ module DSPy
|
|
2639
2379
|
end
|
2640
2380
|
end
|
2641
2381
|
end
|
2642
|
-
|
2382
|
+
|
2643
2383
|
best_program
|
2644
2384
|
end
|
2645
2385
|
|
@@ -2647,13 +2387,13 @@ module DSPy
|
|
2647
2387
|
sig { params(population_with_scores: T::Array[T::Array[T.untyped]], count: Integer).returns(T::Array[T.untyped]) }
|
2648
2388
|
def diversity_selection(population_with_scores, count:)
|
2649
2389
|
return population_with_scores.map(&:first) if count >= population_with_scores.size
|
2650
|
-
|
2390
|
+
|
2651
2391
|
scores = population_with_scores.map(&:last)
|
2652
2392
|
distances = calculate_crowding_distance(scores)
|
2653
|
-
|
2393
|
+
|
2654
2394
|
# Sort by crowding distance (descending - prefer more diverse)
|
2655
2395
|
sorted_pairs = population_with_scores.sort_by { |_, score| -distances[score] }
|
2656
|
-
|
2396
|
+
|
2657
2397
|
sorted_pairs.take(count).map(&:first)
|
2658
2398
|
end
|
2659
2399
|
|
@@ -2661,10 +2401,10 @@ module DSPy
|
|
2661
2401
|
sig { params(population_with_scores: T::Array[T::Array[T.untyped]], count: Integer).returns(T::Array[T.untyped]) }
|
2662
2402
|
def elite_selection(population_with_scores, count:)
|
2663
2403
|
return population_with_scores.map(&:first) if count >= population_with_scores.size
|
2664
|
-
|
2404
|
+
|
2665
2405
|
# Sort by overall score (descending - best first)
|
2666
2406
|
sorted_pairs = population_with_scores.sort_by { |_, score| -score.overall_score }
|
2667
|
-
|
2407
|
+
|
2668
2408
|
sorted_pairs.take(count).map(&:first)
|
2669
2409
|
end
|
2670
2410
|
end
|
@@ -2673,7 +2413,7 @@ module DSPy
|
|
2673
2413
|
class GEPAConfig < Config
|
2674
2414
|
extend T::Sig
|
2675
2415
|
|
2676
|
-
sig { returns(
|
2416
|
+
sig { returns(DSPy::LM) }
|
2677
2417
|
attr_accessor :reflection_lm
|
2678
2418
|
|
2679
2419
|
sig { returns(Integer) }
|
@@ -2688,8 +2428,6 @@ module DSPy
|
|
2688
2428
|
sig { returns(T::Boolean) }
|
2689
2429
|
attr_accessor :use_pareto_selection
|
2690
2430
|
|
2691
|
-
sig { returns(T::Boolean) }
|
2692
|
-
attr_accessor :simple_mode
|
2693
2431
|
sig { returns(T::Array[MutationType]) }
|
2694
2432
|
attr_accessor :mutation_types
|
2695
2433
|
sig { returns(Float) }
|
@@ -2700,12 +2438,12 @@ module DSPy
|
|
2700
2438
|
sig { void }
|
2701
2439
|
def initialize
|
2702
2440
|
super
|
2703
|
-
|
2441
|
+
# reflection_lm must be explicitly set by user - no default provided
|
2442
|
+
@reflection_lm = nil
|
2704
2443
|
@num_generations = 10
|
2705
2444
|
@population_size = 8
|
2706
2445
|
@mutation_rate = 0.7
|
2707
2446
|
@use_pareto_selection = true
|
2708
|
-
@simple_mode = false
|
2709
2447
|
@mutation_types = [MutationType::Rewrite, MutationType::Expand, MutationType::Simplify, MutationType::Combine, MutationType::Rephrase]
|
2710
2448
|
@crossover_rate = 0.6
|
2711
2449
|
@crossover_types = [CrossoverType::Uniform, CrossoverType::Blend, CrossoverType::Structured]
|
@@ -2714,12 +2452,11 @@ module DSPy
|
|
2714
2452
|
sig { returns(T::Hash[Symbol, T.untyped]) }
|
2715
2453
|
def to_h
|
2716
2454
|
super.merge({
|
2717
|
-
reflection_lm: @reflection_lm,
|
2455
|
+
reflection_lm: @reflection_lm&.model, # Serialize the model name for hash representation
|
2718
2456
|
num_generations: @num_generations,
|
2719
2457
|
population_size: @population_size,
|
2720
2458
|
mutation_rate: @mutation_rate,
|
2721
2459
|
use_pareto_selection: @use_pareto_selection,
|
2722
|
-
simple_mode: @simple_mode,
|
2723
2460
|
mutation_types: @mutation_types,
|
2724
2461
|
crossover_rate: @crossover_rate,
|
2725
2462
|
crossover_types: @crossover_types
|
@@ -2738,6 +2475,12 @@ module DSPy
|
|
2738
2475
|
end
|
2739
2476
|
def initialize(metric: nil, config: nil)
|
2740
2477
|
@config = config || GEPAConfig.new
|
2478
|
+
|
2479
|
+
# Validate that reflection_lm is configured
|
2480
|
+
unless @config.reflection_lm
|
2481
|
+
raise ArgumentError, "reflection_lm must be configured for GEPA optimization. Set config.reflection_lm to a DSPy::LM instance."
|
2482
|
+
end
|
2483
|
+
|
2741
2484
|
super(metric: metric, config: @config)
|
2742
2485
|
end
|
2743
2486
|
|
@@ -2749,6 +2492,7 @@ module DSPy
|
|
2749
2492
|
valset: T.nilable(T::Array[T.untyped])
|
2750
2493
|
).returns(OptimizationResult)
|
2751
2494
|
end
|
2495
|
+
|
2752
2496
|
def compile(program, trainset:, valset: nil)
|
2753
2497
|
validate_inputs(program, trainset, valset)
|
2754
2498
|
|
@@ -2758,200 +2502,13 @@ module DSPy
|
|
2758
2502
|
num_generations: @config.num_generations,
|
2759
2503
|
population_size: @config.population_size
|
2760
2504
|
}) do
|
2761
|
-
#
|
2762
|
-
|
2763
|
-
perform_simple_optimization(program, trainset, valset)
|
2764
|
-
else
|
2765
|
-
# Phase 2 - Full GEPA genetic algorithm implementation
|
2766
|
-
perform_gepa_optimization(program, trainset, valset)
|
2767
|
-
end
|
2505
|
+
# Always perform full GEPA genetic algorithm optimization
|
2506
|
+
perform_gepa_optimization(program, trainset, valset)
|
2768
2507
|
end
|
2769
2508
|
end
|
2770
2509
|
|
2771
2510
|
private
|
2772
2511
|
|
2773
|
-
# Simple optimization implementation for testing
|
2774
|
-
sig do
|
2775
|
-
params(
|
2776
|
-
program: T.untyped,
|
2777
|
-
trainset: T::Array[T.untyped],
|
2778
|
-
valset: T.nilable(T::Array[T.untyped])
|
2779
|
-
).returns(OptimizationResult)
|
2780
|
-
end
|
2781
|
-
def perform_simple_optimization(program, trainset, valset)
|
2782
|
-
return basic_result(program) unless program.respond_to?(:signature_class)
|
2783
|
-
|
2784
|
-
original_description = program.signature_class.description
|
2785
|
-
best_program = program
|
2786
|
-
best_score = simple_evaluate_program(program, trainset)
|
2787
|
-
|
2788
|
-
# Try different instruction variations
|
2789
|
-
instruction_variants = generate_instruction_variants(original_description)
|
2790
|
-
|
2791
|
-
instruction_variants.each_with_index do |variant, index|
|
2792
|
-
emit_event('instruction_variant_test', {
|
2793
|
-
variant: variant,
|
2794
|
-
iteration: index + 1,
|
2795
|
-
total_variants: instruction_variants.size
|
2796
|
-
})
|
2797
|
-
|
2798
|
-
# Create modified program
|
2799
|
-
modified_program = create_program_with_instruction(program, variant)
|
2800
|
-
score = simple_evaluate_program(modified_program, trainset)
|
2801
|
-
|
2802
|
-
if score > best_score
|
2803
|
-
best_program = modified_program
|
2804
|
-
best_score = score
|
2805
|
-
|
2806
|
-
emit_event('improvement_found', {
|
2807
|
-
new_score: score,
|
2808
|
-
previous_score: best_score,
|
2809
|
-
instruction: variant
|
2810
|
-
})
|
2811
|
-
end
|
2812
|
-
end
|
2813
|
-
|
2814
|
-
OptimizationResult.new(
|
2815
|
-
optimized_program: best_program,
|
2816
|
-
scores: { accuracy: best_score },
|
2817
|
-
history: {
|
2818
|
-
original_score: simple_evaluate_program(program, trainset),
|
2819
|
-
variants_tested: instruction_variants.size,
|
2820
|
-
best_instruction: best_program.signature_class.description
|
2821
|
-
},
|
2822
|
-
best_score_name: 'accuracy',
|
2823
|
-
best_score_value: best_score,
|
2824
|
-
metadata: {
|
2825
|
-
optimizer: 'GEPA',
|
2826
|
-
mode: 'Simple Optimization',
|
2827
|
-
reflection_lm: @config.reflection_lm
|
2828
|
-
}
|
2829
|
-
)
|
2830
|
-
end
|
2831
|
-
|
2832
|
-
# Generate variations of the instruction
|
2833
|
-
sig { params(original_instruction: String).returns(T::Array[String]) }
|
2834
|
-
def generate_instruction_variants(original_instruction)
|
2835
|
-
variants = []
|
2836
|
-
|
2837
|
-
# Add "step by step" variant
|
2838
|
-
unless original_instruction.include?("step")
|
2839
|
-
variants << "#{original_instruction} Think step by step."
|
2840
|
-
end
|
2841
|
-
|
2842
|
-
# Add "detailed" variant
|
2843
|
-
unless original_instruction.include?("detail")
|
2844
|
-
variants << "#{original_instruction} Provide detailed reasoning."
|
2845
|
-
end
|
2846
|
-
|
2847
|
-
# Add "careful" variant
|
2848
|
-
unless original_instruction.include?("careful")
|
2849
|
-
variants << "Be careful and accurate. #{original_instruction}"
|
2850
|
-
end
|
2851
|
-
|
2852
|
-
variants.take(3) # Limit to 3 variants for simple mode
|
2853
|
-
end
|
2854
|
-
|
2855
|
-
# Create a new program instance with modified instruction using DSPy.rb dynamic capabilities
|
2856
|
-
sig { params(original_program: T.untyped, new_instruction: String).returns(T.untyped) }
|
2857
|
-
def create_program_with_instruction(original_program, new_instruction)
|
2858
|
-
case original_program
|
2859
|
-
when DSPy::Predict
|
2860
|
-
# DSPy::Predict has built-in support for instruction modification
|
2861
|
-
original_program.with_instruction(new_instruction)
|
2862
|
-
when DSPy::Module
|
2863
|
-
# For custom DSPy::Module classes, create new instance with updated predictors
|
2864
|
-
create_modified_module_instance(original_program, new_instruction)
|
2865
|
-
else
|
2866
|
-
# For other types (like test doubles), check available methods
|
2867
|
-
if original_program.respond_to?(:with_instruction)
|
2868
|
-
original_program.with_instruction(new_instruction)
|
2869
|
-
elsif original_program.respond_to?(:signature_class)
|
2870
|
-
# Create new DSPy::Predict with the same signature but new instruction
|
2871
|
-
signature_class = original_program.signature_class
|
2872
|
-
DSPy::Predict.new(signature_class).with_instruction(new_instruction)
|
2873
|
-
else
|
2874
|
-
# Fallback: return original if we can't modify
|
2875
|
-
emit_event('program_modification_fallback', {
|
2876
|
-
program_type: original_program.class.name,
|
2877
|
-
reason: 'No modification method available'
|
2878
|
-
})
|
2879
|
-
original_program
|
2880
|
-
end
|
2881
|
-
end
|
2882
|
-
rescue => e
|
2883
|
-
emit_event('program_modification_error', {
|
2884
|
-
error: e.message,
|
2885
|
-
program_type: original_program.class.name
|
2886
|
-
})
|
2887
|
-
# Return original program on error
|
2888
|
-
original_program
|
2889
|
-
end
|
2890
|
-
|
2891
|
-
# Create modified version of custom DSPy::Module instance (for main GEPA class)
|
2892
|
-
sig { params(original_module: DSPy::Module, new_instruction: String).returns(DSPy::Module) }
|
2893
|
-
def create_modified_module_instance(original_module, new_instruction)
|
2894
|
-
begin
|
2895
|
-
# Create a new instance of the same class
|
2896
|
-
new_module = original_module.class.new
|
2897
|
-
|
2898
|
-
# Try to find and update any internal predictors
|
2899
|
-
original_module.instance_variables.each do |var_name|
|
2900
|
-
var_value = original_module.instance_variable_get(var_name)
|
2901
|
-
|
2902
|
-
if var_value.is_a?(DSPy::Predict)
|
2903
|
-
# Update the instruction for internal predictors
|
2904
|
-
modified_predictor = var_value.with_instruction(new_instruction)
|
2905
|
-
new_module.instance_variable_set(var_name, modified_predictor)
|
2906
|
-
else
|
2907
|
-
# Copy other instance variables as-is
|
2908
|
-
new_module.instance_variable_set(var_name, var_value)
|
2909
|
-
end
|
2910
|
-
end
|
2911
|
-
|
2912
|
-
new_module
|
2913
|
-
rescue => e
|
2914
|
-
emit_event('module_modification_error', {
|
2915
|
-
error: e.message,
|
2916
|
-
module_class: original_module.class.name
|
2917
|
-
})
|
2918
|
-
# Fallback to original module
|
2919
|
-
original_module
|
2920
|
-
end
|
2921
|
-
end
|
2922
|
-
|
2923
|
-
# Simple evaluation for testing (different from base class evaluate_program)
|
2924
|
-
sig { params(program: T.untyped, trainset: T::Array[T.untyped]).returns(Float) }
|
2925
|
-
def simple_evaluate_program(program, trainset)
|
2926
|
-
return 0.0 unless @metric
|
2927
|
-
|
2928
|
-
scores = trainset.map do |example|
|
2929
|
-
prediction = program.call(**example.input_values)
|
2930
|
-
@metric.call(example, prediction).to_f
|
2931
|
-
rescue => e
|
2932
|
-
emit_event('evaluation_error', { error: e.message, example_id: example.object_id.to_s })
|
2933
|
-
0.0
|
2934
|
-
end
|
2935
|
-
|
2936
|
-
scores.sum / scores.size
|
2937
|
-
end
|
2938
|
-
|
2939
|
-
# Return basic result when simple optimization isn't applicable
|
2940
|
-
sig { params(program: T.untyped).returns(OptimizationResult) }
|
2941
|
-
def basic_result(program)
|
2942
|
-
OptimizationResult.new(
|
2943
|
-
optimized_program: program,
|
2944
|
-
scores: { gepa_score: 0.0 },
|
2945
|
-
history: { phase: 'Phase 1 - Basic Structure' },
|
2946
|
-
best_score_name: 'gepa_score',
|
2947
|
-
best_score_value: 0.0,
|
2948
|
-
metadata: {
|
2949
|
-
optimizer: 'GEPA',
|
2950
|
-
implementation_status: 'Phase 1 - Infrastructure Complete'
|
2951
|
-
}
|
2952
|
-
)
|
2953
|
-
end
|
2954
|
-
|
2955
2512
|
# Complete GEPA genetic algorithm optimization
|
2956
2513
|
sig do
|
2957
2514
|
params(
|
@@ -2968,11 +2525,11 @@ module DSPy
|
|
2968
2525
|
mutation_engine = create_mutation_engine
|
2969
2526
|
crossover_engine = create_crossover_engine
|
2970
2527
|
pareto_selector = create_pareto_selector(fitness_evaluator)
|
2971
|
-
|
2528
|
+
|
2972
2529
|
# Initialize trace collection for reflection
|
2973
2530
|
trace_collector = TraceCollector.new
|
2974
2531
|
optimization_run_id = "gepa-run-#{SecureRandom.hex(4)}"
|
2975
|
-
|
2532
|
+
|
2976
2533
|
emit_event('gepa_optimization_start', {
|
2977
2534
|
optimization_run_id: optimization_run_id,
|
2978
2535
|
num_generations: @config.num_generations,
|
@@ -2980,17 +2537,17 @@ module DSPy
|
|
2980
2537
|
mutation_rate: @config.mutation_rate,
|
2981
2538
|
crossover_rate: @config.crossover_rate
|
2982
2539
|
})
|
2983
|
-
|
2540
|
+
|
2984
2541
|
begin
|
2985
2542
|
# Run the complete genetic algorithm evolution
|
2986
2543
|
evolution_result = genetic_engine.run_evolution(program, trainset)
|
2987
|
-
|
2544
|
+
|
2988
2545
|
# Collect traces for reflection analysis
|
2989
2546
|
execution_traces = trace_collector.traces_for_run(optimization_run_id)
|
2990
|
-
|
2547
|
+
|
2991
2548
|
# Generate reflection insights on the optimization process
|
2992
2549
|
reflection_result = reflection_engine.reflect_with_llm(execution_traces)
|
2993
|
-
|
2550
|
+
|
2994
2551
|
# Evaluate final candidate on validation set if provided
|
2995
2552
|
final_validation_score = if valset && !valset.empty?
|
2996
2553
|
validation_fitness = fitness_evaluator.evaluate_candidate(evolution_result[:best_candidate], valset)
|
@@ -2998,7 +2555,7 @@ module DSPy
|
|
2998
2555
|
else
|
2999
2556
|
evolution_result[:best_fitness].overall_score
|
3000
2557
|
end
|
3001
|
-
|
2558
|
+
|
3002
2559
|
emit_event('gepa_optimization_complete', {
|
3003
2560
|
optimization_run_id: optimization_run_id,
|
3004
2561
|
best_fitness: evolution_result[:best_fitness].overall_score,
|
@@ -3006,7 +2563,7 @@ module DSPy
|
|
3006
2563
|
validation_score: final_validation_score,
|
3007
2564
|
reflection_confidence: reflection_result.confidence
|
3008
2565
|
})
|
3009
|
-
|
2566
|
+
|
3010
2567
|
# Create comprehensive optimization result
|
3011
2568
|
OptimizationResult.new(
|
3012
2569
|
optimized_program: evolution_result[:best_candidate],
|
@@ -3030,7 +2587,7 @@ module DSPy
|
|
3030
2587
|
best_score_value: evolution_result[:best_fitness].overall_score,
|
3031
2588
|
metadata: {
|
3032
2589
|
optimizer: 'GEPA',
|
3033
|
-
reflection_lm: @config.reflection_lm,
|
2590
|
+
reflection_lm: @config.reflection_lm&.model,
|
3034
2591
|
implementation_status: 'Phase 2 - Complete Implementation',
|
3035
2592
|
optimization_run_id: optimization_run_id,
|
3036
2593
|
reflection_insights: {
|
@@ -3047,7 +2604,7 @@ module DSPy
|
|
3047
2604
|
},
|
3048
2605
|
component_versions: {
|
3049
2606
|
genetic_engine: 'v2.0',
|
3050
|
-
fitness_evaluator: 'v2.0',
|
2607
|
+
fitness_evaluator: 'v2.0',
|
3051
2608
|
reflection_engine: 'v2.0',
|
3052
2609
|
mutation_engine: 'v2.0',
|
3053
2610
|
crossover_engine: 'v2.0',
|
@@ -3055,20 +2612,20 @@ module DSPy
|
|
3055
2612
|
}
|
3056
2613
|
}
|
3057
2614
|
)
|
3058
|
-
|
2615
|
+
|
3059
2616
|
rescue => e
|
3060
2617
|
emit_event('gepa_optimization_error', {
|
3061
2618
|
optimization_run_id: optimization_run_id,
|
3062
2619
|
error: e.message,
|
3063
2620
|
backtrace: e.backtrace&.take(5)
|
3064
2621
|
})
|
3065
|
-
|
2622
|
+
|
3066
2623
|
# Return fallback result on optimization failure
|
3067
2624
|
fallback_fitness = fitness_evaluator.evaluate_candidate(program, trainset)
|
3068
|
-
|
2625
|
+
|
3069
2626
|
OptimizationResult.new(
|
3070
2627
|
optimized_program: program,
|
3071
|
-
scores: {
|
2628
|
+
scores: {
|
3072
2629
|
fitness_score: fallback_fitness.overall_score,
|
3073
2630
|
primary_score: fallback_fitness.primary_score,
|
3074
2631
|
**fallback_fitness.secondary_scores
|
@@ -3079,11 +2636,11 @@ module DSPy
|
|
3079
2636
|
phase: 'Phase 2 - Error Recovery',
|
3080
2637
|
error: e.message
|
3081
2638
|
},
|
3082
|
-
best_score_name: 'fitness_score',
|
2639
|
+
best_score_name: 'fitness_score',
|
3083
2640
|
best_score_value: fallback_fitness.overall_score,
|
3084
2641
|
metadata: {
|
3085
2642
|
optimizer: 'GEPA',
|
3086
|
-
reflection_lm: @config.reflection_lm,
|
2643
|
+
reflection_lm: @config.reflection_lm&.model,
|
3087
2644
|
implementation_status: 'Phase 2 - Error Recovery',
|
3088
2645
|
optimization_run_id: optimization_run_id,
|
3089
2646
|
error_details: {
|
@@ -3095,48 +2652,48 @@ module DSPy
|
|
3095
2652
|
)
|
3096
2653
|
end
|
3097
2654
|
end
|
3098
|
-
|
2655
|
+
|
3099
2656
|
# Create and configure fitness evaluator
|
3100
2657
|
sig { returns(FitnessEvaluator) }
|
3101
2658
|
def create_fitness_evaluator
|
3102
2659
|
FitnessEvaluator.new(primary_metric: @metric, config: @config)
|
3103
2660
|
end
|
3104
|
-
|
2661
|
+
|
3105
2662
|
# Create and configure genetic engine
|
3106
2663
|
sig { params(fitness_evaluator: FitnessEvaluator).returns(GeneticEngine) }
|
3107
2664
|
def create_genetic_engine(fitness_evaluator)
|
3108
|
-
GeneticEngine.new(config: @config,
|
2665
|
+
GeneticEngine.new(config: @config, fitness_evaluator: fitness_evaluator)
|
3109
2666
|
end
|
3110
|
-
|
2667
|
+
|
3111
2668
|
# Create and configure reflection engine
|
3112
2669
|
sig { returns(ReflectionEngine) }
|
3113
2670
|
def create_reflection_engine
|
3114
2671
|
ReflectionEngine.new(@config)
|
3115
2672
|
end
|
3116
|
-
|
3117
|
-
# Create and configure mutation engine
|
2673
|
+
|
2674
|
+
# Create and configure mutation engine
|
3118
2675
|
sig { returns(MutationEngine) }
|
3119
2676
|
def create_mutation_engine
|
3120
2677
|
MutationEngine.new(config: @config)
|
3121
2678
|
end
|
3122
|
-
|
2679
|
+
|
3123
2680
|
# Create and configure crossover engine
|
3124
2681
|
sig { returns(CrossoverEngine) }
|
3125
2682
|
def create_crossover_engine
|
3126
2683
|
CrossoverEngine.new(config: @config)
|
3127
2684
|
end
|
3128
|
-
|
2685
|
+
|
3129
2686
|
# Create and configure pareto selector
|
3130
2687
|
sig { params(fitness_evaluator: FitnessEvaluator).returns(ParetoSelector) }
|
3131
2688
|
def create_pareto_selector(fitness_evaluator)
|
3132
2689
|
ParetoSelector.new(evaluator: fitness_evaluator, config: @config)
|
3133
2690
|
end
|
3134
|
-
|
2691
|
+
|
3135
2692
|
# Calculate execution timespan from traces
|
3136
2693
|
sig { params(traces: T::Array[ExecutionTrace]).returns(Float) }
|
3137
2694
|
def calculate_execution_timespan(traces)
|
3138
2695
|
return 0.0 if traces.size < 2
|
3139
|
-
|
2696
|
+
|
3140
2697
|
timestamps = traces.map(&:timestamp).sort
|
3141
2698
|
(timestamps.last - timestamps.first).to_f
|
3142
2699
|
end
|
@@ -3147,9 +2704,9 @@ module DSPy
|
|
3147
2704
|
module GEPAFeedbackMetric
|
3148
2705
|
extend T::Sig
|
3149
2706
|
extend T::Helpers
|
3150
|
-
|
2707
|
+
|
3151
2708
|
interface!
|
3152
|
-
|
2709
|
+
|
3153
2710
|
# Evaluates prediction and provides score with optional feedback
|
3154
2711
|
sig do
|
3155
2712
|
abstract
|
@@ -3166,11 +2723,11 @@ module DSPy
|
|
3166
2723
|
# Extended prediction result with score and feedback
|
3167
2724
|
class ScoreWithFeedback < T::Struct
|
3168
2725
|
extend T::Sig
|
3169
|
-
|
2726
|
+
|
3170
2727
|
const :score, Float
|
3171
2728
|
const :feedback, T.nilable(String)
|
3172
2729
|
const :prediction, DSPy::Prediction
|
3173
|
-
|
2730
|
+
|
3174
2731
|
sig { params(score: Float, prediction: DSPy::Prediction, feedback: T.nilable(String)).void }
|
3175
2732
|
def initialize(score:, prediction:, feedback: nil)
|
3176
2733
|
super
|
@@ -3180,7 +2737,7 @@ module DSPy
|
|
3180
2737
|
# Module Evaluator - Evaluates DSPy modules with metrics and feedback
|
3181
2738
|
class ModuleEvaluator
|
3182
2739
|
extend T::Sig
|
3183
|
-
|
2740
|
+
|
3184
2741
|
sig do
|
3185
2742
|
params(
|
3186
2743
|
student: T.untyped, # DSPy::Module or similar callable
|
@@ -3224,9 +2781,9 @@ module DSPy
|
|
3224
2781
|
def evaluate_batch(batch, candidate_instruction, capture_traces: true)
|
3225
2782
|
program = build_program(candidate_instruction)
|
3226
2783
|
results = []
|
3227
|
-
|
2784
|
+
|
3228
2785
|
batch.each do |example|
|
3229
|
-
begin
|
2786
|
+
begin
|
3230
2787
|
# Execute program on example
|
3231
2788
|
prediction = if program.respond_to?(:call)
|
3232
2789
|
program.call(**example.input_values)
|
@@ -3235,11 +2792,11 @@ module DSPy
|
|
3235
2792
|
else
|
3236
2793
|
raise "Program must respond to :call or :forward"
|
3237
2794
|
end
|
3238
|
-
|
2795
|
+
|
3239
2796
|
# Get collected traces (if trace collection is enabled)
|
3240
2797
|
# Note: TraceCollector automatically collects via event subscriptions
|
3241
2798
|
traces = capture_traces ? @trace_collector.traces : []
|
3242
|
-
|
2799
|
+
|
3243
2800
|
# Evaluate with metric
|
3244
2801
|
# Try with traces first (for GEPAFeedbackMetric), fallback to standard metric
|
3245
2802
|
begin
|
@@ -3257,7 +2814,7 @@ module DSPy
|
|
3257
2814
|
raise arg_error
|
3258
2815
|
end
|
3259
2816
|
end
|
3260
|
-
|
2817
|
+
|
3261
2818
|
# Ensure we always have a ScoreWithFeedback object
|
3262
2819
|
if score_result.is_a?(ScoreWithFeedback)
|
3263
2820
|
results << score_result
|
@@ -3269,14 +2826,14 @@ module DSPy
|
|
3269
2826
|
feedback: nil
|
3270
2827
|
)
|
3271
2828
|
end
|
3272
|
-
|
2829
|
+
|
3273
2830
|
rescue => e
|
3274
2831
|
DSPy.logger.error("Evaluation error: #{e.message}")
|
3275
2832
|
# Return zero score on failure
|
3276
2833
|
results << 0.0
|
3277
2834
|
end
|
3278
2835
|
end
|
3279
|
-
|
2836
|
+
|
3280
2837
|
results
|
3281
2838
|
end
|
3282
2839
|
|
@@ -3292,21 +2849,21 @@ module DSPy
|
|
3292
2849
|
end
|
3293
2850
|
def make_reflective_dataset(examples, predictions, scores, threshold: 0.5)
|
3294
2851
|
reflective_data = []
|
3295
|
-
|
2852
|
+
|
3296
2853
|
examples.zip(predictions, scores).each do |example, prediction, score|
|
3297
2854
|
# Extract score value
|
3298
2855
|
score_value = score.is_a?(ScoreWithFeedback) ? score.score : score
|
3299
|
-
|
2856
|
+
|
3300
2857
|
# Include failed predictions (below threshold)
|
3301
2858
|
next if score_value >= threshold
|
3302
|
-
|
2859
|
+
|
3303
2860
|
# Extract feedback if available
|
3304
2861
|
feedback = if score.is_a?(ScoreWithFeedback) && score.feedback
|
3305
2862
|
score.feedback
|
3306
2863
|
else
|
3307
2864
|
"Low performance (score: #{score_value.round(2)})"
|
3308
2865
|
end
|
3309
|
-
|
2866
|
+
|
3310
2867
|
reflective_data << {
|
3311
2868
|
'input' => example.input_values,
|
3312
2869
|
'expected' => example.expected_values,
|
@@ -3315,7 +2872,7 @@ module DSPy
|
|
3315
2872
|
'feedback' => feedback
|
3316
2873
|
}
|
3317
2874
|
end
|
3318
|
-
|
2875
|
+
|
3319
2876
|
reflective_data
|
3320
2877
|
end
|
3321
2878
|
|
@@ -3358,32 +2915,32 @@ module DSPy
|
|
3358
2915
|
end
|
3359
2916
|
def analyze_failures_and_propose(current_instruction, reflective_dataset)
|
3360
2917
|
return [current_instruction] if reflective_dataset.empty?
|
3361
|
-
|
2918
|
+
|
3362
2919
|
# Extract common failure patterns
|
3363
2920
|
feedback_texts = reflective_dataset.map { |data| data['feedback'] }.compact
|
3364
|
-
|
2921
|
+
|
3365
2922
|
# Simple heuristic-based proposals
|
3366
2923
|
proposals = []
|
3367
|
-
|
2924
|
+
|
3368
2925
|
# If many failures, suggest more detailed instruction
|
3369
2926
|
if reflective_dataset.size >= 3
|
3370
2927
|
proposals << "#{current_instruction} Please provide step-by-step reasoning."
|
3371
2928
|
end
|
3372
|
-
|
2929
|
+
|
3373
2930
|
# If feedback mentions specific issues, address them
|
3374
2931
|
if feedback_texts.any? { |fb| fb.include?('unclear') || fb.include?('ambiguous') }
|
3375
2932
|
proposals << "#{current_instruction} Be specific and clear in your response."
|
3376
2933
|
end
|
3377
|
-
|
2934
|
+
|
3378
2935
|
if feedback_texts.any? { |fb| fb.include?('incomplete') || fb.include?('missing') }
|
3379
2936
|
proposals << "#{current_instruction} Ensure your answer is complete and addresses all aspects."
|
3380
2937
|
end
|
3381
|
-
|
2938
|
+
|
3382
2939
|
# Always include at least one proposal
|
3383
2940
|
proposals << "#{current_instruction.strip}. Think carefully before responding." if proposals.empty?
|
3384
|
-
|
2941
|
+
|
3385
2942
|
proposals.uniq.take(3) # Return up to 3 proposals
|
3386
2943
|
end
|
3387
2944
|
end
|
3388
2945
|
end
|
3389
|
-
end
|
2946
|
+
end
|