dspy 0.28.2 → 0.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +2 -3
  3. data/lib/dspy/code_act.rb +14 -1
  4. data/lib/dspy/datasets/ade.rb +90 -0
  5. data/lib/dspy/datasets.rb +8 -0
  6. data/lib/dspy/lm.rb +4 -8
  7. data/lib/dspy/mixins/struct_builder.rb +17 -25
  8. data/lib/dspy/module.rb +12 -1
  9. data/lib/dspy/observability/async_span_processor.rb +67 -93
  10. data/lib/dspy/observability.rb +43 -1
  11. data/lib/dspy/predict.rb +10 -0
  12. data/lib/dspy/propose/dataset_summary_generator.rb +36 -3
  13. data/lib/dspy/propose/grounded_proposer.rb +118 -11
  14. data/lib/dspy/re_act.rb +13 -0
  15. data/lib/dspy/reflection_lm.rb +36 -0
  16. data/lib/dspy/teleprompt/gepa.rb +448 -2803
  17. data/lib/dspy/teleprompt/mipro_v2.rb +564 -65
  18. data/lib/dspy/teleprompt/utils.rb +8 -3
  19. data/lib/dspy/version.rb +2 -2
  20. data/lib/dspy.rb +3 -2
  21. data/lib/gepa/api.rb +61 -0
  22. data/lib/gepa/core/engine.rb +226 -0
  23. data/lib/gepa/core/evaluation_batch.rb +26 -0
  24. data/lib/gepa/core/result.rb +92 -0
  25. data/lib/gepa/core/state.rb +231 -0
  26. data/lib/gepa/logging/experiment_tracker.rb +54 -0
  27. data/lib/gepa/logging/logger.rb +57 -0
  28. data/lib/gepa/logging.rb +9 -0
  29. data/lib/gepa/proposer/base.rb +27 -0
  30. data/lib/gepa/proposer/merge_proposer.rb +424 -0
  31. data/lib/gepa/proposer/reflective_mutation/base.rb +48 -0
  32. data/lib/gepa/proposer/reflective_mutation/reflective_mutation.rb +188 -0
  33. data/lib/gepa/strategies/batch_sampler.rb +91 -0
  34. data/lib/gepa/strategies/candidate_selector.rb +97 -0
  35. data/lib/gepa/strategies/component_selector.rb +57 -0
  36. data/lib/gepa/strategies/instruction_proposal.rb +120 -0
  37. data/lib/gepa/telemetry.rb +122 -0
  38. data/lib/gepa/utils/pareto.rb +119 -0
  39. data/lib/gepa.rb +21 -0
  40. metadata +42 -4
  41. data/lib/dspy/teleprompt/simple_optimizer.rb +0 -503
@@ -1,2945 +1,590 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'ostruct'
3
+ require 'logger'
4
+ require 'set'
4
5
  require 'sorbet-runtime'
5
6
  require_relative 'teleprompter'
6
- require_relative '../events/subscriber_mixin'
7
+ require_relative 'utils'
8
+ require_relative '../../gepa'
7
9
 
8
10
  module DSPy
9
11
  module Teleprompt
10
- # GEPA: Genetic-Pareto Reflective Prompt Evolution optimizer
11
- # Uses natural language reflection to evolve prompts through genetic algorithms
12
- # and Pareto frontier selection for maintaining diverse high-performing candidates
13
12
  class GEPA < Teleprompter
14
13
  extend T::Sig
14
+ DEFAULT_CONFIG = {
15
+ max_metric_calls: 32,
16
+ minibatch_size: 2,
17
+ perfect_score: 1.0,
18
+ skip_perfect_score: true,
19
+ use_merge: true,
20
+ max_merge_invocations: 5
21
+ }.freeze
15
22
 
16
- # Enum for mutation operation types
17
- class MutationType < T::Enum
18
- enums do
19
- Rewrite = new
20
- Expand = new
21
- Simplify = new
22
- Combine = new
23
- Rephrase = new
24
- end
25
- end
26
-
27
- # Enum for crossover operation types
28
- class CrossoverType < T::Enum
29
- enums do
30
- Uniform = new
31
- Blend = new
32
- Structured = new
33
- end
23
+ def self.configure
24
+ yield(default_config) if block_given?
34
25
  end
35
26
 
36
- # Immutable execution trace record using Ruby's Data class
37
- # Captures execution events for GEPA's reflective analysis
38
- class ExecutionTrace < Data.define(
39
- :trace_id,
40
- :event_name,
41
- :timestamp,
42
- :span_id,
43
- :attributes,
44
- :metadata
45
- )
46
- extend T::Sig
47
-
48
- # Type aliases for better type safety
49
- AttributesHash = T.type_alias { T::Hash[T.any(String, Symbol), T.untyped] }
50
- MetadataHash = T.type_alias { T::Hash[Symbol, T.untyped] }
51
-
52
- sig do
53
- params(
54
- trace_id: String,
55
- event_name: String,
56
- timestamp: Time,
57
- span_id: T.nilable(String),
58
- attributes: AttributesHash,
59
- metadata: T.nilable(MetadataHash)
60
- ).void
61
- end
62
-
63
- def initialize(trace_id:, event_name:, timestamp:, span_id: nil, attributes: {}, metadata: nil)
64
- # Freeze nested structures for true immutability
65
- frozen_attributes = attributes.freeze
66
- frozen_metadata = metadata&.freeze
67
-
68
- super(
69
- trace_id: trace_id,
70
- event_name: event_name,
71
- timestamp: timestamp,
72
- span_id: span_id,
73
- attributes: frozen_attributes,
74
- metadata: frozen_metadata
75
- )
76
- end
77
-
78
- # Check if this is an LLM-related trace
79
- sig { returns(T::Boolean) }
80
- def llm_trace?
81
- event_name.start_with?('llm.') || event_name.start_with?('lm.')
82
- end
83
-
84
- # Check if this is a module-related trace
85
- sig { returns(T::Boolean) }
86
- def module_trace?
87
- !llm_trace? && (
88
- event_name.include?('chain_of_thought') ||
89
- event_name.include?('react') ||
90
- event_name.include?('codeact') ||
91
- event_name.include?('predict')
92
- )
93
- end
94
-
95
- # Extract token usage from LLM traces
96
- sig { returns(Integer) }
97
- def token_usage
98
- return 0 unless llm_trace?
99
-
100
- # Try different token attribute keys
101
- [
102
- 'gen_ai.usage.total_tokens',
103
- 'gen_ai.usage.prompt_tokens',
104
- 'tokens',
105
- :tokens
106
- ].each do |key|
107
- value = attributes[key]
108
- return value.to_i if value
109
- end
110
-
111
- 0
112
- end
113
-
114
- # Convert to hash representation
115
- sig { returns(T::Hash[Symbol, T.untyped]) }
116
- def to_h
117
- {
118
- trace_id: trace_id,
119
- event_name: event_name,
120
- timestamp: timestamp,
121
- span_id: span_id,
122
- attributes: attributes,
123
- metadata: metadata
124
- }
125
- end
126
-
127
- # Extract prompt text from trace
128
- sig { returns(T.nilable(String)) }
129
- def prompt_text
130
- attributes[:prompt] || attributes['prompt']
131
- end
132
-
133
- # Extract response text from trace
134
- sig { returns(T.nilable(String)) }
135
- def response_text
136
- attributes[:response] || attributes['response']
137
- end
138
-
139
- # Get the model used in this trace
140
- sig { returns(T.nilable(String)) }
141
- def model_name
142
- attributes['gen_ai.request.model'] || attributes[:model]
143
- end
144
-
145
- # Get the signature class name
146
- sig { returns(T.nilable(String)) }
147
- def signature_name
148
- attributes['dspy.signature'] || attributes[:signature]
149
- end
150
- end
151
-
152
- # Immutable reflection analysis result using Ruby's Data class
153
- # Stores the output of GEPA's reflective analysis on execution traces
154
- class ReflectionResult < Data.define(
155
- :trace_id,
156
- :diagnosis,
157
- :improvements,
158
- :confidence,
159
- :reasoning,
160
- :suggested_mutations,
161
- :metadata
162
- )
163
- extend T::Sig
164
-
165
- # Type aliases for better type safety
166
- ImprovementsList = T.type_alias { T::Array[String] }
167
- MutationsList = T.type_alias { T::Array[Symbol] }
168
- MetadataHash = T.type_alias { T::Hash[Symbol, T.untyped] }
169
-
170
- sig do
171
- params(
172
- trace_id: String,
173
- diagnosis: String,
174
- improvements: ImprovementsList,
175
- confidence: Float,
176
- reasoning: String,
177
- suggested_mutations: MutationsList,
178
- metadata: MetadataHash
179
- ).void
180
- end
181
- def initialize(trace_id:, diagnosis:, improvements:, confidence:, reasoning:, suggested_mutations:, metadata:)
182
- # Validate confidence score
183
- if confidence < 0.0 || confidence > 1.0
184
- raise ArgumentError, "confidence must be between 0 and 1, got #{confidence}"
185
- end
186
-
187
- # Freeze nested structures for true immutability
188
- frozen_improvements = improvements.freeze
189
- frozen_mutations = suggested_mutations.freeze
190
- frozen_metadata = metadata.freeze
191
-
192
- super(
193
- trace_id: trace_id,
194
- diagnosis: diagnosis,
195
- improvements: frozen_improvements,
196
- confidence: confidence,
197
- reasoning: reasoning,
198
- suggested_mutations: frozen_mutations,
199
- metadata: frozen_metadata
200
- )
201
- end
202
-
203
- # Check if this reflection has high confidence (>= 0.8)
204
- sig { returns(T::Boolean) }
205
- def high_confidence?
206
- confidence >= 0.8
207
- end
208
-
209
- # Check if this reflection suggests actionable changes
210
- sig { returns(T::Boolean) }
211
- def actionable?
212
- improvements.any? || suggested_mutations.any?
213
- end
214
-
215
- # Get mutations sorted by priority (simple alphabetical for Phase 1)
216
- sig { returns(MutationsList) }
217
- def mutation_priority
218
- suggested_mutations.sort
219
- end
220
-
221
- # Convert to hash representation
222
- sig { returns(T::Hash[Symbol, T.untyped]) }
223
- def to_h
224
- {
225
- trace_id: trace_id,
226
- diagnosis: diagnosis,
227
- improvements: improvements,
228
- confidence: confidence,
229
- reasoning: reasoning,
230
- suggested_mutations: suggested_mutations,
231
- metadata: metadata
232
- }
233
- end
234
-
235
- # Generate a concise summary of this reflection
236
- sig { returns(String) }
237
- def summary
238
- confidence_pct = (confidence * 100).round
239
- mutation_list = suggested_mutations.map(&:to_s).join(', ')
240
-
241
- "#{diagnosis.split('.').first}. " \
242
- "Confidence: #{confidence_pct}%. " \
243
- "#{improvements.size} improvements suggested. " \
244
- "Mutations: #{mutation_list}."
245
- end
246
-
247
- # Check if reflection model was used
248
- sig { returns(T.nilable(String)) }
249
- def reflection_model
250
- metadata[:reflection_model]
251
- end
252
-
253
- # Get token usage from reflection analysis
254
- sig { returns(Integer) }
255
- def token_usage
256
- metadata[:token_usage] || 0
257
- end
258
-
259
- # Get analysis duration in milliseconds
260
- sig { returns(Integer) }
261
- def analysis_duration_ms
262
- metadata[:analysis_duration_ms] || 0
263
- end
264
- end
265
-
266
- # TraceCollector aggregates execution traces from DSPy events
267
- # Uses SubscriberMixin for class-level event subscriptions
268
- class TraceCollector
269
- include DSPy::Events::SubscriberMixin
270
- extend T::Sig
271
-
272
- sig { void }
273
- def initialize
274
- @traces = T.let([], T::Array[ExecutionTrace])
275
- @traces_mutex = T.let(Mutex.new, Mutex)
276
- setup_subscriptions
277
- end
278
-
279
- sig { returns(T::Array[ExecutionTrace]) }
280
- attr_reader :traces
281
-
282
- # Get count of collected traces
283
- sig { returns(Integer) }
284
- def collected_count
285
- @traces_mutex.synchronize { @traces.size }
286
- end
287
-
288
- # Collect trace from event data
289
- sig { params(event_name: String, event_data: T::Hash[T.any(String, Symbol), T.untyped]).void }
290
- def collect_trace(event_name, event_data)
291
- @traces_mutex.synchronize do
292
- trace_id = event_data['trace_id'] || event_data[:trace_id] || generate_trace_id
293
-
294
- # Avoid duplicates
295
- return if @traces.any? { |t| t.trace_id == trace_id }
296
-
297
- timestamp = event_data['timestamp'] || event_data[:timestamp] || Time.now
298
- span_id = event_data['span_id'] || event_data[:span_id]
299
- attributes = event_data['attributes'] || event_data[:attributes] || {}
300
- metadata = event_data['metadata'] || event_data[:metadata] || {}
301
-
302
- trace = ExecutionTrace.new(
303
- trace_id: trace_id,
304
- event_name: event_name,
305
- timestamp: timestamp,
306
- span_id: span_id,
307
- attributes: attributes,
308
- metadata: metadata
309
- )
310
-
311
- @traces << trace
312
- end
313
- end
314
-
315
- # Get traces for a specific optimization run
316
- sig { params(run_id: String).returns(T::Array[ExecutionTrace]) }
317
- def traces_for_run(run_id)
318
- @traces_mutex.synchronize do
319
- @traces.select do |trace|
320
- metadata = trace.metadata
321
- metadata && metadata[:optimization_run_id] == run_id
322
- end
323
- end
324
- end
325
-
326
- # Get only LLM traces
327
- sig { returns(T::Array[ExecutionTrace]) }
328
- def llm_traces
329
- @traces_mutex.synchronize { @traces.select(&:llm_trace?) }
330
- end
331
-
332
- # Get only module traces
333
- sig { returns(T::Array[ExecutionTrace]) }
334
- def module_traces
335
- @traces_mutex.synchronize { @traces.select(&:module_trace?) }
336
- end
337
-
338
- # Clear all collected traces
339
- sig { void }
340
- def clear
341
- @traces_mutex.synchronize { @traces.clear }
342
- end
343
-
344
- private
345
-
346
- # Set up event subscriptions using SubscriberMixin
347
- sig { void }
348
- def setup_subscriptions
349
- # Subscribe to LLM events
350
- self.class.add_subscription('llm.*') do |name, attrs|
351
- collect_trace(name, attrs)
352
- end
353
-
354
- # Subscribe to module events
355
- self.class.add_subscription('*.reasoning_complete') do |name, attrs|
356
- collect_trace(name, attrs)
357
- end
358
-
359
- self.class.add_subscription('*.predict_complete') do |name, attrs|
360
- collect_trace(name, attrs)
361
- end
362
- end
363
-
364
- # Generate unique trace ID
365
- sig { returns(String) }
366
- def generate_trace_id
367
- "gepa-trace-#{SecureRandom.hex(4)}"
368
- end
369
- end
370
-
371
- # ReflectionEngine performs natural language reflection on execution traces
372
- # This is the core component that analyzes traces and generates improvement insights
373
- class ReflectionEngine
374
- extend T::Sig
375
-
376
- sig { returns(GEPAConfig) }
377
- attr_reader :config
378
-
379
- sig { params(config: T.nilable(GEPAConfig)).void }
380
- def initialize(config = nil)
381
- @config = config || GEPAConfig.new
382
- end
383
-
384
- # Perform reflective analysis on execution traces
385
- sig { params(traces: T::Array[ExecutionTrace]).returns(ReflectionResult) }
386
- def reflect_on_traces(traces)
387
- reflection_id = generate_reflection_id
388
-
389
- if traces.empty?
390
- return ReflectionResult.new(
391
- trace_id: reflection_id,
392
- diagnosis: 'No traces available for analysis',
393
- improvements: [],
394
- confidence: 0.0,
395
- reasoning: 'Cannot provide reflection without execution traces',
396
- suggested_mutations: [],
397
- metadata: {
398
- reflection_model: @config.reflection_lm&.model,
399
- analysis_timestamp: Time.now,
400
- trace_count: 0
401
- }
402
- )
403
- end
404
-
405
- patterns = analyze_execution_patterns(traces)
406
- improvements = generate_improvement_suggestions(patterns)
407
- mutations = suggest_mutations(patterns)
408
-
409
- # For Phase 1, we generate a simple rule-based analysis
410
- # Future phases will use LLM-based reflection
411
- diagnosis = generate_diagnosis(patterns)
412
- reasoning = generate_reasoning(patterns, traces)
413
- confidence = calculate_confidence(patterns)
414
-
415
- ReflectionResult.new(
416
- trace_id: reflection_id,
417
- diagnosis: diagnosis,
418
- improvements: improvements,
419
- confidence: confidence,
420
- reasoning: reasoning,
421
- suggested_mutations: mutations,
422
- metadata: {
423
- reflection_model: @config.reflection_lm&.model,
424
- analysis_timestamp: Time.now,
425
- trace_count: traces.size,
426
- token_usage: 0 # Phase 1 doesn't use actual LLM reflection
427
- }
428
- )
429
- end
430
-
431
- # Analyze patterns in execution traces
432
- sig { params(traces: T::Array[ExecutionTrace]).returns(T::Hash[Symbol, T.untyped]) }
433
- def analyze_execution_patterns(traces)
434
- llm_traces = traces.select(&:llm_trace?)
435
- module_traces = traces.select(&:module_trace?)
436
-
437
- total_tokens = llm_traces.sum(&:token_usage)
438
- unique_models = llm_traces.map(&:model_name).compact.uniq
439
-
440
- {
441
- llm_traces_count: llm_traces.size,
442
- module_traces_count: module_traces.size,
443
- total_tokens: total_tokens,
444
- unique_models: unique_models,
445
- avg_response_length: calculate_avg_response_length(llm_traces),
446
- trace_timespan: calculate_timespan(traces)
447
- }
448
- end
449
-
450
- # Generate improvement suggestions based on patterns
451
- sig { params(patterns: T::Hash[Symbol, T.untyped]).returns(T::Array[String]) }
452
- def generate_improvement_suggestions(patterns)
453
- suggestions = []
454
-
455
- if patterns[:total_tokens] > 500
456
- suggestions << 'Consider reducing prompt length to lower token usage'
457
- end
458
-
459
- if patterns[:avg_response_length] < 10
460
- suggestions << 'Responses seem brief - consider asking for more detailed explanations'
461
- end
462
-
463
- if patterns[:llm_traces_count] > patterns[:module_traces_count] * 3
464
- suggestions << 'High LLM usage detected - consider optimizing reasoning chains'
465
- end
466
-
467
- if patterns[:unique_models].size > 1
468
- suggestions << 'Multiple models used - consider standardizing on one model for consistency'
469
- end
470
-
471
- suggestions << 'Add step-by-step reasoning instructions' if suggestions.empty?
472
- suggestions
473
- end
474
-
475
- # Suggest mutation operations based on patterns
476
- sig { params(patterns: T::Hash[Symbol, T.untyped]).returns(T::Array[Symbol]) }
477
- def suggest_mutations(patterns)
478
- mutations = []
479
-
480
- avg_length = patterns[:avg_response_length] || 0
481
- total_tokens = patterns[:total_tokens] || 0
482
- llm_count = patterns[:llm_traces_count] || 0
483
-
484
- mutations << :expand if avg_length < 15
485
- mutations << :simplify if total_tokens > 300
486
- mutations << :combine if llm_count > 2
487
- mutations << :rewrite if llm_count == 1
488
- mutations << :rephrase if mutations.empty?
489
-
490
- mutations.uniq
491
- end
492
-
493
- public
494
-
495
- # Perform LLM-based reflection on execution traces using DSPy::Predict
496
- sig { params(traces: T::Array[ExecutionTrace]).returns(ReflectionResult) }
497
- def reflect_with_llm(traces)
498
- return reflect_on_traces(traces) if traces.empty?
499
-
500
- begin
501
- # Use DSPy::Predict for analysis instead of raw prompts
502
- prediction = analyze_traces_with_dspy(traces)
503
- convert_prediction_to_reflection_result(prediction, traces)
504
- rescue => e
505
- # Fallback to rule-based analysis on LLM failure
506
- fallback_result = reflect_on_traces(traces)
507
- fallback_result.class.new(
508
- trace_id: fallback_result.trace_id,
509
- diagnosis: "LLM reflection failed (#{e.message}), using fallback analysis: #{fallback_result.diagnosis}",
510
- improvements: fallback_result.improvements,
511
- confidence: [fallback_result.confidence * 0.5, 0.5].min,
512
- reasoning: "Fallback to rule-based analysis after LLM error: #{fallback_result.reasoning}",
513
- suggested_mutations: fallback_result.suggested_mutations,
514
- metadata: fallback_result.metadata.merge(
515
- llm_error: e.message,
516
- fallback_used: true
517
- )
518
- )
519
- end
520
- end
521
-
522
- # Generate structured reflection prompt for LLM (public API)
523
- sig { params(traces: T::Array[ExecutionTrace]).returns(String) }
524
- def generate_reflection_prompt(traces)
525
- if traces.empty?
526
- return <<~PROMPT
527
- You are analyzing execution traces for a genetic algorithm-based prompt optimization system called GEPA.
528
-
529
- **Task**: Analyze execution patterns and provide optimization recommendations.
530
-
531
- **Context**: No execution traces available.
532
-
533
- Please provide your analysis in the following JSON format:
534
- {
535
- "diagnosis": "Brief description of what you observed",
536
- "improvements": ["List of actionable improvement suggestions"],
537
- "confidence": 0.0,
538
- "reasoning": "Your reasoning process",
539
- "suggested_mutations": ["expand", "rewrite", "simplify", "combine", "rephrase"],
540
- "insights": {
541
- "pattern_detected": "no_data",
542
- "optimization_opportunity": "data_collection"
543
- }
544
- }
545
- PROMPT
546
- end
547
-
548
- summary = trace_summary_for_reflection(traces)
549
- insights = extract_optimization_insights(traces)
550
-
551
- <<~PROMPT
552
- You are analyzing execution traces for a genetic algorithm-based prompt optimization system called GEPA.
553
-
554
- **Task**: Analyze execution patterns and provide optimization recommendations for prompt evolution.
555
-
556
- **Execution Summary**:
557
- #{summary}
558
-
559
- **Optimization Context**:
560
- - This is part of a genetic algorithm for prompt optimization
561
- - Available mutation types: rewrite, expand, simplify, combine, rephrase
562
- - Goal is to improve prompt effectiveness through iterative evolution
563
- - Focus on actionable insights that can guide mutation and crossover operations
564
-
565
- **Key Optimization Insights**:
566
- #{insights.map { |k, v| "- #{k}: #{v.is_a?(Hash) ? v.values.join(', ') : v}" }.join("\n")}
567
-
568
- **Sample Traces**:
569
- #{format_traces_for_prompt(traces.take(3))}
570
-
571
- Please analyze these execution patterns and provide optimization recommendations in the following JSON format:
572
- {
573
- "diagnosis": "Brief description of execution patterns and issues identified",
574
- "improvements": ["List of 2-4 specific, actionable improvement suggestions"],
575
- "confidence": 0.85,
576
- "reasoning": "Your detailed reasoning process for the analysis",
577
- "suggested_mutations": ["List of 2-3 mutation types that would be most beneficial"],
578
- "insights": {
579
- "pattern_detected": "primary_pattern_identified",
580
- "optimization_opportunity": "key_area_for_improvement"
581
- }
582
- }
583
-
584
- Focus on practical recommendations that will improve prompt performance through genetic algorithm evolution.
585
- PROMPT
586
- end
587
-
588
- # Parse LLM reflection response into ReflectionResult (public API)
589
- sig { params(response_text: String, original_traces: T::Array[ExecutionTrace]).returns(ReflectionResult) }
590
- def parse_llm_reflection(response_text, original_traces)
591
- reflection_id = generate_reflection_id
592
-
593
- begin
594
- parsed = JSON.parse(response_text)
595
-
596
- # Extract and validate components
597
- diagnosis = parsed['diagnosis'] || 'LLM reflection analysis'
598
- improvements = Array(parsed['improvements']).select { |i| i.is_a?(String) && !i.strip.empty? }
599
- confidence = [parsed['confidence'].to_f, 1.0].min
600
- reasoning = parsed['reasoning'] || 'LLM-based analysis of execution traces'
601
-
602
- # Validate and sanitize mutation suggestions
603
- raw_mutations = Array(parsed['suggested_mutations'])
604
- valid_mutations = raw_mutations.filter_map do |mut|
605
- mutation_symbol = mut.to_s.downcase.to_sym
606
- if [:rewrite, :expand, :simplify, :combine, :rephrase].include?(mutation_symbol)
607
- mutation_symbol
608
- end
609
- end.uniq
610
-
611
- # Ensure we have at least one valid mutation suggestion
612
- valid_mutations = [:rewrite] if valid_mutations.empty?
613
-
614
- ReflectionResult.new(
615
- trace_id: reflection_id,
616
- diagnosis: diagnosis,
617
- improvements: improvements,
618
- confidence: confidence,
619
- reasoning: reasoning,
620
- suggested_mutations: valid_mutations,
621
- metadata: {
622
- reflection_model: @config.reflection_lm&.model,
623
- analysis_timestamp: Time.now,
624
- trace_count: original_traces.size,
625
- token_usage: estimate_token_usage(response_text),
626
- llm_based: true,
627
- insights: parsed['insights'] || {}
628
- }
629
- )
630
-
631
- rescue JSON::ParserError => e
632
- # Handle malformed JSON response
633
- ReflectionResult.new(
634
- trace_id: reflection_id,
635
- diagnosis: "LLM reflection JSON parsing error: #{e.message}",
636
- improvements: ['Review prompt structure and LLM response format'],
637
- confidence: 0.3,
638
- reasoning: "Failed to parse LLM reflection response as valid JSON",
639
- suggested_mutations: [:rewrite],
640
- metadata: {
641
- reflection_model: @config.reflection_lm&.model,
642
- analysis_timestamp: Time.now,
643
- trace_count: original_traces.size,
644
- token_usage: 0,
645
- parsing_error: e.message,
646
- raw_response: response_text.length > 500 ? "#{response_text[0..500]}..." : response_text
647
- }
648
- )
649
- end
650
- end
651
-
652
- # Create comprehensive trace summary for reflection (public API)
653
- sig { params(traces: T::Array[ExecutionTrace]).returns(String) }
654
- def trace_summary_for_reflection(traces)
655
- return "No execution traces available" if traces.empty?
656
-
657
- llm_traces = traces.select(&:llm_trace?)
658
- module_traces = traces.select(&:module_trace?)
659
-
660
- total_tokens = llm_traces.sum(&:token_usage)
661
- unique_models = llm_traces.map(&:model_name).compact.uniq
662
- timespan = calculate_timespan(traces)
663
-
664
- avg_response_length = if llm_traces.any?
665
- total_length = llm_traces.sum { |t| t.response_text&.length || 0 }
666
- total_length / llm_traces.size
667
- else
668
- 0
669
- end
670
-
671
- <<~SUMMARY
672
- Total traces: #{traces.size}
673
- LLM interactions: #{llm_traces.size}
674
- Module calls: #{module_traces.size}
675
- Total tokens: #{total_tokens}
676
- Models used: #{unique_models.join(', ')}
677
- Average response length: #{avg_response_length} characters
678
- Execution timespan: #{timespan.round(2)} seconds
679
- SUMMARY
680
- end
681
-
682
- # Extract optimization insights from trace analysis (public API)
683
- sig { params(traces: T::Array[ExecutionTrace]).returns(T::Hash[Symbol, T.untyped]) }
684
- def extract_optimization_insights(traces)
685
- llm_traces = traces.select(&:llm_trace?)
686
-
687
- insights = {
688
- token_efficiency: analyze_token_efficiency(llm_traces),
689
- response_quality: analyze_response_quality(llm_traces),
690
- model_consistency: analyze_model_consistency(llm_traces)
691
- }
692
-
693
- insights
694
- end
695
-
696
- # Reflection with optimization context (public API)
697
- sig { params(traces: T::Array[ExecutionTrace], context: T::Hash[Symbol, T.untyped]).returns(ReflectionResult) }
698
- def reflection_with_context(traces, context)
699
- base_result = reflect_with_llm(traces)
700
-
701
- # Incorporate context into reasoning
702
- context_reasoning = "Generation #{context[:generation] || 'unknown'} analysis. "
703
- context_reasoning += "Population size: #{context[:population_size] || 'unknown'}. "
704
-
705
- if context[:current_best_score]
706
- context_reasoning += "Current best score: #{context[:current_best_score]}. "
707
- end
708
-
709
- # Adjust mutation suggestions based on history
710
- adjusted_mutations = adjust_mutations_for_history(
711
- base_result.suggested_mutations,
712
- context[:mutation_history] || [],
713
- context[:recent_performance_trend]
714
- )
715
-
716
- ReflectionResult.new(
717
- trace_id: base_result.trace_id,
718
- diagnosis: base_result.diagnosis,
719
- improvements: base_result.improvements,
720
- confidence: base_result.confidence,
721
- reasoning: context_reasoning + base_result.reasoning,
722
- suggested_mutations: adjusted_mutations,
723
- metadata: base_result.metadata.merge(optimization_context: context)
724
- )
725
- end
726
-
727
- public
728
-
729
- # Create signature for trace reflection analysis (public API)
730
- sig { returns(T.class_of(DSPy::Signature)) }
731
- def create_trace_reflection_signature
732
- @trace_reflection_signature ||= Class.new(DSPy::Signature) do
733
- description "Analyze execution traces from GEPA optimization system and provide actionable optimization insights"
734
-
735
- input do
736
- const :execution_summary, String, description: "Summary of execution traces and performance patterns"
737
- const :optimization_context, String, description: "Context about the genetic algorithm optimization goals"
738
- const :key_insights, String, description: "Key insights extracted from trace analysis"
739
- const :sample_traces, String, description: "Representative execution trace samples"
740
- end
741
-
742
- output do
743
- const :diagnosis, String, description: "Brief description of execution patterns and issues identified"
744
- const :improvements, T::Array[String], description: "List of 2-4 specific actionable improvement suggestions"
745
- const :confidence, Float, description: "Confidence level in analysis (0.0 to 1.0)"
746
- const :reasoning, String, description: "Detailed reasoning process for the analysis"
747
- const :suggested_mutations, T::Array[String], description: "List of 2-3 most beneficial mutation types from: rewrite, expand, simplify, combine, rephrase"
748
- const :pattern_detected, String, description: "Primary pattern identified in execution traces"
749
- const :optimization_opportunity, String, description: "Key area identified for performance improvement"
750
- end
751
- end
752
- end
753
-
754
- # Perform LLM analysis using DSPy::Predict (public API)
755
- sig { params(traces: T::Array[ExecutionTrace]).returns(T.untyped) }
756
- def analyze_traces_with_dspy(traces)
757
- raise ArgumentError, "reflection_lm must be configured on GEPAConfig for LLM-based reflection" unless @config.reflection_lm
758
-
759
- predictor = DSPy::Predict.new(create_trace_reflection_signature)
760
-
761
- # Configure predictor to use reflection-specific LM
762
- predictor.config.lm = @config.reflection_lm
763
-
764
- # Prepare input data
765
- summary = trace_summary_for_reflection(traces)
766
- insights = extract_optimization_insights(traces)
767
- insights_text = insights.map { |k, v| "- #{k}: #{v.is_a?(Hash) ? v.values.join(', ') : v}" }.join("\n")
768
-
769
- # Get LLM analysis
770
- T.unsafe(predictor.call(
771
- execution_summary: summary,
772
- optimization_context: "GEPA genetic algorithm for prompt optimization. Available mutations: rewrite, expand, simplify, combine, rephrase. Goal: improve prompt effectiveness through iterative evolution.",
773
- key_insights: insights_text,
774
- sample_traces: format_traces_for_prompt(traces.take(3))
775
- ))
776
- end
777
-
778
- # Convert DSPy prediction to ReflectionResult (public API)
779
- sig { params(prediction: T.untyped, original_traces: T::Array[ExecutionTrace]).returns(ReflectionResult) }
780
- def convert_prediction_to_reflection_result(prediction, original_traces)
781
- reflection_id = generate_reflection_id
782
-
783
- # Extract and validate prediction results
784
- diagnosis = prediction.diagnosis || 'DSPy reflection analysis'
785
- improvements = Array(prediction.improvements).select { |i| i.is_a?(String) && !i.strip.empty? }
786
- confidence = [[prediction.confidence&.to_f || 0.0, 1.0].min, 0.0].max
787
- reasoning = prediction.reasoning || 'DSPy-based analysis of execution traces'
788
-
789
- # Validate mutation suggestions
790
- valid_mutations = Array(prediction.suggested_mutations).filter_map do |mut|
791
- mutation_symbol = mut.to_s.downcase.to_sym
792
- if [:rewrite, :expand, :simplify, :combine, :rephrase].include?(mutation_symbol)
793
- mutation_symbol
794
- end
795
- end.uniq
796
-
797
- # Ensure we have at least one valid mutation suggestion
798
- valid_mutations = [:rewrite] if valid_mutations.empty?
799
-
800
- ReflectionResult.new(
801
- trace_id: reflection_id,
802
- diagnosis: diagnosis,
803
- improvements: improvements,
804
- confidence: confidence,
805
- reasoning: reasoning,
806
- suggested_mutations: valid_mutations,
807
- metadata: {
808
- reflection_model: @config.reflection_lm&.model,
809
- analysis_timestamp: Time.now,
810
- trace_count: original_traces.size,
811
- token_usage: estimate_token_usage(prediction.to_s),
812
- llm_based: true,
813
- dspy_prediction: true,
814
- insights: {
815
- pattern_detected: prediction.pattern_detected || "unknown_pattern",
816
- optimization_opportunity: prediction.optimization_opportunity || "general_optimization"
817
- }
818
- }
819
- )
820
- end
821
-
822
- private
823
-
824
- # Generate unique reflection ID
825
- sig { returns(String) }
826
- def generate_reflection_id
827
- "reflection-#{SecureRandom.hex(4)}"
828
- end
829
-
830
- # Generate diagnosis text
831
- sig { params(patterns: T::Hash[Symbol, T.untyped]).returns(String) }
832
- def generate_diagnosis(patterns)
833
- if patterns[:total_tokens] > 400
834
- 'High token usage indicates potential inefficiency in prompt design'
835
- elsif patterns[:llm_traces_count] == 0
836
- 'No LLM interactions found - execution may not be working as expected'
837
- elsif patterns[:avg_response_length] < 10
838
- 'Responses are unusually brief which may indicate prompt clarity issues'
839
- else
840
- 'Execution patterns appear normal with room for optimization'
841
- end
842
- end
843
-
844
- # Generate reasoning text
845
- sig { params(patterns: T::Hash[Symbol, T.untyped], traces: T::Array[ExecutionTrace]).returns(String) }
846
- def generate_reasoning(patterns, traces)
847
- reasoning_parts = []
848
-
849
- reasoning_parts << "Analyzed #{traces.size} execution traces"
850
- reasoning_parts << "#{patterns[:llm_traces_count]} LLM interactions"
851
- reasoning_parts << "#{patterns[:module_traces_count]} module operations"
852
- reasoning_parts << "Total token usage: #{patterns[:total_tokens]}"
853
-
854
- reasoning_parts.join('. ') + '.'
855
- end
856
-
857
- # Calculate confidence based on patterns
858
- sig { params(patterns: T::Hash[Symbol, T.untyped]).returns(Float) }
859
- def calculate_confidence(patterns)
860
- base_confidence = 0.7
861
-
862
- # More traces = higher confidence
863
- trace_bonus = [patterns[:llm_traces_count] + patterns[:module_traces_count], 10].min * 0.02
864
-
865
- # Reasonable token usage = higher confidence
866
- token_penalty = patterns[:total_tokens] > 1000 ? -0.1 : 0.0
867
-
868
- [(base_confidence + trace_bonus + token_penalty), 1.0].min
869
- end
870
-
871
- # Calculate average response length from LLM traces
872
- sig { params(llm_traces: T::Array[ExecutionTrace]).returns(Integer) }
873
- def calculate_avg_response_length(llm_traces)
874
- return 0 if llm_traces.empty?
875
-
876
- total_length = llm_traces.sum do |trace|
877
- response = trace.response_text
878
- response ? response.length : 0
879
- end
880
-
881
- total_length / llm_traces.size
882
- end
883
-
884
- # Calculate timespan of traces
885
- sig { params(traces: T::Array[ExecutionTrace]).returns(Float) }
886
- def calculate_timespan(traces)
887
- return 0.0 if traces.size < 2
888
-
889
- timestamps = traces.map(&:timestamp).sort
890
- (timestamps.last - timestamps.first).to_f
891
- end
892
-
893
-
894
- # Format traces for inclusion in prompt
895
- sig { params(traces: T::Array[ExecutionTrace]).returns(String) }
896
- def format_traces_for_prompt(traces)
897
- traces.map.with_index do |trace, idx|
898
- prompt_preview = truncate_text(trace.prompt_text || 'N/A', 100)
899
- response_preview = truncate_text(trace.response_text || 'N/A', 100)
900
- "#{idx + 1}. [#{trace.event_name}] #{prompt_preview} → #{response_preview}"
901
- end.join("\n")
902
- end
903
-
904
- # Estimate token usage from response
905
- sig { params(text: String).returns(Integer) }
906
- def estimate_token_usage(text)
907
- # Rough estimation: ~4 characters per token
908
- (text.length / 4.0).ceil
909
- end
910
-
911
- # Analyze token efficiency patterns
912
- sig { params(llm_traces: T::Array[ExecutionTrace]).returns(T::Hash[Symbol, T.untyped]) }
913
- def analyze_token_efficiency(llm_traces)
914
- return { status: 'no_data', suggestions: [] } if llm_traces.empty?
915
-
916
- total_tokens = llm_traces.sum(&:token_usage)
917
- avg_tokens = total_tokens.to_f / llm_traces.size
918
-
919
- if avg_tokens > 400
920
- {
921
- status: 'poor',
922
- average_tokens: avg_tokens,
923
- suggestions: ['Consider reducing prompt length', 'Optimize instruction clarity']
924
- }
925
- elsif avg_tokens > 200
926
- {
927
- status: 'moderate',
928
- average_tokens: avg_tokens,
929
- suggestions: ['Monitor token usage trends', 'Consider prompt optimization']
930
- }
931
- else
932
- {
933
- status: 'good',
934
- average_tokens: avg_tokens,
935
- suggestions: ['Token usage appears efficient']
936
- }
937
- end
938
- end
939
-
940
- # Analyze response quality patterns
941
- sig { params(llm_traces: T::Array[ExecutionTrace]).returns(T::Hash[Symbol, T.untyped]) }
942
- def analyze_response_quality(llm_traces)
943
- return { consistency: 'no_data', recommendations: [] } if llm_traces.empty?
944
-
945
- response_lengths = llm_traces.map { |t| t.response_text&.length || 0 }
946
- length_variance = calculate_variance(response_lengths)
947
-
948
- if length_variance > 1000
949
- {
950
- consistency: 'inconsistent',
951
- variance: length_variance,
952
- recommendations: [
953
- 'Add response format guidelines',
954
- 'Consider structured output templates'
955
- ]
956
- }
957
- else
958
- {
959
- consistency: 'consistent',
960
- variance: length_variance,
961
- recommendations: ['Response quality appears consistent']
962
- }
963
- end
964
- end
965
-
966
- # Analyze model consistency
967
- sig { params(llm_traces: T::Array[ExecutionTrace]).returns(T::Hash[Symbol, T.untyped]) }
968
- def analyze_model_consistency(llm_traces)
969
- models = llm_traces.map(&:model_name).compact.uniq
970
-
971
- {
972
- unique_models: models.size,
973
- models_used: models,
974
- recommendation: models.size > 1 ? 'Consider using single model for consistency' : 'Model usage is consistent'
975
- }
976
- end
977
-
978
- # Adjust mutations based on history to avoid repetition
979
- sig { params(suggested: T::Array[Symbol], history: T::Array[Symbol], trend: T.nilable(String)).returns(T::Array[Symbol]) }
980
- def adjust_mutations_for_history(suggested, history, trend)
981
- # Count recent usage of each mutation type
982
- recent_usage = history.last(5).tally
983
-
984
- # Filter out overused mutations
985
- adjusted = suggested.reject do |mutation|
986
- recent_usage[mutation] && recent_usage[mutation] >= 2
987
- end
988
-
989
- # If trend is declining, prefer different strategies
990
- if trend == 'declining'
991
- adjusted = adjusted.reject { |m| m == :expand } # Avoid expansion if performance declining
992
- adjusted += [:simplify, :rephrase] unless adjusted.include?(:simplify) || adjusted.include?(:rephrase)
993
- end
994
-
995
- # Ensure we always have at least one suggestion
996
- adjusted.empty? ? [:rewrite] : adjusted.uniq
997
- end
998
-
999
- # Calculate variance for array of numbers
1000
- sig { params(values: T::Array[Integer]).returns(Float) }
1001
- def calculate_variance(values)
1002
- return 0.0 if values.size < 2
1003
-
1004
- mean = values.sum.to_f / values.size
1005
- sum_squared_diff = values.sum { |v| (v - mean) ** 2 }
1006
- sum_squared_diff / values.size
1007
- end
1008
-
1009
- # Truncate text to specified length with ellipsis
1010
- sig { params(text: String, length: Integer).returns(String) }
1011
- def truncate_text(text, length)
1012
- return text if text.length <= length
1013
- "#{text[0...length]}..."
1014
- end
1015
- end
1016
-
1017
- # GeneticEngine orchestrates the genetic algorithm for prompt evolution
1018
- # Manages population, selection, and evolution across generations
1019
- class GeneticEngine
1020
- extend T::Sig
1021
-
1022
- sig { returns(GEPAConfig) }
1023
- attr_reader :config
1024
-
1025
- sig { returns(FitnessEvaluator) }
1026
- attr_reader :fitness_evaluator
1027
-
1028
- sig { returns(T::Array[T.untyped]) }
1029
- attr_reader :population
1030
-
1031
- sig { returns(Integer) }
1032
- attr_reader :generation
1033
-
1034
- sig { params(config: GEPAConfig, fitness_evaluator: FitnessEvaluator).void }
1035
- def initialize(config:, fitness_evaluator:)
1036
- @config = config
1037
- @fitness_evaluator = fitness_evaluator
1038
- @population = T.let([], T::Array[T.untyped])
1039
- @generation = 0
1040
- @fitness_scores = T.let([], T::Array[FitnessScore])
1041
- end
1042
-
1043
- # Initialize population with diverse instruction variants
1044
- sig { params(program: T.untyped).void }
1045
- def initialize_population(program)
1046
- @population = []
1047
-
1048
- # Start with original program
1049
- @population << program
1050
-
1051
- # Generate instruction variants to fill population if program has signature_class
1052
- if program.respond_to?(:signature_class) && program.signature_class.respond_to?(:description)
1053
- original_instruction = program.signature_class.description
1054
- if original_instruction && !original_instruction.empty?
1055
- variants = generate_instruction_variants(original_instruction)
1056
- else
1057
- variants = []
1058
- end
1059
- else
1060
- variants = []
1061
- end
1062
-
1063
- # Create program copies with different instructions
1064
- variants.take(@config.population_size - 1).each do |variant|
1065
- variant_program = create_program_with_instruction(program, variant)
1066
- @population << variant_program
1067
- end
1068
-
1069
- # If we need more candidates, duplicate and mutate
1070
- while @population.size < @config.population_size
1071
- base_program = @population.sample
1072
- if base_program.respond_to?(:signature_class) && base_program.signature_class.respond_to?(:description)
1073
- instruction_variants = generate_instruction_variants(base_program.signature_class.description)
1074
- if instruction_variants.any?
1075
- mutated = create_program_with_instruction(base_program, instruction_variants.first)
1076
- @population << mutated
1077
- else
1078
- # If no variants available, just duplicate the base program
1079
- @population << base_program
1080
- end
1081
- else
1082
- # If no signature_class available, just duplicate the base program
1083
- @population << base_program
1084
- end
1085
- end
1086
-
1087
- @generation = 0
1088
- end
1089
-
1090
- # Evaluate all population members on the training set
1091
- sig { params(trainset: T::Array[T.untyped]).returns(T::Array[FitnessScore]) }
1092
- def evaluate_population(trainset)
1093
- @fitness_scores = @population.map do |candidate|
1094
- @fitness_evaluator.evaluate_candidate(candidate, trainset)
1095
- end
1096
-
1097
- @fitness_scores
1098
- end
1099
-
1100
- # Evolve to next generation using selection and mutation
1101
- sig { params(trainset: T::Array[T.untyped]).void }
1102
- def evolve_generation(trainset)
1103
- current_scores = evaluate_population(trainset)
1104
-
1105
- # Simple selection: keep top 50% and mutate them
1106
- sorted_indices = (0...@population.size).sort_by { |i| -current_scores[i].overall_score }
1107
- survivors = sorted_indices.take([@config.population_size / 2, 1].max)
1108
-
1109
- new_population = []
1110
-
1111
- # Keep best performers
1112
- survivors.each { |i| new_population << @population[i] }
1113
-
1114
- # Fill rest with mutations of survivors
1115
- while new_population.size < @config.population_size
1116
- parent_index = survivors.sample
1117
- parent = @population[parent_index]
1118
-
1119
- # Generate mutation if parent has signature_class
1120
- if parent.respond_to?(:signature_class) && parent.signature_class.respond_to?(:description)
1121
- variants = generate_instruction_variants(parent.signature_class.description)
1122
- mutated = create_program_with_instruction(parent, variants.first || parent.signature_class.description)
1123
- new_population << mutated
1124
- else
1125
- # If no signature_class, just duplicate the parent
1126
- new_population << parent
1127
- end
1128
- end
1129
-
1130
- @population = new_population
1131
- @generation += 1
1132
- end
1133
-
1134
- # Run complete evolution process
1135
- sig { params(program: T.untyped, trainset: T::Array[T.untyped]).returns(T::Hash[Symbol, T.untyped]) }
1136
- def run_evolution(program, trainset)
1137
- initialize_population(program)
1138
-
1139
- history = []
1140
-
1141
- # Initial evaluation
1142
- initial_scores = evaluate_population(trainset)
1143
- best_initial = initial_scores.max_by(&:overall_score)
1144
- avg_initial = initial_scores.map(&:overall_score).sum / initial_scores.size
1145
- history << {
1146
- generation: 0,
1147
- best_fitness: best_initial.overall_score,
1148
- avg_fitness: avg_initial,
1149
- diversity: population_diversity
1150
- }
1151
-
1152
- # Evolution loop
1153
- @config.num_generations.times do
1154
- evolve_generation(trainset)
1155
- scores = evaluate_population(trainset)
1156
- best_score = scores.max_by(&:overall_score)
1157
- avg_score = scores.map(&:overall_score).sum / scores.size
1158
-
1159
- history << {
1160
- generation: @generation,
1161
- best_fitness: best_score.overall_score,
1162
- avg_fitness: avg_score,
1163
- diversity: population_diversity
1164
- }
1165
- end
1166
-
1167
- best_fitness_score = @fitness_scores.max_by(&:overall_score)
1168
- {
1169
- best_candidate: get_best_candidate,
1170
- best_fitness: best_fitness_score || FitnessScore.new(
1171
- primary_score: 0.0,
1172
- secondary_scores: {},
1173
- overall_score: 0.0,
1174
- metadata: {}
1175
- ),
1176
- generation_history: history,
1177
- generation_count: @generation,
1178
- final_population: @population.dup
1179
- }
1180
- end
1181
-
1182
- # Get the best performing candidate from current population
1183
- sig { returns(T.untyped) }
1184
- def get_best_candidate
1185
- return @population.first if @fitness_scores.empty?
1186
-
1187
- best_index = @fitness_scores.each_with_index.max_by { |score, _| score.overall_score }[1]
1188
- @population[best_index]
1189
- end
1190
-
1191
- # Measure diversity of instructions in current population
1192
- sig { returns(Float) }
1193
- def population_diversity
1194
- return 0.0 if @population.empty?
1195
-
1196
- # Only calculate diversity for programs that have signature_class
1197
- instructions = @population.filter_map do |program|
1198
- if program.respond_to?(:signature_class) && program.signature_class.respond_to?(:description)
1199
- program.signature_class.description
1200
- else
1201
- nil
1202
- end
1203
- end
1204
-
1205
- return 0.0 if instructions.empty?
1206
-
1207
- unique_instructions = instructions.uniq.size
1208
- unique_instructions.to_f / instructions.size.to_f
1209
- end
1210
-
1211
- private
1212
-
1213
- # Generate instruction variants (similar to simple optimization)
1214
- sig { params(original_instruction: String).returns(T::Array[String]) }
1215
- def generate_instruction_variants(original_instruction)
1216
- variants = []
1217
-
1218
- # Add "step by step" variant
1219
- unless original_instruction.include?("step")
1220
- variants << "#{original_instruction} Think step by step."
1221
- end
1222
-
1223
- # Add "detailed" variant
1224
- unless original_instruction.include?("detail")
1225
- variants << "#{original_instruction} Provide detailed reasoning."
1226
- end
1227
-
1228
- # Add "careful" variant
1229
- unless original_instruction.include?("careful")
1230
- variants << "Be careful and accurate. #{original_instruction}"
1231
- end
1232
-
1233
- # Add "examples" variant
1234
- unless original_instruction.include?("example")
1235
- variants << "#{original_instruction} Use examples in your response."
1236
- end
1237
-
1238
- # Add "precise" variant
1239
- unless original_instruction.include?("precise")
1240
- variants << "Be precise and specific. #{original_instruction}"
1241
- end
1242
-
1243
- variants.shuffle.take(5) # Return up to 5 variants, shuffled
1244
- end
1245
-
1246
- # Create program copy with modified instruction using DSPy.rb dynamic capabilities
1247
- sig { params(original_program: T.untyped, new_instruction: String).returns(T.untyped) }
1248
- def create_program_with_instruction(original_program, new_instruction)
1249
- case original_program
1250
- when DSPy::Predict
1251
- # DSPy::Predict has built-in support for instruction modification
1252
- original_program.with_instruction(new_instruction)
1253
- when DSPy::Module
1254
- # For custom DSPy::Module classes, create new instance with updated predictors
1255
- create_modified_module(original_program, new_instruction)
1256
- else
1257
- # For other types (like test doubles), check available methods
1258
- if original_program.respond_to?(:with_instruction)
1259
- original_program.with_instruction(new_instruction)
1260
- elsif original_program.respond_to?(:signature_class)
1261
- # Create new DSPy::Predict with the same signature but new instruction
1262
- signature_class = original_program.signature_class
1263
- DSPy::Predict.new(signature_class).with_instruction(new_instruction)
1264
- else
1265
- # Fallback: return original if we can't modify
1266
- original_program
1267
- end
1268
- end
1269
- rescue => e
1270
- # Return original program on error
1271
- original_program
1272
- end
1273
-
1274
- # Create modified version of custom DSPy::Module (for GeneticEngine)
1275
- sig { params(original_module: DSPy::Module, new_instruction: String).returns(DSPy::Module) }
1276
- def create_modified_module(original_module, new_instruction)
1277
- begin
1278
- # Create a new instance of the same class
1279
- new_module = original_module.class.new
1280
-
1281
- # Try to find and update any internal predictors
1282
- original_module.instance_variables.each do |var_name|
1283
- var_value = original_module.instance_variable_get(var_name)
1284
-
1285
- if var_value.is_a?(DSPy::Predict)
1286
- # Update the instruction for internal predictors
1287
- modified_predictor = var_value.with_instruction(new_instruction)
1288
- new_module.instance_variable_set(var_name, modified_predictor)
1289
- else
1290
- # Copy other instance variables as-is
1291
- new_module.instance_variable_set(var_name, var_value)
1292
- end
1293
- end
1294
-
1295
- new_module
1296
- rescue => e
1297
- # Fallback to original module
1298
- original_module
1299
- end
1300
- end
1301
- end
1302
-
1303
- # FitnessScore represents multi-dimensional evaluation results
1304
- class FitnessScore < T::Struct
1305
- extend T::Sig
1306
- include Comparable
1307
-
1308
- const :primary_score, Float
1309
- const :secondary_scores, T::Hash[Symbol, Float]
1310
- const :overall_score, Float
1311
- const :metadata, T::Hash[Symbol, T.untyped]
1312
-
1313
- sig do
1314
- params(
1315
- primary_score: Float,
1316
- secondary_scores: T::Hash[Symbol, Float],
1317
- overall_score: Float,
1318
- metadata: T.nilable(T::Hash[Symbol, T.untyped])
1319
- ).void
1320
- end
1321
- def initialize(primary_score:, secondary_scores:, overall_score:, metadata: nil)
1322
- # Validate score ranges
1323
- [primary_score, overall_score].each do |score|
1324
- if score < 0.0 || score > 1.0
1325
- raise ArgumentError, "Score must be between 0.0 and 1.0, got #{score}"
1326
- end
1327
- end
1328
-
1329
- secondary_scores.each do |name, score|
1330
- if score < 0.0 || score > 1.0
1331
- raise ArgumentError, "Secondary score #{name} must be between 0.0 and 1.0, got #{score}"
1332
- end
1333
- end
1334
-
1335
- super(
1336
- primary_score: primary_score,
1337
- secondary_scores: secondary_scores.freeze,
1338
- overall_score: overall_score,
1339
- metadata: (metadata || {}).freeze
1340
- )
1341
- end
1342
-
1343
- # Comparison method for Comparable module
1344
- sig { params(other: FitnessScore).returns(T.nilable(Integer)) }
1345
- def <=>(other)
1346
- return nil unless other.is_a?(FitnessScore)
1347
- overall_score <=> other.overall_score
1348
- end
1349
-
1350
- # Check if this score is dominated by another (for Pareto analysis)
1351
- sig { params(other: FitnessScore).returns(T::Boolean) }
1352
- def dominated_by?(other)
1353
- return false if overall_score > other.overall_score
1354
- return true if overall_score < other.overall_score
1355
-
1356
- # If overall scores are equal, check secondary metrics
1357
- secondary_scores.all? do |metric, score|
1358
- other_score = other.secondary_scores[metric] || 0.0
1359
- score <= other_score
1360
- end
1361
- end
1362
-
1363
- # Get combined score for specific objectives
1364
- sig { params(objectives: T::Array[Symbol]).returns(Float) }
1365
- def score_for_objectives(objectives)
1366
- relevant_scores = objectives.map { |obj| secondary_scores[obj] || 0.0 }
1367
- return primary_score if relevant_scores.empty?
1368
-
1369
- (primary_score + relevant_scores.sum) / (objectives.size + 1)
1370
- end
1371
- end
1372
-
1373
- # FitnessEvaluator provides multi-dimensional evaluation of prompt candidates
1374
- class FitnessEvaluator
1375
- extend T::Sig
1376
-
1377
- sig { returns(T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T.untyped)) }
1378
- attr_reader :primary_metric
1379
-
1380
- sig { returns(GEPAConfig) }
1381
- attr_reader :config
1382
-
1383
- sig { returns(T::Hash[Symbol, T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T.untyped)]) }
1384
- attr_reader :secondary_metrics
1385
-
1386
- sig do
1387
- params(
1388
- primary_metric: T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T.untyped),
1389
- config: GEPAConfig,
1390
- secondary_metrics: T.nilable(T::Hash[Symbol, T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T.untyped)])
1391
- ).void
1392
- end
1393
- def initialize(primary_metric:, config:, secondary_metrics: nil)
1394
- @primary_metric = primary_metric
1395
- @config = config
1396
- @secondary_metrics = secondary_metrics || default_secondary_metrics
1397
- @trace_collector = TraceCollector.new
1398
- end
1399
-
1400
- # Evaluate a single candidate program
1401
- sig { params(program: T.untyped, trainset: T::Array[T.untyped]).returns(FitnessScore) }
1402
- def evaluate_candidate(program, trainset)
1403
- start_time = Time.now
1404
- predictions = []
1405
- traces = []
1406
-
1407
- # Collect primary metric scores and execution data
1408
- primary_scores = trainset.map do |example|
1409
- prediction_start = Time.now
1410
- prediction = program.call(**example.input_values)
1411
- prediction_time = Time.now - prediction_start
1412
-
1413
- predictions << {
1414
- prediction: prediction,
1415
- latency: prediction_time,
1416
- example: example
1417
- }
1418
-
1419
- @primary_metric.call(example, prediction).to_f
1420
- rescue => e
1421
- # Handle prediction errors
1422
- predictions << {
1423
- prediction: nil,
1424
- latency: 0.0,
1425
- example: example,
1426
- error: e.message
1427
- }
1428
- 0.0
1429
- end
1430
-
1431
- primary_score = primary_scores.sum / primary_scores.size
1432
-
1433
- # Calculate secondary metrics
1434
- secondary_scores = {}
1435
-
1436
- # Token efficiency (mock data for now - will be replaced with real trace collection)
1437
- mock_traces = predictions.map.with_index do |pred, i|
1438
- OpenStruct.new(token_usage: 50 + rand(100))
1439
- end
1440
- secondary_scores[:token_efficiency] = calculate_token_efficiency(mock_traces, predictions.size)
1441
-
1442
- # Response consistency - use first output field for any signature
1443
- response_texts = predictions.map do |p|
1444
- pred = p[:prediction]
1445
- if pred && pred.respond_to?(:class) && pred.class.respond_to?(:props)
1446
- # Get first output field name and value
1447
- first_field = pred.class.props.keys.first
1448
- first_field ? (pred.send(first_field)&.to_s || '') : ''
1449
- else
1450
- ''
1451
- end
1452
- end
1453
- secondary_scores[:consistency] = calculate_consistency(response_texts)
1454
-
1455
- # Latency performance
1456
- latencies = predictions.map { |p| p[:latency] }
1457
- secondary_scores[:latency] = calculate_latency_score(latencies)
1458
-
1459
- # Calculate weighted overall score
1460
- overall_score = calculate_overall_score(primary_score, secondary_scores)
1461
-
1462
- FitnessScore.new(
1463
- primary_score: primary_score,
1464
- secondary_scores: secondary_scores,
1465
- overall_score: overall_score,
1466
- metadata: {
1467
- evaluation_time: Time.now - start_time,
1468
- examples_count: trainset.size,
1469
- errors_count: predictions.count { |p| p[:error] }
1470
- }
1471
- )
1472
- end
1473
-
1474
- # Evaluate multiple candidates in batch
1475
- sig { params(programs: T::Array[T.untyped], trainset: T::Array[T.untyped]).returns(T::Array[FitnessScore]) }
1476
- def batch_evaluate(programs, trainset)
1477
- programs.map { |program| evaluate_candidate(program, trainset) }
1478
- end
1479
-
1480
- # Compare two fitness scores (positive if first is better)
1481
- sig { params(score1: FitnessScore, score2: FitnessScore).returns(Float) }
1482
- def compare_candidates(score1, score2)
1483
- score1.overall_score - score2.overall_score
1484
- end
1485
-
1486
- # Rank candidates by fitness (returns indices sorted by fitness, best first)
1487
- sig { params(scores: T::Array[FitnessScore]).returns(T::Array[Integer]) }
1488
- def rank_candidates(scores)
1489
- scores.each_with_index.sort_by { |score, _| -score.overall_score }.map(&:last)
1490
- end
1491
-
1492
- private
1493
-
1494
- # Default secondary metrics for fitness evaluation
1495
- sig { returns(T::Hash[Symbol, T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T.untyped)]) }
1496
- def default_secondary_metrics
1497
- {
1498
- token_efficiency: proc { |traces, count| calculate_token_efficiency(traces, count) },
1499
- consistency: proc { |responses| calculate_consistency(responses) },
1500
- latency: proc { |latencies| calculate_latency_score(latencies) }
1501
- }
1502
- end
1503
-
1504
- # Calculate token usage efficiency (lower usage = higher score)
1505
- sig { params(traces: T::Array[T.untyped], example_count: Integer).returns(Float) }
1506
- def calculate_token_efficiency(traces, example_count)
1507
- return 1.0 if traces.empty? || example_count == 0
1508
-
1509
- total_tokens = traces.sum(&:token_usage)
1510
- avg_tokens_per_example = total_tokens.to_f / example_count
1511
-
1512
- # Efficiency decreases as token usage increases
1513
- # Assume 100 tokens per example is baseline (score 0.5)
1514
- baseline_tokens = 100.0
1515
- efficiency = baseline_tokens / (baseline_tokens + avg_tokens_per_example)
1516
-
1517
- [efficiency, 1.0].min
1518
- end
1519
-
1520
- # Calculate consistency of responses (similar structure = higher score)
1521
- sig { params(responses: T::Array[String]).returns(Float) }
1522
- def calculate_consistency(responses)
1523
- return 1.0 if responses.empty? || responses.size == 1
1524
-
1525
- # Simple consistency measure: average word overlap between responses
1526
- word_sets = responses.map { |response| response.downcase.split.to_set }
1527
-
1528
- total_similarity = 0.0
1529
- comparisons = 0
1530
-
1531
- word_sets.each_with_index do |set1, i|
1532
- word_sets[(i+1)..-1].each do |set2|
1533
- intersection = set1 & set2
1534
- union = set1 | set2
1535
-
1536
- similarity = union.empty? ? 0.0 : intersection.size.to_f / union.size
1537
- total_similarity += similarity
1538
- comparisons += 1
1539
- end
1540
- end
1541
-
1542
- comparisons == 0 ? 1.0 : total_similarity / comparisons
1543
- end
1544
-
1545
- # Calculate latency performance score (faster = higher score)
1546
- sig { params(latencies: T::Array[Float]).returns(Float) }
1547
- def calculate_latency_score(latencies)
1548
- return 1.0 if latencies.empty?
1549
-
1550
- avg_latency = latencies.sum / latencies.size
1551
-
1552
- # Penalize high latencies (assume 2 seconds is baseline for 0.5 score)
1553
- baseline_latency = 2.0
1554
- latency_score = baseline_latency / (baseline_latency + avg_latency)
1555
-
1556
- [latency_score, 1.0].min
1557
- end
1558
-
1559
- # Calculate weighted overall score combining primary and secondary metrics
1560
- sig { params(primary_score: Float, secondary_scores: T::Hash[Symbol, Float]).returns(Float) }
1561
- def calculate_overall_score(primary_score, secondary_scores)
1562
- # Weight primary metric at 70%, secondary metrics at 30%
1563
- primary_weight = 0.7
1564
- secondary_weight = 0.3
1565
-
1566
- return primary_score if secondary_scores.empty?
1567
-
1568
- avg_secondary = secondary_scores.values.sum / secondary_scores.size
1569
- overall = (primary_score * primary_weight) + (avg_secondary * secondary_weight)
1570
-
1571
- [overall, 1.0].min
1572
- end
1573
- end
1574
-
1575
- # InstructionProposer: Analyzes execution traces and generates improved instructions using LLM reflection
1576
- class InstructionProposer
1577
- extend T::Sig
1578
-
1579
- sig { params(config: GEPAConfig).void }
1580
- def initialize(config:)
1581
- @config = config
1582
- end
1583
-
1584
- # Generate improved instruction based on execution traces and failures
1585
- sig { params(original_instruction: String, execution_traces: T::Array[ExecutionTrace], failed_examples: T::Array[T.untyped]).returns(String) }
1586
- def propose_instruction(original_instruction:, execution_traces:, failed_examples:)
1587
- if execution_traces.empty? && failed_examples.empty?
1588
- # No traces or failures to analyze, return original
1589
- return original_instruction
1590
- end
1591
-
1592
- # Use LLM-based reflection to generate improved instruction
1593
- reflect_and_propose(
1594
- original_instruction: original_instruction,
1595
- execution_traces: execution_traces,
1596
- failed_examples: failed_examples
1597
- )
1598
- rescue => e
1599
- # Fallback to original instruction on error
1600
- original_instruction
1601
- end
1602
-
1603
- private
1604
-
1605
- sig { returns(GEPAConfig) }
1606
- attr_reader :config
1607
-
1608
- # Use LLM reflection to propose improved instruction
1609
- sig { params(original_instruction: String, execution_traces: T::Array[ExecutionTrace], failed_examples: T::Array[T.untyped]).returns(String) }
1610
- def reflect_and_propose(original_instruction:, execution_traces:, failed_examples:)
1611
- # Create signature for instruction improvement
1612
- improvement_signature = create_instruction_improvement_signature
1613
-
1614
- # Create predictor for instruction proposal
1615
- proposer = DSPy::Predict.new(improvement_signature)
1616
-
1617
- # Analyze traces and failures
1618
- trace_analysis = analyze_execution_traces(execution_traces)
1619
- failure_analysis = analyze_failed_examples(failed_examples)
1620
-
1621
- # Generate improved instruction
1622
- result = proposer.call(
1623
- original_instruction: original_instruction,
1624
- trace_analysis: trace_analysis,
1625
- failure_analysis: failure_analysis,
1626
- improvement_context: "GEPA prompt optimization for better performance"
1627
- )
1628
-
1629
- result.improved_instruction || original_instruction
1630
- rescue => e
1631
- # Return original instruction if LLM call fails
1632
- original_instruction
1633
- end
1634
-
1635
- # Create signature for instruction improvement
1636
- sig { returns(T.class_of(DSPy::Signature)) }
1637
- def create_instruction_improvement_signature
1638
- Class.new(DSPy::Signature) do
1639
- description "Analyze execution traces and propose improved instructions for better AI system performance"
1640
-
1641
- input do
1642
- const :original_instruction, String, description: "The current instruction/prompt being used"
1643
- const :trace_analysis, String, description: "Analysis of execution traces showing patterns and issues"
1644
- const :failure_analysis, String, description: "Analysis of failed examples and their patterns"
1645
- const :improvement_context, String, description: "Context about what kind of improvement is needed"
1646
- end
1647
-
1648
- output do
1649
- const :improved_instruction, String, description: "Improved instruction that addresses identified issues"
1650
- const :reasoning, String, description: "Explanation of why this improvement should work better"
1651
- const :confidence, Float, description: "Confidence in the improvement (0.0-1.0)"
1652
- end
1653
- end
1654
- end
1655
-
1656
- # Analyze execution traces to identify patterns
1657
- sig { params(traces: T::Array[ExecutionTrace]).returns(String) }
1658
- def analyze_execution_traces(traces)
1659
- return "No execution traces available" if traces.empty?
1660
-
1661
- llm_traces = traces.select(&:llm_trace?)
1662
- module_traces = traces.select(&:module_trace?)
1663
-
1664
- analysis = []
1665
- analysis << "Execution Trace Analysis:"
1666
- analysis << "- Total traces: #{traces.size}"
1667
- analysis << "- LLM interactions: #{llm_traces.size}"
1668
- analysis << "- Module calls: #{module_traces.size}"
1669
-
1670
- if llm_traces.any?
1671
- token_usage = llm_traces.sum(&:token_usage)
1672
- avg_response_length = llm_traces.map { |t| t.attributes['response']&.to_s&.length || 0 }.sum / llm_traces.size
1673
-
1674
- analysis << "- Total tokens used: #{token_usage}"
1675
- analysis << "- Average response length: #{avg_response_length} characters"
1676
-
1677
- # Identify models used
1678
- models = llm_traces.map { |t| t.attributes['gen_ai.request.model'] }.compact.uniq
1679
- analysis << "- Models used: #{models.join(', ')}" if models.any?
1680
- end
1681
-
1682
- # Analyze timing patterns
1683
- if traces.size > 1
1684
- timespan = traces.max_by(&:timestamp).timestamp - traces.min_by(&:timestamp).timestamp
1685
- analysis << "- Execution timespan: #{timespan.round(2)} seconds"
1686
- end
1687
-
1688
- analysis.join("\n")
1689
- end
1690
-
1691
- # Analyze failed examples to identify failure patterns
1692
- sig { params(failed_examples: T::Array[T.untyped]).returns(String) }
1693
- def analyze_failed_examples(failed_examples)
1694
- return "No failed examples to analyze" if failed_examples.empty?
1695
-
1696
- analysis = []
1697
- analysis << "Failure Pattern Analysis:"
1698
- analysis << "- Failed examples count: #{failed_examples.size}"
1699
-
1700
- # Group failures by type if possible
1701
- if failed_examples.first.respond_to?(:input)
1702
- input_patterns = failed_examples.map { |ex| ex.input.keys }.flatten.uniq
1703
- analysis << "- Input fields involved: #{input_patterns.join(', ')}"
1704
- end
1705
-
1706
- # Sample some failure cases for context
1707
- sample_size = [failed_examples.size, 3].min
1708
- analysis << "- Sample failures:"
1709
- failed_examples.take(sample_size).each_with_index do |example, idx|
1710
- if example.respond_to?(:input) && example.respond_to?(:expected_values)
1711
- input_summary = example.input.values.first.to_s[0..50] + "..."
1712
- expected = example.expected_values.values.first.to_s[0..30] + "..."
1713
- analysis << " #{idx + 1}. Input: #{input_summary} | Expected: #{expected}"
1714
- end
1715
- end
1716
-
1717
- analysis.join("\n")
1718
- end
1719
- end
1720
-
1721
- # MutationEngine: Handles LLM-based prompt transformations for genetic evolution
1722
- class MutationEngine
1723
- extend T::Sig
1724
-
1725
- sig { returns(GEPAConfig) }
1726
- attr_reader :config
1727
-
1728
- sig { returns(InstructionProposer) }
1729
- attr_reader :instruction_proposer
1730
-
1731
- sig { params(config: GEPAConfig).void }
1732
- def initialize(config:)
1733
- @config = config
1734
- @instruction_proposer = InstructionProposer.new(config: config)
1735
- end
1736
-
1737
- # Mutate a single program with LLM-based instruction proposal
1738
- sig { params(program: T.untyped, execution_traces: T::Array[ExecutionTrace], failed_examples: T::Array[T.untyped]).returns(T.untyped) }
1739
- def mutate_program(program, execution_traces: [], failed_examples: [])
1740
- return program if rand > @config.mutation_rate
27
+ def self.default_config
28
+ @default_config ||= DEFAULT_CONFIG.dup
29
+ end
1741
30
 
1742
- begin
1743
- original_instruction = extract_instruction(program)
31
+ class NullExperimentTracker
32
+ extend T::Sig
33
+ attr_reader :events
1744
34
 
1745
- # Use LLM-based instruction proposal instead of hardcoded mutations
1746
- improved_instruction = @instruction_proposer.propose_instruction(
1747
- original_instruction: original_instruction,
1748
- execution_traces: execution_traces,
1749
- failed_examples: failed_examples
1750
- )
35
+ def initialize
36
+ @events = []
37
+ end
1751
38
 
1752
- create_mutated_program(program, improved_instruction)
1753
- rescue => e
1754
- emit_event('mutation_error', {
1755
- error: e.message,
1756
- program_type: program.class.name
1757
- })
1758
- # Return original program on mutation failure
1759
- program
1760
- end
39
+ sig { params(metrics: T::Hash[Symbol, T.untyped], step: T.nilable(Integer)).void }
40
+ def log_metrics(metrics, step: nil)
41
+ @events << { metrics: metrics, step: step }
1761
42
  end
43
+ end
1762
44
 
1763
- # Batch mutation of multiple programs with shared execution context
1764
- sig { params(programs: T::Array[T.untyped], execution_traces: T::Array[ExecutionTrace], failed_examples: T::Array[T.untyped]).returns(T::Array[T.untyped]) }
1765
- def batch_mutate(programs, execution_traces: [], failed_examples: [])
1766
- return [] if programs.empty?
45
+ class NullLogger
46
+ extend T::Sig
47
+ attr_reader :messages
1767
48
 
1768
- programs.map { |program| mutate_program(program, execution_traces: execution_traces, failed_examples: failed_examples) }
49
+ def initialize
50
+ @messages = []
1769
51
  end
1770
52
 
1771
- # Emit events for logging and monitoring
1772
- sig { params(event_name: String, data: T::Hash[Symbol, T.untyped]).void }
1773
- def emit_event(event_name, data = {})
1774
- # For now, just a placeholder - could integrate with DSPy event system
1775
- # In full implementation, this would emit events for monitoring
53
+ sig { params(message: String).void }
54
+ def log(message)
55
+ @messages << message
56
+ DSPy.log('gepa.log', message: message)
1776
57
  end
58
+ end
1777
59
 
1778
- private
1779
-
1780
- # Extract instruction text from program
1781
- sig { params(program: T.untyped).returns(String) }
1782
- def extract_instruction(program)
1783
- if program.signature_class&.description
1784
- program.signature_class.description
1785
- else
1786
- "Analyze the input and complete the task accurately"
1787
- end
1788
- end
60
+ class PredictAdapter
61
+ extend T::Sig
1789
62
 
1790
- # Apply specific mutation type to instruction
1791
- sig { params(instruction: String, mutation_type: MutationType).returns(String) }
1792
- def apply_mutation(instruction, mutation_type)
1793
- case mutation_type
1794
- when MutationType::Rewrite
1795
- apply_rewrite_mutation(instruction)
1796
- when MutationType::Expand
1797
- apply_expand_mutation(instruction)
1798
- when MutationType::Simplify
1799
- apply_simplify_mutation(instruction)
1800
- when MutationType::Combine
1801
- apply_combine_mutation(instruction)
1802
- when MutationType::Rephrase
1803
- apply_rephrase_mutation(instruction)
1804
- else
1805
- instruction
1806
- end
63
+ ReflectionLMType = T.type_alias do
64
+ T.any(DSPy::ReflectionLM, T.proc.params(arg0: String).returns(String))
1807
65
  end
1808
66
 
1809
- # Rewrite the instruction with different phrasing
1810
- sig { params(instruction: String).returns(String) }
1811
- def apply_rewrite_mutation(instruction)
1812
- # Simple rewrite patterns for now - in full implementation would use LLM
1813
- patterns = [
1814
- -> (inst) { "Carefully #{inst.downcase}" },
1815
- -> (inst) { "Please #{inst.downcase}" },
1816
- -> (inst) { "#{inst} with precision" }
1817
- ]
1818
-
1819
- patterns.sample.call(instruction)
67
+ FeedbackFnType = T.type_alias do
68
+ T.proc.params(
69
+ predictor_output: T.untyped,
70
+ predictor_inputs: T::Hash[T.any(String, Symbol), T.untyped],
71
+ module_inputs: DSPy::Example,
72
+ module_outputs: T.untyped,
73
+ captured_trace: T::Array[T::Hash[Symbol, T.untyped]]
74
+ ).returns(T.untyped)
1820
75
  end
1821
76
 
1822
- # Expand instruction with additional context
1823
- sig { params(instruction: String).returns(String) }
1824
- def apply_expand_mutation(instruction)
1825
- expansions = [
1826
- "Think step by step.",
1827
- "Provide detailed reasoning.",
1828
- "Consider all aspects carefully.",
1829
- "Explain your thought process."
1830
- ]
1831
-
1832
- "#{instruction} #{expansions.sample}"
77
+ sig do
78
+ params(
79
+ student: DSPy::Module,
80
+ metric: T.proc.params(arg0: DSPy::Example, arg1: T.untyped).returns(T.untyped),
81
+ reflection_lm: T.nilable(ReflectionLMType),
82
+ feedback_map: T::Hash[String, FeedbackFnType]
83
+ ).void
1833
84
  end
85
+ def initialize(student, metric, reflection_lm: nil, feedback_map: {})
86
+ @student = student
87
+ @metric = metric
88
+ @reflection_lm = reflection_lm
89
+ @feedback_map = feedback_map.transform_keys(&:to_s)
1834
90
 
1835
- # Simplify instruction by removing complex terms
1836
- sig { params(instruction: String).returns(String) }
1837
- def apply_simplify_mutation(instruction)
1838
- # Remove common complexity words
1839
- simplified = instruction.gsub(/\b(carefully|detailed|comprehensive|thorough)\b/i, '')
1840
- .gsub(/\s+/, ' ')
1841
- .strip
1842
-
1843
- simplified.empty? ? instruction : simplified
91
+ @predictor_entries = resolve_predictors(@student)
92
+ @predictor_names = @predictor_entries.map(&:first)
1844
93
  end
1845
94
 
1846
- # Combine instruction with complementary strategies
1847
- sig { params(instruction: String).returns(String) }
1848
- def apply_combine_mutation(instruction)
1849
- strategies = [
1850
- "Break down the problem systematically.",
1851
- "Use logical reasoning.",
1852
- "Apply domain knowledge.",
1853
- "Consider edge cases."
1854
- ]
1855
-
1856
- "#{instruction} #{strategies.sample}"
95
+ sig { returns(T::Hash[String, String]) }
96
+ def seed_candidate
97
+ @predictor_entries.each_with_object({}) do |(name, predictor), memo|
98
+ memo[name] = extract_instruction(predictor)
99
+ end
1857
100
  end
1858
101
 
1859
- # Rephrase instruction with synonyms
1860
- sig { params(instruction: String).returns(String) }
1861
- def apply_rephrase_mutation(instruction)
1862
- # Simple synonym replacement - in full implementation would use LLM
1863
- synonyms = {
1864
- 'solve' => 'resolve',
1865
- 'answer' => 'respond to',
1866
- 'analyze' => 'examine',
1867
- 'calculate' => 'compute',
1868
- 'determine' => 'identify'
1869
- }
1870
-
1871
- result = instruction.dup
1872
- synonyms.each do |original, replacement|
1873
- result.gsub!(/\b#{original}\b/i, replacement) if rand < 0.3
102
+ sig do
103
+ params(candidate: T::Hash[String, String], recorder: T.nilable(T.untyped)).returns(DSPy::Module)
104
+ end
105
+ def build_program(candidate, recorder: nil)
106
+ program = clone_module(@student)
107
+ duplicate_predictors!(program)
108
+
109
+ predictor_map = resolve_predictors(program).to_h
110
+ candidate.each do |name, new_instruction|
111
+ predictor = predictor_map[name]
112
+ next unless predictor
113
+
114
+ updated = apply_instruction_to_predictor(predictor, new_instruction)
115
+ if predictor.equal?(program)
116
+ program = updated
117
+ elsif !updated.equal?(predictor)
118
+ replace_reference(program, predictor, updated)
119
+ end
120
+ predictor_map[name] = updated
1874
121
  end
1875
122
 
1876
- result
123
+ wrap_predictors_for_tracing!(program, recorder: recorder) if recorder
124
+ program
1877
125
  end
1878
126
 
1879
- # Create new program with mutated instruction
1880
- sig { params(original_program: T.untyped, new_instruction: String).returns(T.untyped) }
1881
- def create_mutated_program(original_program, new_instruction)
1882
- case original_program
1883
- when DSPy::Predict
1884
- # DSPy::Predict has built-in support for instruction modification
1885
- original_program.with_instruction(new_instruction)
1886
- when DSPy::Module
1887
- # For custom DSPy::Module classes, we need to create a new instance
1888
- # and update any internal predictors that have instruction-based signatures
1889
- create_mutated_module(original_program, new_instruction)
127
+ sig do
128
+ params(
129
+ batch: T::Array[DSPy::Example],
130
+ candidate: T::Hash[String, String],
131
+ capture_traces: T::Boolean
132
+ ).returns(::GEPA::Core::EvaluationBatch)
133
+ end
134
+ def evaluate(batch, candidate, capture_traces: false)
135
+ recorder = capture_traces ? TraceRecorder.new : nil
136
+ program = build_program(candidate, recorder: recorder)
137
+
138
+ if capture_traces
139
+ trajectories = batch.map do |example|
140
+ recorder&.start_example
141
+ prediction = program.call(**example.input_values)
142
+ result = @metric.call(example, prediction)
143
+ score, feedback = extract_score_and_feedback(result)
144
+ trace_entries = recorder ? recorder.finish_example : []
145
+
146
+ {
147
+ example: example,
148
+ prediction: prediction,
149
+ score: score,
150
+ feedback: feedback,
151
+ trace: trace_entries
152
+ }
153
+ end
154
+
155
+ scores = trajectories.map { |row| row[:score] }
156
+ outputs = trajectories.map { |row| row[:prediction] }
157
+ ::GEPA::Core::EvaluationBatch.new(outputs: outputs, scores: scores, trajectories: trajectories)
1890
158
  else
1891
- # For other types (like test doubles), check if they respond to with_instruction
1892
- if original_program.respond_to?(:with_instruction)
1893
- original_program.with_instruction(new_instruction)
1894
- elsif original_program.respond_to?(:signature_class)
1895
- # Try to create a new DSPy::Predict with the same signature but new instruction
1896
- signature_class = original_program.signature_class
1897
- DSPy::Predict.new(signature_class).with_instruction(new_instruction)
1898
- else
1899
- # Fallback: return original if we can't mutate
1900
- emit_event('mutation_fallback', {
1901
- program_type: original_program.class.name,
1902
- reason: 'No mutation method available'
1903
- })
1904
- original_program
159
+ evaluator = DSPy::Evaluate.new(program, metric: nil, num_threads: nil, max_errors: batch.length * 100, provide_traceback: false)
160
+ results = batch.map do |example|
161
+ prediction = program.call(**example.input_values)
162
+ result = @metric.call(example, prediction)
163
+ score, = extract_score_and_feedback(result)
164
+ [prediction, score]
1905
165
  end
166
+ outputs = results.map(&:first)
167
+ scores = results.map(&:last)
168
+ ::GEPA::Core::EvaluationBatch.new(outputs: outputs, scores: scores, trajectories: nil)
1906
169
  end
1907
- rescue => e
1908
- emit_event('mutation_error', {
1909
- error: e.message,
1910
- program_type: original_program.class.name,
1911
- backtrace: e.backtrace&.first(3)
1912
- })
1913
- # Return original program on error
1914
- original_program
1915
170
  end
1916
171
 
1917
- # Create mutated version of custom DSPy::Module
1918
- sig { params(original_module: DSPy::Module, new_instruction: String).returns(DSPy::Module) }
1919
- def create_mutated_module(original_module, new_instruction)
1920
- # For custom modules, we need to create a new instance
1921
- # This is a simplified approach - in practice, modules might need
1922
- # more sophisticated copying of their internal state
1923
- begin
1924
- # Create a new instance of the same class
1925
- new_module = original_module.class.new
1926
-
1927
- # Try to find and update any internal predictors
1928
- original_module.instance_variables.each do |var_name|
1929
- var_value = original_module.instance_variable_get(var_name)
1930
-
1931
- if var_value.is_a?(DSPy::Predict)
1932
- # Update the instruction for internal predictors
1933
- mutated_predictor = var_value.with_instruction(new_instruction)
1934
- new_module.instance_variable_set(var_name, mutated_predictor)
1935
- else
1936
- # Copy other instance variables as-is
1937
- new_module.instance_variable_set(var_name, var_value)
172
+ sig do
173
+ params(
174
+ candidate: T::Hash[String, String],
175
+ eval_batch: ::GEPA::Core::EvaluationBatch,
176
+ components_to_update: T::Array[String]
177
+ ).returns(T::Hash[String, T::Array[T::Hash[String, T.untyped]]])
178
+ end
179
+ def make_reflective_dataset(candidate, eval_batch, components_to_update)
180
+ return {} unless eval_batch.trajectories
181
+
182
+ components_to_update.each_with_object({}) do |component, memo|
183
+ rows = eval_batch.trajectories.flat_map do |trajectory|
184
+ example = trajectory[:example]
185
+ expected = serialize_struct(example.expected)
186
+ actual_program_output = serialize_prediction(trajectory[:prediction])
187
+ diff = build_diff(expected, actual_program_output)
188
+ default_feedback = trajectory[:feedback] || "Score: #{trajectory[:score]}"
189
+ default_score = trajectory[:score]
190
+ full_trace = Array(trajectory[:trace])
191
+
192
+ full_trace.filter_map do |entry|
193
+ next unless entry[:predictor_name] == component
194
+
195
+ raw_inputs = entry[:inputs] || {}
196
+ raw_output = entry[:output]
197
+ inputs = serialize_struct(raw_inputs)
198
+ outputs = serialize_prediction(raw_output)
199
+
200
+ feedback_text = default_feedback
201
+ score_value = default_score
202
+ score_overridden = false
203
+
204
+ if (feedback_fn = @feedback_map[component])
205
+ feedback_result = feedback_fn.call(
206
+ predictor_output: raw_output,
207
+ predictor_inputs: raw_inputs,
208
+ module_inputs: example,
209
+ module_outputs: trajectory[:prediction],
210
+ captured_trace: full_trace
211
+ )
212
+ override_score, override_feedback = extract_score_and_feedback(feedback_result)
213
+ feedback_text = override_feedback if override_feedback
214
+ unless override_score.nil?
215
+ score_value = override_score
216
+ score_overridden = true
217
+ end
218
+ end
219
+
220
+ row = {
221
+ 'Inputs' => inputs,
222
+ 'Expected' => expected,
223
+ 'Generated Outputs' => outputs,
224
+ 'Diff' => diff,
225
+ 'Feedback' => feedback_text
226
+ }
227
+ row['Score'] = score_value if score_overridden
228
+ row
1938
229
  end
1939
230
  end
1940
-
1941
- new_module
1942
- rescue => e
1943
- emit_event('module_mutation_error', {
1944
- error: e.message,
1945
- module_class: original_module.class.name
1946
- })
1947
- # Fallback to original module
1948
- original_module
231
+ memo[component] = rows unless rows.empty?
1949
232
  end
1950
233
  end
1951
234
 
1952
- # Select mutation type based on context and configuration
1953
- sig { params(instruction: T.nilable(String)).returns(MutationType) }
1954
- def select_mutation_type(instruction = nil)
1955
- # Adaptive selection based on instruction characteristics
1956
- if instruction && instruction.length < 20
1957
- # Short instructions benefit from expansion
1958
- [MutationType::Expand, MutationType::Combine].sample
1959
- elsif instruction && instruction.length > 100
1960
- # Long instructions benefit from simplification
1961
- [MutationType::Simplify, MutationType::Rephrase].sample
235
+ sig do
236
+ params(
237
+ candidate: T::Hash[String, String],
238
+ reflective_dataset: T::Hash[String, T::Array[T::Hash[String, T.untyped]]],
239
+ components_to_update: T::Array[String]
240
+ ).returns(T::Hash[String, String])
241
+ end
242
+ def propose_new_texts(candidate, reflective_dataset, components_to_update)
243
+ if @reflection_lm
244
+ components_to_update.to_h do |name|
245
+ response = ::GEPA::Strategies::InstructionProposalSignature.run(
246
+ @reflection_lm,
247
+ {
248
+ 'current_instruction_doc' => candidate[name],
249
+ 'dataset_with_feedback' => reflective_dataset.fetch(name, [])
250
+ }
251
+ )
252
+ [name, response.fetch('new_instruction')]
253
+ end
1962
254
  else
1963
- # Balanced selection from all types
1964
- @config.mutation_types.sample
255
+ components_to_update.to_h do |name|
256
+ [name, "#{candidate[name]} improved"]
257
+ end
1965
258
  end
1966
259
  end
1967
260
 
1968
- # Calculate diversity of mutations applied
1969
- sig { params(mutations: T::Array[MutationType]).returns(Float) }
1970
- def mutation_diversity(mutations)
1971
- return 0.0 if mutations.empty?
1972
-
1973
- unique_types = mutations.uniq.size
1974
- total_types = @config.mutation_types.size
1975
-
1976
- unique_types.to_f / total_types
1977
- end
1978
- end
1979
-
1980
- # CrossoverEngine: Handles genetic recombination of prompts for diversity
1981
- class CrossoverEngine
1982
- extend T::Sig
261
+ private
1983
262
 
1984
- # Struct for instruction components
1985
- class InstructionComponents < T::Struct
1986
- prop :action, String
1987
- prop :modifiers, String
263
+ sig { params(program: DSPy::Module).returns(T::Array[[String, DSPy::Module]]) }
264
+ def resolve_predictors(program)
265
+ pairs = program.named_predictors
266
+ pairs = [['self', program]] if pairs.empty?
267
+ pairs
1988
268
  end
1989
269
 
1990
- sig { returns(GEPAConfig) }
1991
- attr_reader :config
1992
-
1993
- sig { params(config: GEPAConfig).void }
1994
- def initialize(config:)
1995
- @config = config
270
+ sig { params(mod: DSPy::Module).returns(DSPy::Module) }
271
+ def clone_module(mod)
272
+ safe_clone(mod)
1996
273
  end
1997
274
 
1998
- # Perform crossover between two parent programs
1999
- sig { params(parent_a: T.untyped, parent_b: T.untyped).returns(T::Array[T.untyped]) }
2000
- def crossover_programs(parent_a, parent_b)
2001
- return [parent_a, parent_b] if rand > @config.crossover_rate
2002
-
2003
- begin
2004
- instruction_a = extract_instruction(parent_a)
2005
- instruction_b = extract_instruction(parent_b)
2006
-
2007
- crossover_type = select_crossover_type(instruction_a, instruction_b)
2008
- offspring_instructions = apply_crossover(instruction_a, instruction_b, crossover_type)
2009
-
2010
- offspring = [
2011
- create_crossover_program(parent_a, offspring_instructions[0]),
2012
- create_crossover_program(parent_b, offspring_instructions[1])
2013
- ]
2014
-
2015
- offspring
2016
- rescue => e
2017
- # Return original parents on crossover failure
2018
- [parent_a, parent_b]
275
+ sig { params(program: DSPy::Module).void }
276
+ def duplicate_predictors!(program)
277
+ resolve_predictors(program).each do |name, predictor|
278
+ next unless @predictor_names.include?(name)
279
+ next if predictor.equal?(program)
280
+ clone = safe_clone(predictor)
281
+ replace_reference(program, predictor, clone)
2019
282
  end
2020
283
  end
2021
284
 
2022
- # Batch crossover for entire population
2023
- sig { params(population: T::Array[T.untyped]).returns(T::Array[T.untyped]) }
2024
- def batch_crossover(population)
2025
- return [] if population.empty?
2026
- return [population.first] if population.size == 1
2027
-
2028
- offspring = []
2029
-
2030
- # Pair up population for crossover
2031
- population.each_slice(2) do |pair|
2032
- if pair.size == 2
2033
- crossed = crossover_programs(pair[0], pair[1])
2034
- offspring.concat(crossed)
2035
- else
2036
- offspring << pair[0] # Unpaired individual passes through
2037
- end
2038
- end
2039
-
2040
- offspring
285
+ sig do
286
+ params(container: T.untyped, target: T.untyped, replacement: T.untyped, visited: T::Set[Integer]).returns(T.untyped)
2041
287
  end
288
+ def replace_in_object(container, target, replacement, visited)
289
+ return replacement if container.equal?(target)
290
+ return container if visited.include?(container.object_id)
2042
291
 
2043
- private
2044
-
2045
- # Extract instruction text from program
2046
- sig { params(program: T.untyped).returns(String) }
2047
- def extract_instruction(program)
2048
- if program.signature_class&.description
2049
- program.signature_class.description
2050
- else
2051
- "Analyze the input and complete the task accurately"
2052
- end
2053
- end
292
+ visited.add(container.object_id)
2054
293
 
2055
- # Apply specific crossover type to two instructions
2056
- sig { params(instruction_a: String, instruction_b: String, crossover_type: CrossoverType).returns(T::Array[String]) }
2057
- def apply_crossover(instruction_a, instruction_b, crossover_type)
2058
- case crossover_type
2059
- when CrossoverType::Uniform
2060
- uniform_crossover(instruction_a, instruction_b)
2061
- when CrossoverType::Blend
2062
- blend_crossover(instruction_a, instruction_b)
2063
- when CrossoverType::Structured
2064
- structured_crossover(instruction_a, instruction_b)
294
+ case container
295
+ when Array
296
+ modified = false
297
+ new_array = container.map do |value|
298
+ new_value = replace_in_object(value, target, replacement, visited)
299
+ modified ||= !new_value.equal?(value)
300
+ new_value
301
+ end
302
+ modified ? new_array : container
303
+ when Hash
304
+ modified = false
305
+ new_hash = container.each_with_object({}) do |(key, value), memo|
306
+ new_value = replace_in_object(value, target, replacement, visited)
307
+ modified ||= !new_value.equal?(value)
308
+ memo[key] = new_value
309
+ end
310
+ modified ? new_hash : container
2065
311
  else
2066
- [instruction_a, instruction_b]
312
+ container
2067
313
  end
2068
314
  end
2069
315
 
2070
- # Uniform crossover: Exchange elements randomly at word level
2071
- sig { params(instruction_a: String, instruction_b: String).returns(T::Array[String]) }
2072
- def uniform_crossover(instruction_a, instruction_b)
2073
- return [instruction_a, instruction_b] if instruction_a == instruction_b
316
+ sig { params(owner: T.untyped, target: T.untyped, replacement: T.untyped).void }
317
+ def replace_reference(owner, target, replacement)
318
+ return if owner.equal?(target)
2074
319
 
2075
- words_a = instruction_a.split
2076
- words_b = instruction_b.split
320
+ Array(owner.instance_variables).each do |ivar|
321
+ value = owner.instance_variable_get(ivar)
322
+ next if value.nil?
2077
323
 
2078
- # Create offspring by randomly selecting words from parents
2079
- offspring_a_words = []
2080
- offspring_b_words = []
2081
-
2082
- max_length = [words_a.size, words_b.size].max
2083
-
2084
- max_length.times do |i|
2085
- word_a = words_a[i]
2086
- word_b = words_b[i]
2087
-
2088
- if rand < 0.5
2089
- offspring_a_words << (word_a || word_b)
2090
- offspring_b_words << (word_b || word_a)
2091
- else
2092
- offspring_a_words << (word_b || word_a)
2093
- offspring_b_words << (word_a || word_b)
324
+ new_value = replace_in_object(value, target, replacement, ::Set.new)
325
+ unless new_value.equal?(value)
326
+ owner.instance_variable_set(ivar, new_value)
2094
327
  end
2095
328
  end
2096
-
2097
- [
2098
- offspring_a_words.compact.join(' '),
2099
- offspring_b_words.compact.join(' ')
2100
- ]
2101
329
  end
2102
330
 
2103
- # Blend crossover: Semantically combine instructions
2104
- sig { params(instruction_a: String, instruction_b: String).returns(T::Array[String]) }
2105
- def blend_crossover(instruction_a, instruction_b)
2106
- # Simple blending patterns - in full implementation would use LLM
2107
- patterns = [
2108
- -> (a, b) { "#{a} and #{b}" },
2109
- -> (a, b) { "#{a}, specifically #{b}" },
2110
- -> (a, b) { "#{b} while #{a.downcase}" },
2111
- -> (a, b) { "Combine #{a.downcase} with #{b.downcase}" }
2112
- ]
2113
-
2114
- pattern = patterns.sample
2115
-
2116
- [
2117
- pattern.call(instruction_a, instruction_b),
2118
- pattern.call(instruction_b, instruction_a)
2119
- ]
2120
- end
2121
-
2122
- # Structured crossover: Maintain grammatical and logical structure
2123
- sig { params(instruction_a: String, instruction_b: String).returns(T::Array[String]) }
2124
- def structured_crossover(instruction_a, instruction_b)
2125
- # Extract structural components
2126
- components_a = extract_components(instruction_a)
2127
- components_b = extract_components(instruction_b)
2128
-
2129
- # Cross structural components
2130
- offspring_a = combine_components(components_a.action, components_b.modifiers)
2131
- offspring_b = combine_components(components_b.action, components_a.modifiers)
2132
-
2133
- [offspring_a, offspring_b]
2134
- end
2135
-
2136
- # Extract structural components from instruction
2137
- sig { params(instruction: String).returns(InstructionComponents) }
2138
- def extract_components(instruction)
2139
- words = instruction.split
2140
-
2141
- # Simple heuristic: first verb-like word is action, rest are modifiers
2142
- action_idx = words.find_index { |word| verb_like?(word) } || 0
2143
-
2144
- InstructionComponents.new(
2145
- action: words[action_idx] || words.first || "complete",
2146
- modifiers: (words - [words[action_idx]]).join(' ')
2147
- )
2148
- end
331
+ sig { params(program: DSPy::Module, recorder: T.nilable(T.untyped)).void }
332
+ def wrap_predictors_for_tracing!(program, recorder: nil)
333
+ return unless recorder
2149
334
 
2150
- # Combine action and modifiers into coherent instruction
2151
- sig { params(action: String, modifiers: String).returns(String) }
2152
- def combine_components(action, modifiers)
2153
- if modifiers.empty?
2154
- "#{action.capitalize} the task"
2155
- else
2156
- "#{action.capitalize} #{modifiers}"
335
+ resolve_predictors(program).each do |name, predictor|
336
+ wrap_predictor_for_tracing(program, predictor, name, recorder)
2157
337
  end
2158
338
  end
2159
339
 
2160
- # Simple heuristic to identify verb-like words
2161
- sig { params(word: String).returns(T::Boolean) }
2162
- def verb_like?(word)
2163
- verb_patterns = %w[solve answer calculate determine analyze compute resolve examine]
2164
- verb_patterns.any? { |pattern| word.downcase.include?(pattern) }
2165
- end
340
+ sig { params(program: DSPy::Module, predictor: DSPy::Module, name: String, recorder: T.untyped).void }
341
+ def wrap_predictor_for_tracing(program, predictor, name, recorder)
342
+ original_forward = predictor.method(:forward_untyped)
343
+ recorder_ref = recorder
344
+ predictor_name = name
2166
345
 
2167
- # Create new program with crossover instruction
2168
- sig { params(original_program: T.untyped, new_instruction: String).returns(T.untyped) }
2169
- def create_crossover_program(original_program, new_instruction)
2170
- # For now, return the original program as we don't modify instruction in place
2171
- # In full implementation, would create new program instance with modified instruction
2172
- original_program
346
+ predictor.define_singleton_method(:forward_untyped) do |**input_values|
347
+ result = original_forward.call(**input_values)
348
+ recorder_ref.record(
349
+ predictor_name: predictor_name,
350
+ inputs: input_values.dup,
351
+ output: result
352
+ )
353
+ result
354
+ end
2173
355
  end
2174
356
 
2175
- # Select crossover type based on instruction characteristics
2176
- sig { params(instruction_a: T.nilable(String), instruction_b: T.nilable(String)).returns(CrossoverType) }
2177
- def select_crossover_type(instruction_a = nil, instruction_b = nil)
2178
- # Adaptive selection based on instruction characteristics
2179
- if instruction_a && instruction_b
2180
- combined_length = instruction_a.length + instruction_b.length
2181
-
2182
- if combined_length < 40
2183
- # Short instructions benefit from blending
2184
- [CrossoverType::Blend, CrossoverType::Uniform].sample
2185
- elsif combined_length > 200
2186
- # Long instructions benefit from structured crossover
2187
- [CrossoverType::Structured, CrossoverType::Uniform].sample
2188
- else
2189
- # Balanced selection
2190
- @config.crossover_types.sample
2191
- end
357
+ sig { params(predictor: DSPy::Module, instruction: String).returns(DSPy::Module) }
358
+ def apply_instruction_to_predictor(predictor, instruction)
359
+ if predictor.respond_to?(:with_instruction)
360
+ predictor.with_instruction(instruction)
361
+ elsif predictor.respond_to?(:prompt) && predictor.prompt.respond_to?(:with_instruction)
362
+ predictor.with_prompt(predictor.prompt.with_instruction(instruction))
2192
363
  else
2193
- @config.crossover_types.sample
364
+ duplicate = safe_clone(predictor)
365
+ signature = DSPy::Teleprompt::Utils.get_signature(duplicate)
366
+ updated_signature = signature.with_instructions(instruction)
367
+ DSPy::Teleprompt::Utils.set_signature(duplicate, updated_signature)
368
+ duplicate
2194
369
  end
2195
370
  end
2196
371
 
2197
- # Calculate diversity of crossover operations
2198
- sig { params(crossovers: T::Array[CrossoverType]).returns(Float) }
2199
- def crossover_diversity(crossovers)
2200
- return 0.0 if crossovers.empty?
2201
-
2202
- unique_types = crossovers.uniq.size
2203
- total_types = @config.crossover_types.size
2204
-
2205
- unique_types.to_f / total_types
2206
- end
2207
- end
2208
-
2209
- # ParetoSelector: Multi-objective optimization using Pareto frontier analysis
2210
- class ParetoSelector
2211
- extend T::Sig
2212
-
2213
- sig { returns(FitnessEvaluator) }
2214
- attr_reader :evaluator
2215
-
2216
- sig { returns(GEPAConfig) }
2217
- attr_reader :config
2218
-
2219
- sig { params(evaluator: FitnessEvaluator, config: GEPAConfig).void }
2220
- def initialize(evaluator:, config:)
2221
- @evaluator = evaluator
2222
- @config = config
372
+ sig { params(object: T.untyped).returns(T.untyped) }
373
+ def safe_clone(object)
374
+ object.clone
375
+ rescue TypeError
376
+ object.dup
2223
377
  end
2224
378
 
2225
- # Select parents for breeding using Pareto-based selection
2226
- sig { params(population_with_scores: T::Array[T::Array[T.untyped]], count: Integer).returns(T::Array[T.untyped]) }
2227
- def select_parents(population_with_scores, count:)
2228
- return [] if population_with_scores.empty?
2229
- return population_with_scores.map(&:first) if count >= population_with_scores.size
2230
-
2231
- # Combine tournament and Pareto-based selection for parent selection
2232
- selected = []
2233
-
2234
- count.times do
2235
- parent = tournament_selection(population_with_scores)
2236
- selected << parent
379
+ class TraceRecorder
380
+ def initialize
381
+ @current_trace = nil
2237
382
  end
2238
383
 
2239
- selected
2240
- end
2241
-
2242
- # Select survivors for next generation balancing elite and diversity
2243
- sig { params(population_with_scores: T::Array[T::Array[T.untyped]], count: Integer).returns(T::Array[T.untyped]) }
2244
- def select_survivors(population_with_scores, count:)
2245
- return [] if population_with_scores.empty?
2246
- return population_with_scores.map(&:first) if count >= population_with_scores.size
2247
-
2248
- scores = population_with_scores.map(&:last)
2249
-
2250
- # Find Pareto frontier first
2251
- pareto_frontier = find_pareto_frontier(scores)
2252
- frontier_indices = scores.each_index.select { |i| pareto_frontier.include?(scores[i]) }
2253
- frontier_programs = frontier_indices.map { |i| population_with_scores[i].first }
2254
-
2255
- if frontier_programs.size >= count
2256
- # Use diversity selection within frontier
2257
- frontier_with_scores = frontier_indices.map { |i| population_with_scores[i] }
2258
- return diversity_selection(frontier_with_scores, count: count)
2259
- else
2260
- # Include all frontier + fill remaining with elite selection
2261
- remaining_count = count - frontier_programs.size
2262
- remaining_population = population_with_scores.reject.with_index { |_, i| frontier_indices.include?(i) }
2263
-
2264
- additional = elite_selection(remaining_population, count: remaining_count)
2265
- frontier_programs + additional
384
+ def start_example
385
+ @current_trace = []
2266
386
  end
2267
- end
2268
-
2269
- private
2270
-
2271
- # Find Pareto frontier (non-dominated solutions)
2272
- sig { params(fitness_scores: T::Array[FitnessScore]).returns(T::Array[FitnessScore]) }
2273
- def find_pareto_frontier(fitness_scores)
2274
- return [] if fitness_scores.empty?
2275
- return fitness_scores if fitness_scores.size == 1
2276
-
2277
- frontier = []
2278
-
2279
- fitness_scores.each do |candidate|
2280
- # Check if candidate is dominated by any other solution
2281
- is_dominated = fitness_scores.any? do |other|
2282
- other != candidate && candidate.dominated_by?(other)
2283
- end
2284
387
 
2285
- frontier << candidate unless is_dominated
388
+ def record(entry)
389
+ return unless @current_trace
390
+ @current_trace << entry
2286
391
  end
2287
392
 
2288
- frontier
2289
- end
2290
-
2291
- # Calculate crowding distance for diversity preservation
2292
- sig { params(fitness_scores: T::Array[FitnessScore]).returns(T::Hash[FitnessScore, Float]) }
2293
- def calculate_crowding_distance(fitness_scores)
2294
- distances = {}
2295
-
2296
- # Initialize distances for all solutions
2297
- fitness_scores.each { |score| distances[score] = 0.0 }
2298
-
2299
- return distances if fitness_scores.size <= 2
2300
-
2301
- # Calculate crowding distance for each objective
2302
- objectives = [:primary_score, :overall_score]
2303
- secondary_objectives = fitness_scores.first.secondary_scores.keys
2304
- all_objectives = objectives + secondary_objectives
2305
-
2306
- all_objectives.each do |objective|
2307
- # Sort by current objective
2308
- sorted_scores = fitness_scores.sort_by do |score|
2309
- case objective
2310
- when :primary_score
2311
- score.primary_score
2312
- when :overall_score
2313
- score.overall_score
2314
- else
2315
- score.secondary_scores[objective] || 0.0
2316
- end
2317
- end
2318
-
2319
- # Set boundary solutions to high distance
2320
- distances[sorted_scores.first] = Float::INFINITY if sorted_scores.size > 0
2321
- distances[sorted_scores.last] = Float::INFINITY if sorted_scores.size > 1
2322
-
2323
- next if sorted_scores.size <= 2
2324
-
2325
- # Calculate range for normalization
2326
- min_val = get_objective_value(sorted_scores.first, objective)
2327
- max_val = get_objective_value(sorted_scores.last, objective)
2328
- range = max_val - min_val
2329
-
2330
- next if range <= 0
2331
-
2332
- # Calculate crowding distance for intermediate solutions
2333
- (1...(sorted_scores.size - 1)).each do |i|
2334
- prev_val = get_objective_value(sorted_scores[i - 1], objective)
2335
- next_val = get_objective_value(sorted_scores[i + 1], objective)
2336
-
2337
- distances[sorted_scores[i]] += (next_val - prev_val) / range
2338
- end
393
+ def finish_example
394
+ trace = @current_trace || []
395
+ @current_trace = nil
396
+ trace
2339
397
  end
2340
-
2341
- distances
2342
398
  end
2343
399
 
2344
- # Get objective value from fitness score
2345
- sig { params(score: FitnessScore, objective: Symbol).returns(Float) }
2346
- def get_objective_value(score, objective)
2347
- case objective
2348
- when :primary_score
2349
- score.primary_score
2350
- when :overall_score
2351
- score.overall_score
400
+ sig { params(program: DSPy::Module).returns(String) }
401
+ def extract_instruction(program)
402
+ if program.respond_to?(:prompt) && program.prompt.respond_to?(:instruction)
403
+ program.prompt.instruction
404
+ elsif program.respond_to?(:instruction)
405
+ program.instruction
2352
406
  else
2353
- score.secondary_scores[objective] || 0.0
407
+ raise ArgumentError, "Program must expose prompt.instruction or #instruction"
2354
408
  end
2355
409
  end
2356
410
 
2357
- # Tournament selection with Pareto preference
2358
- sig { params(population_with_scores: T::Array[T::Array[T.untyped]]).returns(T.untyped) }
2359
- def tournament_selection(population_with_scores)
2360
- return population_with_scores.first.first if population_with_scores.size == 1
2361
-
2362
- tournament_size = [3, population_with_scores.size].min
2363
- tournament = population_with_scores.sample(tournament_size)
2364
-
2365
- # Select best from tournament based on Pareto dominance and crowding
2366
- best_program, best_score = tournament.first
2367
-
2368
- tournament[1..].each do |program, score|
2369
- if score.dominated_by?(best_score)
2370
- # Current best dominates this candidate, keep current
2371
- next
2372
- elsif best_score.dominated_by?(score)
2373
- # This candidate dominates current best, replace
2374
- best_program, best_score = program, score
2375
- else
2376
- # Non-dominated comparison, use overall score as tiebreaker
2377
- if score.overall_score > best_score.overall_score
2378
- best_program, best_score = program, score
2379
- end
411
+ sig { params(struct: T.untyped).returns(T::Hash[Symbol, T.untyped]) }
412
+ def serialize_struct(struct)
413
+ if struct.respond_to?(:to_h)
414
+ struct.to_h
415
+ elsif struct.instance_variables.any?
416
+ struct.instance_variables.each_with_object({}) do |ivar, memo|
417
+ key = ivar.to_s.delete_prefix('@').to_sym
418
+ memo[key] = struct.instance_variable_get(ivar)
2380
419
  end
420
+ else
421
+ {}
2381
422
  end
2382
-
2383
- best_program
2384
423
  end
2385
424
 
2386
- # Diversity-based selection using crowding distance
2387
- sig { params(population_with_scores: T::Array[T::Array[T.untyped]], count: Integer).returns(T::Array[T.untyped]) }
2388
- def diversity_selection(population_with_scores, count:)
2389
- return population_with_scores.map(&:first) if count >= population_with_scores.size
2390
-
2391
- scores = population_with_scores.map(&:last)
2392
- distances = calculate_crowding_distance(scores)
2393
-
2394
- # Sort by crowding distance (descending - prefer more diverse)
2395
- sorted_pairs = population_with_scores.sort_by { |_, score| -distances[score] }
2396
-
2397
- sorted_pairs.take(count).map(&:first)
2398
- end
2399
-
2400
- # Elite selection based on overall fitness
2401
- sig { params(population_with_scores: T::Array[T::Array[T.untyped]], count: Integer).returns(T::Array[T.untyped]) }
2402
- def elite_selection(population_with_scores, count:)
2403
- return population_with_scores.map(&:first) if count >= population_with_scores.size
2404
-
2405
- # Sort by overall score (descending - best first)
2406
- sorted_pairs = population_with_scores.sort_by { |_, score| -score.overall_score }
2407
-
2408
- sorted_pairs.take(count).map(&:first)
425
+ sig { params(prediction: T.untyped).returns(T::Hash[Symbol, T.untyped]) }
426
+ def serialize_prediction(prediction)
427
+ case prediction
428
+ when DSPy::Prediction
429
+ prediction.to_h
430
+ when Hash
431
+ prediction
432
+ else
433
+ serialize_struct(prediction)
434
+ end
2409
435
  end
2410
- end
2411
-
2412
- # Configuration for GEPA optimization
2413
- class GEPAConfig < Config
2414
- extend T::Sig
2415
-
2416
- sig { returns(DSPy::LM) }
2417
- attr_accessor :reflection_lm
2418
-
2419
- sig { returns(Integer) }
2420
- attr_accessor :num_generations
2421
-
2422
- sig { returns(Integer) }
2423
- attr_accessor :population_size
2424
-
2425
- sig { returns(Float) }
2426
- attr_accessor :mutation_rate
2427
436
 
2428
- sig { returns(T::Boolean) }
2429
- attr_accessor :use_pareto_selection
437
+ sig { params(expected: T::Hash[Symbol, T.untyped], actual: T::Hash[Symbol, T.untyped]).returns(T::Hash[Symbol, T.untyped]) }
438
+ def build_diff(expected, actual)
439
+ keys = expected.keys | actual.keys
440
+ keys.each_with_object({}) do |key, memo|
441
+ exp = expected[key]
442
+ act = actual[key]
443
+ next if exp == act
2430
444
 
2431
- sig { returns(T::Array[MutationType]) }
2432
- attr_accessor :mutation_types
2433
- sig { returns(Float) }
2434
- attr_accessor :crossover_rate
2435
- sig { returns(T::Array[CrossoverType]) }
2436
- attr_accessor :crossover_types
2437
-
2438
- sig { void }
2439
- def initialize
2440
- super
2441
- # reflection_lm must be explicitly set by user - no default provided
2442
- @reflection_lm = nil
2443
- @num_generations = 10
2444
- @population_size = 8
2445
- @mutation_rate = 0.7
2446
- @use_pareto_selection = true
2447
- @mutation_types = [MutationType::Rewrite, MutationType::Expand, MutationType::Simplify, MutationType::Combine, MutationType::Rephrase]
2448
- @crossover_rate = 0.6
2449
- @crossover_types = [CrossoverType::Uniform, CrossoverType::Blend, CrossoverType::Structured]
445
+ memo[key] = { expected: exp, actual: act }
446
+ end
2450
447
  end
2451
448
 
2452
- sig { returns(T::Hash[Symbol, T.untyped]) }
2453
- def to_h
2454
- super.merge({
2455
- reflection_lm: @reflection_lm&.model, # Serialize the model name for hash representation
2456
- num_generations: @num_generations,
2457
- population_size: @population_size,
2458
- mutation_rate: @mutation_rate,
2459
- use_pareto_selection: @use_pareto_selection,
2460
- mutation_types: @mutation_types,
2461
- crossover_rate: @crossover_rate,
2462
- crossover_types: @crossover_types
2463
- })
449
+ sig { params(result: T.untyped).returns([Float, T.nilable(String)]) }
450
+ def extract_score_and_feedback(result)
451
+ case result
452
+ when DSPy::Prediction
453
+ score = result.respond_to?(:score) ? result.score : 0.0
454
+ feedback = result.respond_to?(:feedback) ? result.feedback : nil
455
+ [score.to_f, feedback]
456
+ when Hash
457
+ [result[:score].to_f, result[:feedback]]
458
+ else
459
+ [result.to_f, nil]
460
+ end
2464
461
  end
2465
462
  end
2466
463
 
2467
- sig { returns(GEPAConfig) }
2468
- attr_reader :config
2469
-
2470
464
  sig do
2471
465
  params(
2472
- metric: T.nilable(T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T.untyped)),
2473
- config: T.nilable(GEPAConfig)
466
+ metric: T.proc.params(arg0: DSPy::Example, arg1: T.untyped).returns(T.untyped),
467
+ reflection_lm: T.nilable(T.untyped),
468
+ feedback_map: T.nilable(T::Hash[String, PredictAdapter::FeedbackFnType]),
469
+ adapter_builder: T.nilable(T.proc.returns(T.untyped)),
470
+ config: T.nilable(T::Hash[Symbol, T.untyped])
2474
471
  ).void
2475
472
  end
2476
- def initialize(metric: nil, config: nil)
2477
- @config = config || GEPAConfig.new
2478
-
2479
- # Validate that reflection_lm is configured
2480
- unless @config.reflection_lm
2481
- raise ArgumentError, "reflection_lm must be configured for GEPA optimization. Set config.reflection_lm to a DSPy::LM instance."
2482
- end
2483
-
2484
- super(metric: metric, config: @config)
473
+ def initialize(metric:, reflection_lm: nil, feedback_map: nil, adapter_builder: nil, config: nil)
474
+ super(metric: metric)
475
+ @metric = metric
476
+ @reflection_lm = reflection_lm
477
+ @feedback_map = (feedback_map || {}).transform_keys(&:to_s)
478
+ @adapter_builder = adapter_builder || method(:build_adapter)
479
+ @gepa_config = self.class.default_config.merge(config || {})
2485
480
  end
2486
481
 
2487
- # Main optimization method
2488
482
  sig do
2489
- params(
2490
- program: T.untyped,
483
+ override.params(
484
+ program: DSPy::Module,
2491
485
  trainset: T::Array[T.untyped],
2492
486
  valset: T.nilable(T::Array[T.untyped])
2493
487
  ).returns(OptimizationResult)
2494
488
  end
2495
-
2496
489
  def compile(program, trainset:, valset: nil)
2497
490
  validate_inputs(program, trainset, valset)
2498
491
 
2499
- instrument_step('gepa_compile', {
2500
- trainset_size: trainset.size,
2501
- valset_size: valset&.size || 0,
2502
- num_generations: @config.num_generations,
2503
- population_size: @config.population_size
2504
- }) do
2505
- # Always perform full GEPA genetic algorithm optimization
2506
- perform_gepa_optimization(program, trainset, valset)
2507
- end
2508
- end
2509
-
2510
- private
2511
-
2512
- # Complete GEPA genetic algorithm optimization
2513
- sig do
2514
- params(
2515
- program: T.untyped,
2516
- trainset: T::Array[T.untyped],
2517
- valset: T.nilable(T::Array[T.untyped])
2518
- ).returns(OptimizationResult)
2519
- end
2520
- def perform_gepa_optimization(program, trainset, valset)
2521
- # Initialize all GEPA components
2522
- fitness_evaluator = create_fitness_evaluator
2523
- genetic_engine = create_genetic_engine(fitness_evaluator)
2524
- reflection_engine = create_reflection_engine
2525
- mutation_engine = create_mutation_engine
2526
- crossover_engine = create_crossover_engine
2527
- pareto_selector = create_pareto_selector(fitness_evaluator)
2528
-
2529
- # Initialize trace collection for reflection
2530
- trace_collector = TraceCollector.new
2531
- optimization_run_id = "gepa-run-#{SecureRandom.hex(4)}"
2532
-
2533
- emit_event('gepa_optimization_start', {
2534
- optimization_run_id: optimization_run_id,
2535
- num_generations: @config.num_generations,
2536
- population_size: @config.population_size,
2537
- mutation_rate: @config.mutation_rate,
2538
- crossover_rate: @config.crossover_rate
2539
- })
2540
-
2541
- begin
2542
- # Run the complete genetic algorithm evolution
2543
- evolution_result = genetic_engine.run_evolution(program, trainset)
2544
-
2545
- # Collect traces for reflection analysis
2546
- execution_traces = trace_collector.traces_for_run(optimization_run_id)
2547
-
2548
- # Generate reflection insights on the optimization process
2549
- reflection_result = reflection_engine.reflect_with_llm(execution_traces)
2550
-
2551
- # Evaluate final candidate on validation set if provided
2552
- final_validation_score = if valset && !valset.empty?
2553
- validation_fitness = fitness_evaluator.evaluate_candidate(evolution_result[:best_candidate], valset)
2554
- validation_fitness.overall_score
2555
- else
2556
- evolution_result[:best_fitness].overall_score
2557
- end
2558
-
2559
- emit_event('gepa_optimization_complete', {
2560
- optimization_run_id: optimization_run_id,
2561
- best_fitness: evolution_result[:best_fitness].overall_score,
2562
- final_generation: evolution_result[:generation_count],
2563
- validation_score: final_validation_score,
2564
- reflection_confidence: reflection_result.confidence
2565
- })
2566
-
2567
- # Create comprehensive optimization result
2568
- OptimizationResult.new(
2569
- optimized_program: evolution_result[:best_candidate],
2570
- scores: {
2571
- fitness_score: evolution_result[:best_fitness].overall_score,
2572
- validation_score: final_validation_score,
2573
- primary_score: evolution_result[:best_fitness].primary_score,
2574
- **evolution_result[:best_fitness].secondary_scores
2575
- },
2576
- history: {
2577
- num_generations: evolution_result[:generation_count],
2578
- population_size: @config.population_size,
2579
- generation_history: evolution_result[:generation_history],
2580
- final_population: evolution_result[:final_population],
2581
- phase: 'Phase 2 - Complete GEPA',
2582
- mutation_rate: @config.mutation_rate,
2583
- crossover_rate: @config.crossover_rate,
2584
- selection_strategy: @config.use_pareto_selection ? 'pareto' : 'tournament'
2585
- },
2586
- best_score_name: 'fitness_score',
2587
- best_score_value: evolution_result[:best_fitness].overall_score,
2588
- metadata: {
2589
- optimizer: 'GEPA',
2590
- reflection_lm: @config.reflection_lm&.model,
2591
- implementation_status: 'Phase 2 - Complete Implementation',
2592
- optimization_run_id: optimization_run_id,
2593
- reflection_insights: {
2594
- diagnosis: reflection_result.diagnosis,
2595
- improvements: reflection_result.improvements,
2596
- confidence: reflection_result.confidence,
2597
- suggested_mutations: reflection_result.suggested_mutations
2598
- },
2599
- trace_analysis: {
2600
- total_traces: execution_traces.size,
2601
- llm_traces: execution_traces.count(&:llm_trace?),
2602
- module_traces: execution_traces.count(&:module_trace?),
2603
- execution_timespan: calculate_execution_timespan(execution_traces)
2604
- },
2605
- component_versions: {
2606
- genetic_engine: 'v2.0',
2607
- fitness_evaluator: 'v2.0',
2608
- reflection_engine: 'v2.0',
2609
- mutation_engine: 'v2.0',
2610
- crossover_engine: 'v2.0',
2611
- pareto_selector: 'v2.0'
2612
- }
2613
- }
2614
- )
2615
-
2616
- rescue => e
2617
- emit_event('gepa_optimization_error', {
2618
- optimization_run_id: optimization_run_id,
2619
- error: e.message,
2620
- backtrace: e.backtrace&.take(5)
2621
- })
2622
-
2623
- # Return fallback result on optimization failure
2624
- fallback_fitness = fitness_evaluator.evaluate_candidate(program, trainset)
2625
-
2626
- OptimizationResult.new(
2627
- optimized_program: program,
2628
- scores: {
2629
- fitness_score: fallback_fitness.overall_score,
2630
- primary_score: fallback_fitness.primary_score,
2631
- **fallback_fitness.secondary_scores
2632
- },
2633
- history: {
2634
- num_generations: 0,
2635
- population_size: @config.population_size,
2636
- phase: 'Phase 2 - Error Recovery',
2637
- error: e.message
2638
- },
2639
- best_score_name: 'fitness_score',
2640
- best_score_value: fallback_fitness.overall_score,
2641
- metadata: {
2642
- optimizer: 'GEPA',
2643
- reflection_lm: @config.reflection_lm&.model,
2644
- implementation_status: 'Phase 2 - Error Recovery',
2645
- optimization_run_id: optimization_run_id,
2646
- error_details: {
2647
- message: e.message,
2648
- class: e.class.name,
2649
- recovery_strategy: 'fallback_to_original'
2650
- }
2651
- }
2652
- )
2653
- end
2654
- end
2655
-
2656
- # Create and configure fitness evaluator
2657
- sig { returns(FitnessEvaluator) }
2658
- def create_fitness_evaluator
2659
- FitnessEvaluator.new(primary_metric: @metric, config: @config)
2660
- end
2661
-
2662
- # Create and configure genetic engine
2663
- sig { params(fitness_evaluator: FitnessEvaluator).returns(GeneticEngine) }
2664
- def create_genetic_engine(fitness_evaluator)
2665
- GeneticEngine.new(config: @config, fitness_evaluator: fitness_evaluator)
2666
- end
2667
-
2668
- # Create and configure reflection engine
2669
- sig { returns(ReflectionEngine) }
2670
- def create_reflection_engine
2671
- ReflectionEngine.new(@config)
2672
- end
2673
-
2674
- # Create and configure mutation engine
2675
- sig { returns(MutationEngine) }
2676
- def create_mutation_engine
2677
- MutationEngine.new(config: @config)
2678
- end
2679
-
2680
- # Create and configure crossover engine
2681
- sig { returns(CrossoverEngine) }
2682
- def create_crossover_engine
2683
- CrossoverEngine.new(config: @config)
2684
- end
2685
-
2686
- # Create and configure pareto selector
2687
- sig { params(fitness_evaluator: FitnessEvaluator).returns(ParetoSelector) }
2688
- def create_pareto_selector(fitness_evaluator)
2689
- ParetoSelector.new(evaluator: fitness_evaluator, config: @config)
2690
- end
2691
-
2692
- # Calculate execution timespan from traces
2693
- sig { params(traces: T::Array[ExecutionTrace]).returns(Float) }
2694
- def calculate_execution_timespan(traces)
2695
- return 0.0 if traces.size < 2
2696
-
2697
- timestamps = traces.map(&:timestamp).sort
2698
- (timestamps.last - timestamps.first).to_f
2699
- end
2700
- end
2701
-
2702
- # GEPA Feedback Metric Protocol
2703
- # Defines interface for providing scores with optional textual feedback
2704
- module GEPAFeedbackMetric
2705
- extend T::Sig
2706
- extend T::Helpers
2707
-
2708
- interface!
492
+ typed_trainset = ensure_typed_examples(trainset)
493
+ typed_valset = valset ? ensure_typed_examples(valset) : typed_trainset
2709
494
 
2710
- # Evaluates prediction and provides score with optional feedback
2711
- sig do
2712
- abstract
2713
- .params(
2714
- example: DSPy::Example,
2715
- prediction: DSPy::Prediction,
2716
- trace: T.nilable(T::Array[ExecutionTrace])
495
+ adapter = @adapter_builder.call(
496
+ program,
497
+ @metric,
498
+ reflection_lm: @reflection_lm,
499
+ feedback_map: @feedback_map
2717
500
  )
2718
- .returns(ScoreWithFeedback)
2719
- end
2720
- def call(example, prediction, trace = nil); end
2721
- end
2722
-
2723
- # Extended prediction result with score and feedback
2724
- class ScoreWithFeedback < T::Struct
2725
- extend T::Sig
2726
-
2727
- const :score, Float
2728
- const :feedback, T.nilable(String)
2729
- const :prediction, DSPy::Prediction
2730
-
2731
- sig { params(score: Float, prediction: DSPy::Prediction, feedback: T.nilable(String)).void }
2732
- def initialize(score:, prediction:, feedback: nil)
2733
- super
2734
- end
2735
- end
2736
-
2737
- # Module Evaluator - Evaluates DSPy modules with metrics and feedback
2738
- class ModuleEvaluator
2739
- extend T::Sig
2740
-
2741
- sig do
2742
- params(
2743
- student: T.untyped, # DSPy::Module or similar callable
2744
- metric: T.untyped,
2745
- feedback_map: T::Hash[String, String],
2746
- custom_instruction_proposer: T.nilable(T.untyped)
2747
- ).void
2748
- end
2749
- def initialize(student:, metric:, feedback_map: {}, custom_instruction_proposer: nil)
2750
- @student = student
2751
- @metric = metric
2752
- @feedback_map = feedback_map
2753
- @custom_instruction_proposer = custom_instruction_proposer
2754
- @trace_collector = GEPA::TraceCollector.new
2755
- end
2756
-
2757
- # Build program with candidate instruction
2758
- sig { params(candidate_instruction: String).returns(T.untyped) }
2759
- def build_program(candidate_instruction)
2760
- # For DSPy::Module compatibility, we'll need to create a new instance
2761
- # with modified signature description
2762
- if @student.respond_to?(:signature_class) && @student.signature_class.respond_to?(:description=)
2763
- modified_student = @student.class.new
2764
- modified_student.signature_class.description = candidate_instruction
2765
- modified_student
2766
- else
2767
- # Fallback: return student as-is for non-standard modules
2768
- @student
2769
- end
2770
- end
2771
-
2772
- # Evaluate program on batch with trace capture
2773
- sig do
2774
- params(
2775
- batch: T::Array[DSPy::Example],
2776
- candidate_instruction: String,
2777
- capture_traces: T::Boolean
501
+ seed_candidate = adapter.seed_candidate
502
+
503
+ cand_selector = ::GEPA::Strategies::ParetoCandidateSelector.new
504
+ comp_selector = ::GEPA::Strategies::RoundRobinReflectionComponentSelector.new
505
+ batch_sampler = ::GEPA::Strategies::EpochShuffledBatchSampler.new([@gepa_config[:minibatch_size], typed_trainset.size].min)
506
+
507
+ telemetry_context = ::GEPA::Telemetry.build_context
508
+
509
+ logger = ::GEPA::Logging::BufferingLogger.new
510
+ tracker = ::GEPA::Logging::ExperimentTracker.new
511
+
512
+ reflective = ::GEPA::Proposer::ReflectiveMutationProposer.new(
513
+ logger: logger,
514
+ trainset: typed_trainset,
515
+ adapter: adapter,
516
+ candidate_selector: cand_selector,
517
+ module_selector: comp_selector,
518
+ batch_sampler: batch_sampler,
519
+ perfect_score: @gepa_config[:perfect_score],
520
+ skip_perfect_score: @gepa_config[:skip_perfect_score],
521
+ experiment_tracker: tracker,
522
+ reflection_lm: nil,
523
+ telemetry: telemetry_context
2778
524
  )
2779
- .returns(T::Array[T.any(Float, ScoreWithFeedback)])
2780
- end
2781
- def evaluate_batch(batch, candidate_instruction, capture_traces: true)
2782
- program = build_program(candidate_instruction)
2783
- results = []
2784
-
2785
- batch.each do |example|
2786
- begin
2787
- # Execute program on example
2788
- prediction = if program.respond_to?(:call)
2789
- program.call(**example.input_values)
2790
- elsif program.respond_to?(:forward)
2791
- program.forward(**example.input_values)
2792
- else
2793
- raise "Program must respond to :call or :forward"
2794
- end
2795
-
2796
- # Get collected traces (if trace collection is enabled)
2797
- # Note: TraceCollector automatically collects via event subscriptions
2798
- traces = capture_traces ? @trace_collector.traces : []
2799
-
2800
- # Evaluate with metric
2801
- # Try with traces first (for GEPAFeedbackMetric), fallback to standard metric
2802
- begin
2803
- # Check if metric can accept 3 parameters (example, prediction, traces)
2804
- if @metric.respond_to?(:arity) && (@metric.arity == 3 || @metric.arity < 0)
2805
- score_result = @metric.call(example, prediction, traces)
2806
- else
2807
- score_result = @metric.call(example, prediction)
2808
- end
2809
- rescue ArgumentError => arg_error
2810
- # If 3-arg call fails, try 2-arg call
2811
- if arg_error.message.include?('wrong number of arguments')
2812
- score_result = @metric.call(example, prediction)
2813
- else
2814
- raise arg_error
2815
- end
2816
- end
2817
525
 
2818
- # Ensure we always have a ScoreWithFeedback object
2819
- if score_result.is_a?(ScoreWithFeedback)
2820
- results << score_result
2821
- else
2822
- # Wrap plain float scores in ScoreWithFeedback
2823
- results << ScoreWithFeedback.new(
2824
- score: score_result.to_f,
2825
- prediction: prediction,
2826
- feedback: nil
2827
- )
2828
- end
2829
-
2830
- rescue => e
2831
- DSPy.logger.error("Evaluation error: #{e.message}")
2832
- # Return zero score on failure
2833
- results << 0.0
2834
- end
526
+ evaluator = lambda do |dataset, candidate|
527
+ batch = adapter.evaluate(dataset, candidate, capture_traces: false)
528
+ [batch.outputs, batch.scores]
529
+ end
530
+
531
+ merge_proposer = nil
532
+ if @gepa_config[:use_merge]
533
+ merge_proposer = ::GEPA::Proposer::MergeProposer.new(
534
+ logger: logger,
535
+ valset: typed_valset,
536
+ evaluator: evaluator,
537
+ use_merge: true,
538
+ max_merge_invocations: @gepa_config[:max_merge_invocations],
539
+ rng: Random.new(0),
540
+ telemetry: telemetry_context
541
+ )
2835
542
  end
2836
543
 
2837
- results
2838
- end
2839
-
2840
- # Create reflective dataset from failed predictions
2841
- sig do
2842
- params(
2843
- examples: T::Array[DSPy::Example],
2844
- predictions: T::Array[DSPy::Prediction],
2845
- scores: T::Array[T.any(Float, ScoreWithFeedback)],
2846
- threshold: Float
544
+ engine = ::GEPA::Core::Engine.new(
545
+ evaluator: evaluator,
546
+ valset: typed_valset,
547
+ seed_candidate: seed_candidate,
548
+ max_metric_calls: @gepa_config[:max_metric_calls],
549
+ perfect_score: @gepa_config[:perfect_score],
550
+ seed: 0,
551
+ reflective_proposer: reflective,
552
+ logger: logger,
553
+ experiment_tracker: tracker,
554
+ merge_proposer: merge_proposer,
555
+ run_dir: nil,
556
+ track_best_outputs: false,
557
+ display_progress_bar: false,
558
+ telemetry: telemetry_context,
559
+ raise_on_exception: true
2847
560
  )
2848
- .returns(T::Array[T::Hash[String, T.untyped]])
2849
- end
2850
- def make_reflective_dataset(examples, predictions, scores, threshold: 0.5)
2851
- reflective_data = []
2852
-
2853
- examples.zip(predictions, scores).each do |example, prediction, score|
2854
- # Extract score value
2855
- score_value = score.is_a?(ScoreWithFeedback) ? score.score : score
2856
-
2857
- # Include failed predictions (below threshold)
2858
- next if score_value >= threshold
2859
-
2860
- # Extract feedback if available
2861
- feedback = if score.is_a?(ScoreWithFeedback) && score.feedback
2862
- score.feedback
2863
- else
2864
- "Low performance (score: #{score_value.round(2)})"
2865
- end
2866
-
2867
- reflective_data << {
2868
- 'input' => example.input_values,
2869
- 'expected' => example.expected_values,
2870
- 'prediction' => extract_prediction_values(prediction),
2871
- 'score' => score_value,
2872
- 'feedback' => feedback
2873
- }
2874
- end
2875
561
 
2876
- reflective_data
2877
- end
2878
-
2879
- # Propose new instruction texts based on reflective dataset
2880
- sig do
2881
- params(
2882
- current_instruction: String,
2883
- reflective_dataset: T::Array[T::Hash[String, T.untyped]],
2884
- components_to_update: T::Array[String]
562
+ state = engine.run
563
+ result = ::GEPA::Core::Result.from_state(state)
564
+ best_program = adapter.build_program(result.best_candidate)
565
+
566
+ OptimizationResult.new(
567
+ optimized_program: best_program,
568
+ scores: { best: result.val_aggregate_scores[result.best_idx] },
569
+ history: { total_candidates: result.num_candidates },
570
+ best_score_name: 'best',
571
+ best_score_value: result.val_aggregate_scores[result.best_idx],
572
+ metadata: { candidates: result.num_candidates }
2885
573
  )
2886
- .returns(T::Array[String])
2887
- end
2888
- def propose_new_texts(current_instruction, reflective_dataset, components_to_update = ['instruction'])
2889
- if @custom_instruction_proposer
2890
- # Use custom proposer if provided
2891
- proposed = @custom_instruction_proposer.call(current_instruction, reflective_dataset)
2892
- [proposed].compact
2893
- else
2894
- # Use built-in proposal logic
2895
- analyze_failures_and_propose(current_instruction, reflective_dataset)
2896
- end
2897
574
  end
2898
575
 
2899
576
  private
2900
577
 
2901
- # Extract prediction values for reflective analysis
2902
- sig { params(prediction: DSPy::Prediction).returns(T::Hash[String, T.untyped]) }
2903
- def extract_prediction_values(prediction)
2904
- # DSPy::Prediction implements to_h which returns the underlying struct's data
2905
- prediction.to_h.transform_keys(&:to_s)
2906
- end
2907
-
2908
- # Analyze failures and propose improvements
2909
578
  sig do
2910
579
  params(
2911
- current_instruction: String,
2912
- reflective_dataset: T::Array[T::Hash[String, T.untyped]]
2913
- )
2914
- .returns(T::Array[String])
2915
- end
2916
- def analyze_failures_and_propose(current_instruction, reflective_dataset)
2917
- return [current_instruction] if reflective_dataset.empty?
2918
-
2919
- # Extract common failure patterns
2920
- feedback_texts = reflective_dataset.map { |data| data['feedback'] }.compact
2921
-
2922
- # Simple heuristic-based proposals
2923
- proposals = []
2924
-
2925
- # If many failures, suggest more detailed instruction
2926
- if reflective_dataset.size >= 3
2927
- proposals << "#{current_instruction} Please provide step-by-step reasoning."
2928
- end
2929
-
2930
- # If feedback mentions specific issues, address them
2931
- if feedback_texts.any? { |fb| fb.include?('unclear') || fb.include?('ambiguous') }
2932
- proposals << "#{current_instruction} Be specific and clear in your response."
2933
- end
2934
-
2935
- if feedback_texts.any? { |fb| fb.include?('incomplete') || fb.include?('missing') }
2936
- proposals << "#{current_instruction} Ensure your answer is complete and addresses all aspects."
2937
- end
2938
-
2939
- # Always include at least one proposal
2940
- proposals << "#{current_instruction.strip}. Think carefully before responding." if proposals.empty?
2941
-
2942
- proposals.uniq.take(3) # Return up to 3 proposals
580
+ program: DSPy::Module,
581
+ metric: T.proc.params(arg0: DSPy::Example, arg1: T.untyped).returns(T.untyped),
582
+ reflection_lm: T.nilable(T.untyped),
583
+ feedback_map: T::Hash[String, PredictAdapter::FeedbackFnType]
584
+ ).returns(PredictAdapter)
585
+ end
586
+ def build_adapter(program, metric, reflection_lm: nil, feedback_map: {})
587
+ PredictAdapter.new(program, metric, reflection_lm: reflection_lm, feedback_map: feedback_map)
2943
588
  end
2944
589
  end
2945
590
  end