dspy 0.28.1 → 0.29.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +2 -3
- data/lib/dspy/callbacks.rb +222 -0
- data/lib/dspy/chain_of_thought.rb +2 -1
- data/lib/dspy/code_act.rb +14 -1
- data/lib/dspy/datasets/ade.rb +90 -0
- data/lib/dspy/datasets.rb +8 -0
- data/lib/dspy/lm.rb +9 -12
- data/lib/dspy/mixins/struct_builder.rb +17 -25
- data/lib/dspy/module.rb +45 -1
- data/lib/dspy/observability/async_span_processor.rb +67 -93
- data/lib/dspy/observability.rb +43 -1
- data/lib/dspy/predict.rb +17 -0
- data/lib/dspy/prompt.rb +90 -20
- data/lib/dspy/propose/dataset_summary_generator.rb +210 -0
- data/lib/dspy/propose/grounded_proposer.rb +320 -66
- data/lib/dspy/re_act.rb +13 -0
- data/lib/dspy/reflection_lm.rb +36 -0
- data/lib/dspy/teleprompt/bootstrap_strategy.rb +26 -0
- data/lib/dspy/teleprompt/gepa.rb +448 -2803
- data/lib/dspy/teleprompt/mipro_v2.rb +624 -100
- data/lib/dspy/teleprompt/utils.rb +349 -42
- data/lib/dspy/version.rb +2 -2
- data/lib/dspy.rb +4 -2
- data/lib/gepa/api.rb +61 -0
- data/lib/gepa/core/engine.rb +226 -0
- data/lib/gepa/core/evaluation_batch.rb +26 -0
- data/lib/gepa/core/result.rb +92 -0
- data/lib/gepa/core/state.rb +231 -0
- data/lib/gepa/logging/experiment_tracker.rb +54 -0
- data/lib/gepa/logging/logger.rb +57 -0
- data/lib/gepa/logging.rb +9 -0
- data/lib/gepa/proposer/base.rb +27 -0
- data/lib/gepa/proposer/merge_proposer.rb +424 -0
- data/lib/gepa/proposer/reflective_mutation/base.rb +48 -0
- data/lib/gepa/proposer/reflective_mutation/reflective_mutation.rb +188 -0
- data/lib/gepa/strategies/batch_sampler.rb +91 -0
- data/lib/gepa/strategies/candidate_selector.rb +97 -0
- data/lib/gepa/strategies/component_selector.rb +57 -0
- data/lib/gepa/strategies/instruction_proposal.rb +120 -0
- data/lib/gepa/telemetry.rb +122 -0
- data/lib/gepa/utils/pareto.rb +119 -0
- data/lib/gepa.rb +21 -0
- metadata +59 -4
- data/lib/dspy/teleprompt/simple_optimizer.rb +0 -497
data/lib/dspy/teleprompt/gepa.rb
CHANGED
@@ -1,2945 +1,590 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require '
|
3
|
+
require 'logger'
|
4
|
+
require 'set'
|
4
5
|
require 'sorbet-runtime'
|
5
6
|
require_relative 'teleprompter'
|
6
|
-
require_relative '
|
7
|
+
require_relative 'utils'
|
8
|
+
require_relative '../../gepa'
|
7
9
|
|
8
10
|
module DSPy
|
9
11
|
module Teleprompt
|
10
|
-
# GEPA: Genetic-Pareto Reflective Prompt Evolution optimizer
|
11
|
-
# Uses natural language reflection to evolve prompts through genetic algorithms
|
12
|
-
# and Pareto frontier selection for maintaining diverse high-performing candidates
|
13
12
|
class GEPA < Teleprompter
|
14
13
|
extend T::Sig
|
14
|
+
DEFAULT_CONFIG = {
|
15
|
+
max_metric_calls: 32,
|
16
|
+
minibatch_size: 2,
|
17
|
+
perfect_score: 1.0,
|
18
|
+
skip_perfect_score: true,
|
19
|
+
use_merge: true,
|
20
|
+
max_merge_invocations: 5
|
21
|
+
}.freeze
|
15
22
|
|
16
|
-
|
17
|
-
|
18
|
-
enums do
|
19
|
-
Rewrite = new
|
20
|
-
Expand = new
|
21
|
-
Simplify = new
|
22
|
-
Combine = new
|
23
|
-
Rephrase = new
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
# Enum for crossover operation types
|
28
|
-
class CrossoverType < T::Enum
|
29
|
-
enums do
|
30
|
-
Uniform = new
|
31
|
-
Blend = new
|
32
|
-
Structured = new
|
33
|
-
end
|
23
|
+
def self.configure
|
24
|
+
yield(default_config) if block_given?
|
34
25
|
end
|
35
26
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
:trace_id,
|
40
|
-
:event_name,
|
41
|
-
:timestamp,
|
42
|
-
:span_id,
|
43
|
-
:attributes,
|
44
|
-
:metadata
|
45
|
-
)
|
46
|
-
extend T::Sig
|
47
|
-
|
48
|
-
# Type aliases for better type safety
|
49
|
-
AttributesHash = T.type_alias { T::Hash[T.any(String, Symbol), T.untyped] }
|
50
|
-
MetadataHash = T.type_alias { T::Hash[Symbol, T.untyped] }
|
51
|
-
|
52
|
-
sig do
|
53
|
-
params(
|
54
|
-
trace_id: String,
|
55
|
-
event_name: String,
|
56
|
-
timestamp: Time,
|
57
|
-
span_id: T.nilable(String),
|
58
|
-
attributes: AttributesHash,
|
59
|
-
metadata: T.nilable(MetadataHash)
|
60
|
-
).void
|
61
|
-
end
|
62
|
-
|
63
|
-
def initialize(trace_id:, event_name:, timestamp:, span_id: nil, attributes: {}, metadata: nil)
|
64
|
-
# Freeze nested structures for true immutability
|
65
|
-
frozen_attributes = attributes.freeze
|
66
|
-
frozen_metadata = metadata&.freeze
|
67
|
-
|
68
|
-
super(
|
69
|
-
trace_id: trace_id,
|
70
|
-
event_name: event_name,
|
71
|
-
timestamp: timestamp,
|
72
|
-
span_id: span_id,
|
73
|
-
attributes: frozen_attributes,
|
74
|
-
metadata: frozen_metadata
|
75
|
-
)
|
76
|
-
end
|
77
|
-
|
78
|
-
# Check if this is an LLM-related trace
|
79
|
-
sig { returns(T::Boolean) }
|
80
|
-
def llm_trace?
|
81
|
-
event_name.start_with?('llm.') || event_name.start_with?('lm.')
|
82
|
-
end
|
83
|
-
|
84
|
-
# Check if this is a module-related trace
|
85
|
-
sig { returns(T::Boolean) }
|
86
|
-
def module_trace?
|
87
|
-
!llm_trace? && (
|
88
|
-
event_name.include?('chain_of_thought') ||
|
89
|
-
event_name.include?('react') ||
|
90
|
-
event_name.include?('codeact') ||
|
91
|
-
event_name.include?('predict')
|
92
|
-
)
|
93
|
-
end
|
94
|
-
|
95
|
-
# Extract token usage from LLM traces
|
96
|
-
sig { returns(Integer) }
|
97
|
-
def token_usage
|
98
|
-
return 0 unless llm_trace?
|
99
|
-
|
100
|
-
# Try different token attribute keys
|
101
|
-
[
|
102
|
-
'gen_ai.usage.total_tokens',
|
103
|
-
'gen_ai.usage.prompt_tokens',
|
104
|
-
'tokens',
|
105
|
-
:tokens
|
106
|
-
].each do |key|
|
107
|
-
value = attributes[key]
|
108
|
-
return value.to_i if value
|
109
|
-
end
|
110
|
-
|
111
|
-
0
|
112
|
-
end
|
113
|
-
|
114
|
-
# Convert to hash representation
|
115
|
-
sig { returns(T::Hash[Symbol, T.untyped]) }
|
116
|
-
def to_h
|
117
|
-
{
|
118
|
-
trace_id: trace_id,
|
119
|
-
event_name: event_name,
|
120
|
-
timestamp: timestamp,
|
121
|
-
span_id: span_id,
|
122
|
-
attributes: attributes,
|
123
|
-
metadata: metadata
|
124
|
-
}
|
125
|
-
end
|
126
|
-
|
127
|
-
# Extract prompt text from trace
|
128
|
-
sig { returns(T.nilable(String)) }
|
129
|
-
def prompt_text
|
130
|
-
attributes[:prompt] || attributes['prompt']
|
131
|
-
end
|
132
|
-
|
133
|
-
# Extract response text from trace
|
134
|
-
sig { returns(T.nilable(String)) }
|
135
|
-
def response_text
|
136
|
-
attributes[:response] || attributes['response']
|
137
|
-
end
|
138
|
-
|
139
|
-
# Get the model used in this trace
|
140
|
-
sig { returns(T.nilable(String)) }
|
141
|
-
def model_name
|
142
|
-
attributes['gen_ai.request.model'] || attributes[:model]
|
143
|
-
end
|
144
|
-
|
145
|
-
# Get the signature class name
|
146
|
-
sig { returns(T.nilable(String)) }
|
147
|
-
def signature_name
|
148
|
-
attributes['dspy.signature'] || attributes[:signature]
|
149
|
-
end
|
150
|
-
end
|
151
|
-
|
152
|
-
# Immutable reflection analysis result using Ruby's Data class
|
153
|
-
# Stores the output of GEPA's reflective analysis on execution traces
|
154
|
-
class ReflectionResult < Data.define(
|
155
|
-
:trace_id,
|
156
|
-
:diagnosis,
|
157
|
-
:improvements,
|
158
|
-
:confidence,
|
159
|
-
:reasoning,
|
160
|
-
:suggested_mutations,
|
161
|
-
:metadata
|
162
|
-
)
|
163
|
-
extend T::Sig
|
164
|
-
|
165
|
-
# Type aliases for better type safety
|
166
|
-
ImprovementsList = T.type_alias { T::Array[String] }
|
167
|
-
MutationsList = T.type_alias { T::Array[Symbol] }
|
168
|
-
MetadataHash = T.type_alias { T::Hash[Symbol, T.untyped] }
|
169
|
-
|
170
|
-
sig do
|
171
|
-
params(
|
172
|
-
trace_id: String,
|
173
|
-
diagnosis: String,
|
174
|
-
improvements: ImprovementsList,
|
175
|
-
confidence: Float,
|
176
|
-
reasoning: String,
|
177
|
-
suggested_mutations: MutationsList,
|
178
|
-
metadata: MetadataHash
|
179
|
-
).void
|
180
|
-
end
|
181
|
-
def initialize(trace_id:, diagnosis:, improvements:, confidence:, reasoning:, suggested_mutations:, metadata:)
|
182
|
-
# Validate confidence score
|
183
|
-
if confidence < 0.0 || confidence > 1.0
|
184
|
-
raise ArgumentError, "confidence must be between 0 and 1, got #{confidence}"
|
185
|
-
end
|
186
|
-
|
187
|
-
# Freeze nested structures for true immutability
|
188
|
-
frozen_improvements = improvements.freeze
|
189
|
-
frozen_mutations = suggested_mutations.freeze
|
190
|
-
frozen_metadata = metadata.freeze
|
191
|
-
|
192
|
-
super(
|
193
|
-
trace_id: trace_id,
|
194
|
-
diagnosis: diagnosis,
|
195
|
-
improvements: frozen_improvements,
|
196
|
-
confidence: confidence,
|
197
|
-
reasoning: reasoning,
|
198
|
-
suggested_mutations: frozen_mutations,
|
199
|
-
metadata: frozen_metadata
|
200
|
-
)
|
201
|
-
end
|
202
|
-
|
203
|
-
# Check if this reflection has high confidence (>= 0.8)
|
204
|
-
sig { returns(T::Boolean) }
|
205
|
-
def high_confidence?
|
206
|
-
confidence >= 0.8
|
207
|
-
end
|
208
|
-
|
209
|
-
# Check if this reflection suggests actionable changes
|
210
|
-
sig { returns(T::Boolean) }
|
211
|
-
def actionable?
|
212
|
-
improvements.any? || suggested_mutations.any?
|
213
|
-
end
|
214
|
-
|
215
|
-
# Get mutations sorted by priority (simple alphabetical for Phase 1)
|
216
|
-
sig { returns(MutationsList) }
|
217
|
-
def mutation_priority
|
218
|
-
suggested_mutations.sort
|
219
|
-
end
|
220
|
-
|
221
|
-
# Convert to hash representation
|
222
|
-
sig { returns(T::Hash[Symbol, T.untyped]) }
|
223
|
-
def to_h
|
224
|
-
{
|
225
|
-
trace_id: trace_id,
|
226
|
-
diagnosis: diagnosis,
|
227
|
-
improvements: improvements,
|
228
|
-
confidence: confidence,
|
229
|
-
reasoning: reasoning,
|
230
|
-
suggested_mutations: suggested_mutations,
|
231
|
-
metadata: metadata
|
232
|
-
}
|
233
|
-
end
|
234
|
-
|
235
|
-
# Generate a concise summary of this reflection
|
236
|
-
sig { returns(String) }
|
237
|
-
def summary
|
238
|
-
confidence_pct = (confidence * 100).round
|
239
|
-
mutation_list = suggested_mutations.map(&:to_s).join(', ')
|
240
|
-
|
241
|
-
"#{diagnosis.split('.').first}. " \
|
242
|
-
"Confidence: #{confidence_pct}%. " \
|
243
|
-
"#{improvements.size} improvements suggested. " \
|
244
|
-
"Mutations: #{mutation_list}."
|
245
|
-
end
|
246
|
-
|
247
|
-
# Check if reflection model was used
|
248
|
-
sig { returns(T.nilable(String)) }
|
249
|
-
def reflection_model
|
250
|
-
metadata[:reflection_model]
|
251
|
-
end
|
252
|
-
|
253
|
-
# Get token usage from reflection analysis
|
254
|
-
sig { returns(Integer) }
|
255
|
-
def token_usage
|
256
|
-
metadata[:token_usage] || 0
|
257
|
-
end
|
258
|
-
|
259
|
-
# Get analysis duration in milliseconds
|
260
|
-
sig { returns(Integer) }
|
261
|
-
def analysis_duration_ms
|
262
|
-
metadata[:analysis_duration_ms] || 0
|
263
|
-
end
|
264
|
-
end
|
265
|
-
|
266
|
-
# TraceCollector aggregates execution traces from DSPy events
|
267
|
-
# Uses SubscriberMixin for class-level event subscriptions
|
268
|
-
class TraceCollector
|
269
|
-
include DSPy::Events::SubscriberMixin
|
270
|
-
extend T::Sig
|
271
|
-
|
272
|
-
sig { void }
|
273
|
-
def initialize
|
274
|
-
@traces = T.let([], T::Array[ExecutionTrace])
|
275
|
-
@traces_mutex = T.let(Mutex.new, Mutex)
|
276
|
-
setup_subscriptions
|
277
|
-
end
|
278
|
-
|
279
|
-
sig { returns(T::Array[ExecutionTrace]) }
|
280
|
-
attr_reader :traces
|
281
|
-
|
282
|
-
# Get count of collected traces
|
283
|
-
sig { returns(Integer) }
|
284
|
-
def collected_count
|
285
|
-
@traces_mutex.synchronize { @traces.size }
|
286
|
-
end
|
287
|
-
|
288
|
-
# Collect trace from event data
|
289
|
-
sig { params(event_name: String, event_data: T::Hash[T.any(String, Symbol), T.untyped]).void }
|
290
|
-
def collect_trace(event_name, event_data)
|
291
|
-
@traces_mutex.synchronize do
|
292
|
-
trace_id = event_data['trace_id'] || event_data[:trace_id] || generate_trace_id
|
293
|
-
|
294
|
-
# Avoid duplicates
|
295
|
-
return if @traces.any? { |t| t.trace_id == trace_id }
|
296
|
-
|
297
|
-
timestamp = event_data['timestamp'] || event_data[:timestamp] || Time.now
|
298
|
-
span_id = event_data['span_id'] || event_data[:span_id]
|
299
|
-
attributes = event_data['attributes'] || event_data[:attributes] || {}
|
300
|
-
metadata = event_data['metadata'] || event_data[:metadata] || {}
|
301
|
-
|
302
|
-
trace = ExecutionTrace.new(
|
303
|
-
trace_id: trace_id,
|
304
|
-
event_name: event_name,
|
305
|
-
timestamp: timestamp,
|
306
|
-
span_id: span_id,
|
307
|
-
attributes: attributes,
|
308
|
-
metadata: metadata
|
309
|
-
)
|
310
|
-
|
311
|
-
@traces << trace
|
312
|
-
end
|
313
|
-
end
|
314
|
-
|
315
|
-
# Get traces for a specific optimization run
|
316
|
-
sig { params(run_id: String).returns(T::Array[ExecutionTrace]) }
|
317
|
-
def traces_for_run(run_id)
|
318
|
-
@traces_mutex.synchronize do
|
319
|
-
@traces.select do |trace|
|
320
|
-
metadata = trace.metadata
|
321
|
-
metadata && metadata[:optimization_run_id] == run_id
|
322
|
-
end
|
323
|
-
end
|
324
|
-
end
|
325
|
-
|
326
|
-
# Get only LLM traces
|
327
|
-
sig { returns(T::Array[ExecutionTrace]) }
|
328
|
-
def llm_traces
|
329
|
-
@traces_mutex.synchronize { @traces.select(&:llm_trace?) }
|
330
|
-
end
|
331
|
-
|
332
|
-
# Get only module traces
|
333
|
-
sig { returns(T::Array[ExecutionTrace]) }
|
334
|
-
def module_traces
|
335
|
-
@traces_mutex.synchronize { @traces.select(&:module_trace?) }
|
336
|
-
end
|
337
|
-
|
338
|
-
# Clear all collected traces
|
339
|
-
sig { void }
|
340
|
-
def clear
|
341
|
-
@traces_mutex.synchronize { @traces.clear }
|
342
|
-
end
|
343
|
-
|
344
|
-
private
|
345
|
-
|
346
|
-
# Set up event subscriptions using SubscriberMixin
|
347
|
-
sig { void }
|
348
|
-
def setup_subscriptions
|
349
|
-
# Subscribe to LLM events
|
350
|
-
self.class.add_subscription('llm.*') do |name, attrs|
|
351
|
-
collect_trace(name, attrs)
|
352
|
-
end
|
353
|
-
|
354
|
-
# Subscribe to module events
|
355
|
-
self.class.add_subscription('*.reasoning_complete') do |name, attrs|
|
356
|
-
collect_trace(name, attrs)
|
357
|
-
end
|
358
|
-
|
359
|
-
self.class.add_subscription('*.predict_complete') do |name, attrs|
|
360
|
-
collect_trace(name, attrs)
|
361
|
-
end
|
362
|
-
end
|
363
|
-
|
364
|
-
# Generate unique trace ID
|
365
|
-
sig { returns(String) }
|
366
|
-
def generate_trace_id
|
367
|
-
"gepa-trace-#{SecureRandom.hex(4)}"
|
368
|
-
end
|
369
|
-
end
|
370
|
-
|
371
|
-
# ReflectionEngine performs natural language reflection on execution traces
|
372
|
-
# This is the core component that analyzes traces and generates improvement insights
|
373
|
-
class ReflectionEngine
|
374
|
-
extend T::Sig
|
375
|
-
|
376
|
-
sig { returns(GEPAConfig) }
|
377
|
-
attr_reader :config
|
378
|
-
|
379
|
-
sig { params(config: T.nilable(GEPAConfig)).void }
|
380
|
-
def initialize(config = nil)
|
381
|
-
@config = config || GEPAConfig.new
|
382
|
-
end
|
383
|
-
|
384
|
-
# Perform reflective analysis on execution traces
|
385
|
-
sig { params(traces: T::Array[ExecutionTrace]).returns(ReflectionResult) }
|
386
|
-
def reflect_on_traces(traces)
|
387
|
-
reflection_id = generate_reflection_id
|
388
|
-
|
389
|
-
if traces.empty?
|
390
|
-
return ReflectionResult.new(
|
391
|
-
trace_id: reflection_id,
|
392
|
-
diagnosis: 'No traces available for analysis',
|
393
|
-
improvements: [],
|
394
|
-
confidence: 0.0,
|
395
|
-
reasoning: 'Cannot provide reflection without execution traces',
|
396
|
-
suggested_mutations: [],
|
397
|
-
metadata: {
|
398
|
-
reflection_model: @config.reflection_lm&.model,
|
399
|
-
analysis_timestamp: Time.now,
|
400
|
-
trace_count: 0
|
401
|
-
}
|
402
|
-
)
|
403
|
-
end
|
404
|
-
|
405
|
-
patterns = analyze_execution_patterns(traces)
|
406
|
-
improvements = generate_improvement_suggestions(patterns)
|
407
|
-
mutations = suggest_mutations(patterns)
|
408
|
-
|
409
|
-
# For Phase 1, we generate a simple rule-based analysis
|
410
|
-
# Future phases will use LLM-based reflection
|
411
|
-
diagnosis = generate_diagnosis(patterns)
|
412
|
-
reasoning = generate_reasoning(patterns, traces)
|
413
|
-
confidence = calculate_confidence(patterns)
|
414
|
-
|
415
|
-
ReflectionResult.new(
|
416
|
-
trace_id: reflection_id,
|
417
|
-
diagnosis: diagnosis,
|
418
|
-
improvements: improvements,
|
419
|
-
confidence: confidence,
|
420
|
-
reasoning: reasoning,
|
421
|
-
suggested_mutations: mutations,
|
422
|
-
metadata: {
|
423
|
-
reflection_model: @config.reflection_lm&.model,
|
424
|
-
analysis_timestamp: Time.now,
|
425
|
-
trace_count: traces.size,
|
426
|
-
token_usage: 0 # Phase 1 doesn't use actual LLM reflection
|
427
|
-
}
|
428
|
-
)
|
429
|
-
end
|
430
|
-
|
431
|
-
# Analyze patterns in execution traces
|
432
|
-
sig { params(traces: T::Array[ExecutionTrace]).returns(T::Hash[Symbol, T.untyped]) }
|
433
|
-
def analyze_execution_patterns(traces)
|
434
|
-
llm_traces = traces.select(&:llm_trace?)
|
435
|
-
module_traces = traces.select(&:module_trace?)
|
436
|
-
|
437
|
-
total_tokens = llm_traces.sum(&:token_usage)
|
438
|
-
unique_models = llm_traces.map(&:model_name).compact.uniq
|
439
|
-
|
440
|
-
{
|
441
|
-
llm_traces_count: llm_traces.size,
|
442
|
-
module_traces_count: module_traces.size,
|
443
|
-
total_tokens: total_tokens,
|
444
|
-
unique_models: unique_models,
|
445
|
-
avg_response_length: calculate_avg_response_length(llm_traces),
|
446
|
-
trace_timespan: calculate_timespan(traces)
|
447
|
-
}
|
448
|
-
end
|
449
|
-
|
450
|
-
# Generate improvement suggestions based on patterns
|
451
|
-
sig { params(patterns: T::Hash[Symbol, T.untyped]).returns(T::Array[String]) }
|
452
|
-
def generate_improvement_suggestions(patterns)
|
453
|
-
suggestions = []
|
454
|
-
|
455
|
-
if patterns[:total_tokens] > 500
|
456
|
-
suggestions << 'Consider reducing prompt length to lower token usage'
|
457
|
-
end
|
458
|
-
|
459
|
-
if patterns[:avg_response_length] < 10
|
460
|
-
suggestions << 'Responses seem brief - consider asking for more detailed explanations'
|
461
|
-
end
|
462
|
-
|
463
|
-
if patterns[:llm_traces_count] > patterns[:module_traces_count] * 3
|
464
|
-
suggestions << 'High LLM usage detected - consider optimizing reasoning chains'
|
465
|
-
end
|
466
|
-
|
467
|
-
if patterns[:unique_models].size > 1
|
468
|
-
suggestions << 'Multiple models used - consider standardizing on one model for consistency'
|
469
|
-
end
|
470
|
-
|
471
|
-
suggestions << 'Add step-by-step reasoning instructions' if suggestions.empty?
|
472
|
-
suggestions
|
473
|
-
end
|
474
|
-
|
475
|
-
# Suggest mutation operations based on patterns
|
476
|
-
sig { params(patterns: T::Hash[Symbol, T.untyped]).returns(T::Array[Symbol]) }
|
477
|
-
def suggest_mutations(patterns)
|
478
|
-
mutations = []
|
479
|
-
|
480
|
-
avg_length = patterns[:avg_response_length] || 0
|
481
|
-
total_tokens = patterns[:total_tokens] || 0
|
482
|
-
llm_count = patterns[:llm_traces_count] || 0
|
483
|
-
|
484
|
-
mutations << :expand if avg_length < 15
|
485
|
-
mutations << :simplify if total_tokens > 300
|
486
|
-
mutations << :combine if llm_count > 2
|
487
|
-
mutations << :rewrite if llm_count == 1
|
488
|
-
mutations << :rephrase if mutations.empty?
|
489
|
-
|
490
|
-
mutations.uniq
|
491
|
-
end
|
492
|
-
|
493
|
-
public
|
494
|
-
|
495
|
-
# Perform LLM-based reflection on execution traces using DSPy::Predict
|
496
|
-
sig { params(traces: T::Array[ExecutionTrace]).returns(ReflectionResult) }
|
497
|
-
def reflect_with_llm(traces)
|
498
|
-
return reflect_on_traces(traces) if traces.empty?
|
499
|
-
|
500
|
-
begin
|
501
|
-
# Use DSPy::Predict for analysis instead of raw prompts
|
502
|
-
prediction = analyze_traces_with_dspy(traces)
|
503
|
-
convert_prediction_to_reflection_result(prediction, traces)
|
504
|
-
rescue => e
|
505
|
-
# Fallback to rule-based analysis on LLM failure
|
506
|
-
fallback_result = reflect_on_traces(traces)
|
507
|
-
fallback_result.class.new(
|
508
|
-
trace_id: fallback_result.trace_id,
|
509
|
-
diagnosis: "LLM reflection failed (#{e.message}), using fallback analysis: #{fallback_result.diagnosis}",
|
510
|
-
improvements: fallback_result.improvements,
|
511
|
-
confidence: [fallback_result.confidence * 0.5, 0.5].min,
|
512
|
-
reasoning: "Fallback to rule-based analysis after LLM error: #{fallback_result.reasoning}",
|
513
|
-
suggested_mutations: fallback_result.suggested_mutations,
|
514
|
-
metadata: fallback_result.metadata.merge(
|
515
|
-
llm_error: e.message,
|
516
|
-
fallback_used: true
|
517
|
-
)
|
518
|
-
)
|
519
|
-
end
|
520
|
-
end
|
521
|
-
|
522
|
-
# Generate structured reflection prompt for LLM (public API)
|
523
|
-
sig { params(traces: T::Array[ExecutionTrace]).returns(String) }
|
524
|
-
def generate_reflection_prompt(traces)
|
525
|
-
if traces.empty?
|
526
|
-
return <<~PROMPT
|
527
|
-
You are analyzing execution traces for a genetic algorithm-based prompt optimization system called GEPA.
|
528
|
-
|
529
|
-
**Task**: Analyze execution patterns and provide optimization recommendations.
|
530
|
-
|
531
|
-
**Context**: No execution traces available.
|
532
|
-
|
533
|
-
Please provide your analysis in the following JSON format:
|
534
|
-
{
|
535
|
-
"diagnosis": "Brief description of what you observed",
|
536
|
-
"improvements": ["List of actionable improvement suggestions"],
|
537
|
-
"confidence": 0.0,
|
538
|
-
"reasoning": "Your reasoning process",
|
539
|
-
"suggested_mutations": ["expand", "rewrite", "simplify", "combine", "rephrase"],
|
540
|
-
"insights": {
|
541
|
-
"pattern_detected": "no_data",
|
542
|
-
"optimization_opportunity": "data_collection"
|
543
|
-
}
|
544
|
-
}
|
545
|
-
PROMPT
|
546
|
-
end
|
547
|
-
|
548
|
-
summary = trace_summary_for_reflection(traces)
|
549
|
-
insights = extract_optimization_insights(traces)
|
550
|
-
|
551
|
-
<<~PROMPT
|
552
|
-
You are analyzing execution traces for a genetic algorithm-based prompt optimization system called GEPA.
|
553
|
-
|
554
|
-
**Task**: Analyze execution patterns and provide optimization recommendations for prompt evolution.
|
555
|
-
|
556
|
-
**Execution Summary**:
|
557
|
-
#{summary}
|
558
|
-
|
559
|
-
**Optimization Context**:
|
560
|
-
- This is part of a genetic algorithm for prompt optimization
|
561
|
-
- Available mutation types: rewrite, expand, simplify, combine, rephrase
|
562
|
-
- Goal is to improve prompt effectiveness through iterative evolution
|
563
|
-
- Focus on actionable insights that can guide mutation and crossover operations
|
564
|
-
|
565
|
-
**Key Optimization Insights**:
|
566
|
-
#{insights.map { |k, v| "- #{k}: #{v.is_a?(Hash) ? v.values.join(', ') : v}" }.join("\n")}
|
567
|
-
|
568
|
-
**Sample Traces**:
|
569
|
-
#{format_traces_for_prompt(traces.take(3))}
|
570
|
-
|
571
|
-
Please analyze these execution patterns and provide optimization recommendations in the following JSON format:
|
572
|
-
{
|
573
|
-
"diagnosis": "Brief description of execution patterns and issues identified",
|
574
|
-
"improvements": ["List of 2-4 specific, actionable improvement suggestions"],
|
575
|
-
"confidence": 0.85,
|
576
|
-
"reasoning": "Your detailed reasoning process for the analysis",
|
577
|
-
"suggested_mutations": ["List of 2-3 mutation types that would be most beneficial"],
|
578
|
-
"insights": {
|
579
|
-
"pattern_detected": "primary_pattern_identified",
|
580
|
-
"optimization_opportunity": "key_area_for_improvement"
|
581
|
-
}
|
582
|
-
}
|
583
|
-
|
584
|
-
Focus on practical recommendations that will improve prompt performance through genetic algorithm evolution.
|
585
|
-
PROMPT
|
586
|
-
end
|
587
|
-
|
588
|
-
# Parse LLM reflection response into ReflectionResult (public API)
|
589
|
-
sig { params(response_text: String, original_traces: T::Array[ExecutionTrace]).returns(ReflectionResult) }
|
590
|
-
def parse_llm_reflection(response_text, original_traces)
|
591
|
-
reflection_id = generate_reflection_id
|
592
|
-
|
593
|
-
begin
|
594
|
-
parsed = JSON.parse(response_text)
|
595
|
-
|
596
|
-
# Extract and validate components
|
597
|
-
diagnosis = parsed['diagnosis'] || 'LLM reflection analysis'
|
598
|
-
improvements = Array(parsed['improvements']).select { |i| i.is_a?(String) && !i.strip.empty? }
|
599
|
-
confidence = [parsed['confidence'].to_f, 1.0].min
|
600
|
-
reasoning = parsed['reasoning'] || 'LLM-based analysis of execution traces'
|
601
|
-
|
602
|
-
# Validate and sanitize mutation suggestions
|
603
|
-
raw_mutations = Array(parsed['suggested_mutations'])
|
604
|
-
valid_mutations = raw_mutations.filter_map do |mut|
|
605
|
-
mutation_symbol = mut.to_s.downcase.to_sym
|
606
|
-
if [:rewrite, :expand, :simplify, :combine, :rephrase].include?(mutation_symbol)
|
607
|
-
mutation_symbol
|
608
|
-
end
|
609
|
-
end.uniq
|
610
|
-
|
611
|
-
# Ensure we have at least one valid mutation suggestion
|
612
|
-
valid_mutations = [:rewrite] if valid_mutations.empty?
|
613
|
-
|
614
|
-
ReflectionResult.new(
|
615
|
-
trace_id: reflection_id,
|
616
|
-
diagnosis: diagnosis,
|
617
|
-
improvements: improvements,
|
618
|
-
confidence: confidence,
|
619
|
-
reasoning: reasoning,
|
620
|
-
suggested_mutations: valid_mutations,
|
621
|
-
metadata: {
|
622
|
-
reflection_model: @config.reflection_lm&.model,
|
623
|
-
analysis_timestamp: Time.now,
|
624
|
-
trace_count: original_traces.size,
|
625
|
-
token_usage: estimate_token_usage(response_text),
|
626
|
-
llm_based: true,
|
627
|
-
insights: parsed['insights'] || {}
|
628
|
-
}
|
629
|
-
)
|
630
|
-
|
631
|
-
rescue JSON::ParserError => e
|
632
|
-
# Handle malformed JSON response
|
633
|
-
ReflectionResult.new(
|
634
|
-
trace_id: reflection_id,
|
635
|
-
diagnosis: "LLM reflection JSON parsing error: #{e.message}",
|
636
|
-
improvements: ['Review prompt structure and LLM response format'],
|
637
|
-
confidence: 0.3,
|
638
|
-
reasoning: "Failed to parse LLM reflection response as valid JSON",
|
639
|
-
suggested_mutations: [:rewrite],
|
640
|
-
metadata: {
|
641
|
-
reflection_model: @config.reflection_lm&.model,
|
642
|
-
analysis_timestamp: Time.now,
|
643
|
-
trace_count: original_traces.size,
|
644
|
-
token_usage: 0,
|
645
|
-
parsing_error: e.message,
|
646
|
-
raw_response: response_text.length > 500 ? "#{response_text[0..500]}..." : response_text
|
647
|
-
}
|
648
|
-
)
|
649
|
-
end
|
650
|
-
end
|
651
|
-
|
652
|
-
# Create comprehensive trace summary for reflection (public API)
|
653
|
-
sig { params(traces: T::Array[ExecutionTrace]).returns(String) }
|
654
|
-
def trace_summary_for_reflection(traces)
|
655
|
-
return "No execution traces available" if traces.empty?
|
656
|
-
|
657
|
-
llm_traces = traces.select(&:llm_trace?)
|
658
|
-
module_traces = traces.select(&:module_trace?)
|
659
|
-
|
660
|
-
total_tokens = llm_traces.sum(&:token_usage)
|
661
|
-
unique_models = llm_traces.map(&:model_name).compact.uniq
|
662
|
-
timespan = calculate_timespan(traces)
|
663
|
-
|
664
|
-
avg_response_length = if llm_traces.any?
|
665
|
-
total_length = llm_traces.sum { |t| t.response_text&.length || 0 }
|
666
|
-
total_length / llm_traces.size
|
667
|
-
else
|
668
|
-
0
|
669
|
-
end
|
670
|
-
|
671
|
-
<<~SUMMARY
|
672
|
-
Total traces: #{traces.size}
|
673
|
-
LLM interactions: #{llm_traces.size}
|
674
|
-
Module calls: #{module_traces.size}
|
675
|
-
Total tokens: #{total_tokens}
|
676
|
-
Models used: #{unique_models.join(', ')}
|
677
|
-
Average response length: #{avg_response_length} characters
|
678
|
-
Execution timespan: #{timespan.round(2)} seconds
|
679
|
-
SUMMARY
|
680
|
-
end
|
681
|
-
|
682
|
-
# Extract optimization insights from trace analysis (public API)
|
683
|
-
sig { params(traces: T::Array[ExecutionTrace]).returns(T::Hash[Symbol, T.untyped]) }
|
684
|
-
def extract_optimization_insights(traces)
|
685
|
-
llm_traces = traces.select(&:llm_trace?)
|
686
|
-
|
687
|
-
insights = {
|
688
|
-
token_efficiency: analyze_token_efficiency(llm_traces),
|
689
|
-
response_quality: analyze_response_quality(llm_traces),
|
690
|
-
model_consistency: analyze_model_consistency(llm_traces)
|
691
|
-
}
|
692
|
-
|
693
|
-
insights
|
694
|
-
end
|
695
|
-
|
696
|
-
# Reflection with optimization context (public API)
|
697
|
-
sig { params(traces: T::Array[ExecutionTrace], context: T::Hash[Symbol, T.untyped]).returns(ReflectionResult) }
|
698
|
-
def reflection_with_context(traces, context)
|
699
|
-
base_result = reflect_with_llm(traces)
|
700
|
-
|
701
|
-
# Incorporate context into reasoning
|
702
|
-
context_reasoning = "Generation #{context[:generation] || 'unknown'} analysis. "
|
703
|
-
context_reasoning += "Population size: #{context[:population_size] || 'unknown'}. "
|
704
|
-
|
705
|
-
if context[:current_best_score]
|
706
|
-
context_reasoning += "Current best score: #{context[:current_best_score]}. "
|
707
|
-
end
|
708
|
-
|
709
|
-
# Adjust mutation suggestions based on history
|
710
|
-
adjusted_mutations = adjust_mutations_for_history(
|
711
|
-
base_result.suggested_mutations,
|
712
|
-
context[:mutation_history] || [],
|
713
|
-
context[:recent_performance_trend]
|
714
|
-
)
|
715
|
-
|
716
|
-
ReflectionResult.new(
|
717
|
-
trace_id: base_result.trace_id,
|
718
|
-
diagnosis: base_result.diagnosis,
|
719
|
-
improvements: base_result.improvements,
|
720
|
-
confidence: base_result.confidence,
|
721
|
-
reasoning: context_reasoning + base_result.reasoning,
|
722
|
-
suggested_mutations: adjusted_mutations,
|
723
|
-
metadata: base_result.metadata.merge(optimization_context: context)
|
724
|
-
)
|
725
|
-
end
|
726
|
-
|
727
|
-
public
|
728
|
-
|
729
|
-
# Create signature for trace reflection analysis (public API)
|
730
|
-
sig { returns(T.class_of(DSPy::Signature)) }
|
731
|
-
def create_trace_reflection_signature
|
732
|
-
@trace_reflection_signature ||= Class.new(DSPy::Signature) do
|
733
|
-
description "Analyze execution traces from GEPA optimization system and provide actionable optimization insights"
|
734
|
-
|
735
|
-
input do
|
736
|
-
const :execution_summary, String, description: "Summary of execution traces and performance patterns"
|
737
|
-
const :optimization_context, String, description: "Context about the genetic algorithm optimization goals"
|
738
|
-
const :key_insights, String, description: "Key insights extracted from trace analysis"
|
739
|
-
const :sample_traces, String, description: "Representative execution trace samples"
|
740
|
-
end
|
741
|
-
|
742
|
-
output do
|
743
|
-
const :diagnosis, String, description: "Brief description of execution patterns and issues identified"
|
744
|
-
const :improvements, T::Array[String], description: "List of 2-4 specific actionable improvement suggestions"
|
745
|
-
const :confidence, Float, description: "Confidence level in analysis (0.0 to 1.0)"
|
746
|
-
const :reasoning, String, description: "Detailed reasoning process for the analysis"
|
747
|
-
const :suggested_mutations, T::Array[String], description: "List of 2-3 most beneficial mutation types from: rewrite, expand, simplify, combine, rephrase"
|
748
|
-
const :pattern_detected, String, description: "Primary pattern identified in execution traces"
|
749
|
-
const :optimization_opportunity, String, description: "Key area identified for performance improvement"
|
750
|
-
end
|
751
|
-
end
|
752
|
-
end
|
753
|
-
|
754
|
-
# Perform LLM analysis using DSPy::Predict (public API)
|
755
|
-
sig { params(traces: T::Array[ExecutionTrace]).returns(T.untyped) }
|
756
|
-
def analyze_traces_with_dspy(traces)
|
757
|
-
raise ArgumentError, "reflection_lm must be configured on GEPAConfig for LLM-based reflection" unless @config.reflection_lm
|
758
|
-
|
759
|
-
predictor = DSPy::Predict.new(create_trace_reflection_signature)
|
760
|
-
|
761
|
-
# Configure predictor to use reflection-specific LM
|
762
|
-
predictor.config.lm = @config.reflection_lm
|
763
|
-
|
764
|
-
# Prepare input data
|
765
|
-
summary = trace_summary_for_reflection(traces)
|
766
|
-
insights = extract_optimization_insights(traces)
|
767
|
-
insights_text = insights.map { |k, v| "- #{k}: #{v.is_a?(Hash) ? v.values.join(', ') : v}" }.join("\n")
|
768
|
-
|
769
|
-
# Get LLM analysis
|
770
|
-
T.unsafe(predictor.call(
|
771
|
-
execution_summary: summary,
|
772
|
-
optimization_context: "GEPA genetic algorithm for prompt optimization. Available mutations: rewrite, expand, simplify, combine, rephrase. Goal: improve prompt effectiveness through iterative evolution.",
|
773
|
-
key_insights: insights_text,
|
774
|
-
sample_traces: format_traces_for_prompt(traces.take(3))
|
775
|
-
))
|
776
|
-
end
|
777
|
-
|
778
|
-
# Convert DSPy prediction to ReflectionResult (public API)
|
779
|
-
sig { params(prediction: T.untyped, original_traces: T::Array[ExecutionTrace]).returns(ReflectionResult) }
|
780
|
-
def convert_prediction_to_reflection_result(prediction, original_traces)
|
781
|
-
reflection_id = generate_reflection_id
|
782
|
-
|
783
|
-
# Extract and validate prediction results
|
784
|
-
diagnosis = prediction.diagnosis || 'DSPy reflection analysis'
|
785
|
-
improvements = Array(prediction.improvements).select { |i| i.is_a?(String) && !i.strip.empty? }
|
786
|
-
confidence = [[prediction.confidence&.to_f || 0.0, 1.0].min, 0.0].max
|
787
|
-
reasoning = prediction.reasoning || 'DSPy-based analysis of execution traces'
|
788
|
-
|
789
|
-
# Validate mutation suggestions
|
790
|
-
valid_mutations = Array(prediction.suggested_mutations).filter_map do |mut|
|
791
|
-
mutation_symbol = mut.to_s.downcase.to_sym
|
792
|
-
if [:rewrite, :expand, :simplify, :combine, :rephrase].include?(mutation_symbol)
|
793
|
-
mutation_symbol
|
794
|
-
end
|
795
|
-
end.uniq
|
796
|
-
|
797
|
-
# Ensure we have at least one valid mutation suggestion
|
798
|
-
valid_mutations = [:rewrite] if valid_mutations.empty?
|
799
|
-
|
800
|
-
ReflectionResult.new(
|
801
|
-
trace_id: reflection_id,
|
802
|
-
diagnosis: diagnosis,
|
803
|
-
improvements: improvements,
|
804
|
-
confidence: confidence,
|
805
|
-
reasoning: reasoning,
|
806
|
-
suggested_mutations: valid_mutations,
|
807
|
-
metadata: {
|
808
|
-
reflection_model: @config.reflection_lm&.model,
|
809
|
-
analysis_timestamp: Time.now,
|
810
|
-
trace_count: original_traces.size,
|
811
|
-
token_usage: estimate_token_usage(prediction.to_s),
|
812
|
-
llm_based: true,
|
813
|
-
dspy_prediction: true,
|
814
|
-
insights: {
|
815
|
-
pattern_detected: prediction.pattern_detected || "unknown_pattern",
|
816
|
-
optimization_opportunity: prediction.optimization_opportunity || "general_optimization"
|
817
|
-
}
|
818
|
-
}
|
819
|
-
)
|
820
|
-
end
|
821
|
-
|
822
|
-
private
|
823
|
-
|
824
|
-
# Generate unique reflection ID
|
825
|
-
sig { returns(String) }
|
826
|
-
def generate_reflection_id
|
827
|
-
"reflection-#{SecureRandom.hex(4)}"
|
828
|
-
end
|
829
|
-
|
830
|
-
# Generate diagnosis text
|
831
|
-
sig { params(patterns: T::Hash[Symbol, T.untyped]).returns(String) }
|
832
|
-
def generate_diagnosis(patterns)
|
833
|
-
if patterns[:total_tokens] > 400
|
834
|
-
'High token usage indicates potential inefficiency in prompt design'
|
835
|
-
elsif patterns[:llm_traces_count] == 0
|
836
|
-
'No LLM interactions found - execution may not be working as expected'
|
837
|
-
elsif patterns[:avg_response_length] < 10
|
838
|
-
'Responses are unusually brief which may indicate prompt clarity issues'
|
839
|
-
else
|
840
|
-
'Execution patterns appear normal with room for optimization'
|
841
|
-
end
|
842
|
-
end
|
843
|
-
|
844
|
-
# Generate reasoning text
|
845
|
-
sig { params(patterns: T::Hash[Symbol, T.untyped], traces: T::Array[ExecutionTrace]).returns(String) }
|
846
|
-
def generate_reasoning(patterns, traces)
|
847
|
-
reasoning_parts = []
|
848
|
-
|
849
|
-
reasoning_parts << "Analyzed #{traces.size} execution traces"
|
850
|
-
reasoning_parts << "#{patterns[:llm_traces_count]} LLM interactions"
|
851
|
-
reasoning_parts << "#{patterns[:module_traces_count]} module operations"
|
852
|
-
reasoning_parts << "Total token usage: #{patterns[:total_tokens]}"
|
853
|
-
|
854
|
-
reasoning_parts.join('. ') + '.'
|
855
|
-
end
|
856
|
-
|
857
|
-
# Calculate confidence based on patterns
|
858
|
-
sig { params(patterns: T::Hash[Symbol, T.untyped]).returns(Float) }
|
859
|
-
def calculate_confidence(patterns)
|
860
|
-
base_confidence = 0.7
|
861
|
-
|
862
|
-
# More traces = higher confidence
|
863
|
-
trace_bonus = [patterns[:llm_traces_count] + patterns[:module_traces_count], 10].min * 0.02
|
864
|
-
|
865
|
-
# Reasonable token usage = higher confidence
|
866
|
-
token_penalty = patterns[:total_tokens] > 1000 ? -0.1 : 0.0
|
867
|
-
|
868
|
-
[(base_confidence + trace_bonus + token_penalty), 1.0].min
|
869
|
-
end
|
870
|
-
|
871
|
-
# Calculate average response length from LLM traces
|
872
|
-
sig { params(llm_traces: T::Array[ExecutionTrace]).returns(Integer) }
|
873
|
-
def calculate_avg_response_length(llm_traces)
|
874
|
-
return 0 if llm_traces.empty?
|
875
|
-
|
876
|
-
total_length = llm_traces.sum do |trace|
|
877
|
-
response = trace.response_text
|
878
|
-
response ? response.length : 0
|
879
|
-
end
|
880
|
-
|
881
|
-
total_length / llm_traces.size
|
882
|
-
end
|
883
|
-
|
884
|
-
# Calculate timespan of traces
|
885
|
-
sig { params(traces: T::Array[ExecutionTrace]).returns(Float) }
|
886
|
-
def calculate_timespan(traces)
|
887
|
-
return 0.0 if traces.size < 2
|
888
|
-
|
889
|
-
timestamps = traces.map(&:timestamp).sort
|
890
|
-
(timestamps.last - timestamps.first).to_f
|
891
|
-
end
|
892
|
-
|
893
|
-
|
894
|
-
# Format traces for inclusion in prompt
|
895
|
-
sig { params(traces: T::Array[ExecutionTrace]).returns(String) }
|
896
|
-
def format_traces_for_prompt(traces)
|
897
|
-
traces.map.with_index do |trace, idx|
|
898
|
-
prompt_preview = truncate_text(trace.prompt_text || 'N/A', 100)
|
899
|
-
response_preview = truncate_text(trace.response_text || 'N/A', 100)
|
900
|
-
"#{idx + 1}. [#{trace.event_name}] #{prompt_preview} → #{response_preview}"
|
901
|
-
end.join("\n")
|
902
|
-
end
|
903
|
-
|
904
|
-
# Estimate token usage from response
|
905
|
-
sig { params(text: String).returns(Integer) }
|
906
|
-
def estimate_token_usage(text)
|
907
|
-
# Rough estimation: ~4 characters per token
|
908
|
-
(text.length / 4.0).ceil
|
909
|
-
end
|
910
|
-
|
911
|
-
# Analyze token efficiency patterns
|
912
|
-
sig { params(llm_traces: T::Array[ExecutionTrace]).returns(T::Hash[Symbol, T.untyped]) }
|
913
|
-
def analyze_token_efficiency(llm_traces)
|
914
|
-
return { status: 'no_data', suggestions: [] } if llm_traces.empty?
|
915
|
-
|
916
|
-
total_tokens = llm_traces.sum(&:token_usage)
|
917
|
-
avg_tokens = total_tokens.to_f / llm_traces.size
|
918
|
-
|
919
|
-
if avg_tokens > 400
|
920
|
-
{
|
921
|
-
status: 'poor',
|
922
|
-
average_tokens: avg_tokens,
|
923
|
-
suggestions: ['Consider reducing prompt length', 'Optimize instruction clarity']
|
924
|
-
}
|
925
|
-
elsif avg_tokens > 200
|
926
|
-
{
|
927
|
-
status: 'moderate',
|
928
|
-
average_tokens: avg_tokens,
|
929
|
-
suggestions: ['Monitor token usage trends', 'Consider prompt optimization']
|
930
|
-
}
|
931
|
-
else
|
932
|
-
{
|
933
|
-
status: 'good',
|
934
|
-
average_tokens: avg_tokens,
|
935
|
-
suggestions: ['Token usage appears efficient']
|
936
|
-
}
|
937
|
-
end
|
938
|
-
end
|
939
|
-
|
940
|
-
# Analyze response quality patterns
|
941
|
-
sig { params(llm_traces: T::Array[ExecutionTrace]).returns(T::Hash[Symbol, T.untyped]) }
|
942
|
-
def analyze_response_quality(llm_traces)
|
943
|
-
return { consistency: 'no_data', recommendations: [] } if llm_traces.empty?
|
944
|
-
|
945
|
-
response_lengths = llm_traces.map { |t| t.response_text&.length || 0 }
|
946
|
-
length_variance = calculate_variance(response_lengths)
|
947
|
-
|
948
|
-
if length_variance > 1000
|
949
|
-
{
|
950
|
-
consistency: 'inconsistent',
|
951
|
-
variance: length_variance,
|
952
|
-
recommendations: [
|
953
|
-
'Add response format guidelines',
|
954
|
-
'Consider structured output templates'
|
955
|
-
]
|
956
|
-
}
|
957
|
-
else
|
958
|
-
{
|
959
|
-
consistency: 'consistent',
|
960
|
-
variance: length_variance,
|
961
|
-
recommendations: ['Response quality appears consistent']
|
962
|
-
}
|
963
|
-
end
|
964
|
-
end
|
965
|
-
|
966
|
-
# Analyze model consistency
|
967
|
-
sig { params(llm_traces: T::Array[ExecutionTrace]).returns(T::Hash[Symbol, T.untyped]) }
|
968
|
-
def analyze_model_consistency(llm_traces)
|
969
|
-
models = llm_traces.map(&:model_name).compact.uniq
|
970
|
-
|
971
|
-
{
|
972
|
-
unique_models: models.size,
|
973
|
-
models_used: models,
|
974
|
-
recommendation: models.size > 1 ? 'Consider using single model for consistency' : 'Model usage is consistent'
|
975
|
-
}
|
976
|
-
end
|
977
|
-
|
978
|
-
# Adjust mutations based on history to avoid repetition
|
979
|
-
sig { params(suggested: T::Array[Symbol], history: T::Array[Symbol], trend: T.nilable(String)).returns(T::Array[Symbol]) }
|
980
|
-
def adjust_mutations_for_history(suggested, history, trend)
|
981
|
-
# Count recent usage of each mutation type
|
982
|
-
recent_usage = history.last(5).tally
|
983
|
-
|
984
|
-
# Filter out overused mutations
|
985
|
-
adjusted = suggested.reject do |mutation|
|
986
|
-
recent_usage[mutation] && recent_usage[mutation] >= 2
|
987
|
-
end
|
988
|
-
|
989
|
-
# If trend is declining, prefer different strategies
|
990
|
-
if trend == 'declining'
|
991
|
-
adjusted = adjusted.reject { |m| m == :expand } # Avoid expansion if performance declining
|
992
|
-
adjusted += [:simplify, :rephrase] unless adjusted.include?(:simplify) || adjusted.include?(:rephrase)
|
993
|
-
end
|
994
|
-
|
995
|
-
# Ensure we always have at least one suggestion
|
996
|
-
adjusted.empty? ? [:rewrite] : adjusted.uniq
|
997
|
-
end
|
998
|
-
|
999
|
-
# Calculate variance for array of numbers
|
1000
|
-
sig { params(values: T::Array[Integer]).returns(Float) }
|
1001
|
-
def calculate_variance(values)
|
1002
|
-
return 0.0 if values.size < 2
|
1003
|
-
|
1004
|
-
mean = values.sum.to_f / values.size
|
1005
|
-
sum_squared_diff = values.sum { |v| (v - mean) ** 2 }
|
1006
|
-
sum_squared_diff / values.size
|
1007
|
-
end
|
1008
|
-
|
1009
|
-
# Truncate text to specified length with ellipsis
|
1010
|
-
sig { params(text: String, length: Integer).returns(String) }
|
1011
|
-
def truncate_text(text, length)
|
1012
|
-
return text if text.length <= length
|
1013
|
-
"#{text[0...length]}..."
|
1014
|
-
end
|
1015
|
-
end
|
1016
|
-
|
1017
|
-
# GeneticEngine orchestrates the genetic algorithm for prompt evolution
|
1018
|
-
# Manages population, selection, and evolution across generations
|
1019
|
-
class GeneticEngine
|
1020
|
-
extend T::Sig
|
1021
|
-
|
1022
|
-
sig { returns(GEPAConfig) }
|
1023
|
-
attr_reader :config
|
1024
|
-
|
1025
|
-
sig { returns(FitnessEvaluator) }
|
1026
|
-
attr_reader :fitness_evaluator
|
1027
|
-
|
1028
|
-
sig { returns(T::Array[T.untyped]) }
|
1029
|
-
attr_reader :population
|
1030
|
-
|
1031
|
-
sig { returns(Integer) }
|
1032
|
-
attr_reader :generation
|
1033
|
-
|
1034
|
-
sig { params(config: GEPAConfig, fitness_evaluator: FitnessEvaluator).void }
|
1035
|
-
def initialize(config:, fitness_evaluator:)
|
1036
|
-
@config = config
|
1037
|
-
@fitness_evaluator = fitness_evaluator
|
1038
|
-
@population = T.let([], T::Array[T.untyped])
|
1039
|
-
@generation = 0
|
1040
|
-
@fitness_scores = T.let([], T::Array[FitnessScore])
|
1041
|
-
end
|
1042
|
-
|
1043
|
-
# Initialize population with diverse instruction variants
|
1044
|
-
sig { params(program: T.untyped).void }
|
1045
|
-
def initialize_population(program)
|
1046
|
-
@population = []
|
1047
|
-
|
1048
|
-
# Start with original program
|
1049
|
-
@population << program
|
1050
|
-
|
1051
|
-
# Generate instruction variants to fill population if program has signature_class
|
1052
|
-
if program.respond_to?(:signature_class) && program.signature_class.respond_to?(:description)
|
1053
|
-
original_instruction = program.signature_class.description
|
1054
|
-
if original_instruction && !original_instruction.empty?
|
1055
|
-
variants = generate_instruction_variants(original_instruction)
|
1056
|
-
else
|
1057
|
-
variants = []
|
1058
|
-
end
|
1059
|
-
else
|
1060
|
-
variants = []
|
1061
|
-
end
|
1062
|
-
|
1063
|
-
# Create program copies with different instructions
|
1064
|
-
variants.take(@config.population_size - 1).each do |variant|
|
1065
|
-
variant_program = create_program_with_instruction(program, variant)
|
1066
|
-
@population << variant_program
|
1067
|
-
end
|
1068
|
-
|
1069
|
-
# If we need more candidates, duplicate and mutate
|
1070
|
-
while @population.size < @config.population_size
|
1071
|
-
base_program = @population.sample
|
1072
|
-
if base_program.respond_to?(:signature_class) && base_program.signature_class.respond_to?(:description)
|
1073
|
-
instruction_variants = generate_instruction_variants(base_program.signature_class.description)
|
1074
|
-
if instruction_variants.any?
|
1075
|
-
mutated = create_program_with_instruction(base_program, instruction_variants.first)
|
1076
|
-
@population << mutated
|
1077
|
-
else
|
1078
|
-
# If no variants available, just duplicate the base program
|
1079
|
-
@population << base_program
|
1080
|
-
end
|
1081
|
-
else
|
1082
|
-
# If no signature_class available, just duplicate the base program
|
1083
|
-
@population << base_program
|
1084
|
-
end
|
1085
|
-
end
|
1086
|
-
|
1087
|
-
@generation = 0
|
1088
|
-
end
|
1089
|
-
|
1090
|
-
# Evaluate all population members on the training set
|
1091
|
-
sig { params(trainset: T::Array[T.untyped]).returns(T::Array[FitnessScore]) }
|
1092
|
-
def evaluate_population(trainset)
|
1093
|
-
@fitness_scores = @population.map do |candidate|
|
1094
|
-
@fitness_evaluator.evaluate_candidate(candidate, trainset)
|
1095
|
-
end
|
1096
|
-
|
1097
|
-
@fitness_scores
|
1098
|
-
end
|
1099
|
-
|
1100
|
-
# Evolve to next generation using selection and mutation
|
1101
|
-
sig { params(trainset: T::Array[T.untyped]).void }
|
1102
|
-
def evolve_generation(trainset)
|
1103
|
-
current_scores = evaluate_population(trainset)
|
1104
|
-
|
1105
|
-
# Simple selection: keep top 50% and mutate them
|
1106
|
-
sorted_indices = (0...@population.size).sort_by { |i| -current_scores[i].overall_score }
|
1107
|
-
survivors = sorted_indices.take([@config.population_size / 2, 1].max)
|
1108
|
-
|
1109
|
-
new_population = []
|
1110
|
-
|
1111
|
-
# Keep best performers
|
1112
|
-
survivors.each { |i| new_population << @population[i] }
|
1113
|
-
|
1114
|
-
# Fill rest with mutations of survivors
|
1115
|
-
while new_population.size < @config.population_size
|
1116
|
-
parent_index = survivors.sample
|
1117
|
-
parent = @population[parent_index]
|
1118
|
-
|
1119
|
-
# Generate mutation if parent has signature_class
|
1120
|
-
if parent.respond_to?(:signature_class) && parent.signature_class.respond_to?(:description)
|
1121
|
-
variants = generate_instruction_variants(parent.signature_class.description)
|
1122
|
-
mutated = create_program_with_instruction(parent, variants.first || parent.signature_class.description)
|
1123
|
-
new_population << mutated
|
1124
|
-
else
|
1125
|
-
# If no signature_class, just duplicate the parent
|
1126
|
-
new_population << parent
|
1127
|
-
end
|
1128
|
-
end
|
1129
|
-
|
1130
|
-
@population = new_population
|
1131
|
-
@generation += 1
|
1132
|
-
end
|
1133
|
-
|
1134
|
-
# Run complete evolution process
|
1135
|
-
sig { params(program: T.untyped, trainset: T::Array[T.untyped]).returns(T::Hash[Symbol, T.untyped]) }
|
1136
|
-
def run_evolution(program, trainset)
|
1137
|
-
initialize_population(program)
|
1138
|
-
|
1139
|
-
history = []
|
1140
|
-
|
1141
|
-
# Initial evaluation
|
1142
|
-
initial_scores = evaluate_population(trainset)
|
1143
|
-
best_initial = initial_scores.max_by(&:overall_score)
|
1144
|
-
avg_initial = initial_scores.map(&:overall_score).sum / initial_scores.size
|
1145
|
-
history << {
|
1146
|
-
generation: 0,
|
1147
|
-
best_fitness: best_initial.overall_score,
|
1148
|
-
avg_fitness: avg_initial,
|
1149
|
-
diversity: population_diversity
|
1150
|
-
}
|
1151
|
-
|
1152
|
-
# Evolution loop
|
1153
|
-
@config.num_generations.times do
|
1154
|
-
evolve_generation(trainset)
|
1155
|
-
scores = evaluate_population(trainset)
|
1156
|
-
best_score = scores.max_by(&:overall_score)
|
1157
|
-
avg_score = scores.map(&:overall_score).sum / scores.size
|
1158
|
-
|
1159
|
-
history << {
|
1160
|
-
generation: @generation,
|
1161
|
-
best_fitness: best_score.overall_score,
|
1162
|
-
avg_fitness: avg_score,
|
1163
|
-
diversity: population_diversity
|
1164
|
-
}
|
1165
|
-
end
|
1166
|
-
|
1167
|
-
best_fitness_score = @fitness_scores.max_by(&:overall_score)
|
1168
|
-
{
|
1169
|
-
best_candidate: get_best_candidate,
|
1170
|
-
best_fitness: best_fitness_score || FitnessScore.new(
|
1171
|
-
primary_score: 0.0,
|
1172
|
-
secondary_scores: {},
|
1173
|
-
overall_score: 0.0,
|
1174
|
-
metadata: {}
|
1175
|
-
),
|
1176
|
-
generation_history: history,
|
1177
|
-
generation_count: @generation,
|
1178
|
-
final_population: @population.dup
|
1179
|
-
}
|
1180
|
-
end
|
1181
|
-
|
1182
|
-
# Get the best performing candidate from current population
|
1183
|
-
sig { returns(T.untyped) }
|
1184
|
-
def get_best_candidate
|
1185
|
-
return @population.first if @fitness_scores.empty?
|
1186
|
-
|
1187
|
-
best_index = @fitness_scores.each_with_index.max_by { |score, _| score.overall_score }[1]
|
1188
|
-
@population[best_index]
|
1189
|
-
end
|
1190
|
-
|
1191
|
-
# Measure diversity of instructions in current population
|
1192
|
-
sig { returns(Float) }
|
1193
|
-
def population_diversity
|
1194
|
-
return 0.0 if @population.empty?
|
1195
|
-
|
1196
|
-
# Only calculate diversity for programs that have signature_class
|
1197
|
-
instructions = @population.filter_map do |program|
|
1198
|
-
if program.respond_to?(:signature_class) && program.signature_class.respond_to?(:description)
|
1199
|
-
program.signature_class.description
|
1200
|
-
else
|
1201
|
-
nil
|
1202
|
-
end
|
1203
|
-
end
|
1204
|
-
|
1205
|
-
return 0.0 if instructions.empty?
|
1206
|
-
|
1207
|
-
unique_instructions = instructions.uniq.size
|
1208
|
-
unique_instructions.to_f / instructions.size.to_f
|
1209
|
-
end
|
1210
|
-
|
1211
|
-
private
|
1212
|
-
|
1213
|
-
# Generate instruction variants (similar to simple optimization)
|
1214
|
-
sig { params(original_instruction: String).returns(T::Array[String]) }
|
1215
|
-
def generate_instruction_variants(original_instruction)
|
1216
|
-
variants = []
|
1217
|
-
|
1218
|
-
# Add "step by step" variant
|
1219
|
-
unless original_instruction.include?("step")
|
1220
|
-
variants << "#{original_instruction} Think step by step."
|
1221
|
-
end
|
1222
|
-
|
1223
|
-
# Add "detailed" variant
|
1224
|
-
unless original_instruction.include?("detail")
|
1225
|
-
variants << "#{original_instruction} Provide detailed reasoning."
|
1226
|
-
end
|
1227
|
-
|
1228
|
-
# Add "careful" variant
|
1229
|
-
unless original_instruction.include?("careful")
|
1230
|
-
variants << "Be careful and accurate. #{original_instruction}"
|
1231
|
-
end
|
1232
|
-
|
1233
|
-
# Add "examples" variant
|
1234
|
-
unless original_instruction.include?("example")
|
1235
|
-
variants << "#{original_instruction} Use examples in your response."
|
1236
|
-
end
|
1237
|
-
|
1238
|
-
# Add "precise" variant
|
1239
|
-
unless original_instruction.include?("precise")
|
1240
|
-
variants << "Be precise and specific. #{original_instruction}"
|
1241
|
-
end
|
1242
|
-
|
1243
|
-
variants.shuffle.take(5) # Return up to 5 variants, shuffled
|
1244
|
-
end
|
1245
|
-
|
1246
|
-
# Create program copy with modified instruction using DSPy.rb dynamic capabilities
|
1247
|
-
sig { params(original_program: T.untyped, new_instruction: String).returns(T.untyped) }
|
1248
|
-
def create_program_with_instruction(original_program, new_instruction)
|
1249
|
-
case original_program
|
1250
|
-
when DSPy::Predict
|
1251
|
-
# DSPy::Predict has built-in support for instruction modification
|
1252
|
-
original_program.with_instruction(new_instruction)
|
1253
|
-
when DSPy::Module
|
1254
|
-
# For custom DSPy::Module classes, create new instance with updated predictors
|
1255
|
-
create_modified_module(original_program, new_instruction)
|
1256
|
-
else
|
1257
|
-
# For other types (like test doubles), check available methods
|
1258
|
-
if original_program.respond_to?(:with_instruction)
|
1259
|
-
original_program.with_instruction(new_instruction)
|
1260
|
-
elsif original_program.respond_to?(:signature_class)
|
1261
|
-
# Create new DSPy::Predict with the same signature but new instruction
|
1262
|
-
signature_class = original_program.signature_class
|
1263
|
-
DSPy::Predict.new(signature_class).with_instruction(new_instruction)
|
1264
|
-
else
|
1265
|
-
# Fallback: return original if we can't modify
|
1266
|
-
original_program
|
1267
|
-
end
|
1268
|
-
end
|
1269
|
-
rescue => e
|
1270
|
-
# Return original program on error
|
1271
|
-
original_program
|
1272
|
-
end
|
1273
|
-
|
1274
|
-
# Create modified version of custom DSPy::Module (for GeneticEngine)
|
1275
|
-
sig { params(original_module: DSPy::Module, new_instruction: String).returns(DSPy::Module) }
|
1276
|
-
def create_modified_module(original_module, new_instruction)
|
1277
|
-
begin
|
1278
|
-
# Create a new instance of the same class
|
1279
|
-
new_module = original_module.class.new
|
1280
|
-
|
1281
|
-
# Try to find and update any internal predictors
|
1282
|
-
original_module.instance_variables.each do |var_name|
|
1283
|
-
var_value = original_module.instance_variable_get(var_name)
|
1284
|
-
|
1285
|
-
if var_value.is_a?(DSPy::Predict)
|
1286
|
-
# Update the instruction for internal predictors
|
1287
|
-
modified_predictor = var_value.with_instruction(new_instruction)
|
1288
|
-
new_module.instance_variable_set(var_name, modified_predictor)
|
1289
|
-
else
|
1290
|
-
# Copy other instance variables as-is
|
1291
|
-
new_module.instance_variable_set(var_name, var_value)
|
1292
|
-
end
|
1293
|
-
end
|
1294
|
-
|
1295
|
-
new_module
|
1296
|
-
rescue => e
|
1297
|
-
# Fallback to original module
|
1298
|
-
original_module
|
1299
|
-
end
|
1300
|
-
end
|
1301
|
-
end
|
1302
|
-
|
1303
|
-
# FitnessScore represents multi-dimensional evaluation results
|
1304
|
-
class FitnessScore < T::Struct
|
1305
|
-
extend T::Sig
|
1306
|
-
include Comparable
|
1307
|
-
|
1308
|
-
const :primary_score, Float
|
1309
|
-
const :secondary_scores, T::Hash[Symbol, Float]
|
1310
|
-
const :overall_score, Float
|
1311
|
-
const :metadata, T::Hash[Symbol, T.untyped]
|
1312
|
-
|
1313
|
-
sig do
|
1314
|
-
params(
|
1315
|
-
primary_score: Float,
|
1316
|
-
secondary_scores: T::Hash[Symbol, Float],
|
1317
|
-
overall_score: Float,
|
1318
|
-
metadata: T.nilable(T::Hash[Symbol, T.untyped])
|
1319
|
-
).void
|
1320
|
-
end
|
1321
|
-
def initialize(primary_score:, secondary_scores:, overall_score:, metadata: nil)
|
1322
|
-
# Validate score ranges
|
1323
|
-
[primary_score, overall_score].each do |score|
|
1324
|
-
if score < 0.0 || score > 1.0
|
1325
|
-
raise ArgumentError, "Score must be between 0.0 and 1.0, got #{score}"
|
1326
|
-
end
|
1327
|
-
end
|
1328
|
-
|
1329
|
-
secondary_scores.each do |name, score|
|
1330
|
-
if score < 0.0 || score > 1.0
|
1331
|
-
raise ArgumentError, "Secondary score #{name} must be between 0.0 and 1.0, got #{score}"
|
1332
|
-
end
|
1333
|
-
end
|
1334
|
-
|
1335
|
-
super(
|
1336
|
-
primary_score: primary_score,
|
1337
|
-
secondary_scores: secondary_scores.freeze,
|
1338
|
-
overall_score: overall_score,
|
1339
|
-
metadata: (metadata || {}).freeze
|
1340
|
-
)
|
1341
|
-
end
|
1342
|
-
|
1343
|
-
# Comparison method for Comparable module
|
1344
|
-
sig { params(other: FitnessScore).returns(T.nilable(Integer)) }
|
1345
|
-
def <=>(other)
|
1346
|
-
return nil unless other.is_a?(FitnessScore)
|
1347
|
-
overall_score <=> other.overall_score
|
1348
|
-
end
|
1349
|
-
|
1350
|
-
# Check if this score is dominated by another (for Pareto analysis)
|
1351
|
-
sig { params(other: FitnessScore).returns(T::Boolean) }
|
1352
|
-
def dominated_by?(other)
|
1353
|
-
return false if overall_score > other.overall_score
|
1354
|
-
return true if overall_score < other.overall_score
|
1355
|
-
|
1356
|
-
# If overall scores are equal, check secondary metrics
|
1357
|
-
secondary_scores.all? do |metric, score|
|
1358
|
-
other_score = other.secondary_scores[metric] || 0.0
|
1359
|
-
score <= other_score
|
1360
|
-
end
|
1361
|
-
end
|
1362
|
-
|
1363
|
-
# Get combined score for specific objectives
|
1364
|
-
sig { params(objectives: T::Array[Symbol]).returns(Float) }
|
1365
|
-
def score_for_objectives(objectives)
|
1366
|
-
relevant_scores = objectives.map { |obj| secondary_scores[obj] || 0.0 }
|
1367
|
-
return primary_score if relevant_scores.empty?
|
1368
|
-
|
1369
|
-
(primary_score + relevant_scores.sum) / (objectives.size + 1)
|
1370
|
-
end
|
1371
|
-
end
|
1372
|
-
|
1373
|
-
# FitnessEvaluator provides multi-dimensional evaluation of prompt candidates
|
1374
|
-
class FitnessEvaluator
|
1375
|
-
extend T::Sig
|
1376
|
-
|
1377
|
-
sig { returns(T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T.untyped)) }
|
1378
|
-
attr_reader :primary_metric
|
1379
|
-
|
1380
|
-
sig { returns(GEPAConfig) }
|
1381
|
-
attr_reader :config
|
1382
|
-
|
1383
|
-
sig { returns(T::Hash[Symbol, T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T.untyped)]) }
|
1384
|
-
attr_reader :secondary_metrics
|
1385
|
-
|
1386
|
-
sig do
|
1387
|
-
params(
|
1388
|
-
primary_metric: T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T.untyped),
|
1389
|
-
config: GEPAConfig,
|
1390
|
-
secondary_metrics: T.nilable(T::Hash[Symbol, T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T.untyped)])
|
1391
|
-
).void
|
1392
|
-
end
|
1393
|
-
def initialize(primary_metric:, config:, secondary_metrics: nil)
|
1394
|
-
@primary_metric = primary_metric
|
1395
|
-
@config = config
|
1396
|
-
@secondary_metrics = secondary_metrics || default_secondary_metrics
|
1397
|
-
@trace_collector = TraceCollector.new
|
1398
|
-
end
|
1399
|
-
|
1400
|
-
# Evaluate a single candidate program
|
1401
|
-
sig { params(program: T.untyped, trainset: T::Array[T.untyped]).returns(FitnessScore) }
|
1402
|
-
def evaluate_candidate(program, trainset)
|
1403
|
-
start_time = Time.now
|
1404
|
-
predictions = []
|
1405
|
-
traces = []
|
1406
|
-
|
1407
|
-
# Collect primary metric scores and execution data
|
1408
|
-
primary_scores = trainset.map do |example|
|
1409
|
-
prediction_start = Time.now
|
1410
|
-
prediction = program.call(**example.input_values)
|
1411
|
-
prediction_time = Time.now - prediction_start
|
1412
|
-
|
1413
|
-
predictions << {
|
1414
|
-
prediction: prediction,
|
1415
|
-
latency: prediction_time,
|
1416
|
-
example: example
|
1417
|
-
}
|
1418
|
-
|
1419
|
-
@primary_metric.call(example, prediction).to_f
|
1420
|
-
rescue => e
|
1421
|
-
# Handle prediction errors
|
1422
|
-
predictions << {
|
1423
|
-
prediction: nil,
|
1424
|
-
latency: 0.0,
|
1425
|
-
example: example,
|
1426
|
-
error: e.message
|
1427
|
-
}
|
1428
|
-
0.0
|
1429
|
-
end
|
1430
|
-
|
1431
|
-
primary_score = primary_scores.sum / primary_scores.size
|
1432
|
-
|
1433
|
-
# Calculate secondary metrics
|
1434
|
-
secondary_scores = {}
|
1435
|
-
|
1436
|
-
# Token efficiency (mock data for now - will be replaced with real trace collection)
|
1437
|
-
mock_traces = predictions.map.with_index do |pred, i|
|
1438
|
-
OpenStruct.new(token_usage: 50 + rand(100))
|
1439
|
-
end
|
1440
|
-
secondary_scores[:token_efficiency] = calculate_token_efficiency(mock_traces, predictions.size)
|
1441
|
-
|
1442
|
-
# Response consistency - use first output field for any signature
|
1443
|
-
response_texts = predictions.map do |p|
|
1444
|
-
pred = p[:prediction]
|
1445
|
-
if pred && pred.respond_to?(:class) && pred.class.respond_to?(:props)
|
1446
|
-
# Get first output field name and value
|
1447
|
-
first_field = pred.class.props.keys.first
|
1448
|
-
first_field ? (pred.send(first_field)&.to_s || '') : ''
|
1449
|
-
else
|
1450
|
-
''
|
1451
|
-
end
|
1452
|
-
end
|
1453
|
-
secondary_scores[:consistency] = calculate_consistency(response_texts)
|
1454
|
-
|
1455
|
-
# Latency performance
|
1456
|
-
latencies = predictions.map { |p| p[:latency] }
|
1457
|
-
secondary_scores[:latency] = calculate_latency_score(latencies)
|
1458
|
-
|
1459
|
-
# Calculate weighted overall score
|
1460
|
-
overall_score = calculate_overall_score(primary_score, secondary_scores)
|
1461
|
-
|
1462
|
-
FitnessScore.new(
|
1463
|
-
primary_score: primary_score,
|
1464
|
-
secondary_scores: secondary_scores,
|
1465
|
-
overall_score: overall_score,
|
1466
|
-
metadata: {
|
1467
|
-
evaluation_time: Time.now - start_time,
|
1468
|
-
examples_count: trainset.size,
|
1469
|
-
errors_count: predictions.count { |p| p[:error] }
|
1470
|
-
}
|
1471
|
-
)
|
1472
|
-
end
|
1473
|
-
|
1474
|
-
# Evaluate multiple candidates in batch
|
1475
|
-
sig { params(programs: T::Array[T.untyped], trainset: T::Array[T.untyped]).returns(T::Array[FitnessScore]) }
|
1476
|
-
def batch_evaluate(programs, trainset)
|
1477
|
-
programs.map { |program| evaluate_candidate(program, trainset) }
|
1478
|
-
end
|
1479
|
-
|
1480
|
-
# Compare two fitness scores (positive if first is better)
|
1481
|
-
sig { params(score1: FitnessScore, score2: FitnessScore).returns(Float) }
|
1482
|
-
def compare_candidates(score1, score2)
|
1483
|
-
score1.overall_score - score2.overall_score
|
1484
|
-
end
|
1485
|
-
|
1486
|
-
# Rank candidates by fitness (returns indices sorted by fitness, best first)
|
1487
|
-
sig { params(scores: T::Array[FitnessScore]).returns(T::Array[Integer]) }
|
1488
|
-
def rank_candidates(scores)
|
1489
|
-
scores.each_with_index.sort_by { |score, _| -score.overall_score }.map(&:last)
|
1490
|
-
end
|
1491
|
-
|
1492
|
-
private
|
1493
|
-
|
1494
|
-
# Default secondary metrics for fitness evaluation
|
1495
|
-
sig { returns(T::Hash[Symbol, T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T.untyped)]) }
|
1496
|
-
def default_secondary_metrics
|
1497
|
-
{
|
1498
|
-
token_efficiency: proc { |traces, count| calculate_token_efficiency(traces, count) },
|
1499
|
-
consistency: proc { |responses| calculate_consistency(responses) },
|
1500
|
-
latency: proc { |latencies| calculate_latency_score(latencies) }
|
1501
|
-
}
|
1502
|
-
end
|
1503
|
-
|
1504
|
-
# Calculate token usage efficiency (lower usage = higher score)
|
1505
|
-
sig { params(traces: T::Array[T.untyped], example_count: Integer).returns(Float) }
|
1506
|
-
def calculate_token_efficiency(traces, example_count)
|
1507
|
-
return 1.0 if traces.empty? || example_count == 0
|
1508
|
-
|
1509
|
-
total_tokens = traces.sum(&:token_usage)
|
1510
|
-
avg_tokens_per_example = total_tokens.to_f / example_count
|
1511
|
-
|
1512
|
-
# Efficiency decreases as token usage increases
|
1513
|
-
# Assume 100 tokens per example is baseline (score 0.5)
|
1514
|
-
baseline_tokens = 100.0
|
1515
|
-
efficiency = baseline_tokens / (baseline_tokens + avg_tokens_per_example)
|
1516
|
-
|
1517
|
-
[efficiency, 1.0].min
|
1518
|
-
end
|
1519
|
-
|
1520
|
-
# Calculate consistency of responses (similar structure = higher score)
|
1521
|
-
sig { params(responses: T::Array[String]).returns(Float) }
|
1522
|
-
def calculate_consistency(responses)
|
1523
|
-
return 1.0 if responses.empty? || responses.size == 1
|
1524
|
-
|
1525
|
-
# Simple consistency measure: average word overlap between responses
|
1526
|
-
word_sets = responses.map { |response| response.downcase.split.to_set }
|
1527
|
-
|
1528
|
-
total_similarity = 0.0
|
1529
|
-
comparisons = 0
|
1530
|
-
|
1531
|
-
word_sets.each_with_index do |set1, i|
|
1532
|
-
word_sets[(i+1)..-1].each do |set2|
|
1533
|
-
intersection = set1 & set2
|
1534
|
-
union = set1 | set2
|
1535
|
-
|
1536
|
-
similarity = union.empty? ? 0.0 : intersection.size.to_f / union.size
|
1537
|
-
total_similarity += similarity
|
1538
|
-
comparisons += 1
|
1539
|
-
end
|
1540
|
-
end
|
1541
|
-
|
1542
|
-
comparisons == 0 ? 1.0 : total_similarity / comparisons
|
1543
|
-
end
|
1544
|
-
|
1545
|
-
# Calculate latency performance score (faster = higher score)
|
1546
|
-
sig { params(latencies: T::Array[Float]).returns(Float) }
|
1547
|
-
def calculate_latency_score(latencies)
|
1548
|
-
return 1.0 if latencies.empty?
|
1549
|
-
|
1550
|
-
avg_latency = latencies.sum / latencies.size
|
1551
|
-
|
1552
|
-
# Penalize high latencies (assume 2 seconds is baseline for 0.5 score)
|
1553
|
-
baseline_latency = 2.0
|
1554
|
-
latency_score = baseline_latency / (baseline_latency + avg_latency)
|
1555
|
-
|
1556
|
-
[latency_score, 1.0].min
|
1557
|
-
end
|
1558
|
-
|
1559
|
-
# Calculate weighted overall score combining primary and secondary metrics
|
1560
|
-
sig { params(primary_score: Float, secondary_scores: T::Hash[Symbol, Float]).returns(Float) }
|
1561
|
-
def calculate_overall_score(primary_score, secondary_scores)
|
1562
|
-
# Weight primary metric at 70%, secondary metrics at 30%
|
1563
|
-
primary_weight = 0.7
|
1564
|
-
secondary_weight = 0.3
|
1565
|
-
|
1566
|
-
return primary_score if secondary_scores.empty?
|
1567
|
-
|
1568
|
-
avg_secondary = secondary_scores.values.sum / secondary_scores.size
|
1569
|
-
overall = (primary_score * primary_weight) + (avg_secondary * secondary_weight)
|
1570
|
-
|
1571
|
-
[overall, 1.0].min
|
1572
|
-
end
|
1573
|
-
end
|
1574
|
-
|
1575
|
-
# InstructionProposer: Analyzes execution traces and generates improved instructions using LLM reflection
|
1576
|
-
class InstructionProposer
|
1577
|
-
extend T::Sig
|
1578
|
-
|
1579
|
-
sig { params(config: GEPAConfig).void }
|
1580
|
-
def initialize(config:)
|
1581
|
-
@config = config
|
1582
|
-
end
|
1583
|
-
|
1584
|
-
# Generate improved instruction based on execution traces and failures
|
1585
|
-
sig { params(original_instruction: String, execution_traces: T::Array[ExecutionTrace], failed_examples: T::Array[T.untyped]).returns(String) }
|
1586
|
-
def propose_instruction(original_instruction:, execution_traces:, failed_examples:)
|
1587
|
-
if execution_traces.empty? && failed_examples.empty?
|
1588
|
-
# No traces or failures to analyze, return original
|
1589
|
-
return original_instruction
|
1590
|
-
end
|
1591
|
-
|
1592
|
-
# Use LLM-based reflection to generate improved instruction
|
1593
|
-
reflect_and_propose(
|
1594
|
-
original_instruction: original_instruction,
|
1595
|
-
execution_traces: execution_traces,
|
1596
|
-
failed_examples: failed_examples
|
1597
|
-
)
|
1598
|
-
rescue => e
|
1599
|
-
# Fallback to original instruction on error
|
1600
|
-
original_instruction
|
1601
|
-
end
|
1602
|
-
|
1603
|
-
private
|
1604
|
-
|
1605
|
-
sig { returns(GEPAConfig) }
|
1606
|
-
attr_reader :config
|
1607
|
-
|
1608
|
-
# Use LLM reflection to propose improved instruction
|
1609
|
-
sig { params(original_instruction: String, execution_traces: T::Array[ExecutionTrace], failed_examples: T::Array[T.untyped]).returns(String) }
|
1610
|
-
def reflect_and_propose(original_instruction:, execution_traces:, failed_examples:)
|
1611
|
-
# Create signature for instruction improvement
|
1612
|
-
improvement_signature = create_instruction_improvement_signature
|
1613
|
-
|
1614
|
-
# Create predictor for instruction proposal
|
1615
|
-
proposer = DSPy::Predict.new(improvement_signature)
|
1616
|
-
|
1617
|
-
# Analyze traces and failures
|
1618
|
-
trace_analysis = analyze_execution_traces(execution_traces)
|
1619
|
-
failure_analysis = analyze_failed_examples(failed_examples)
|
1620
|
-
|
1621
|
-
# Generate improved instruction
|
1622
|
-
result = proposer.call(
|
1623
|
-
original_instruction: original_instruction,
|
1624
|
-
trace_analysis: trace_analysis,
|
1625
|
-
failure_analysis: failure_analysis,
|
1626
|
-
improvement_context: "GEPA prompt optimization for better performance"
|
1627
|
-
)
|
1628
|
-
|
1629
|
-
result.improved_instruction || original_instruction
|
1630
|
-
rescue => e
|
1631
|
-
# Return original instruction if LLM call fails
|
1632
|
-
original_instruction
|
1633
|
-
end
|
1634
|
-
|
1635
|
-
# Create signature for instruction improvement
|
1636
|
-
sig { returns(T.class_of(DSPy::Signature)) }
|
1637
|
-
def create_instruction_improvement_signature
|
1638
|
-
Class.new(DSPy::Signature) do
|
1639
|
-
description "Analyze execution traces and propose improved instructions for better AI system performance"
|
1640
|
-
|
1641
|
-
input do
|
1642
|
-
const :original_instruction, String, description: "The current instruction/prompt being used"
|
1643
|
-
const :trace_analysis, String, description: "Analysis of execution traces showing patterns and issues"
|
1644
|
-
const :failure_analysis, String, description: "Analysis of failed examples and their patterns"
|
1645
|
-
const :improvement_context, String, description: "Context about what kind of improvement is needed"
|
1646
|
-
end
|
1647
|
-
|
1648
|
-
output do
|
1649
|
-
const :improved_instruction, String, description: "Improved instruction that addresses identified issues"
|
1650
|
-
const :reasoning, String, description: "Explanation of why this improvement should work better"
|
1651
|
-
const :confidence, Float, description: "Confidence in the improvement (0.0-1.0)"
|
1652
|
-
end
|
1653
|
-
end
|
1654
|
-
end
|
1655
|
-
|
1656
|
-
# Analyze execution traces to identify patterns
|
1657
|
-
sig { params(traces: T::Array[ExecutionTrace]).returns(String) }
|
1658
|
-
def analyze_execution_traces(traces)
|
1659
|
-
return "No execution traces available" if traces.empty?
|
1660
|
-
|
1661
|
-
llm_traces = traces.select(&:llm_trace?)
|
1662
|
-
module_traces = traces.select(&:module_trace?)
|
1663
|
-
|
1664
|
-
analysis = []
|
1665
|
-
analysis << "Execution Trace Analysis:"
|
1666
|
-
analysis << "- Total traces: #{traces.size}"
|
1667
|
-
analysis << "- LLM interactions: #{llm_traces.size}"
|
1668
|
-
analysis << "- Module calls: #{module_traces.size}"
|
1669
|
-
|
1670
|
-
if llm_traces.any?
|
1671
|
-
token_usage = llm_traces.sum(&:token_usage)
|
1672
|
-
avg_response_length = llm_traces.map { |t| t.attributes['response']&.to_s&.length || 0 }.sum / llm_traces.size
|
1673
|
-
|
1674
|
-
analysis << "- Total tokens used: #{token_usage}"
|
1675
|
-
analysis << "- Average response length: #{avg_response_length} characters"
|
1676
|
-
|
1677
|
-
# Identify models used
|
1678
|
-
models = llm_traces.map { |t| t.attributes['gen_ai.request.model'] }.compact.uniq
|
1679
|
-
analysis << "- Models used: #{models.join(', ')}" if models.any?
|
1680
|
-
end
|
1681
|
-
|
1682
|
-
# Analyze timing patterns
|
1683
|
-
if traces.size > 1
|
1684
|
-
timespan = traces.max_by(&:timestamp).timestamp - traces.min_by(&:timestamp).timestamp
|
1685
|
-
analysis << "- Execution timespan: #{timespan.round(2)} seconds"
|
1686
|
-
end
|
1687
|
-
|
1688
|
-
analysis.join("\n")
|
1689
|
-
end
|
1690
|
-
|
1691
|
-
# Analyze failed examples to identify failure patterns
|
1692
|
-
sig { params(failed_examples: T::Array[T.untyped]).returns(String) }
|
1693
|
-
def analyze_failed_examples(failed_examples)
|
1694
|
-
return "No failed examples to analyze" if failed_examples.empty?
|
1695
|
-
|
1696
|
-
analysis = []
|
1697
|
-
analysis << "Failure Pattern Analysis:"
|
1698
|
-
analysis << "- Failed examples count: #{failed_examples.size}"
|
1699
|
-
|
1700
|
-
# Group failures by type if possible
|
1701
|
-
if failed_examples.first.respond_to?(:input)
|
1702
|
-
input_patterns = failed_examples.map { |ex| ex.input.keys }.flatten.uniq
|
1703
|
-
analysis << "- Input fields involved: #{input_patterns.join(', ')}"
|
1704
|
-
end
|
1705
|
-
|
1706
|
-
# Sample some failure cases for context
|
1707
|
-
sample_size = [failed_examples.size, 3].min
|
1708
|
-
analysis << "- Sample failures:"
|
1709
|
-
failed_examples.take(sample_size).each_with_index do |example, idx|
|
1710
|
-
if example.respond_to?(:input) && example.respond_to?(:expected_values)
|
1711
|
-
input_summary = example.input.values.first.to_s[0..50] + "..."
|
1712
|
-
expected = example.expected_values.values.first.to_s[0..30] + "..."
|
1713
|
-
analysis << " #{idx + 1}. Input: #{input_summary} | Expected: #{expected}"
|
1714
|
-
end
|
1715
|
-
end
|
1716
|
-
|
1717
|
-
analysis.join("\n")
|
1718
|
-
end
|
1719
|
-
end
|
1720
|
-
|
1721
|
-
# MutationEngine: Handles LLM-based prompt transformations for genetic evolution
|
1722
|
-
class MutationEngine
|
1723
|
-
extend T::Sig
|
1724
|
-
|
1725
|
-
sig { returns(GEPAConfig) }
|
1726
|
-
attr_reader :config
|
1727
|
-
|
1728
|
-
sig { returns(InstructionProposer) }
|
1729
|
-
attr_reader :instruction_proposer
|
1730
|
-
|
1731
|
-
sig { params(config: GEPAConfig).void }
|
1732
|
-
def initialize(config:)
|
1733
|
-
@config = config
|
1734
|
-
@instruction_proposer = InstructionProposer.new(config: config)
|
1735
|
-
end
|
1736
|
-
|
1737
|
-
# Mutate a single program with LLM-based instruction proposal
|
1738
|
-
sig { params(program: T.untyped, execution_traces: T::Array[ExecutionTrace], failed_examples: T::Array[T.untyped]).returns(T.untyped) }
|
1739
|
-
def mutate_program(program, execution_traces: [], failed_examples: [])
|
1740
|
-
return program if rand > @config.mutation_rate
|
27
|
+
def self.default_config
|
28
|
+
@default_config ||= DEFAULT_CONFIG.dup
|
29
|
+
end
|
1741
30
|
|
1742
|
-
|
1743
|
-
|
31
|
+
class NullExperimentTracker
|
32
|
+
extend T::Sig
|
33
|
+
attr_reader :events
|
1744
34
|
|
1745
|
-
|
1746
|
-
|
1747
|
-
|
1748
|
-
execution_traces: execution_traces,
|
1749
|
-
failed_examples: failed_examples
|
1750
|
-
)
|
35
|
+
def initialize
|
36
|
+
@events = []
|
37
|
+
end
|
1751
38
|
|
1752
|
-
|
1753
|
-
|
1754
|
-
|
1755
|
-
error: e.message,
|
1756
|
-
program_type: program.class.name
|
1757
|
-
})
|
1758
|
-
# Return original program on mutation failure
|
1759
|
-
program
|
1760
|
-
end
|
39
|
+
sig { params(metrics: T::Hash[Symbol, T.untyped], step: T.nilable(Integer)).void }
|
40
|
+
def log_metrics(metrics, step: nil)
|
41
|
+
@events << { metrics: metrics, step: step }
|
1761
42
|
end
|
43
|
+
end
|
1762
44
|
|
1763
|
-
|
1764
|
-
|
1765
|
-
|
1766
|
-
return [] if programs.empty?
|
45
|
+
class NullLogger
|
46
|
+
extend T::Sig
|
47
|
+
attr_reader :messages
|
1767
48
|
|
1768
|
-
|
49
|
+
def initialize
|
50
|
+
@messages = []
|
1769
51
|
end
|
1770
52
|
|
1771
|
-
|
1772
|
-
|
1773
|
-
|
1774
|
-
|
1775
|
-
# In full implementation, this would emit events for monitoring
|
53
|
+
sig { params(message: String).void }
|
54
|
+
def log(message)
|
55
|
+
@messages << message
|
56
|
+
DSPy.log('gepa.log', message: message)
|
1776
57
|
end
|
58
|
+
end
|
1777
59
|
|
1778
|
-
|
1779
|
-
|
1780
|
-
# Extract instruction text from program
|
1781
|
-
sig { params(program: T.untyped).returns(String) }
|
1782
|
-
def extract_instruction(program)
|
1783
|
-
if program.signature_class&.description
|
1784
|
-
program.signature_class.description
|
1785
|
-
else
|
1786
|
-
"Analyze the input and complete the task accurately"
|
1787
|
-
end
|
1788
|
-
end
|
60
|
+
class PredictAdapter
|
61
|
+
extend T::Sig
|
1789
62
|
|
1790
|
-
|
1791
|
-
|
1792
|
-
def apply_mutation(instruction, mutation_type)
|
1793
|
-
case mutation_type
|
1794
|
-
when MutationType::Rewrite
|
1795
|
-
apply_rewrite_mutation(instruction)
|
1796
|
-
when MutationType::Expand
|
1797
|
-
apply_expand_mutation(instruction)
|
1798
|
-
when MutationType::Simplify
|
1799
|
-
apply_simplify_mutation(instruction)
|
1800
|
-
when MutationType::Combine
|
1801
|
-
apply_combine_mutation(instruction)
|
1802
|
-
when MutationType::Rephrase
|
1803
|
-
apply_rephrase_mutation(instruction)
|
1804
|
-
else
|
1805
|
-
instruction
|
1806
|
-
end
|
63
|
+
ReflectionLMType = T.type_alias do
|
64
|
+
T.any(DSPy::ReflectionLM, T.proc.params(arg0: String).returns(String))
|
1807
65
|
end
|
1808
66
|
|
1809
|
-
|
1810
|
-
|
1811
|
-
|
1812
|
-
|
1813
|
-
|
1814
|
-
|
1815
|
-
|
1816
|
-
|
1817
|
-
]
|
1818
|
-
|
1819
|
-
patterns.sample.call(instruction)
|
67
|
+
FeedbackFnType = T.type_alias do
|
68
|
+
T.proc.params(
|
69
|
+
predictor_output: T.untyped,
|
70
|
+
predictor_inputs: T::Hash[T.any(String, Symbol), T.untyped],
|
71
|
+
module_inputs: DSPy::Example,
|
72
|
+
module_outputs: T.untyped,
|
73
|
+
captured_trace: T::Array[T::Hash[Symbol, T.untyped]]
|
74
|
+
).returns(T.untyped)
|
1820
75
|
end
|
1821
76
|
|
1822
|
-
|
1823
|
-
|
1824
|
-
|
1825
|
-
|
1826
|
-
|
1827
|
-
|
1828
|
-
|
1829
|
-
"Explain your thought process."
|
1830
|
-
]
|
1831
|
-
|
1832
|
-
"#{instruction} #{expansions.sample}"
|
77
|
+
sig do
|
78
|
+
params(
|
79
|
+
student: DSPy::Module,
|
80
|
+
metric: T.proc.params(arg0: DSPy::Example, arg1: T.untyped).returns(T.untyped),
|
81
|
+
reflection_lm: T.nilable(ReflectionLMType),
|
82
|
+
feedback_map: T::Hash[String, FeedbackFnType]
|
83
|
+
).void
|
1833
84
|
end
|
85
|
+
def initialize(student, metric, reflection_lm: nil, feedback_map: {})
|
86
|
+
@student = student
|
87
|
+
@metric = metric
|
88
|
+
@reflection_lm = reflection_lm
|
89
|
+
@feedback_map = feedback_map.transform_keys(&:to_s)
|
1834
90
|
|
1835
|
-
|
1836
|
-
|
1837
|
-
def apply_simplify_mutation(instruction)
|
1838
|
-
# Remove common complexity words
|
1839
|
-
simplified = instruction.gsub(/\b(carefully|detailed|comprehensive|thorough)\b/i, '')
|
1840
|
-
.gsub(/\s+/, ' ')
|
1841
|
-
.strip
|
1842
|
-
|
1843
|
-
simplified.empty? ? instruction : simplified
|
91
|
+
@predictor_entries = resolve_predictors(@student)
|
92
|
+
@predictor_names = @predictor_entries.map(&:first)
|
1844
93
|
end
|
1845
94
|
|
1846
|
-
|
1847
|
-
|
1848
|
-
|
1849
|
-
|
1850
|
-
|
1851
|
-
"Use logical reasoning.",
|
1852
|
-
"Apply domain knowledge.",
|
1853
|
-
"Consider edge cases."
|
1854
|
-
]
|
1855
|
-
|
1856
|
-
"#{instruction} #{strategies.sample}"
|
95
|
+
sig { returns(T::Hash[String, String]) }
|
96
|
+
def seed_candidate
|
97
|
+
@predictor_entries.each_with_object({}) do |(name, predictor), memo|
|
98
|
+
memo[name] = extract_instruction(predictor)
|
99
|
+
end
|
1857
100
|
end
|
1858
101
|
|
1859
|
-
|
1860
|
-
|
1861
|
-
|
1862
|
-
|
1863
|
-
|
1864
|
-
|
1865
|
-
|
1866
|
-
|
1867
|
-
|
1868
|
-
|
1869
|
-
|
1870
|
-
|
1871
|
-
|
1872
|
-
|
1873
|
-
|
102
|
+
sig do
|
103
|
+
params(candidate: T::Hash[String, String], recorder: T.nilable(T.untyped)).returns(DSPy::Module)
|
104
|
+
end
|
105
|
+
def build_program(candidate, recorder: nil)
|
106
|
+
program = clone_module(@student)
|
107
|
+
duplicate_predictors!(program)
|
108
|
+
|
109
|
+
predictor_map = resolve_predictors(program).to_h
|
110
|
+
candidate.each do |name, new_instruction|
|
111
|
+
predictor = predictor_map[name]
|
112
|
+
next unless predictor
|
113
|
+
|
114
|
+
updated = apply_instruction_to_predictor(predictor, new_instruction)
|
115
|
+
if predictor.equal?(program)
|
116
|
+
program = updated
|
117
|
+
elsif !updated.equal?(predictor)
|
118
|
+
replace_reference(program, predictor, updated)
|
119
|
+
end
|
120
|
+
predictor_map[name] = updated
|
1874
121
|
end
|
1875
122
|
|
1876
|
-
|
123
|
+
wrap_predictors_for_tracing!(program, recorder: recorder) if recorder
|
124
|
+
program
|
1877
125
|
end
|
1878
126
|
|
1879
|
-
|
1880
|
-
|
1881
|
-
|
1882
|
-
|
1883
|
-
|
1884
|
-
|
1885
|
-
|
1886
|
-
|
1887
|
-
|
1888
|
-
|
1889
|
-
|
127
|
+
sig do
|
128
|
+
params(
|
129
|
+
batch: T::Array[DSPy::Example],
|
130
|
+
candidate: T::Hash[String, String],
|
131
|
+
capture_traces: T::Boolean
|
132
|
+
).returns(::GEPA::Core::EvaluationBatch)
|
133
|
+
end
|
134
|
+
def evaluate(batch, candidate, capture_traces: false)
|
135
|
+
recorder = capture_traces ? TraceRecorder.new : nil
|
136
|
+
program = build_program(candidate, recorder: recorder)
|
137
|
+
|
138
|
+
if capture_traces
|
139
|
+
trajectories = batch.map do |example|
|
140
|
+
recorder&.start_example
|
141
|
+
prediction = program.call(**example.input_values)
|
142
|
+
result = @metric.call(example, prediction)
|
143
|
+
score, feedback = extract_score_and_feedback(result)
|
144
|
+
trace_entries = recorder ? recorder.finish_example : []
|
145
|
+
|
146
|
+
{
|
147
|
+
example: example,
|
148
|
+
prediction: prediction,
|
149
|
+
score: score,
|
150
|
+
feedback: feedback,
|
151
|
+
trace: trace_entries
|
152
|
+
}
|
153
|
+
end
|
154
|
+
|
155
|
+
scores = trajectories.map { |row| row[:score] }
|
156
|
+
outputs = trajectories.map { |row| row[:prediction] }
|
157
|
+
::GEPA::Core::EvaluationBatch.new(outputs: outputs, scores: scores, trajectories: trajectories)
|
1890
158
|
else
|
1891
|
-
|
1892
|
-
|
1893
|
-
|
1894
|
-
|
1895
|
-
|
1896
|
-
|
1897
|
-
DSPy::Predict.new(signature_class).with_instruction(new_instruction)
|
1898
|
-
else
|
1899
|
-
# Fallback: return original if we can't mutate
|
1900
|
-
emit_event('mutation_fallback', {
|
1901
|
-
program_type: original_program.class.name,
|
1902
|
-
reason: 'No mutation method available'
|
1903
|
-
})
|
1904
|
-
original_program
|
159
|
+
evaluator = DSPy::Evaluate.new(program, metric: nil, num_threads: nil, max_errors: batch.length * 100, provide_traceback: false)
|
160
|
+
results = batch.map do |example|
|
161
|
+
prediction = program.call(**example.input_values)
|
162
|
+
result = @metric.call(example, prediction)
|
163
|
+
score, = extract_score_and_feedback(result)
|
164
|
+
[prediction, score]
|
1905
165
|
end
|
166
|
+
outputs = results.map(&:first)
|
167
|
+
scores = results.map(&:last)
|
168
|
+
::GEPA::Core::EvaluationBatch.new(outputs: outputs, scores: scores, trajectories: nil)
|
1906
169
|
end
|
1907
|
-
rescue => e
|
1908
|
-
emit_event('mutation_error', {
|
1909
|
-
error: e.message,
|
1910
|
-
program_type: original_program.class.name,
|
1911
|
-
backtrace: e.backtrace&.first(3)
|
1912
|
-
})
|
1913
|
-
# Return original program on error
|
1914
|
-
original_program
|
1915
170
|
end
|
1916
171
|
|
1917
|
-
|
1918
|
-
|
1919
|
-
|
1920
|
-
|
1921
|
-
|
1922
|
-
|
1923
|
-
|
1924
|
-
|
1925
|
-
|
1926
|
-
|
1927
|
-
|
1928
|
-
|
1929
|
-
|
1930
|
-
|
1931
|
-
|
1932
|
-
|
1933
|
-
|
1934
|
-
|
1935
|
-
|
1936
|
-
|
1937
|
-
|
172
|
+
sig do
|
173
|
+
params(
|
174
|
+
candidate: T::Hash[String, String],
|
175
|
+
eval_batch: ::GEPA::Core::EvaluationBatch,
|
176
|
+
components_to_update: T::Array[String]
|
177
|
+
).returns(T::Hash[String, T::Array[T::Hash[String, T.untyped]]])
|
178
|
+
end
|
179
|
+
def make_reflective_dataset(candidate, eval_batch, components_to_update)
|
180
|
+
return {} unless eval_batch.trajectories
|
181
|
+
|
182
|
+
components_to_update.each_with_object({}) do |component, memo|
|
183
|
+
rows = eval_batch.trajectories.flat_map do |trajectory|
|
184
|
+
example = trajectory[:example]
|
185
|
+
expected = serialize_struct(example.expected)
|
186
|
+
actual_program_output = serialize_prediction(trajectory[:prediction])
|
187
|
+
diff = build_diff(expected, actual_program_output)
|
188
|
+
default_feedback = trajectory[:feedback] || "Score: #{trajectory[:score]}"
|
189
|
+
default_score = trajectory[:score]
|
190
|
+
full_trace = Array(trajectory[:trace])
|
191
|
+
|
192
|
+
full_trace.filter_map do |entry|
|
193
|
+
next unless entry[:predictor_name] == component
|
194
|
+
|
195
|
+
raw_inputs = entry[:inputs] || {}
|
196
|
+
raw_output = entry[:output]
|
197
|
+
inputs = serialize_struct(raw_inputs)
|
198
|
+
outputs = serialize_prediction(raw_output)
|
199
|
+
|
200
|
+
feedback_text = default_feedback
|
201
|
+
score_value = default_score
|
202
|
+
score_overridden = false
|
203
|
+
|
204
|
+
if (feedback_fn = @feedback_map[component])
|
205
|
+
feedback_result = feedback_fn.call(
|
206
|
+
predictor_output: raw_output,
|
207
|
+
predictor_inputs: raw_inputs,
|
208
|
+
module_inputs: example,
|
209
|
+
module_outputs: trajectory[:prediction],
|
210
|
+
captured_trace: full_trace
|
211
|
+
)
|
212
|
+
override_score, override_feedback = extract_score_and_feedback(feedback_result)
|
213
|
+
feedback_text = override_feedback if override_feedback
|
214
|
+
unless override_score.nil?
|
215
|
+
score_value = override_score
|
216
|
+
score_overridden = true
|
217
|
+
end
|
218
|
+
end
|
219
|
+
|
220
|
+
row = {
|
221
|
+
'Inputs' => inputs,
|
222
|
+
'Expected' => expected,
|
223
|
+
'Generated Outputs' => outputs,
|
224
|
+
'Diff' => diff,
|
225
|
+
'Feedback' => feedback_text
|
226
|
+
}
|
227
|
+
row['Score'] = score_value if score_overridden
|
228
|
+
row
|
1938
229
|
end
|
1939
230
|
end
|
1940
|
-
|
1941
|
-
new_module
|
1942
|
-
rescue => e
|
1943
|
-
emit_event('module_mutation_error', {
|
1944
|
-
error: e.message,
|
1945
|
-
module_class: original_module.class.name
|
1946
|
-
})
|
1947
|
-
# Fallback to original module
|
1948
|
-
original_module
|
231
|
+
memo[component] = rows unless rows.empty?
|
1949
232
|
end
|
1950
233
|
end
|
1951
234
|
|
1952
|
-
|
1953
|
-
|
1954
|
-
|
1955
|
-
|
1956
|
-
|
1957
|
-
|
1958
|
-
|
1959
|
-
|
1960
|
-
|
1961
|
-
|
235
|
+
sig do
|
236
|
+
params(
|
237
|
+
candidate: T::Hash[String, String],
|
238
|
+
reflective_dataset: T::Hash[String, T::Array[T::Hash[String, T.untyped]]],
|
239
|
+
components_to_update: T::Array[String]
|
240
|
+
).returns(T::Hash[String, String])
|
241
|
+
end
|
242
|
+
def propose_new_texts(candidate, reflective_dataset, components_to_update)
|
243
|
+
if @reflection_lm
|
244
|
+
components_to_update.to_h do |name|
|
245
|
+
response = ::GEPA::Strategies::InstructionProposalSignature.run(
|
246
|
+
@reflection_lm,
|
247
|
+
{
|
248
|
+
'current_instruction_doc' => candidate[name],
|
249
|
+
'dataset_with_feedback' => reflective_dataset.fetch(name, [])
|
250
|
+
}
|
251
|
+
)
|
252
|
+
[name, response.fetch('new_instruction')]
|
253
|
+
end
|
1962
254
|
else
|
1963
|
-
|
1964
|
-
|
255
|
+
components_to_update.to_h do |name|
|
256
|
+
[name, "#{candidate[name]} improved"]
|
257
|
+
end
|
1965
258
|
end
|
1966
259
|
end
|
1967
260
|
|
1968
|
-
|
1969
|
-
sig { params(mutations: T::Array[MutationType]).returns(Float) }
|
1970
|
-
def mutation_diversity(mutations)
|
1971
|
-
return 0.0 if mutations.empty?
|
1972
|
-
|
1973
|
-
unique_types = mutations.uniq.size
|
1974
|
-
total_types = @config.mutation_types.size
|
1975
|
-
|
1976
|
-
unique_types.to_f / total_types
|
1977
|
-
end
|
1978
|
-
end
|
1979
|
-
|
1980
|
-
# CrossoverEngine: Handles genetic recombination of prompts for diversity
|
1981
|
-
class CrossoverEngine
|
1982
|
-
extend T::Sig
|
261
|
+
private
|
1983
262
|
|
1984
|
-
|
1985
|
-
|
1986
|
-
|
1987
|
-
|
263
|
+
sig { params(program: DSPy::Module).returns(T::Array[[String, DSPy::Module]]) }
|
264
|
+
def resolve_predictors(program)
|
265
|
+
pairs = program.named_predictors
|
266
|
+
pairs = [['self', program]] if pairs.empty?
|
267
|
+
pairs
|
1988
268
|
end
|
1989
269
|
|
1990
|
-
sig { returns(
|
1991
|
-
|
1992
|
-
|
1993
|
-
sig { params(config: GEPAConfig).void }
|
1994
|
-
def initialize(config:)
|
1995
|
-
@config = config
|
270
|
+
sig { params(mod: DSPy::Module).returns(DSPy::Module) }
|
271
|
+
def clone_module(mod)
|
272
|
+
safe_clone(mod)
|
1996
273
|
end
|
1997
274
|
|
1998
|
-
|
1999
|
-
|
2000
|
-
|
2001
|
-
|
2002
|
-
|
2003
|
-
|
2004
|
-
|
2005
|
-
instruction_b = extract_instruction(parent_b)
|
2006
|
-
|
2007
|
-
crossover_type = select_crossover_type(instruction_a, instruction_b)
|
2008
|
-
offspring_instructions = apply_crossover(instruction_a, instruction_b, crossover_type)
|
2009
|
-
|
2010
|
-
offspring = [
|
2011
|
-
create_crossover_program(parent_a, offspring_instructions[0]),
|
2012
|
-
create_crossover_program(parent_b, offspring_instructions[1])
|
2013
|
-
]
|
2014
|
-
|
2015
|
-
offspring
|
2016
|
-
rescue => e
|
2017
|
-
# Return original parents on crossover failure
|
2018
|
-
[parent_a, parent_b]
|
275
|
+
sig { params(program: DSPy::Module).void }
|
276
|
+
def duplicate_predictors!(program)
|
277
|
+
resolve_predictors(program).each do |name, predictor|
|
278
|
+
next unless @predictor_names.include?(name)
|
279
|
+
next if predictor.equal?(program)
|
280
|
+
clone = safe_clone(predictor)
|
281
|
+
replace_reference(program, predictor, clone)
|
2019
282
|
end
|
2020
283
|
end
|
2021
284
|
|
2022
|
-
|
2023
|
-
|
2024
|
-
def batch_crossover(population)
|
2025
|
-
return [] if population.empty?
|
2026
|
-
return [population.first] if population.size == 1
|
2027
|
-
|
2028
|
-
offspring = []
|
2029
|
-
|
2030
|
-
# Pair up population for crossover
|
2031
|
-
population.each_slice(2) do |pair|
|
2032
|
-
if pair.size == 2
|
2033
|
-
crossed = crossover_programs(pair[0], pair[1])
|
2034
|
-
offspring.concat(crossed)
|
2035
|
-
else
|
2036
|
-
offspring << pair[0] # Unpaired individual passes through
|
2037
|
-
end
|
2038
|
-
end
|
2039
|
-
|
2040
|
-
offspring
|
285
|
+
sig do
|
286
|
+
params(container: T.untyped, target: T.untyped, replacement: T.untyped, visited: T::Set[Integer]).returns(T.untyped)
|
2041
287
|
end
|
288
|
+
def replace_in_object(container, target, replacement, visited)
|
289
|
+
return replacement if container.equal?(target)
|
290
|
+
return container if visited.include?(container.object_id)
|
2042
291
|
|
2043
|
-
|
2044
|
-
|
2045
|
-
# Extract instruction text from program
|
2046
|
-
sig { params(program: T.untyped).returns(String) }
|
2047
|
-
def extract_instruction(program)
|
2048
|
-
if program.signature_class&.description
|
2049
|
-
program.signature_class.description
|
2050
|
-
else
|
2051
|
-
"Analyze the input and complete the task accurately"
|
2052
|
-
end
|
2053
|
-
end
|
292
|
+
visited.add(container.object_id)
|
2054
293
|
|
2055
|
-
|
2056
|
-
|
2057
|
-
|
2058
|
-
|
2059
|
-
|
2060
|
-
|
2061
|
-
|
2062
|
-
|
2063
|
-
|
2064
|
-
|
294
|
+
case container
|
295
|
+
when Array
|
296
|
+
modified = false
|
297
|
+
new_array = container.map do |value|
|
298
|
+
new_value = replace_in_object(value, target, replacement, visited)
|
299
|
+
modified ||= !new_value.equal?(value)
|
300
|
+
new_value
|
301
|
+
end
|
302
|
+
modified ? new_array : container
|
303
|
+
when Hash
|
304
|
+
modified = false
|
305
|
+
new_hash = container.each_with_object({}) do |(key, value), memo|
|
306
|
+
new_value = replace_in_object(value, target, replacement, visited)
|
307
|
+
modified ||= !new_value.equal?(value)
|
308
|
+
memo[key] = new_value
|
309
|
+
end
|
310
|
+
modified ? new_hash : container
|
2065
311
|
else
|
2066
|
-
|
312
|
+
container
|
2067
313
|
end
|
2068
314
|
end
|
2069
315
|
|
2070
|
-
|
2071
|
-
|
2072
|
-
|
2073
|
-
return [instruction_a, instruction_b] if instruction_a == instruction_b
|
316
|
+
sig { params(owner: T.untyped, target: T.untyped, replacement: T.untyped).void }
|
317
|
+
def replace_reference(owner, target, replacement)
|
318
|
+
return if owner.equal?(target)
|
2074
319
|
|
2075
|
-
|
2076
|
-
|
320
|
+
Array(owner.instance_variables).each do |ivar|
|
321
|
+
value = owner.instance_variable_get(ivar)
|
322
|
+
next if value.nil?
|
2077
323
|
|
2078
|
-
|
2079
|
-
|
2080
|
-
|
2081
|
-
|
2082
|
-
max_length = [words_a.size, words_b.size].max
|
2083
|
-
|
2084
|
-
max_length.times do |i|
|
2085
|
-
word_a = words_a[i]
|
2086
|
-
word_b = words_b[i]
|
2087
|
-
|
2088
|
-
if rand < 0.5
|
2089
|
-
offspring_a_words << (word_a || word_b)
|
2090
|
-
offspring_b_words << (word_b || word_a)
|
2091
|
-
else
|
2092
|
-
offspring_a_words << (word_b || word_a)
|
2093
|
-
offspring_b_words << (word_a || word_b)
|
324
|
+
new_value = replace_in_object(value, target, replacement, ::Set.new)
|
325
|
+
unless new_value.equal?(value)
|
326
|
+
owner.instance_variable_set(ivar, new_value)
|
2094
327
|
end
|
2095
328
|
end
|
2096
|
-
|
2097
|
-
[
|
2098
|
-
offspring_a_words.compact.join(' '),
|
2099
|
-
offspring_b_words.compact.join(' ')
|
2100
|
-
]
|
2101
329
|
end
|
2102
330
|
|
2103
|
-
|
2104
|
-
|
2105
|
-
|
2106
|
-
# Simple blending patterns - in full implementation would use LLM
|
2107
|
-
patterns = [
|
2108
|
-
-> (a, b) { "#{a} and #{b}" },
|
2109
|
-
-> (a, b) { "#{a}, specifically #{b}" },
|
2110
|
-
-> (a, b) { "#{b} while #{a.downcase}" },
|
2111
|
-
-> (a, b) { "Combine #{a.downcase} with #{b.downcase}" }
|
2112
|
-
]
|
2113
|
-
|
2114
|
-
pattern = patterns.sample
|
2115
|
-
|
2116
|
-
[
|
2117
|
-
pattern.call(instruction_a, instruction_b),
|
2118
|
-
pattern.call(instruction_b, instruction_a)
|
2119
|
-
]
|
2120
|
-
end
|
2121
|
-
|
2122
|
-
# Structured crossover: Maintain grammatical and logical structure
|
2123
|
-
sig { params(instruction_a: String, instruction_b: String).returns(T::Array[String]) }
|
2124
|
-
def structured_crossover(instruction_a, instruction_b)
|
2125
|
-
# Extract structural components
|
2126
|
-
components_a = extract_components(instruction_a)
|
2127
|
-
components_b = extract_components(instruction_b)
|
2128
|
-
|
2129
|
-
# Cross structural components
|
2130
|
-
offspring_a = combine_components(components_a.action, components_b.modifiers)
|
2131
|
-
offspring_b = combine_components(components_b.action, components_a.modifiers)
|
2132
|
-
|
2133
|
-
[offspring_a, offspring_b]
|
2134
|
-
end
|
2135
|
-
|
2136
|
-
# Extract structural components from instruction
|
2137
|
-
sig { params(instruction: String).returns(InstructionComponents) }
|
2138
|
-
def extract_components(instruction)
|
2139
|
-
words = instruction.split
|
2140
|
-
|
2141
|
-
# Simple heuristic: first verb-like word is action, rest are modifiers
|
2142
|
-
action_idx = words.find_index { |word| verb_like?(word) } || 0
|
2143
|
-
|
2144
|
-
InstructionComponents.new(
|
2145
|
-
action: words[action_idx] || words.first || "complete",
|
2146
|
-
modifiers: (words - [words[action_idx]]).join(' ')
|
2147
|
-
)
|
2148
|
-
end
|
331
|
+
sig { params(program: DSPy::Module, recorder: T.nilable(T.untyped)).void }
|
332
|
+
def wrap_predictors_for_tracing!(program, recorder: nil)
|
333
|
+
return unless recorder
|
2149
334
|
|
2150
|
-
|
2151
|
-
|
2152
|
-
def combine_components(action, modifiers)
|
2153
|
-
if modifiers.empty?
|
2154
|
-
"#{action.capitalize} the task"
|
2155
|
-
else
|
2156
|
-
"#{action.capitalize} #{modifiers}"
|
335
|
+
resolve_predictors(program).each do |name, predictor|
|
336
|
+
wrap_predictor_for_tracing(program, predictor, name, recorder)
|
2157
337
|
end
|
2158
338
|
end
|
2159
339
|
|
2160
|
-
|
2161
|
-
|
2162
|
-
|
2163
|
-
|
2164
|
-
|
2165
|
-
end
|
340
|
+
sig { params(program: DSPy::Module, predictor: DSPy::Module, name: String, recorder: T.untyped).void }
|
341
|
+
def wrap_predictor_for_tracing(program, predictor, name, recorder)
|
342
|
+
original_forward = predictor.method(:forward_untyped)
|
343
|
+
recorder_ref = recorder
|
344
|
+
predictor_name = name
|
2166
345
|
|
2167
|
-
|
2168
|
-
|
2169
|
-
|
2170
|
-
|
2171
|
-
|
2172
|
-
|
346
|
+
predictor.define_singleton_method(:forward_untyped) do |**input_values|
|
347
|
+
result = original_forward.call(**input_values)
|
348
|
+
recorder_ref.record(
|
349
|
+
predictor_name: predictor_name,
|
350
|
+
inputs: input_values.dup,
|
351
|
+
output: result
|
352
|
+
)
|
353
|
+
result
|
354
|
+
end
|
2173
355
|
end
|
2174
356
|
|
2175
|
-
|
2176
|
-
|
2177
|
-
|
2178
|
-
|
2179
|
-
|
2180
|
-
|
2181
|
-
|
2182
|
-
if combined_length < 40
|
2183
|
-
# Short instructions benefit from blending
|
2184
|
-
[CrossoverType::Blend, CrossoverType::Uniform].sample
|
2185
|
-
elsif combined_length > 200
|
2186
|
-
# Long instructions benefit from structured crossover
|
2187
|
-
[CrossoverType::Structured, CrossoverType::Uniform].sample
|
2188
|
-
else
|
2189
|
-
# Balanced selection
|
2190
|
-
@config.crossover_types.sample
|
2191
|
-
end
|
357
|
+
sig { params(predictor: DSPy::Module, instruction: String).returns(DSPy::Module) }
|
358
|
+
def apply_instruction_to_predictor(predictor, instruction)
|
359
|
+
if predictor.respond_to?(:with_instruction)
|
360
|
+
predictor.with_instruction(instruction)
|
361
|
+
elsif predictor.respond_to?(:prompt) && predictor.prompt.respond_to?(:with_instruction)
|
362
|
+
predictor.with_prompt(predictor.prompt.with_instruction(instruction))
|
2192
363
|
else
|
2193
|
-
|
364
|
+
duplicate = safe_clone(predictor)
|
365
|
+
signature = DSPy::Teleprompt::Utils.get_signature(duplicate)
|
366
|
+
updated_signature = signature.with_instructions(instruction)
|
367
|
+
DSPy::Teleprompt::Utils.set_signature(duplicate, updated_signature)
|
368
|
+
duplicate
|
2194
369
|
end
|
2195
370
|
end
|
2196
371
|
|
2197
|
-
|
2198
|
-
|
2199
|
-
|
2200
|
-
|
2201
|
-
|
2202
|
-
unique_types = crossovers.uniq.size
|
2203
|
-
total_types = @config.crossover_types.size
|
2204
|
-
|
2205
|
-
unique_types.to_f / total_types
|
2206
|
-
end
|
2207
|
-
end
|
2208
|
-
|
2209
|
-
# ParetoSelector: Multi-objective optimization using Pareto frontier analysis
|
2210
|
-
class ParetoSelector
|
2211
|
-
extend T::Sig
|
2212
|
-
|
2213
|
-
sig { returns(FitnessEvaluator) }
|
2214
|
-
attr_reader :evaluator
|
2215
|
-
|
2216
|
-
sig { returns(GEPAConfig) }
|
2217
|
-
attr_reader :config
|
2218
|
-
|
2219
|
-
sig { params(evaluator: FitnessEvaluator, config: GEPAConfig).void }
|
2220
|
-
def initialize(evaluator:, config:)
|
2221
|
-
@evaluator = evaluator
|
2222
|
-
@config = config
|
372
|
+
sig { params(object: T.untyped).returns(T.untyped) }
|
373
|
+
def safe_clone(object)
|
374
|
+
object.clone
|
375
|
+
rescue TypeError
|
376
|
+
object.dup
|
2223
377
|
end
|
2224
378
|
|
2225
|
-
|
2226
|
-
|
2227
|
-
|
2228
|
-
return [] if population_with_scores.empty?
|
2229
|
-
return population_with_scores.map(&:first) if count >= population_with_scores.size
|
2230
|
-
|
2231
|
-
# Combine tournament and Pareto-based selection for parent selection
|
2232
|
-
selected = []
|
2233
|
-
|
2234
|
-
count.times do
|
2235
|
-
parent = tournament_selection(population_with_scores)
|
2236
|
-
selected << parent
|
379
|
+
class TraceRecorder
|
380
|
+
def initialize
|
381
|
+
@current_trace = nil
|
2237
382
|
end
|
2238
383
|
|
2239
|
-
|
2240
|
-
|
2241
|
-
|
2242
|
-
# Select survivors for next generation balancing elite and diversity
|
2243
|
-
sig { params(population_with_scores: T::Array[T::Array[T.untyped]], count: Integer).returns(T::Array[T.untyped]) }
|
2244
|
-
def select_survivors(population_with_scores, count:)
|
2245
|
-
return [] if population_with_scores.empty?
|
2246
|
-
return population_with_scores.map(&:first) if count >= population_with_scores.size
|
2247
|
-
|
2248
|
-
scores = population_with_scores.map(&:last)
|
2249
|
-
|
2250
|
-
# Find Pareto frontier first
|
2251
|
-
pareto_frontier = find_pareto_frontier(scores)
|
2252
|
-
frontier_indices = scores.each_index.select { |i| pareto_frontier.include?(scores[i]) }
|
2253
|
-
frontier_programs = frontier_indices.map { |i| population_with_scores[i].first }
|
2254
|
-
|
2255
|
-
if frontier_programs.size >= count
|
2256
|
-
# Use diversity selection within frontier
|
2257
|
-
frontier_with_scores = frontier_indices.map { |i| population_with_scores[i] }
|
2258
|
-
return diversity_selection(frontier_with_scores, count: count)
|
2259
|
-
else
|
2260
|
-
# Include all frontier + fill remaining with elite selection
|
2261
|
-
remaining_count = count - frontier_programs.size
|
2262
|
-
remaining_population = population_with_scores.reject.with_index { |_, i| frontier_indices.include?(i) }
|
2263
|
-
|
2264
|
-
additional = elite_selection(remaining_population, count: remaining_count)
|
2265
|
-
frontier_programs + additional
|
384
|
+
def start_example
|
385
|
+
@current_trace = []
|
2266
386
|
end
|
2267
|
-
end
|
2268
|
-
|
2269
|
-
private
|
2270
|
-
|
2271
|
-
# Find Pareto frontier (non-dominated solutions)
|
2272
|
-
sig { params(fitness_scores: T::Array[FitnessScore]).returns(T::Array[FitnessScore]) }
|
2273
|
-
def find_pareto_frontier(fitness_scores)
|
2274
|
-
return [] if fitness_scores.empty?
|
2275
|
-
return fitness_scores if fitness_scores.size == 1
|
2276
|
-
|
2277
|
-
frontier = []
|
2278
|
-
|
2279
|
-
fitness_scores.each do |candidate|
|
2280
|
-
# Check if candidate is dominated by any other solution
|
2281
|
-
is_dominated = fitness_scores.any? do |other|
|
2282
|
-
other != candidate && candidate.dominated_by?(other)
|
2283
|
-
end
|
2284
387
|
|
2285
|
-
|
388
|
+
def record(entry)
|
389
|
+
return unless @current_trace
|
390
|
+
@current_trace << entry
|
2286
391
|
end
|
2287
392
|
|
2288
|
-
|
2289
|
-
|
2290
|
-
|
2291
|
-
|
2292
|
-
sig { params(fitness_scores: T::Array[FitnessScore]).returns(T::Hash[FitnessScore, Float]) }
|
2293
|
-
def calculate_crowding_distance(fitness_scores)
|
2294
|
-
distances = {}
|
2295
|
-
|
2296
|
-
# Initialize distances for all solutions
|
2297
|
-
fitness_scores.each { |score| distances[score] = 0.0 }
|
2298
|
-
|
2299
|
-
return distances if fitness_scores.size <= 2
|
2300
|
-
|
2301
|
-
# Calculate crowding distance for each objective
|
2302
|
-
objectives = [:primary_score, :overall_score]
|
2303
|
-
secondary_objectives = fitness_scores.first.secondary_scores.keys
|
2304
|
-
all_objectives = objectives + secondary_objectives
|
2305
|
-
|
2306
|
-
all_objectives.each do |objective|
|
2307
|
-
# Sort by current objective
|
2308
|
-
sorted_scores = fitness_scores.sort_by do |score|
|
2309
|
-
case objective
|
2310
|
-
when :primary_score
|
2311
|
-
score.primary_score
|
2312
|
-
when :overall_score
|
2313
|
-
score.overall_score
|
2314
|
-
else
|
2315
|
-
score.secondary_scores[objective] || 0.0
|
2316
|
-
end
|
2317
|
-
end
|
2318
|
-
|
2319
|
-
# Set boundary solutions to high distance
|
2320
|
-
distances[sorted_scores.first] = Float::INFINITY if sorted_scores.size > 0
|
2321
|
-
distances[sorted_scores.last] = Float::INFINITY if sorted_scores.size > 1
|
2322
|
-
|
2323
|
-
next if sorted_scores.size <= 2
|
2324
|
-
|
2325
|
-
# Calculate range for normalization
|
2326
|
-
min_val = get_objective_value(sorted_scores.first, objective)
|
2327
|
-
max_val = get_objective_value(sorted_scores.last, objective)
|
2328
|
-
range = max_val - min_val
|
2329
|
-
|
2330
|
-
next if range <= 0
|
2331
|
-
|
2332
|
-
# Calculate crowding distance for intermediate solutions
|
2333
|
-
(1...(sorted_scores.size - 1)).each do |i|
|
2334
|
-
prev_val = get_objective_value(sorted_scores[i - 1], objective)
|
2335
|
-
next_val = get_objective_value(sorted_scores[i + 1], objective)
|
2336
|
-
|
2337
|
-
distances[sorted_scores[i]] += (next_val - prev_val) / range
|
2338
|
-
end
|
393
|
+
def finish_example
|
394
|
+
trace = @current_trace || []
|
395
|
+
@current_trace = nil
|
396
|
+
trace
|
2339
397
|
end
|
2340
|
-
|
2341
|
-
distances
|
2342
398
|
end
|
2343
399
|
|
2344
|
-
|
2345
|
-
|
2346
|
-
|
2347
|
-
|
2348
|
-
|
2349
|
-
|
2350
|
-
when :overall_score
|
2351
|
-
score.overall_score
|
400
|
+
sig { params(program: DSPy::Module).returns(String) }
|
401
|
+
def extract_instruction(program)
|
402
|
+
if program.respond_to?(:prompt) && program.prompt.respond_to?(:instruction)
|
403
|
+
program.prompt.instruction
|
404
|
+
elsif program.respond_to?(:instruction)
|
405
|
+
program.instruction
|
2352
406
|
else
|
2353
|
-
|
407
|
+
raise ArgumentError, "Program must expose prompt.instruction or #instruction"
|
2354
408
|
end
|
2355
409
|
end
|
2356
410
|
|
2357
|
-
|
2358
|
-
|
2359
|
-
|
2360
|
-
|
2361
|
-
|
2362
|
-
|
2363
|
-
|
2364
|
-
|
2365
|
-
# Select best from tournament based on Pareto dominance and crowding
|
2366
|
-
best_program, best_score = tournament.first
|
2367
|
-
|
2368
|
-
tournament[1..].each do |program, score|
|
2369
|
-
if score.dominated_by?(best_score)
|
2370
|
-
# Current best dominates this candidate, keep current
|
2371
|
-
next
|
2372
|
-
elsif best_score.dominated_by?(score)
|
2373
|
-
# This candidate dominates current best, replace
|
2374
|
-
best_program, best_score = program, score
|
2375
|
-
else
|
2376
|
-
# Non-dominated comparison, use overall score as tiebreaker
|
2377
|
-
if score.overall_score > best_score.overall_score
|
2378
|
-
best_program, best_score = program, score
|
2379
|
-
end
|
411
|
+
sig { params(struct: T.untyped).returns(T::Hash[Symbol, T.untyped]) }
|
412
|
+
def serialize_struct(struct)
|
413
|
+
if struct.respond_to?(:to_h)
|
414
|
+
struct.to_h
|
415
|
+
elsif struct.instance_variables.any?
|
416
|
+
struct.instance_variables.each_with_object({}) do |ivar, memo|
|
417
|
+
key = ivar.to_s.delete_prefix('@').to_sym
|
418
|
+
memo[key] = struct.instance_variable_get(ivar)
|
2380
419
|
end
|
420
|
+
else
|
421
|
+
{}
|
2381
422
|
end
|
2382
|
-
|
2383
|
-
best_program
|
2384
423
|
end
|
2385
424
|
|
2386
|
-
|
2387
|
-
|
2388
|
-
|
2389
|
-
|
2390
|
-
|
2391
|
-
|
2392
|
-
|
2393
|
-
|
2394
|
-
|
2395
|
-
|
2396
|
-
|
2397
|
-
sorted_pairs.take(count).map(&:first)
|
2398
|
-
end
|
2399
|
-
|
2400
|
-
# Elite selection based on overall fitness
|
2401
|
-
sig { params(population_with_scores: T::Array[T::Array[T.untyped]], count: Integer).returns(T::Array[T.untyped]) }
|
2402
|
-
def elite_selection(population_with_scores, count:)
|
2403
|
-
return population_with_scores.map(&:first) if count >= population_with_scores.size
|
2404
|
-
|
2405
|
-
# Sort by overall score (descending - best first)
|
2406
|
-
sorted_pairs = population_with_scores.sort_by { |_, score| -score.overall_score }
|
2407
|
-
|
2408
|
-
sorted_pairs.take(count).map(&:first)
|
425
|
+
sig { params(prediction: T.untyped).returns(T::Hash[Symbol, T.untyped]) }
|
426
|
+
def serialize_prediction(prediction)
|
427
|
+
case prediction
|
428
|
+
when DSPy::Prediction
|
429
|
+
prediction.to_h
|
430
|
+
when Hash
|
431
|
+
prediction
|
432
|
+
else
|
433
|
+
serialize_struct(prediction)
|
434
|
+
end
|
2409
435
|
end
|
2410
|
-
end
|
2411
|
-
|
2412
|
-
# Configuration for GEPA optimization
|
2413
|
-
class GEPAConfig < Config
|
2414
|
-
extend T::Sig
|
2415
|
-
|
2416
|
-
sig { returns(DSPy::LM) }
|
2417
|
-
attr_accessor :reflection_lm
|
2418
|
-
|
2419
|
-
sig { returns(Integer) }
|
2420
|
-
attr_accessor :num_generations
|
2421
|
-
|
2422
|
-
sig { returns(Integer) }
|
2423
|
-
attr_accessor :population_size
|
2424
|
-
|
2425
|
-
sig { returns(Float) }
|
2426
|
-
attr_accessor :mutation_rate
|
2427
436
|
|
2428
|
-
sig { returns(T::
|
2429
|
-
|
437
|
+
sig { params(expected: T::Hash[Symbol, T.untyped], actual: T::Hash[Symbol, T.untyped]).returns(T::Hash[Symbol, T.untyped]) }
|
438
|
+
def build_diff(expected, actual)
|
439
|
+
keys = expected.keys | actual.keys
|
440
|
+
keys.each_with_object({}) do |key, memo|
|
441
|
+
exp = expected[key]
|
442
|
+
act = actual[key]
|
443
|
+
next if exp == act
|
2430
444
|
|
2431
|
-
|
2432
|
-
|
2433
|
-
sig { returns(Float) }
|
2434
|
-
attr_accessor :crossover_rate
|
2435
|
-
sig { returns(T::Array[CrossoverType]) }
|
2436
|
-
attr_accessor :crossover_types
|
2437
|
-
|
2438
|
-
sig { void }
|
2439
|
-
def initialize
|
2440
|
-
super
|
2441
|
-
# reflection_lm must be explicitly set by user - no default provided
|
2442
|
-
@reflection_lm = nil
|
2443
|
-
@num_generations = 10
|
2444
|
-
@population_size = 8
|
2445
|
-
@mutation_rate = 0.7
|
2446
|
-
@use_pareto_selection = true
|
2447
|
-
@mutation_types = [MutationType::Rewrite, MutationType::Expand, MutationType::Simplify, MutationType::Combine, MutationType::Rephrase]
|
2448
|
-
@crossover_rate = 0.6
|
2449
|
-
@crossover_types = [CrossoverType::Uniform, CrossoverType::Blend, CrossoverType::Structured]
|
445
|
+
memo[key] = { expected: exp, actual: act }
|
446
|
+
end
|
2450
447
|
end
|
2451
448
|
|
2452
|
-
sig { returns(
|
2453
|
-
def
|
2454
|
-
|
2455
|
-
|
2456
|
-
|
2457
|
-
|
2458
|
-
|
2459
|
-
|
2460
|
-
|
2461
|
-
|
2462
|
-
|
2463
|
-
|
449
|
+
sig { params(result: T.untyped).returns([Float, T.nilable(String)]) }
|
450
|
+
def extract_score_and_feedback(result)
|
451
|
+
case result
|
452
|
+
when DSPy::Prediction
|
453
|
+
score = result.respond_to?(:score) ? result.score : 0.0
|
454
|
+
feedback = result.respond_to?(:feedback) ? result.feedback : nil
|
455
|
+
[score.to_f, feedback]
|
456
|
+
when Hash
|
457
|
+
[result[:score].to_f, result[:feedback]]
|
458
|
+
else
|
459
|
+
[result.to_f, nil]
|
460
|
+
end
|
2464
461
|
end
|
2465
462
|
end
|
2466
463
|
|
2467
|
-
sig { returns(GEPAConfig) }
|
2468
|
-
attr_reader :config
|
2469
|
-
|
2470
464
|
sig do
|
2471
465
|
params(
|
2472
|
-
metric: T.
|
2473
|
-
|
466
|
+
metric: T.proc.params(arg0: DSPy::Example, arg1: T.untyped).returns(T.untyped),
|
467
|
+
reflection_lm: T.nilable(T.untyped),
|
468
|
+
feedback_map: T.nilable(T::Hash[String, PredictAdapter::FeedbackFnType]),
|
469
|
+
adapter_builder: T.nilable(T.proc.returns(T.untyped)),
|
470
|
+
config: T.nilable(T::Hash[Symbol, T.untyped])
|
2474
471
|
).void
|
2475
472
|
end
|
2476
|
-
def initialize(metric: nil, config: nil)
|
2477
|
-
|
2478
|
-
|
2479
|
-
|
2480
|
-
|
2481
|
-
|
2482
|
-
|
2483
|
-
|
2484
|
-
super(metric: metric, config: @config)
|
473
|
+
def initialize(metric:, reflection_lm: nil, feedback_map: nil, adapter_builder: nil, config: nil)
|
474
|
+
super(metric: metric)
|
475
|
+
@metric = metric
|
476
|
+
@reflection_lm = reflection_lm
|
477
|
+
@feedback_map = (feedback_map || {}).transform_keys(&:to_s)
|
478
|
+
@adapter_builder = adapter_builder || method(:build_adapter)
|
479
|
+
@gepa_config = self.class.default_config.merge(config || {})
|
2485
480
|
end
|
2486
481
|
|
2487
|
-
# Main optimization method
|
2488
482
|
sig do
|
2489
|
-
params(
|
2490
|
-
program:
|
483
|
+
override.params(
|
484
|
+
program: DSPy::Module,
|
2491
485
|
trainset: T::Array[T.untyped],
|
2492
486
|
valset: T.nilable(T::Array[T.untyped])
|
2493
487
|
).returns(OptimizationResult)
|
2494
488
|
end
|
2495
|
-
|
2496
489
|
def compile(program, trainset:, valset: nil)
|
2497
490
|
validate_inputs(program, trainset, valset)
|
2498
491
|
|
2499
|
-
|
2500
|
-
|
2501
|
-
valset_size: valset&.size || 0,
|
2502
|
-
num_generations: @config.num_generations,
|
2503
|
-
population_size: @config.population_size
|
2504
|
-
}) do
|
2505
|
-
# Always perform full GEPA genetic algorithm optimization
|
2506
|
-
perform_gepa_optimization(program, trainset, valset)
|
2507
|
-
end
|
2508
|
-
end
|
2509
|
-
|
2510
|
-
private
|
2511
|
-
|
2512
|
-
# Complete GEPA genetic algorithm optimization
|
2513
|
-
sig do
|
2514
|
-
params(
|
2515
|
-
program: T.untyped,
|
2516
|
-
trainset: T::Array[T.untyped],
|
2517
|
-
valset: T.nilable(T::Array[T.untyped])
|
2518
|
-
).returns(OptimizationResult)
|
2519
|
-
end
|
2520
|
-
def perform_gepa_optimization(program, trainset, valset)
|
2521
|
-
# Initialize all GEPA components
|
2522
|
-
fitness_evaluator = create_fitness_evaluator
|
2523
|
-
genetic_engine = create_genetic_engine(fitness_evaluator)
|
2524
|
-
reflection_engine = create_reflection_engine
|
2525
|
-
mutation_engine = create_mutation_engine
|
2526
|
-
crossover_engine = create_crossover_engine
|
2527
|
-
pareto_selector = create_pareto_selector(fitness_evaluator)
|
2528
|
-
|
2529
|
-
# Initialize trace collection for reflection
|
2530
|
-
trace_collector = TraceCollector.new
|
2531
|
-
optimization_run_id = "gepa-run-#{SecureRandom.hex(4)}"
|
2532
|
-
|
2533
|
-
emit_event('gepa_optimization_start', {
|
2534
|
-
optimization_run_id: optimization_run_id,
|
2535
|
-
num_generations: @config.num_generations,
|
2536
|
-
population_size: @config.population_size,
|
2537
|
-
mutation_rate: @config.mutation_rate,
|
2538
|
-
crossover_rate: @config.crossover_rate
|
2539
|
-
})
|
2540
|
-
|
2541
|
-
begin
|
2542
|
-
# Run the complete genetic algorithm evolution
|
2543
|
-
evolution_result = genetic_engine.run_evolution(program, trainset)
|
2544
|
-
|
2545
|
-
# Collect traces for reflection analysis
|
2546
|
-
execution_traces = trace_collector.traces_for_run(optimization_run_id)
|
2547
|
-
|
2548
|
-
# Generate reflection insights on the optimization process
|
2549
|
-
reflection_result = reflection_engine.reflect_with_llm(execution_traces)
|
2550
|
-
|
2551
|
-
# Evaluate final candidate on validation set if provided
|
2552
|
-
final_validation_score = if valset && !valset.empty?
|
2553
|
-
validation_fitness = fitness_evaluator.evaluate_candidate(evolution_result[:best_candidate], valset)
|
2554
|
-
validation_fitness.overall_score
|
2555
|
-
else
|
2556
|
-
evolution_result[:best_fitness].overall_score
|
2557
|
-
end
|
2558
|
-
|
2559
|
-
emit_event('gepa_optimization_complete', {
|
2560
|
-
optimization_run_id: optimization_run_id,
|
2561
|
-
best_fitness: evolution_result[:best_fitness].overall_score,
|
2562
|
-
final_generation: evolution_result[:generation_count],
|
2563
|
-
validation_score: final_validation_score,
|
2564
|
-
reflection_confidence: reflection_result.confidence
|
2565
|
-
})
|
2566
|
-
|
2567
|
-
# Create comprehensive optimization result
|
2568
|
-
OptimizationResult.new(
|
2569
|
-
optimized_program: evolution_result[:best_candidate],
|
2570
|
-
scores: {
|
2571
|
-
fitness_score: evolution_result[:best_fitness].overall_score,
|
2572
|
-
validation_score: final_validation_score,
|
2573
|
-
primary_score: evolution_result[:best_fitness].primary_score,
|
2574
|
-
**evolution_result[:best_fitness].secondary_scores
|
2575
|
-
},
|
2576
|
-
history: {
|
2577
|
-
num_generations: evolution_result[:generation_count],
|
2578
|
-
population_size: @config.population_size,
|
2579
|
-
generation_history: evolution_result[:generation_history],
|
2580
|
-
final_population: evolution_result[:final_population],
|
2581
|
-
phase: 'Phase 2 - Complete GEPA',
|
2582
|
-
mutation_rate: @config.mutation_rate,
|
2583
|
-
crossover_rate: @config.crossover_rate,
|
2584
|
-
selection_strategy: @config.use_pareto_selection ? 'pareto' : 'tournament'
|
2585
|
-
},
|
2586
|
-
best_score_name: 'fitness_score',
|
2587
|
-
best_score_value: evolution_result[:best_fitness].overall_score,
|
2588
|
-
metadata: {
|
2589
|
-
optimizer: 'GEPA',
|
2590
|
-
reflection_lm: @config.reflection_lm&.model,
|
2591
|
-
implementation_status: 'Phase 2 - Complete Implementation',
|
2592
|
-
optimization_run_id: optimization_run_id,
|
2593
|
-
reflection_insights: {
|
2594
|
-
diagnosis: reflection_result.diagnosis,
|
2595
|
-
improvements: reflection_result.improvements,
|
2596
|
-
confidence: reflection_result.confidence,
|
2597
|
-
suggested_mutations: reflection_result.suggested_mutations
|
2598
|
-
},
|
2599
|
-
trace_analysis: {
|
2600
|
-
total_traces: execution_traces.size,
|
2601
|
-
llm_traces: execution_traces.count(&:llm_trace?),
|
2602
|
-
module_traces: execution_traces.count(&:module_trace?),
|
2603
|
-
execution_timespan: calculate_execution_timespan(execution_traces)
|
2604
|
-
},
|
2605
|
-
component_versions: {
|
2606
|
-
genetic_engine: 'v2.0',
|
2607
|
-
fitness_evaluator: 'v2.0',
|
2608
|
-
reflection_engine: 'v2.0',
|
2609
|
-
mutation_engine: 'v2.0',
|
2610
|
-
crossover_engine: 'v2.0',
|
2611
|
-
pareto_selector: 'v2.0'
|
2612
|
-
}
|
2613
|
-
}
|
2614
|
-
)
|
2615
|
-
|
2616
|
-
rescue => e
|
2617
|
-
emit_event('gepa_optimization_error', {
|
2618
|
-
optimization_run_id: optimization_run_id,
|
2619
|
-
error: e.message,
|
2620
|
-
backtrace: e.backtrace&.take(5)
|
2621
|
-
})
|
2622
|
-
|
2623
|
-
# Return fallback result on optimization failure
|
2624
|
-
fallback_fitness = fitness_evaluator.evaluate_candidate(program, trainset)
|
2625
|
-
|
2626
|
-
OptimizationResult.new(
|
2627
|
-
optimized_program: program,
|
2628
|
-
scores: {
|
2629
|
-
fitness_score: fallback_fitness.overall_score,
|
2630
|
-
primary_score: fallback_fitness.primary_score,
|
2631
|
-
**fallback_fitness.secondary_scores
|
2632
|
-
},
|
2633
|
-
history: {
|
2634
|
-
num_generations: 0,
|
2635
|
-
population_size: @config.population_size,
|
2636
|
-
phase: 'Phase 2 - Error Recovery',
|
2637
|
-
error: e.message
|
2638
|
-
},
|
2639
|
-
best_score_name: 'fitness_score',
|
2640
|
-
best_score_value: fallback_fitness.overall_score,
|
2641
|
-
metadata: {
|
2642
|
-
optimizer: 'GEPA',
|
2643
|
-
reflection_lm: @config.reflection_lm&.model,
|
2644
|
-
implementation_status: 'Phase 2 - Error Recovery',
|
2645
|
-
optimization_run_id: optimization_run_id,
|
2646
|
-
error_details: {
|
2647
|
-
message: e.message,
|
2648
|
-
class: e.class.name,
|
2649
|
-
recovery_strategy: 'fallback_to_original'
|
2650
|
-
}
|
2651
|
-
}
|
2652
|
-
)
|
2653
|
-
end
|
2654
|
-
end
|
2655
|
-
|
2656
|
-
# Create and configure fitness evaluator
|
2657
|
-
sig { returns(FitnessEvaluator) }
|
2658
|
-
def create_fitness_evaluator
|
2659
|
-
FitnessEvaluator.new(primary_metric: @metric, config: @config)
|
2660
|
-
end
|
2661
|
-
|
2662
|
-
# Create and configure genetic engine
|
2663
|
-
sig { params(fitness_evaluator: FitnessEvaluator).returns(GeneticEngine) }
|
2664
|
-
def create_genetic_engine(fitness_evaluator)
|
2665
|
-
GeneticEngine.new(config: @config, fitness_evaluator: fitness_evaluator)
|
2666
|
-
end
|
2667
|
-
|
2668
|
-
# Create and configure reflection engine
|
2669
|
-
sig { returns(ReflectionEngine) }
|
2670
|
-
def create_reflection_engine
|
2671
|
-
ReflectionEngine.new(@config)
|
2672
|
-
end
|
2673
|
-
|
2674
|
-
# Create and configure mutation engine
|
2675
|
-
sig { returns(MutationEngine) }
|
2676
|
-
def create_mutation_engine
|
2677
|
-
MutationEngine.new(config: @config)
|
2678
|
-
end
|
2679
|
-
|
2680
|
-
# Create and configure crossover engine
|
2681
|
-
sig { returns(CrossoverEngine) }
|
2682
|
-
def create_crossover_engine
|
2683
|
-
CrossoverEngine.new(config: @config)
|
2684
|
-
end
|
2685
|
-
|
2686
|
-
# Create and configure pareto selector
|
2687
|
-
sig { params(fitness_evaluator: FitnessEvaluator).returns(ParetoSelector) }
|
2688
|
-
def create_pareto_selector(fitness_evaluator)
|
2689
|
-
ParetoSelector.new(evaluator: fitness_evaluator, config: @config)
|
2690
|
-
end
|
2691
|
-
|
2692
|
-
# Calculate execution timespan from traces
|
2693
|
-
sig { params(traces: T::Array[ExecutionTrace]).returns(Float) }
|
2694
|
-
def calculate_execution_timespan(traces)
|
2695
|
-
return 0.0 if traces.size < 2
|
2696
|
-
|
2697
|
-
timestamps = traces.map(&:timestamp).sort
|
2698
|
-
(timestamps.last - timestamps.first).to_f
|
2699
|
-
end
|
2700
|
-
end
|
2701
|
-
|
2702
|
-
# GEPA Feedback Metric Protocol
|
2703
|
-
# Defines interface for providing scores with optional textual feedback
|
2704
|
-
module GEPAFeedbackMetric
|
2705
|
-
extend T::Sig
|
2706
|
-
extend T::Helpers
|
2707
|
-
|
2708
|
-
interface!
|
492
|
+
typed_trainset = ensure_typed_examples(trainset)
|
493
|
+
typed_valset = valset ? ensure_typed_examples(valset) : typed_trainset
|
2709
494
|
|
2710
|
-
|
2711
|
-
|
2712
|
-
|
2713
|
-
|
2714
|
-
|
2715
|
-
prediction: DSPy::Prediction,
|
2716
|
-
trace: T.nilable(T::Array[ExecutionTrace])
|
495
|
+
adapter = @adapter_builder.call(
|
496
|
+
program,
|
497
|
+
@metric,
|
498
|
+
reflection_lm: @reflection_lm,
|
499
|
+
feedback_map: @feedback_map
|
2717
500
|
)
|
2718
|
-
.
|
2719
|
-
|
2720
|
-
|
2721
|
-
|
2722
|
-
|
2723
|
-
|
2724
|
-
|
2725
|
-
|
2726
|
-
|
2727
|
-
|
2728
|
-
|
2729
|
-
|
2730
|
-
|
2731
|
-
|
2732
|
-
|
2733
|
-
|
2734
|
-
|
2735
|
-
|
2736
|
-
|
2737
|
-
|
2738
|
-
|
2739
|
-
|
2740
|
-
|
2741
|
-
sig do
|
2742
|
-
params(
|
2743
|
-
student: T.untyped, # DSPy::Module or similar callable
|
2744
|
-
metric: T.untyped,
|
2745
|
-
feedback_map: T::Hash[String, String],
|
2746
|
-
custom_instruction_proposer: T.nilable(T.untyped)
|
2747
|
-
).void
|
2748
|
-
end
|
2749
|
-
def initialize(student:, metric:, feedback_map: {}, custom_instruction_proposer: nil)
|
2750
|
-
@student = student
|
2751
|
-
@metric = metric
|
2752
|
-
@feedback_map = feedback_map
|
2753
|
-
@custom_instruction_proposer = custom_instruction_proposer
|
2754
|
-
@trace_collector = GEPA::TraceCollector.new
|
2755
|
-
end
|
2756
|
-
|
2757
|
-
# Build program with candidate instruction
|
2758
|
-
sig { params(candidate_instruction: String).returns(T.untyped) }
|
2759
|
-
def build_program(candidate_instruction)
|
2760
|
-
# For DSPy::Module compatibility, we'll need to create a new instance
|
2761
|
-
# with modified signature description
|
2762
|
-
if @student.respond_to?(:signature_class) && @student.signature_class.respond_to?(:description=)
|
2763
|
-
modified_student = @student.class.new
|
2764
|
-
modified_student.signature_class.description = candidate_instruction
|
2765
|
-
modified_student
|
2766
|
-
else
|
2767
|
-
# Fallback: return student as-is for non-standard modules
|
2768
|
-
@student
|
2769
|
-
end
|
2770
|
-
end
|
2771
|
-
|
2772
|
-
# Evaluate program on batch with trace capture
|
2773
|
-
sig do
|
2774
|
-
params(
|
2775
|
-
batch: T::Array[DSPy::Example],
|
2776
|
-
candidate_instruction: String,
|
2777
|
-
capture_traces: T::Boolean
|
501
|
+
seed_candidate = adapter.seed_candidate
|
502
|
+
|
503
|
+
cand_selector = ::GEPA::Strategies::ParetoCandidateSelector.new
|
504
|
+
comp_selector = ::GEPA::Strategies::RoundRobinReflectionComponentSelector.new
|
505
|
+
batch_sampler = ::GEPA::Strategies::EpochShuffledBatchSampler.new([@gepa_config[:minibatch_size], typed_trainset.size].min)
|
506
|
+
|
507
|
+
telemetry_context = ::GEPA::Telemetry.build_context
|
508
|
+
|
509
|
+
logger = ::GEPA::Logging::BufferingLogger.new
|
510
|
+
tracker = ::GEPA::Logging::ExperimentTracker.new
|
511
|
+
|
512
|
+
reflective = ::GEPA::Proposer::ReflectiveMutationProposer.new(
|
513
|
+
logger: logger,
|
514
|
+
trainset: typed_trainset,
|
515
|
+
adapter: adapter,
|
516
|
+
candidate_selector: cand_selector,
|
517
|
+
module_selector: comp_selector,
|
518
|
+
batch_sampler: batch_sampler,
|
519
|
+
perfect_score: @gepa_config[:perfect_score],
|
520
|
+
skip_perfect_score: @gepa_config[:skip_perfect_score],
|
521
|
+
experiment_tracker: tracker,
|
522
|
+
reflection_lm: nil,
|
523
|
+
telemetry: telemetry_context
|
2778
524
|
)
|
2779
|
-
.returns(T::Array[T.any(Float, ScoreWithFeedback)])
|
2780
|
-
end
|
2781
|
-
def evaluate_batch(batch, candidate_instruction, capture_traces: true)
|
2782
|
-
program = build_program(candidate_instruction)
|
2783
|
-
results = []
|
2784
|
-
|
2785
|
-
batch.each do |example|
|
2786
|
-
begin
|
2787
|
-
# Execute program on example
|
2788
|
-
prediction = if program.respond_to?(:call)
|
2789
|
-
program.call(**example.input_values)
|
2790
|
-
elsif program.respond_to?(:forward)
|
2791
|
-
program.forward(**example.input_values)
|
2792
|
-
else
|
2793
|
-
raise "Program must respond to :call or :forward"
|
2794
|
-
end
|
2795
|
-
|
2796
|
-
# Get collected traces (if trace collection is enabled)
|
2797
|
-
# Note: TraceCollector automatically collects via event subscriptions
|
2798
|
-
traces = capture_traces ? @trace_collector.traces : []
|
2799
|
-
|
2800
|
-
# Evaluate with metric
|
2801
|
-
# Try with traces first (for GEPAFeedbackMetric), fallback to standard metric
|
2802
|
-
begin
|
2803
|
-
# Check if metric can accept 3 parameters (example, prediction, traces)
|
2804
|
-
if @metric.respond_to?(:arity) && (@metric.arity == 3 || @metric.arity < 0)
|
2805
|
-
score_result = @metric.call(example, prediction, traces)
|
2806
|
-
else
|
2807
|
-
score_result = @metric.call(example, prediction)
|
2808
|
-
end
|
2809
|
-
rescue ArgumentError => arg_error
|
2810
|
-
# If 3-arg call fails, try 2-arg call
|
2811
|
-
if arg_error.message.include?('wrong number of arguments')
|
2812
|
-
score_result = @metric.call(example, prediction)
|
2813
|
-
else
|
2814
|
-
raise arg_error
|
2815
|
-
end
|
2816
|
-
end
|
2817
525
|
|
2818
|
-
|
2819
|
-
|
2820
|
-
|
2821
|
-
|
2822
|
-
|
2823
|
-
|
2824
|
-
|
2825
|
-
|
2826
|
-
|
2827
|
-
|
2828
|
-
|
2829
|
-
|
2830
|
-
|
2831
|
-
|
2832
|
-
|
2833
|
-
|
2834
|
-
end
|
526
|
+
evaluator = lambda do |dataset, candidate|
|
527
|
+
batch = adapter.evaluate(dataset, candidate, capture_traces: false)
|
528
|
+
[batch.outputs, batch.scores]
|
529
|
+
end
|
530
|
+
|
531
|
+
merge_proposer = nil
|
532
|
+
if @gepa_config[:use_merge]
|
533
|
+
merge_proposer = ::GEPA::Proposer::MergeProposer.new(
|
534
|
+
logger: logger,
|
535
|
+
valset: typed_valset,
|
536
|
+
evaluator: evaluator,
|
537
|
+
use_merge: true,
|
538
|
+
max_merge_invocations: @gepa_config[:max_merge_invocations],
|
539
|
+
rng: Random.new(0),
|
540
|
+
telemetry: telemetry_context
|
541
|
+
)
|
2835
542
|
end
|
2836
543
|
|
2837
|
-
|
2838
|
-
|
2839
|
-
|
2840
|
-
|
2841
|
-
|
2842
|
-
|
2843
|
-
|
2844
|
-
|
2845
|
-
|
2846
|
-
|
544
|
+
engine = ::GEPA::Core::Engine.new(
|
545
|
+
evaluator: evaluator,
|
546
|
+
valset: typed_valset,
|
547
|
+
seed_candidate: seed_candidate,
|
548
|
+
max_metric_calls: @gepa_config[:max_metric_calls],
|
549
|
+
perfect_score: @gepa_config[:perfect_score],
|
550
|
+
seed: 0,
|
551
|
+
reflective_proposer: reflective,
|
552
|
+
logger: logger,
|
553
|
+
experiment_tracker: tracker,
|
554
|
+
merge_proposer: merge_proposer,
|
555
|
+
run_dir: nil,
|
556
|
+
track_best_outputs: false,
|
557
|
+
display_progress_bar: false,
|
558
|
+
telemetry: telemetry_context,
|
559
|
+
raise_on_exception: true
|
2847
560
|
)
|
2848
|
-
.returns(T::Array[T::Hash[String, T.untyped]])
|
2849
|
-
end
|
2850
|
-
def make_reflective_dataset(examples, predictions, scores, threshold: 0.5)
|
2851
|
-
reflective_data = []
|
2852
|
-
|
2853
|
-
examples.zip(predictions, scores).each do |example, prediction, score|
|
2854
|
-
# Extract score value
|
2855
|
-
score_value = score.is_a?(ScoreWithFeedback) ? score.score : score
|
2856
|
-
|
2857
|
-
# Include failed predictions (below threshold)
|
2858
|
-
next if score_value >= threshold
|
2859
|
-
|
2860
|
-
# Extract feedback if available
|
2861
|
-
feedback = if score.is_a?(ScoreWithFeedback) && score.feedback
|
2862
|
-
score.feedback
|
2863
|
-
else
|
2864
|
-
"Low performance (score: #{score_value.round(2)})"
|
2865
|
-
end
|
2866
|
-
|
2867
|
-
reflective_data << {
|
2868
|
-
'input' => example.input_values,
|
2869
|
-
'expected' => example.expected_values,
|
2870
|
-
'prediction' => extract_prediction_values(prediction),
|
2871
|
-
'score' => score_value,
|
2872
|
-
'feedback' => feedback
|
2873
|
-
}
|
2874
|
-
end
|
2875
561
|
|
2876
|
-
|
2877
|
-
|
2878
|
-
|
2879
|
-
|
2880
|
-
|
2881
|
-
|
2882
|
-
|
2883
|
-
|
2884
|
-
|
562
|
+
state = engine.run
|
563
|
+
result = ::GEPA::Core::Result.from_state(state)
|
564
|
+
best_program = adapter.build_program(result.best_candidate)
|
565
|
+
|
566
|
+
OptimizationResult.new(
|
567
|
+
optimized_program: best_program,
|
568
|
+
scores: { best: result.val_aggregate_scores[result.best_idx] },
|
569
|
+
history: { total_candidates: result.num_candidates },
|
570
|
+
best_score_name: 'best',
|
571
|
+
best_score_value: result.val_aggregate_scores[result.best_idx],
|
572
|
+
metadata: { candidates: result.num_candidates }
|
2885
573
|
)
|
2886
|
-
.returns(T::Array[String])
|
2887
|
-
end
|
2888
|
-
def propose_new_texts(current_instruction, reflective_dataset, components_to_update = ['instruction'])
|
2889
|
-
if @custom_instruction_proposer
|
2890
|
-
# Use custom proposer if provided
|
2891
|
-
proposed = @custom_instruction_proposer.call(current_instruction, reflective_dataset)
|
2892
|
-
[proposed].compact
|
2893
|
-
else
|
2894
|
-
# Use built-in proposal logic
|
2895
|
-
analyze_failures_and_propose(current_instruction, reflective_dataset)
|
2896
|
-
end
|
2897
574
|
end
|
2898
575
|
|
2899
576
|
private
|
2900
577
|
|
2901
|
-
# Extract prediction values for reflective analysis
|
2902
|
-
sig { params(prediction: DSPy::Prediction).returns(T::Hash[String, T.untyped]) }
|
2903
|
-
def extract_prediction_values(prediction)
|
2904
|
-
# DSPy::Prediction implements to_h which returns the underlying struct's data
|
2905
|
-
prediction.to_h.transform_keys(&:to_s)
|
2906
|
-
end
|
2907
|
-
|
2908
|
-
# Analyze failures and propose improvements
|
2909
578
|
sig do
|
2910
579
|
params(
|
2911
|
-
|
2912
|
-
|
2913
|
-
|
2914
|
-
|
2915
|
-
|
2916
|
-
|
2917
|
-
|
2918
|
-
|
2919
|
-
# Extract common failure patterns
|
2920
|
-
feedback_texts = reflective_dataset.map { |data| data['feedback'] }.compact
|
2921
|
-
|
2922
|
-
# Simple heuristic-based proposals
|
2923
|
-
proposals = []
|
2924
|
-
|
2925
|
-
# If many failures, suggest more detailed instruction
|
2926
|
-
if reflective_dataset.size >= 3
|
2927
|
-
proposals << "#{current_instruction} Please provide step-by-step reasoning."
|
2928
|
-
end
|
2929
|
-
|
2930
|
-
# If feedback mentions specific issues, address them
|
2931
|
-
if feedback_texts.any? { |fb| fb.include?('unclear') || fb.include?('ambiguous') }
|
2932
|
-
proposals << "#{current_instruction} Be specific and clear in your response."
|
2933
|
-
end
|
2934
|
-
|
2935
|
-
if feedback_texts.any? { |fb| fb.include?('incomplete') || fb.include?('missing') }
|
2936
|
-
proposals << "#{current_instruction} Ensure your answer is complete and addresses all aspects."
|
2937
|
-
end
|
2938
|
-
|
2939
|
-
# Always include at least one proposal
|
2940
|
-
proposals << "#{current_instruction.strip}. Think carefully before responding." if proposals.empty?
|
2941
|
-
|
2942
|
-
proposals.uniq.take(3) # Return up to 3 proposals
|
580
|
+
program: DSPy::Module,
|
581
|
+
metric: T.proc.params(arg0: DSPy::Example, arg1: T.untyped).returns(T.untyped),
|
582
|
+
reflection_lm: T.nilable(T.untyped),
|
583
|
+
feedback_map: T::Hash[String, PredictAdapter::FeedbackFnType]
|
584
|
+
).returns(PredictAdapter)
|
585
|
+
end
|
586
|
+
def build_adapter(program, metric, reflection_lm: nil, feedback_map: {})
|
587
|
+
PredictAdapter.new(program, metric, reflection_lm: reflection_lm, feedback_map: feedback_map)
|
2943
588
|
end
|
2944
589
|
end
|
2945
590
|
end
|