dspy 0.29.1 → 0.30.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +45 -0
- data/README.md +159 -95
- data/lib/dspy/callbacks.rb +93 -19
- data/lib/dspy/context.rb +101 -5
- data/lib/dspy/errors.rb +19 -1
- data/lib/dspy/{datasets.rb → evals/version.rb} +2 -3
- data/lib/dspy/{evaluate.rb → evals.rb} +373 -110
- data/lib/dspy/mixins/instruction_updatable.rb +22 -0
- data/lib/dspy/module.rb +213 -17
- data/lib/dspy/observability.rb +40 -182
- data/lib/dspy/predict.rb +10 -2
- data/lib/dspy/propose/dataset_summary_generator.rb +28 -18
- data/lib/dspy/re_act.rb +21 -0
- data/lib/dspy/schema/sorbet_json_schema.rb +302 -0
- data/lib/dspy/schema/version.rb +7 -0
- data/lib/dspy/schema.rb +4 -0
- data/lib/dspy/structured_outputs_prompt.rb +48 -0
- data/lib/dspy/support/warning_filters.rb +27 -0
- data/lib/dspy/teleprompt/gepa.rb +9 -588
- data/lib/dspy/teleprompt/instruction_updates.rb +94 -0
- data/lib/dspy/teleprompt/teleprompter.rb +6 -6
- data/lib/dspy/teleprompt/utils.rb +5 -65
- data/lib/dspy/type_system/sorbet_json_schema.rb +2 -299
- data/lib/dspy/version.rb +1 -1
- data/lib/dspy.rb +39 -7
- metadata +18 -61
- data/lib/dspy/code_act.rb +0 -477
- data/lib/dspy/datasets/ade.rb +0 -90
- data/lib/dspy/observability/async_span_processor.rb +0 -250
- data/lib/dspy/observability/observation_type.rb +0 -65
- data/lib/dspy/optimizers/gaussian_process.rb +0 -141
- data/lib/dspy/teleprompt/mipro_v2.rb +0 -1672
- data/lib/gepa/api.rb +0 -61
- data/lib/gepa/core/engine.rb +0 -226
- data/lib/gepa/core/evaluation_batch.rb +0 -26
- data/lib/gepa/core/result.rb +0 -92
- data/lib/gepa/core/state.rb +0 -231
- data/lib/gepa/logging/experiment_tracker.rb +0 -54
- data/lib/gepa/logging/logger.rb +0 -57
- data/lib/gepa/logging.rb +0 -9
- data/lib/gepa/proposer/base.rb +0 -27
- data/lib/gepa/proposer/merge_proposer.rb +0 -424
- data/lib/gepa/proposer/reflective_mutation/base.rb +0 -48
- data/lib/gepa/proposer/reflective_mutation/reflective_mutation.rb +0 -188
- data/lib/gepa/strategies/batch_sampler.rb +0 -91
- data/lib/gepa/strategies/candidate_selector.rb +0 -97
- data/lib/gepa/strategies/component_selector.rb +0 -57
- data/lib/gepa/strategies/instruction_proposal.rb +0 -120
- data/lib/gepa/telemetry.rb +0 -122
- data/lib/gepa/utils/pareto.rb +0 -119
- data/lib/gepa.rb +0 -21
|
@@ -1,12 +1,16 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require 'json'
|
|
4
|
+
require 'polars'
|
|
5
|
+
require 'concurrent'
|
|
3
6
|
require 'sorbet-runtime'
|
|
4
7
|
require_relative 'example'
|
|
8
|
+
require_relative 'callbacks'
|
|
5
9
|
|
|
6
10
|
module DSPy
|
|
7
11
|
# Core evaluation framework for DSPy programs
|
|
8
12
|
# Supports single evaluations, batch evaluations, and optimization workflows
|
|
9
|
-
class
|
|
13
|
+
class Evals
|
|
10
14
|
extend T::Sig
|
|
11
15
|
|
|
12
16
|
# Result of evaluating a single example
|
|
@@ -76,6 +80,9 @@ module DSPy
|
|
|
76
80
|
sig { returns(Float) }
|
|
77
81
|
attr_reader :pass_rate
|
|
78
82
|
|
|
83
|
+
sig { returns(Float) }
|
|
84
|
+
attr_reader :score
|
|
85
|
+
|
|
79
86
|
sig do
|
|
80
87
|
params(
|
|
81
88
|
results: T::Array[EvaluationResult],
|
|
@@ -88,6 +95,8 @@ module DSPy
|
|
|
88
95
|
@total_examples = results.length
|
|
89
96
|
@passed_examples = results.count(&:passed)
|
|
90
97
|
@pass_rate = @total_examples > 0 ? @passed_examples.to_f / @total_examples : 0.0
|
|
98
|
+
score_avg = aggregated_metrics[:score_avg] || @pass_rate
|
|
99
|
+
@score = (score_avg * 100).round(2)
|
|
91
100
|
end
|
|
92
101
|
|
|
93
102
|
sig { returns(T::Hash[Symbol, T.untyped]) }
|
|
@@ -96,10 +105,47 @@ module DSPy
|
|
|
96
105
|
total_examples: @total_examples,
|
|
97
106
|
passed_examples: @passed_examples,
|
|
98
107
|
pass_rate: @pass_rate,
|
|
108
|
+
score: @score,
|
|
99
109
|
aggregated_metrics: @aggregated_metrics,
|
|
100
110
|
results: @results.map(&:to_h)
|
|
101
111
|
}
|
|
102
112
|
end
|
|
113
|
+
|
|
114
|
+
sig { returns(Polars::DataFrame) }
|
|
115
|
+
def to_polars
|
|
116
|
+
rows = @results.each_with_index.map do |result, index|
|
|
117
|
+
{
|
|
118
|
+
"index" => index,
|
|
119
|
+
"passed" => result.passed,
|
|
120
|
+
"score" => result.metrics[:score],
|
|
121
|
+
"example" => serialize_for_polars(result.example),
|
|
122
|
+
"prediction" => serialize_for_polars(result.prediction),
|
|
123
|
+
"metrics" => serialize_for_polars(result.metrics),
|
|
124
|
+
"trace" => serialize_for_polars(result.trace)
|
|
125
|
+
}
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
Polars::DataFrame.new(rows)
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
private
|
|
132
|
+
|
|
133
|
+
def serialize_for_polars(value)
|
|
134
|
+
case value
|
|
135
|
+
when NilClass, TrueClass, FalseClass, Numeric, String
|
|
136
|
+
value
|
|
137
|
+
when Hash
|
|
138
|
+
JSON.generate(value)
|
|
139
|
+
when Array
|
|
140
|
+
JSON.generate(value)
|
|
141
|
+
else
|
|
142
|
+
if value.respond_to?(:to_h)
|
|
143
|
+
JSON.generate(value.to_h)
|
|
144
|
+
else
|
|
145
|
+
value.to_s
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
end
|
|
103
149
|
end
|
|
104
150
|
|
|
105
151
|
sig { returns(T.untyped) }
|
|
@@ -117,26 +163,70 @@ module DSPy
|
|
|
117
163
|
sig { returns(T::Boolean) }
|
|
118
164
|
attr_reader :provide_traceback
|
|
119
165
|
|
|
166
|
+
sig { returns(Float) }
|
|
167
|
+
attr_reader :failure_score
|
|
168
|
+
|
|
169
|
+
sig { returns(T.nilable(EvaluationResult)) }
|
|
170
|
+
attr_reader :last_example_result
|
|
171
|
+
|
|
172
|
+
sig { returns(T.nilable(BatchEvaluationResult)) }
|
|
173
|
+
attr_reader :last_batch_result
|
|
174
|
+
|
|
175
|
+
include DSPy::Callbacks
|
|
176
|
+
|
|
177
|
+
create_before_callback :call, wrap: false
|
|
178
|
+
create_after_callback :call, wrap: false
|
|
179
|
+
create_before_callback :evaluate, wrap: false
|
|
180
|
+
create_after_callback :evaluate, wrap: false
|
|
181
|
+
|
|
182
|
+
class << self
|
|
183
|
+
def before_example(callback = nil, &block)
|
|
184
|
+
before(callback, target: :call, &block)
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
def after_example(callback = nil, &block)
|
|
188
|
+
after(callback, target: :call, &block)
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
def before_batch(callback = nil, &block)
|
|
192
|
+
before(callback, target: :evaluate, &block)
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
def after_batch(callback = nil, &block)
|
|
196
|
+
after(callback, target: :evaluate, &block)
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
def reset_callbacks!
|
|
200
|
+
@callbacks = {}
|
|
201
|
+
end
|
|
202
|
+
end
|
|
203
|
+
|
|
120
204
|
sig do
|
|
121
205
|
params(
|
|
122
206
|
program: T.untyped,
|
|
123
207
|
metric: T.nilable(T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T::Boolean)),
|
|
124
208
|
num_threads: T.nilable(Integer),
|
|
125
209
|
max_errors: T.nilable(Integer),
|
|
210
|
+
failure_score: T.nilable(Numeric),
|
|
126
211
|
provide_traceback: T::Boolean
|
|
127
212
|
).void
|
|
128
213
|
end
|
|
129
|
-
def initialize(program, metric: nil, num_threads: 1, max_errors: 5, provide_traceback: true)
|
|
214
|
+
def initialize(program, metric: nil, num_threads: 1, max_errors: 5, failure_score: 0.0, provide_traceback: true)
|
|
130
215
|
@program = program
|
|
131
216
|
@metric = metric
|
|
132
217
|
@num_threads = num_threads || 1
|
|
133
218
|
@max_errors = max_errors || 5
|
|
134
219
|
@provide_traceback = provide_traceback
|
|
220
|
+
@failure_score = failure_score ? failure_score.to_f : 0.0
|
|
221
|
+
@last_example_result = nil
|
|
222
|
+
@last_batch_result = nil
|
|
135
223
|
end
|
|
136
224
|
|
|
137
225
|
# Evaluate program on a single example
|
|
138
226
|
sig { params(example: T.untyped, trace: T.nilable(T.untyped)).returns(EvaluationResult) }
|
|
139
227
|
def call(example, trace: nil)
|
|
228
|
+
run_callbacks(:before, :call, example: example)
|
|
229
|
+
|
|
140
230
|
DSPy::Context.with_span(
|
|
141
231
|
operation: 'evaluation.example',
|
|
142
232
|
'dspy.module' => 'Evaluator',
|
|
@@ -144,59 +234,15 @@ module DSPy
|
|
|
144
234
|
'evaluation.has_metric' => !@metric.nil?
|
|
145
235
|
) do
|
|
146
236
|
begin
|
|
147
|
-
|
|
148
|
-
input_values = extract_input_values(example)
|
|
149
|
-
|
|
150
|
-
# Run prediction
|
|
151
|
-
prediction = @program.call(**input_values)
|
|
152
|
-
|
|
153
|
-
# Calculate metrics if provided
|
|
154
|
-
metrics = {}
|
|
155
|
-
passed = true
|
|
156
|
-
|
|
157
|
-
if @metric
|
|
158
|
-
begin
|
|
159
|
-
metric_result = @metric.call(example, prediction)
|
|
160
|
-
if metric_result.is_a?(Hash)
|
|
161
|
-
metrics = metric_result
|
|
162
|
-
passed = metrics[:passed] || metrics['passed'] || true
|
|
163
|
-
else
|
|
164
|
-
passed = !!metric_result
|
|
165
|
-
metrics[:passed] = passed
|
|
166
|
-
end
|
|
167
|
-
rescue => e
|
|
168
|
-
passed = false
|
|
169
|
-
metrics[:error] = e.message
|
|
170
|
-
metrics[:passed] = false
|
|
171
|
-
end
|
|
172
|
-
end
|
|
173
|
-
|
|
174
|
-
EvaluationResult.new(
|
|
175
|
-
example: example,
|
|
176
|
-
prediction: prediction,
|
|
177
|
-
trace: trace,
|
|
178
|
-
metrics: metrics,
|
|
179
|
-
passed: passed
|
|
180
|
-
)
|
|
237
|
+
perform_call(example, trace: trace)
|
|
181
238
|
rescue => e
|
|
182
|
-
|
|
183
|
-
error_metrics = {
|
|
184
|
-
error: e.message,
|
|
185
|
-
passed: false
|
|
186
|
-
}
|
|
187
|
-
|
|
188
|
-
if @provide_traceback
|
|
189
|
-
error_metrics[:traceback] = e.backtrace&.first(10) || []
|
|
190
|
-
end
|
|
191
|
-
|
|
192
|
-
EvaluationResult.new(
|
|
193
|
-
example: example,
|
|
194
|
-
prediction: nil,
|
|
195
|
-
trace: trace,
|
|
196
|
-
metrics: error_metrics,
|
|
197
|
-
passed: false
|
|
198
|
-
)
|
|
239
|
+
build_error_result(example, e, trace: trace)
|
|
199
240
|
end
|
|
241
|
+
end.then do |result|
|
|
242
|
+
@last_example_result = result
|
|
243
|
+
emit_example_observation(example, result)
|
|
244
|
+
run_callbacks(:after, :call, example: example, result: result)
|
|
245
|
+
result
|
|
200
246
|
end
|
|
201
247
|
end
|
|
202
248
|
|
|
@@ -210,6 +256,8 @@ module DSPy
|
|
|
210
256
|
).returns(BatchEvaluationResult)
|
|
211
257
|
end
|
|
212
258
|
def evaluate(devset, display_progress: true, display_table: false, return_outputs: true)
|
|
259
|
+
run_callbacks(:before, :evaluate, devset: devset)
|
|
260
|
+
|
|
213
261
|
DSPy::Context.with_span(
|
|
214
262
|
operation: 'evaluation.batch',
|
|
215
263
|
'dspy.module' => 'Evaluator',
|
|
@@ -218,56 +266,28 @@ module DSPy
|
|
|
218
266
|
'evaluation.has_metric' => !@metric.nil?,
|
|
219
267
|
'evaluation.num_threads' => @num_threads
|
|
220
268
|
) do
|
|
221
|
-
results = []
|
|
222
|
-
errors = 0
|
|
223
|
-
|
|
224
269
|
if display_progress
|
|
225
270
|
puts "Evaluating #{devset.length} examples..."
|
|
226
271
|
end
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
result = call(example)
|
|
233
|
-
results << result
|
|
234
|
-
|
|
235
|
-
unless result.passed
|
|
236
|
-
errors += 1
|
|
237
|
-
end
|
|
238
|
-
|
|
239
|
-
if display_progress && (index + 1) % 10 == 0
|
|
240
|
-
puts "Processed #{index + 1}/#{devset.length} examples (#{results.count(&:passed)} passed)"
|
|
241
|
-
end
|
|
242
|
-
|
|
243
|
-
rescue => e
|
|
244
|
-
errors += 1
|
|
245
|
-
puts "Error processing example #{index}: #{e.message}" if display_progress
|
|
246
|
-
|
|
247
|
-
# Create error result
|
|
248
|
-
error_result = EvaluationResult.new(
|
|
249
|
-
example: example,
|
|
250
|
-
prediction: nil,
|
|
251
|
-
trace: nil,
|
|
252
|
-
metrics: { error: e.message, passed: false },
|
|
253
|
-
passed: false
|
|
254
|
-
)
|
|
255
|
-
results << error_result
|
|
256
|
-
end
|
|
272
|
+
|
|
273
|
+
results = if parallel_execution?
|
|
274
|
+
evaluate_in_parallel(devset, display_progress: display_progress)
|
|
275
|
+
else
|
|
276
|
+
evaluate_sequential(devset, display_progress: display_progress)
|
|
257
277
|
end
|
|
258
|
-
|
|
278
|
+
|
|
259
279
|
# Aggregate metrics
|
|
260
280
|
aggregated_metrics = aggregate_metrics(results)
|
|
261
|
-
|
|
281
|
+
|
|
262
282
|
batch_result = BatchEvaluationResult.new(
|
|
263
283
|
results: results,
|
|
264
284
|
aggregated_metrics: aggregated_metrics
|
|
265
285
|
)
|
|
266
|
-
|
|
286
|
+
|
|
267
287
|
if display_table
|
|
268
288
|
display_results_table(batch_result)
|
|
269
289
|
end
|
|
270
|
-
|
|
290
|
+
|
|
271
291
|
# Emit batch completion event
|
|
272
292
|
DSPy.log('evaluation.batch_complete', **{
|
|
273
293
|
'evaluation.program_class' => @program.class.name,
|
|
@@ -276,17 +296,192 @@ module DSPy
|
|
|
276
296
|
'evaluation.pass_rate' => batch_result.pass_rate,
|
|
277
297
|
'evaluation.aggregated_metrics' => aggregated_metrics
|
|
278
298
|
})
|
|
279
|
-
|
|
299
|
+
|
|
280
300
|
if display_progress
|
|
281
301
|
puts "Evaluation complete: #{batch_result.passed_examples}/#{batch_result.total_examples} passed (#{(batch_result.pass_rate * 100).round(1)}%)"
|
|
282
302
|
end
|
|
283
|
-
|
|
303
|
+
|
|
304
|
+
batch_result
|
|
305
|
+
end.then do |batch_result|
|
|
306
|
+
@last_batch_result = batch_result
|
|
307
|
+
emit_batch_observation(devset, batch_result)
|
|
308
|
+
run_callbacks(:after, :evaluate, devset: devset, result: batch_result)
|
|
284
309
|
batch_result
|
|
285
310
|
end
|
|
286
311
|
end
|
|
287
312
|
|
|
288
313
|
private
|
|
289
314
|
|
|
315
|
+
def parallel_execution?
|
|
316
|
+
(@num_threads || 1) > 1
|
|
317
|
+
end
|
|
318
|
+
|
|
319
|
+
def evaluate_sequential(devset, display_progress:)
|
|
320
|
+
results = []
|
|
321
|
+
errors = 0
|
|
322
|
+
passed_count = 0
|
|
323
|
+
|
|
324
|
+
devset.each_with_index do |example, index|
|
|
325
|
+
break if errors >= @max_errors
|
|
326
|
+
|
|
327
|
+
result = safe_call(example)
|
|
328
|
+
results << result
|
|
329
|
+
|
|
330
|
+
if result.passed
|
|
331
|
+
passed_count += 1
|
|
332
|
+
else
|
|
333
|
+
errors += 1
|
|
334
|
+
end
|
|
335
|
+
|
|
336
|
+
if display_progress && (index + 1) % 10 == 0
|
|
337
|
+
log_progress(index + 1, devset.length, passed_count)
|
|
338
|
+
end
|
|
339
|
+
end
|
|
340
|
+
|
|
341
|
+
results
|
|
342
|
+
end
|
|
343
|
+
|
|
344
|
+
def evaluate_in_parallel(devset, display_progress:)
|
|
345
|
+
total = devset.length
|
|
346
|
+
results = Array.new(total)
|
|
347
|
+
errors = 0
|
|
348
|
+
processed = 0
|
|
349
|
+
passed_count = 0
|
|
350
|
+
|
|
351
|
+
executor = Concurrent::ThreadPoolExecutor.new(
|
|
352
|
+
min_threads: @num_threads,
|
|
353
|
+
max_threads: @num_threads,
|
|
354
|
+
max_queue: [total, 1].max,
|
|
355
|
+
idletime: 60
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
enumerator = devset.each_with_index
|
|
359
|
+
|
|
360
|
+
loop do
|
|
361
|
+
break if errors >= @max_errors
|
|
362
|
+
|
|
363
|
+
batch = []
|
|
364
|
+
@num_threads.times do
|
|
365
|
+
begin
|
|
366
|
+
example = enumerator.next
|
|
367
|
+
batch << { example: example[0], index: example[1] }
|
|
368
|
+
rescue StopIteration
|
|
369
|
+
break
|
|
370
|
+
end
|
|
371
|
+
end
|
|
372
|
+
|
|
373
|
+
break if batch.empty?
|
|
374
|
+
|
|
375
|
+
futures = batch.map do |item|
|
|
376
|
+
Concurrent::Promises.future_on(executor) do
|
|
377
|
+
[:ok, item[:index], safe_call(item[:example])]
|
|
378
|
+
rescue => e
|
|
379
|
+
[:error, item[:index], e]
|
|
380
|
+
end
|
|
381
|
+
end
|
|
382
|
+
|
|
383
|
+
futures.each do |future|
|
|
384
|
+
status, index, payload = future.value!
|
|
385
|
+
example = batch.find { |entry| entry[:index] == index }[:example]
|
|
386
|
+
|
|
387
|
+
result = if status == :ok
|
|
388
|
+
payload
|
|
389
|
+
else
|
|
390
|
+
errors += 1
|
|
391
|
+
puts "Error processing example #{index}: #{payload.message}" if display_progress
|
|
392
|
+
build_error_result(example, payload)
|
|
393
|
+
end
|
|
394
|
+
|
|
395
|
+
results[index] = result
|
|
396
|
+
processed += 1
|
|
397
|
+
if result.passed
|
|
398
|
+
passed_count += 1
|
|
399
|
+
else
|
|
400
|
+
errors += 1 unless status == :error
|
|
401
|
+
end
|
|
402
|
+
|
|
403
|
+
if display_progress && (processed % 10).zero?
|
|
404
|
+
log_progress(processed, total, passed_count)
|
|
405
|
+
end
|
|
406
|
+
end
|
|
407
|
+
end
|
|
408
|
+
|
|
409
|
+
executor.shutdown
|
|
410
|
+
executor.wait_for_termination
|
|
411
|
+
|
|
412
|
+
results.compact
|
|
413
|
+
end
|
|
414
|
+
|
|
415
|
+
def safe_call(example)
|
|
416
|
+
call(example)
|
|
417
|
+
rescue => e
|
|
418
|
+
build_error_result(example, e)
|
|
419
|
+
end
|
|
420
|
+
|
|
421
|
+
def perform_call(example, trace:)
|
|
422
|
+
# Extract input from example - support both hash and object formats
|
|
423
|
+
input_values = extract_input_values(example)
|
|
424
|
+
|
|
425
|
+
# Run prediction
|
|
426
|
+
prediction = @program.call(**input_values)
|
|
427
|
+
|
|
428
|
+
# Calculate metrics if provided
|
|
429
|
+
metrics = {}
|
|
430
|
+
passed = true
|
|
431
|
+
|
|
432
|
+
if @metric
|
|
433
|
+
begin
|
|
434
|
+
metric_result = @metric.call(example, prediction)
|
|
435
|
+
if metric_result.is_a?(Hash)
|
|
436
|
+
metrics = symbolize_keys(metric_result)
|
|
437
|
+
passed_flag = metrics.key?(:passed) ? metrics[:passed] : metrics['passed']
|
|
438
|
+
passed = passed_flag.nil? ? true : !!passed_flag
|
|
439
|
+
else
|
|
440
|
+
passed = !!metric_result
|
|
441
|
+
metrics[:passed] = passed
|
|
442
|
+
end
|
|
443
|
+
rescue => e
|
|
444
|
+
passed = false
|
|
445
|
+
metrics[:error] = e.message
|
|
446
|
+
metrics[:passed] = false
|
|
447
|
+
metrics[:score] = @failure_score
|
|
448
|
+
end
|
|
449
|
+
end
|
|
450
|
+
|
|
451
|
+
metrics[:passed] = passed unless metrics.key?(:passed)
|
|
452
|
+
metrics[:score] = normalize_score(metrics[:score], passed) if metrics.key?(:score)
|
|
453
|
+
metrics[:score] ||= passed ? 1.0 : 0.0
|
|
454
|
+
|
|
455
|
+
EvaluationResult.new(
|
|
456
|
+
example: example,
|
|
457
|
+
prediction: prediction,
|
|
458
|
+
trace: trace,
|
|
459
|
+
metrics: metrics,
|
|
460
|
+
passed: passed
|
|
461
|
+
)
|
|
462
|
+
end
|
|
463
|
+
|
|
464
|
+
def build_error_result(example, error, trace: nil)
|
|
465
|
+
metrics = {
|
|
466
|
+
error: error.message,
|
|
467
|
+
passed: false,
|
|
468
|
+
score: @failure_score
|
|
469
|
+
}
|
|
470
|
+
metrics[:traceback] = error.backtrace&.first(10) || [] if @provide_traceback
|
|
471
|
+
|
|
472
|
+
EvaluationResult.new(
|
|
473
|
+
example: example,
|
|
474
|
+
prediction: nil,
|
|
475
|
+
trace: trace,
|
|
476
|
+
metrics: metrics,
|
|
477
|
+
passed: false
|
|
478
|
+
)
|
|
479
|
+
end
|
|
480
|
+
|
|
481
|
+
def log_progress(processed, total, passed_count)
|
|
482
|
+
puts "Processed #{processed}/#{total} examples (#{passed_count} passed)"
|
|
483
|
+
end
|
|
484
|
+
|
|
290
485
|
# Extract input values from example in various formats
|
|
291
486
|
sig { params(example: T.untyped).returns(T::Hash[Symbol, T.untyped]) }
|
|
292
487
|
def extract_input_values(example)
|
|
@@ -376,36 +571,49 @@ module DSPy
|
|
|
376
571
|
def aggregate_metrics(results)
|
|
377
572
|
return {} if results.empty?
|
|
378
573
|
|
|
379
|
-
|
|
574
|
+
total = results.length
|
|
575
|
+
passed = results.count(&:passed)
|
|
576
|
+
|
|
380
577
|
aggregated = {
|
|
381
|
-
total_examples:
|
|
382
|
-
passed_examples:
|
|
578
|
+
total_examples: total,
|
|
579
|
+
passed_examples: passed,
|
|
383
580
|
failed_examples: results.count { |r| !r.passed }
|
|
384
581
|
}
|
|
385
|
-
|
|
386
|
-
|
|
582
|
+
|
|
583
|
+
score_values = results.filter_map do |result|
|
|
584
|
+
score = result.metrics[:score]
|
|
585
|
+
score if score.is_a?(Numeric)
|
|
586
|
+
end
|
|
587
|
+
|
|
588
|
+
if score_values.any?
|
|
589
|
+
aggregated[:score_sum] = score_values.sum
|
|
590
|
+
aggregated[:score_avg] = score_values.sum.to_f / score_values.length
|
|
591
|
+
aggregated[:score_min] = score_values.min
|
|
592
|
+
aggregated[:score_max] = score_values.max
|
|
593
|
+
else
|
|
594
|
+
aggregated[:score_avg] = passed.positive? && total.positive? ? passed.to_f / total : 0.0
|
|
595
|
+
end
|
|
596
|
+
|
|
597
|
+
# Aggregate other numeric metrics
|
|
387
598
|
numeric_metrics = {}
|
|
388
599
|
results.each do |result|
|
|
389
600
|
result.metrics.each do |key, value|
|
|
390
|
-
next if [:error, :traceback, :passed].include?(key)
|
|
601
|
+
next if [:error, :traceback, :passed, :score].include?(key)
|
|
391
602
|
next unless value.is_a?(Numeric)
|
|
392
|
-
|
|
603
|
+
|
|
393
604
|
numeric_metrics[key] ||= []
|
|
394
605
|
numeric_metrics[key] << value
|
|
395
606
|
end
|
|
396
607
|
end
|
|
397
|
-
|
|
398
|
-
# Calculate averages for numeric metrics
|
|
608
|
+
|
|
399
609
|
numeric_metrics.each do |key, values|
|
|
400
610
|
aggregated[:"#{key}_avg"] = values.sum.to_f / values.length
|
|
401
611
|
aggregated[:"#{key}_min"] = values.min
|
|
402
612
|
aggregated[:"#{key}_max"] = values.max
|
|
403
613
|
end
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
aggregated[:passed_examples].to_f / aggregated[:total_examples] : 0.0
|
|
408
|
-
|
|
614
|
+
|
|
615
|
+
aggregated[:pass_rate] = total.positive? ? passed.to_f / total : 0.0
|
|
616
|
+
|
|
409
617
|
aggregated
|
|
410
618
|
end
|
|
411
619
|
|
|
@@ -429,6 +637,61 @@ module DSPy
|
|
|
429
637
|
|
|
430
638
|
puts "=" * 50
|
|
431
639
|
end
|
|
640
|
+
|
|
641
|
+
def emit_example_observation(example, result)
|
|
642
|
+
DSPy.event('evals.example.complete', {
|
|
643
|
+
program: @program.class.name,
|
|
644
|
+
example_id: extract_example_id(example),
|
|
645
|
+
passed: result.passed,
|
|
646
|
+
score: result.metrics[:score],
|
|
647
|
+
error: result.metrics[:error]
|
|
648
|
+
})
|
|
649
|
+
rescue => e
|
|
650
|
+
DSPy.log('evals.example.observation_error', error: e.message)
|
|
651
|
+
end
|
|
652
|
+
|
|
653
|
+
def emit_batch_observation(devset, batch_result)
|
|
654
|
+
DSPy.event('evals.batch.complete', {
|
|
655
|
+
program: @program.class.name,
|
|
656
|
+
dataset_size: devset.length,
|
|
657
|
+
total_examples: batch_result.total_examples,
|
|
658
|
+
passed_examples: batch_result.passed_examples,
|
|
659
|
+
pass_rate: batch_result.pass_rate,
|
|
660
|
+
score: batch_result.score
|
|
661
|
+
})
|
|
662
|
+
rescue => e
|
|
663
|
+
DSPy.log('evals.batch.observation_error', error: e.message)
|
|
664
|
+
end
|
|
665
|
+
|
|
666
|
+
def extract_example_id(example)
|
|
667
|
+
if example.respond_to?(:id)
|
|
668
|
+
example.id
|
|
669
|
+
elsif example.is_a?(Hash)
|
|
670
|
+
example[:id] || example['id']
|
|
671
|
+
else
|
|
672
|
+
nil
|
|
673
|
+
end
|
|
674
|
+
rescue
|
|
675
|
+
nil
|
|
676
|
+
end
|
|
677
|
+
|
|
678
|
+
def symbolize_keys(hash)
|
|
679
|
+
hash.each_with_object({}) do |(key, value), memo|
|
|
680
|
+
memo[key.respond_to?(:to_sym) ? key.to_sym : key] = value
|
|
681
|
+
end
|
|
682
|
+
end
|
|
683
|
+
|
|
684
|
+
def normalize_score(value, passed)
|
|
685
|
+
case value
|
|
686
|
+
when Numeric
|
|
687
|
+
value.to_f
|
|
688
|
+
when TrueClass, FalseClass
|
|
689
|
+
value ? 1.0 : 0.0
|
|
690
|
+
else
|
|
691
|
+
passed ? 1.0 : 0.0
|
|
692
|
+
end
|
|
693
|
+
end
|
|
694
|
+
|
|
432
695
|
end
|
|
433
696
|
|
|
434
697
|
# Common metric functions for evaluation
|
|
@@ -447,7 +710,7 @@ module DSPy
|
|
|
447
710
|
expected = extract_field(example, field)
|
|
448
711
|
actual = extract_field(prediction, field)
|
|
449
712
|
|
|
450
|
-
|
|
713
|
+
next false if expected.nil? || actual.nil?
|
|
451
714
|
|
|
452
715
|
if case_sensitive
|
|
453
716
|
expected.to_s == actual.to_s
|
|
@@ -469,7 +732,7 @@ module DSPy
|
|
|
469
732
|
expected = extract_field(example, field)
|
|
470
733
|
actual = extract_field(prediction, field)
|
|
471
734
|
|
|
472
|
-
|
|
735
|
+
next false if expected.nil? || actual.nil?
|
|
473
736
|
|
|
474
737
|
if case_sensitive
|
|
475
738
|
actual.to_s.include?(expected.to_s)
|
|
@@ -491,7 +754,7 @@ module DSPy
|
|
|
491
754
|
expected = extract_field(example, field)
|
|
492
755
|
actual = extract_field(prediction, field)
|
|
493
756
|
|
|
494
|
-
|
|
757
|
+
next { passed: false, error: "Missing values" } if expected.nil? || actual.nil?
|
|
495
758
|
|
|
496
759
|
begin
|
|
497
760
|
expected_num = Float(expected)
|
|
@@ -554,4 +817,4 @@ module DSPy
|
|
|
554
817
|
end
|
|
555
818
|
end
|
|
556
819
|
end
|
|
557
|
-
end
|
|
820
|
+
end
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'sorbet-runtime'
|
|
4
|
+
require_relative '../errors'
|
|
5
|
+
|
|
6
|
+
module DSPy
|
|
7
|
+
module Mixins
|
|
8
|
+
module InstructionUpdatable
|
|
9
|
+
extend T::Sig
|
|
10
|
+
|
|
11
|
+
sig { params(new_instruction: String).returns(T.untyped) }
|
|
12
|
+
def with_instruction(new_instruction)
|
|
13
|
+
raise DSPy::InstructionUpdateError.missing_instruction_capability(self.class)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
sig { params(few_shot_examples: T::Array[T.untyped]).returns(T.untyped) }
|
|
17
|
+
def with_examples(few_shot_examples)
|
|
18
|
+
raise DSPy::InstructionUpdateError.missing_examples_capability(self.class)
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|