braintrust 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +148 -24
- data/lib/braintrust/api/internal/btql.rb +124 -0
- data/lib/braintrust/api/internal/experiments.rb +19 -0
- data/lib/braintrust/api/internal/projects.rb +19 -0
- data/lib/braintrust/contrib/rails/server/application_controller.rb +34 -0
- data/lib/braintrust/contrib/rails/server/engine.rb +72 -0
- data/lib/braintrust/contrib/rails/server/eval_controller.rb +36 -0
- data/lib/braintrust/contrib/rails/server/generator.rb +43 -0
- data/lib/braintrust/contrib/rails/server/health_controller.rb +15 -0
- data/lib/braintrust/contrib/rails/server/list_controller.rb +16 -0
- data/lib/braintrust/contrib/rails/server/routes.rb +8 -0
- data/lib/braintrust/contrib/rails/server.rb +20 -0
- data/lib/braintrust/dataset.rb +6 -3
- data/lib/braintrust/eval/context.rb +131 -0
- data/lib/braintrust/eval/evaluator.rb +11 -5
- data/lib/braintrust/eval/functions.rb +10 -166
- data/lib/braintrust/eval/runner.rb +165 -145
- data/lib/braintrust/eval/scorer.rb +24 -96
- data/lib/braintrust/eval/trace.rb +129 -0
- data/lib/braintrust/eval.rb +60 -132
- data/lib/braintrust/functions.rb +168 -0
- data/lib/braintrust/internal/callable.rb +83 -0
- data/lib/braintrust/logger.rb +9 -0
- data/lib/braintrust/scorer.rb +173 -0
- data/lib/braintrust/server/handlers/eval.rb +8 -168
- data/lib/braintrust/server/handlers/list.rb +3 -41
- data/lib/braintrust/server/rack.rb +2 -0
- data/lib/braintrust/server/services/eval_service.rb +214 -0
- data/lib/braintrust/server/services/list_service.rb +64 -0
- data/lib/braintrust/task.rb +108 -0
- data/lib/braintrust/trace/span_processor.rb +0 -5
- data/lib/braintrust/version.rb +1 -1
- metadata +18 -1
|
@@ -1,59 +1,52 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require_relative "case"
|
|
4
|
-
require_relative "cases"
|
|
5
|
-
require_relative "scorer"
|
|
6
4
|
require_relative "result"
|
|
7
5
|
require_relative "summary"
|
|
6
|
+
require_relative "trace"
|
|
8
7
|
require_relative "../internal/thread_pool"
|
|
8
|
+
require_relative "../api/internal/btql"
|
|
9
9
|
|
|
10
10
|
require "opentelemetry/sdk"
|
|
11
11
|
require "json"
|
|
12
12
|
|
|
13
13
|
module Braintrust
|
|
14
14
|
module Eval
|
|
15
|
-
# Internal runner class that performs the execution of the Eval and returns the result
|
|
15
|
+
# Internal runner class that performs the execution of the Eval and returns the result.
|
|
16
|
+
# Receives a fully-normalized Context — all callables are already typed wrappers.
|
|
16
17
|
class Runner
|
|
17
18
|
# Maximum parallelism allowed (mirrors Internal::ThreadPool::MAX_PARALLELISM)
|
|
18
19
|
MAX_PARALLELISM = Internal::ThreadPool::MAX_PARALLELISM
|
|
19
20
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
@
|
|
26
|
-
|
|
27
|
-
@
|
|
28
|
-
@scorers = normalize_scorers(scorers)
|
|
29
|
-
@state = state
|
|
30
|
-
@tracer_provider = tracer_provider || OpenTelemetry.tracer_provider
|
|
31
|
-
@tracer = @tracer_provider.tracer("braintrust-eval")
|
|
32
|
-
@parent_attr = parent ? "#{parent[:object_type]}:#{parent[:object_id]}" : nil
|
|
33
|
-
@generation = parent&.dig(:generation)
|
|
34
|
-
@on_progress = on_progress
|
|
21
|
+
# Per-case mutable accumulator. Built from Case, populated by task and scoring stages.
|
|
22
|
+
CaseContext = Struct.new(:input, :expected, :output, :metadata, :tags, :trace, :origin, keyword_init: true)
|
|
23
|
+
|
|
24
|
+
# @param eval_context [Context] Normalized eval context
|
|
25
|
+
def initialize(eval_context)
|
|
26
|
+
@eval_context = eval_context
|
|
27
|
+
tracer_provider = eval_context.tracer_provider || OpenTelemetry.tracer_provider
|
|
28
|
+
@tracer = tracer_provider.tracer("braintrust-eval")
|
|
35
29
|
|
|
36
30
|
# Mutex for thread-safe score collection
|
|
37
31
|
@score_mutex = Mutex.new
|
|
38
32
|
end
|
|
39
33
|
|
|
40
34
|
# Run evaluation and return Result
|
|
41
|
-
# @param cases [Array, Enumerable] Test cases
|
|
42
35
|
# @param parallelism [Integer] Number of parallel workers (default: 1)
|
|
43
36
|
# @return [Result]
|
|
44
|
-
def run(
|
|
37
|
+
def run(parallelism: 1)
|
|
45
38
|
start_time = Time.now
|
|
46
|
-
|
|
39
|
+
eval_cases = eval_context.cases
|
|
47
40
|
errors = Queue.new
|
|
48
41
|
@scores = {} # Reset for each run: { scorer_name => Array<Numeric> }
|
|
49
42
|
|
|
50
43
|
if parallelism && parallelism > 1
|
|
51
|
-
Internal::ThreadPool.each(
|
|
52
|
-
|
|
44
|
+
Internal::ThreadPool.each(eval_cases, parallelism: parallelism) do |eval_case|
|
|
45
|
+
run_eval_case(build_case_context(eval_case), errors)
|
|
53
46
|
end
|
|
54
47
|
else
|
|
55
|
-
|
|
56
|
-
|
|
48
|
+
eval_cases.each do |eval_case|
|
|
49
|
+
run_eval_case(build_case_context(eval_case), errors)
|
|
57
50
|
end
|
|
58
51
|
end
|
|
59
52
|
|
|
@@ -64,15 +57,15 @@ module Braintrust
|
|
|
64
57
|
duration = Time.now - start_time
|
|
65
58
|
|
|
66
59
|
# Generate permalink (only when state and experiment are available)
|
|
67
|
-
permalink = if
|
|
68
|
-
|
|
60
|
+
permalink = if eval_context.state && eval_context.experiment_id
|
|
61
|
+
eval_context.state.object_permalink(object_type: "experiment", object_id: eval_context.experiment_id)
|
|
69
62
|
end
|
|
70
63
|
|
|
71
64
|
Result.new(
|
|
72
|
-
experiment_id: experiment_id,
|
|
73
|
-
experiment_name: experiment_name,
|
|
74
|
-
project_id: project_id,
|
|
75
|
-
project_name: project_name,
|
|
65
|
+
experiment_id: eval_context.experiment_id,
|
|
66
|
+
experiment_name: eval_context.experiment_name,
|
|
67
|
+
project_id: eval_context.project_id,
|
|
68
|
+
project_name: eval_context.project_name,
|
|
76
69
|
permalink: permalink,
|
|
77
70
|
errors: error_array,
|
|
78
71
|
duration: duration,
|
|
@@ -82,86 +75,73 @@ module Braintrust
|
|
|
82
75
|
|
|
83
76
|
private
|
|
84
77
|
|
|
85
|
-
attr_reader :
|
|
86
|
-
:task, :scorers, :tracer, :parent_attr
|
|
78
|
+
attr_reader :eval_context, :tracer
|
|
87
79
|
|
|
88
80
|
# Run a single test case with OpenTelemetry tracing
|
|
89
81
|
# Creates eval span (parent) with task and score as children
|
|
90
|
-
# @param
|
|
82
|
+
# @param case_context [CaseContext] The per-case accumulator
|
|
91
83
|
# @param errors [Queue] Thread-safe error collection queue
|
|
92
|
-
def
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
# Set
|
|
97
|
-
eval_span.set_attribute("braintrust.
|
|
84
|
+
def run_eval_case(case_context, errors)
|
|
85
|
+
# Each eval case starts its own trace — detach from any ambient span context
|
|
86
|
+
eval_span = tracer.start_root_span("eval")
|
|
87
|
+
OpenTelemetry::Trace.with_span(eval_span) do
|
|
88
|
+
# Set attributes known before task execution
|
|
89
|
+
eval_span.set_attribute("braintrust.parent", eval_context.parent_span_attr) if eval_context.parent_span_attr
|
|
90
|
+
set_json_attr(eval_span, "braintrust.span_attributes", build_span_attributes("eval"))
|
|
91
|
+
set_json_attr(eval_span, "braintrust.input_json", {input: case_context.input})
|
|
92
|
+
set_json_attr(eval_span, "braintrust.expected", case_context.expected) if case_context.expected
|
|
93
|
+
set_json_attr(eval_span, "braintrust.metadata", case_context.metadata) if case_context.metadata
|
|
94
|
+
eval_span.set_attribute("braintrust.tags", case_context.tags) if case_context.tags
|
|
95
|
+
eval_span.set_attribute("braintrust.origin", case_context.origin) if case_context.origin
|
|
98
96
|
|
|
99
97
|
# Run task
|
|
100
|
-
output = nil
|
|
101
98
|
begin
|
|
102
|
-
output = run_task(
|
|
99
|
+
case_context.output = run_task(case_context)
|
|
103
100
|
rescue => e
|
|
104
101
|
# Error already recorded on task span, set eval span status
|
|
105
102
|
eval_span.status = OpenTelemetry::Trace::Status.error(e.message)
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
"id" => eval_span.context.hex_span_id,
|
|
110
|
-
"error" => e.message
|
|
111
|
-
}
|
|
112
|
-
if test_case.origin
|
|
113
|
-
error_progress["origin"] = test_case.origin.is_a?(String) ? JSON.parse(test_case.origin) : test_case.origin
|
|
114
|
-
end
|
|
115
|
-
@on_progress.call(error_progress)
|
|
116
|
-
end
|
|
103
|
+
set_json_attr(eval_span, "braintrust.output_json", {output: nil})
|
|
104
|
+
errors << "Task failed for input '#{case_context.input}': #{e.message}"
|
|
105
|
+
report_progress(eval_span, case_context, error: e.message)
|
|
117
106
|
next
|
|
118
107
|
end
|
|
119
108
|
|
|
109
|
+
# Flush spans so they're queryable via BTQL, then build trace
|
|
110
|
+
eval_context.tracer_provider&.force_flush
|
|
111
|
+
case_context.trace = build_trace(eval_span)
|
|
112
|
+
|
|
120
113
|
# Run scorers
|
|
121
|
-
case_scores = nil
|
|
122
114
|
begin
|
|
123
|
-
|
|
115
|
+
run_scorers(case_context)
|
|
124
116
|
rescue => e
|
|
125
117
|
# Error already recorded on score span, set eval span status
|
|
126
118
|
eval_span.status = OpenTelemetry::Trace::Status.error(e.message)
|
|
127
|
-
errors << "Scorers failed for input '#{
|
|
119
|
+
errors << "Scorers failed for input '#{case_context.input}': #{e.message}"
|
|
128
120
|
end
|
|
129
121
|
|
|
130
|
-
# Set
|
|
131
|
-
set_json_attr(eval_span, "braintrust.
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
set_json_attr(eval_span, "braintrust.expected", test_case.expected) if test_case.expected
|
|
135
|
-
|
|
136
|
-
# Set origin for cases from remote sources (already JSON-serialized)
|
|
137
|
-
eval_span.set_attribute("braintrust.origin", test_case.origin) if test_case.origin
|
|
138
|
-
|
|
139
|
-
if @on_progress
|
|
140
|
-
progress = {
|
|
141
|
-
"id" => eval_span.context.hex_span_id,
|
|
142
|
-
"data" => output,
|
|
143
|
-
"scores" => case_scores || {}
|
|
144
|
-
}
|
|
145
|
-
if test_case.origin
|
|
146
|
-
progress["origin"] = test_case.origin.is_a?(String) ? JSON.parse(test_case.origin) : test_case.origin
|
|
147
|
-
end
|
|
148
|
-
@on_progress.call(progress)
|
|
149
|
-
end
|
|
122
|
+
# Set output after task completes
|
|
123
|
+
set_json_attr(eval_span, "braintrust.output_json", {output: case_context.output})
|
|
124
|
+
|
|
125
|
+
report_progress(eval_span, case_context, data: case_context.output)
|
|
150
126
|
end
|
|
127
|
+
ensure
|
|
128
|
+
eval_span&.finish
|
|
151
129
|
end
|
|
152
130
|
|
|
153
131
|
# Run task with OpenTelemetry tracing
|
|
154
132
|
# Creates task span with input and output
|
|
155
|
-
# @param
|
|
133
|
+
# @param case_context [CaseContext] The per-case context
|
|
156
134
|
# @return [Object] Task output
|
|
157
|
-
def run_task(
|
|
135
|
+
def run_task(case_context)
|
|
158
136
|
tracer.in_span("task") do |task_span|
|
|
159
|
-
task_span.set_attribute("braintrust.parent",
|
|
137
|
+
task_span.set_attribute("braintrust.parent", eval_context.parent_span_attr) if eval_context.parent_span_attr
|
|
160
138
|
set_json_attr(task_span, "braintrust.span_attributes", build_span_attributes("task"))
|
|
161
|
-
set_json_attr(task_span, "braintrust.input_json",
|
|
139
|
+
set_json_attr(task_span, "braintrust.input_json", case_context.input)
|
|
162
140
|
|
|
163
141
|
begin
|
|
164
|
-
output = task.call(
|
|
142
|
+
output = eval_context.task.call(
|
|
143
|
+
input: case_context.input
|
|
144
|
+
)
|
|
165
145
|
set_json_attr(task_span, "braintrust.output_json", output)
|
|
166
146
|
output
|
|
167
147
|
rescue => e
|
|
@@ -173,70 +153,103 @@ module Braintrust
|
|
|
173
153
|
end
|
|
174
154
|
end
|
|
175
155
|
|
|
176
|
-
# Run scorers with OpenTelemetry tracing
|
|
177
|
-
# Creates
|
|
178
|
-
# @param
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
156
|
+
# Run scorers with OpenTelemetry tracing.
|
|
157
|
+
# Creates one span per scorer, each a direct child of the current (eval) span.
|
|
158
|
+
# @param case_context [CaseContext] The per-case context (output must be populated)
|
|
159
|
+
def run_scorers(case_context)
|
|
160
|
+
scorer_kwargs = {
|
|
161
|
+
input: case_context.input,
|
|
162
|
+
expected: case_context.expected,
|
|
163
|
+
output: case_context.output,
|
|
164
|
+
metadata: case_context.metadata || {},
|
|
165
|
+
trace: case_context.trace
|
|
166
|
+
}
|
|
167
|
+
scorer_input = {
|
|
168
|
+
input: case_context.input,
|
|
169
|
+
expected: case_context.expected,
|
|
170
|
+
output: case_context.output,
|
|
171
|
+
metadata: case_context.metadata || {}
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
scorer_error = nil
|
|
175
|
+
eval_context.scorers.each do |scorer|
|
|
176
|
+
collect_scores(run_scorer(scorer, scorer_kwargs, scorer_input))
|
|
177
|
+
rescue => e
|
|
178
|
+
scorer_error ||= e
|
|
179
|
+
end
|
|
199
180
|
|
|
200
|
-
|
|
201
|
-
|
|
181
|
+
raise scorer_error if scorer_error
|
|
182
|
+
end
|
|
202
183
|
|
|
203
|
-
|
|
204
|
-
|
|
184
|
+
# Run a single scorer inside its own span.
|
|
185
|
+
# @param scorer [Scorer] The scorer to run
|
|
186
|
+
# @param scorer_kwargs [Hash] Keyword arguments for the scorer
|
|
187
|
+
# @param scorer_input [Hash] Input to log on the span
|
|
188
|
+
# @return [Array<Hash>] Raw score results from the scorer
|
|
189
|
+
def run_scorer(scorer, scorer_kwargs, scorer_input)
|
|
190
|
+
tracer.in_span(scorer.name) do |score_span|
|
|
191
|
+
score_span.set_attribute("braintrust.parent", eval_context.parent_span_attr) if eval_context.parent_span_attr
|
|
192
|
+
set_json_attr(score_span, "braintrust.span_attributes", build_scorer_span_attributes(scorer.name))
|
|
193
|
+
set_json_attr(score_span, "braintrust.input_json", scorer_input)
|
|
194
|
+
|
|
195
|
+
score_results = scorer.call(**scorer_kwargs)
|
|
196
|
+
|
|
197
|
+
scorer_scores = {}
|
|
198
|
+
scorer_metadata = {}
|
|
199
|
+
score_results.each do |s|
|
|
200
|
+
scorer_scores[s[:name]] = s[:score]
|
|
201
|
+
scorer_metadata[s[:name]] = s[:metadata] if s[:metadata].is_a?(Hash)
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
set_json_attr(score_span, "braintrust.output_json", scorer_scores)
|
|
205
|
+
set_json_attr(score_span, "braintrust.scores", scorer_scores)
|
|
206
|
+
set_json_attr(score_span, "braintrust.metadata", scorer_metadata) unless scorer_metadata.empty?
|
|
205
207
|
|
|
206
|
-
|
|
208
|
+
score_results
|
|
209
|
+
rescue => e
|
|
210
|
+
record_span_error(score_span, e, "ScorerError")
|
|
211
|
+
raise
|
|
207
212
|
end
|
|
208
213
|
end
|
|
209
214
|
|
|
210
|
-
#
|
|
211
|
-
#
|
|
212
|
-
# @
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
end
|
|
215
|
+
# Build a lazy Trace for a case, backed by BTQL.
|
|
216
|
+
# Returns nil when state or experiment_id are unavailable (local-only mode).
|
|
217
|
+
# @param eval_span [OpenTelemetry::Trace::Span] The eval span for this case
|
|
218
|
+
# @return [Eval::Trace, nil]
|
|
219
|
+
def build_trace(eval_span)
|
|
220
|
+
return nil unless eval_context.state && eval_context.experiment_id
|
|
221
|
+
|
|
222
|
+
root_span_id = eval_span.context.hex_trace_id
|
|
223
|
+
object_type = "experiment"
|
|
224
|
+
object_id = eval_context.experiment_id
|
|
225
|
+
btql = API::Internal::BTQL.new(eval_context.state)
|
|
226
|
+
|
|
227
|
+
Eval::Trace.new(
|
|
228
|
+
spans: -> { btql.trace_spans(object_type: object_type, object_id: object_id, root_span_id: root_span_id) }
|
|
229
|
+
)
|
|
226
230
|
end
|
|
227
231
|
|
|
228
|
-
#
|
|
229
|
-
# @param
|
|
230
|
-
# @return [
|
|
231
|
-
def
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
232
|
+
# Build a CaseContext from a Case struct
|
|
233
|
+
# @param eval_case [Case] The eval case
|
|
234
|
+
# @return [CaseContext]
|
|
235
|
+
def build_case_context(eval_case)
|
|
236
|
+
CaseContext.new(
|
|
237
|
+
input: eval_case.input, expected: eval_case.expected,
|
|
238
|
+
metadata: eval_case.metadata, tags: eval_case.tags, origin: eval_case.origin
|
|
239
|
+
)
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
# Report progress for a case via on_progress callback.
|
|
243
|
+
# Rescues errors in the callback so a broken handler never crashes the eval.
|
|
244
|
+
def report_progress(eval_span, case_context, **fields)
|
|
245
|
+
return unless eval_context.on_progress
|
|
246
|
+
progress = {"id" => eval_span.context.hex_span_id}.merge(fields.transform_keys(&:to_s))
|
|
247
|
+
if case_context.origin
|
|
248
|
+
progress["origin"] = case_context.origin.is_a?(String) ? JSON.parse(case_context.origin) : case_context.origin
|
|
239
249
|
end
|
|
250
|
+
eval_context.on_progress.call(progress)
|
|
251
|
+
rescue => e
|
|
252
|
+
Braintrust.logger.warn("on_progress callback error: #{e.message}")
|
|
240
253
|
end
|
|
241
254
|
|
|
242
255
|
# Record error on span with exception event and error status
|
|
@@ -258,8 +271,18 @@ module Braintrust
|
|
|
258
271
|
# @return [Hash]
|
|
259
272
|
def build_span_attributes(type)
|
|
260
273
|
attrs = {type: type}
|
|
261
|
-
attrs[:name] = experiment_name if experiment_name
|
|
262
|
-
attrs[:generation] =
|
|
274
|
+
attrs[:name] = eval_context.experiment_name if eval_context.experiment_name
|
|
275
|
+
attrs[:generation] = eval_context.generation if eval_context.generation
|
|
276
|
+
attrs
|
|
277
|
+
end
|
|
278
|
+
|
|
279
|
+
# Build span_attributes for a scorer span.
|
|
280
|
+
# Each scorer gets its own span with type "score", purpose "scorer", and the scorer's name.
|
|
281
|
+
# @param scorer_name [String] The scorer name
|
|
282
|
+
# @return [Hash]
|
|
283
|
+
def build_scorer_span_attributes(scorer_name)
|
|
284
|
+
attrs = {type: "score", name: scorer_name, purpose: "scorer"}
|
|
285
|
+
attrs[:generation] = eval_context.generation if eval_context.generation
|
|
263
286
|
attrs
|
|
264
287
|
end
|
|
265
288
|
|
|
@@ -271,14 +294,11 @@ module Braintrust
|
|
|
271
294
|
span.set_attribute(key, JSON.dump(value))
|
|
272
295
|
end
|
|
273
296
|
|
|
274
|
-
# Collect
|
|
275
|
-
# @param
|
|
276
|
-
|
|
277
|
-
def collect_score(name, value)
|
|
278
|
-
return unless value.is_a?(Numeric)
|
|
279
|
-
|
|
297
|
+
# Collect score results into the summary accumulator (thread-safe).
|
|
298
|
+
# @param score_results [Array<Hash>] Score results from a scorer
|
|
299
|
+
def collect_scores(score_results)
|
|
280
300
|
@score_mutex.synchronize do
|
|
281
|
-
(@scores[name] ||= []) <<
|
|
301
|
+
score_results.each { |s| (@scores[s[:name]] ||= []) << s[:score] }
|
|
282
302
|
end
|
|
283
303
|
end
|
|
284
304
|
end
|
|
@@ -1,112 +1,40 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require_relative "../scorer"
|
|
4
|
+
|
|
3
5
|
module Braintrust
|
|
4
6
|
module Eval
|
|
5
|
-
#
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
# Create a new scorer
|
|
12
|
-
# @param name_or_callable [String, Symbol, #call] Name or callable (if callable, name is auto-detected)
|
|
13
|
-
# @param callable [#call, nil] Callable if name was provided separately
|
|
14
|
-
# @param block [Proc, nil] Block if no callable provided
|
|
15
|
-
def initialize(name_or_callable = nil, callable = nil, &block)
|
|
16
|
-
# Determine name and callable from arguments
|
|
17
|
-
if name_or_callable.nil? && callable.nil? && block.nil?
|
|
18
|
-
raise ArgumentError, "Must provide callable or block"
|
|
19
|
-
end
|
|
7
|
+
# @deprecated Use {Braintrust::Scorer} instead.
|
|
8
|
+
module Scorer
|
|
9
|
+
# @deprecated Use {Braintrust::Scorer.new} instead.
|
|
10
|
+
def self.new(name_or_callable = nil, callable = nil, &block)
|
|
11
|
+
Log.warn_once(:eval_scorer_class, "Braintrust::Eval::Scorer is deprecated: use Braintrust::Scorer.new instead.")
|
|
20
12
|
|
|
21
|
-
# If first arg is a string/symbol, it's the name
|
|
22
13
|
if name_or_callable.is_a?(String) || name_or_callable.is_a?(Symbol)
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
raise ArgumentError, "Must provide callable or block" unless @callable
|
|
14
|
+
name = name_or_callable.to_s
|
|
15
|
+
block = callable.method(:call) if callable && !block
|
|
26
16
|
else
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
17
|
+
resolved = name_or_callable || callable
|
|
18
|
+
block = resolved.method(:call) if resolved && !block
|
|
19
|
+
name = nil
|
|
30
20
|
end
|
|
31
21
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
end
|
|
36
|
-
|
|
37
|
-
# Detect arity and wrap callable if needed
|
|
38
|
-
@wrapped_callable = wrap_callable(@callable)
|
|
22
|
+
scorer = Braintrust::Scorer.new(name, &block)
|
|
23
|
+
scorer.singleton_class.prepend(PositionalArgsRemapping)
|
|
24
|
+
scorer
|
|
39
25
|
end
|
|
40
26
|
|
|
41
|
-
#
|
|
42
|
-
#
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
private
|
|
52
|
-
|
|
53
|
-
# Detect the name from a callable object
|
|
54
|
-
# @param callable [#call] The callable
|
|
55
|
-
# @return [String] The detected name
|
|
56
|
-
def detect_name(callable)
|
|
57
|
-
# Method objects have .name
|
|
58
|
-
if callable.is_a?(Method)
|
|
59
|
-
return callable.name.to_s
|
|
60
|
-
end
|
|
61
|
-
|
|
62
|
-
# Objects with .name method
|
|
63
|
-
if callable.respond_to?(:name)
|
|
64
|
-
return callable.name.to_s
|
|
65
|
-
end
|
|
66
|
-
|
|
67
|
-
# Fallback
|
|
68
|
-
"scorer"
|
|
69
|
-
end
|
|
70
|
-
|
|
71
|
-
# Wrap the callable to always accept 4 parameters
|
|
72
|
-
# @param callable [#call] The callable to wrap
|
|
73
|
-
# @return [Proc] Wrapped callable that accepts 4 params
|
|
74
|
-
def wrap_callable(callable)
|
|
75
|
-
arity = callable_arity(callable)
|
|
76
|
-
|
|
77
|
-
case arity
|
|
78
|
-
when 3
|
|
79
|
-
# Callable takes 3 params - wrap to ignore metadata
|
|
80
|
-
->(input, expected, output, metadata) {
|
|
81
|
-
callable.call(input, expected, output)
|
|
82
|
-
}
|
|
83
|
-
when 4, -4, -1
|
|
84
|
-
# Callable takes 4 params (or variadic with 4+)
|
|
85
|
-
# -4 means optional 4th param
|
|
86
|
-
# -1 means variadic (*args)
|
|
87
|
-
callable
|
|
88
|
-
else
|
|
89
|
-
raise ArgumentError, "Scorer must accept 3 or 4 parameters (got arity #{arity})"
|
|
90
|
-
end
|
|
91
|
-
end
|
|
92
|
-
|
|
93
|
-
# Get the arity of a callable
|
|
94
|
-
# @param callable [#call] The callable
|
|
95
|
-
# @return [Integer] The arity
|
|
96
|
-
def callable_arity(callable)
|
|
97
|
-
if callable.respond_to?(:arity)
|
|
98
|
-
callable.arity
|
|
99
|
-
elsif callable.respond_to?(:method)
|
|
100
|
-
callable.method(:call).arity
|
|
101
|
-
else
|
|
102
|
-
# Assume 3 params if we can't detect
|
|
103
|
-
3
|
|
27
|
+
# @deprecated Maps positional #call(input, expected, output, metadata) to keyword args.
|
|
28
|
+
# Will be removed when the legacy Eval::Scorer API is removed.
|
|
29
|
+
module PositionalArgsRemapping
|
|
30
|
+
def call(*args, **kwargs)
|
|
31
|
+
if args.any?
|
|
32
|
+
Log.warn_once(:scorer_positional_call, "Calling a Scorer with positional args is deprecated: use keyword args (input:, expected:, output:, metadata:) instead.")
|
|
33
|
+
kwargs = {input: args[0], expected: args[1], output: args[2], metadata: args[3]}
|
|
34
|
+
end
|
|
35
|
+
super(**kwargs)
|
|
104
36
|
end
|
|
105
37
|
end
|
|
106
38
|
end
|
|
107
39
|
end
|
|
108
|
-
|
|
109
|
-
# Value object wrapping a remote scorer function UUID.
|
|
110
|
-
# Used by Eval.run to distinguish remote scorers from local callables.
|
|
111
|
-
ScorerId = Struct.new(:function_id, :version, keyword_init: true)
|
|
112
40
|
end
|