braintrust 0.2.1 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +163 -10
- data/lib/braintrust/api/functions.rb +3 -1
- data/lib/braintrust/api/internal/btql.rb +3 -33
- data/lib/braintrust/contrib/rails/server/application_controller.rb +34 -0
- data/lib/braintrust/contrib/rails/server/engine.rb +72 -0
- data/lib/braintrust/contrib/rails/server/eval_controller.rb +36 -0
- data/lib/braintrust/contrib/rails/server/generator.rb +43 -0
- data/lib/braintrust/contrib/rails/server/health_controller.rb +15 -0
- data/lib/braintrust/contrib/rails/server/list_controller.rb +16 -0
- data/lib/braintrust/contrib/rails/server/routes.rb +8 -0
- data/lib/braintrust/contrib/rails/server.rb +20 -0
- data/lib/braintrust/eval/context.rb +84 -21
- data/lib/braintrust/eval/evaluator.rb +16 -2
- data/lib/braintrust/eval/runner.rb +120 -75
- data/lib/braintrust/eval.rb +22 -2
- data/lib/braintrust/internal/retry.rb +41 -0
- data/lib/braintrust/prompt.rb +11 -5
- data/lib/braintrust/scorer.rb +55 -4
- data/lib/braintrust/server/handlers/eval.rb +8 -168
- data/lib/braintrust/server/handlers/list.rb +3 -41
- data/lib/braintrust/server/rack.rb +2 -0
- data/lib/braintrust/server/services/eval_service.rb +226 -0
- data/lib/braintrust/server/services/list_service.rb +64 -0
- data/lib/braintrust/trace/span_processor.rb +0 -5
- data/lib/braintrust/version.rb +1 -1
- metadata +26 -127
|
@@ -9,11 +9,24 @@ module Braintrust
|
|
|
9
9
|
class Context
|
|
10
10
|
attr_reader :task, :scorers, :cases, :experiment_id, :experiment_name,
|
|
11
11
|
:project_id, :project_name, :state, :tracer_provider,
|
|
12
|
-
:on_progress, :parent_span_attr, :generation
|
|
12
|
+
:on_progress, :parent_span_attr, :generation, :parameters
|
|
13
13
|
|
|
14
|
+
# @param task [Task] Normalized task wrapper
|
|
15
|
+
# @param scorers [Array<Scorer>] Normalized scorer wrappers
|
|
16
|
+
# @param cases [Cases] Normalized eval cases
|
|
17
|
+
# @param experiment_id [String, nil] Experiment ID for logging and trace linkage
|
|
18
|
+
# @param experiment_name [String, nil] Experiment name, included in span attributes
|
|
19
|
+
# @param project_id [String, nil] Project ID
|
|
20
|
+
# @param project_name [String, nil] Project name
|
|
21
|
+
# @param state [Braintrust::State, nil] Authenticated API state; nil for local-only evals
|
|
22
|
+
# @param tracer_provider [#tracer, nil] OpenTelemetry tracer provider
|
|
23
|
+
# @param on_progress [Proc, nil] Callback invoked after each case completes, receiving a progress Hash
|
|
24
|
+
# @param parent_span_attr [String, nil] Formatted parent span identifier ("type:id"), linking spans to a parent context
|
|
25
|
+
# @param generation [Integer, nil] Generation number from the parent span context, used to link spans in a trace hierarchy
|
|
26
|
+
# @param parameters [Hash, nil] Runtime parameters passed to task and scorers as a `parameters:` keyword argument
|
|
14
27
|
def initialize(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil,
|
|
15
28
|
project_id: nil, project_name: nil, state: nil, tracer_provider: nil,
|
|
16
|
-
on_progress: nil, parent_span_attr: nil, generation: nil)
|
|
29
|
+
on_progress: nil, parent_span_attr: nil, generation: nil, parameters: nil)
|
|
17
30
|
@task = task
|
|
18
31
|
@scorers = scorers
|
|
19
32
|
@cases = cases
|
|
@@ -26,40 +39,83 @@ module Braintrust
|
|
|
26
39
|
@on_progress = on_progress
|
|
27
40
|
@parent_span_attr = parent_span_attr
|
|
28
41
|
@generation = generation
|
|
42
|
+
@parameters = parameters
|
|
29
43
|
end
|
|
30
44
|
|
|
31
45
|
# Build a Context from raw user inputs.
|
|
32
|
-
#
|
|
33
|
-
#
|
|
46
|
+
# Delegates to Factory for normalization.
|
|
47
|
+
# @param task [Task, Proc, #call] Task to evaluate; wrapped into a {Task} if needed
|
|
48
|
+
# @param scorers [Array<Scorer, Proc, String, Scorer::ID, #call>] Scorers; each is normalized into a {Scorer}
|
|
49
|
+
# @param cases [Cases, Array, Enumerable] Eval cases; wrapped into {Cases} if needed
|
|
50
|
+
# @param experiment_id [String, nil] Experiment ID for logging
|
|
51
|
+
# @param experiment_name [String, nil] Experiment name, included in span attributes
|
|
52
|
+
# @param project_id [String, nil] Project ID
|
|
53
|
+
# @param project_name [String, nil] Project name; required when resolving scorer slugs
|
|
54
|
+
# @param state [Braintrust::State, nil] Authenticated API state; nil for local-only evals
|
|
55
|
+
# @param tracer_provider [#tracer, nil] OpenTelemetry tracer provider; defaults to global provider
|
|
56
|
+
# @param on_progress [Proc, nil] Callback invoked after each case completes, receiving a progress Hash
|
|
57
|
+
# @param parent [Hash, nil] Parent span info with keys :object_type, :object_id, and optionally :generation
|
|
58
|
+
# @param parameters [Hash, nil] Runtime parameters passed to task and scorers as a `parameters:` keyword argument
|
|
59
|
+
# @return [Context]
|
|
34
60
|
def self.build(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil,
|
|
35
61
|
project_id: nil, project_name: nil, state: nil, tracer_provider: nil,
|
|
36
|
-
on_progress: nil, parent: nil)
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
scorers:
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
experiment_name: experiment_name,
|
|
45
|
-
project_id: project_id,
|
|
46
|
-
project_name: project_name,
|
|
47
|
-
state: state,
|
|
48
|
-
tracer_provider: tracer_provider,
|
|
49
|
-
on_progress: on_progress,
|
|
50
|
-
parent_span_attr: factory.resolve_parent_span_attr(parent),
|
|
51
|
-
generation: parent&.dig(:generation)
|
|
62
|
+
on_progress: nil, parent: nil, parameters: nil)
|
|
63
|
+
Factory.new(
|
|
64
|
+
state: state, tracer_provider: tracer_provider,
|
|
65
|
+
project_id: project_id, project_name: project_name
|
|
66
|
+
).build(
|
|
67
|
+
task: task, scorers: scorers, cases: cases,
|
|
68
|
+
experiment_id: experiment_id, experiment_name: experiment_name,
|
|
69
|
+
on_progress: on_progress, parent: parent, parameters: parameters
|
|
52
70
|
)
|
|
53
71
|
end
|
|
54
72
|
|
|
55
73
|
# Encapsulates normalization of raw user inputs into typed wrappers.
|
|
56
74
|
class Factory
|
|
57
|
-
|
|
75
|
+
# @param state [Braintrust::State, nil] Authenticated API state; passed through to scorer resolution
|
|
76
|
+
# @param tracer_provider [#tracer, nil] OpenTelemetry tracer provider; passed through to remote scorers
|
|
77
|
+
# @param project_id [String, nil] Project ID; passed through to the built Context
|
|
78
|
+
# @param project_name [String, nil] Project name; required when resolving scorer slugs
|
|
79
|
+
def initialize(state: nil, tracer_provider: nil, project_id: nil, project_name: nil)
|
|
58
80
|
@state = state
|
|
59
81
|
@tracer_provider = tracer_provider
|
|
82
|
+
@project_id = project_id
|
|
60
83
|
@project_name = project_name
|
|
61
84
|
end
|
|
62
85
|
|
|
86
|
+
# Normalize raw inputs and construct a {Context}.
|
|
87
|
+
# @param task [Task, Proc, #call] Raw task
|
|
88
|
+
# @param scorers [Array] Raw scorers
|
|
89
|
+
# @param cases [Cases, Array, Enumerable] Raw eval cases
|
|
90
|
+
# @param experiment_id [String, nil]
|
|
91
|
+
# @param experiment_name [String, nil]
|
|
92
|
+
# @param on_progress [Proc, nil]
|
|
93
|
+
# @param parent [Hash, nil] Parent span info with keys :object_type, :object_id, and optionally :generation
|
|
94
|
+
# @return [Context]
|
|
95
|
+
def build(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil,
|
|
96
|
+
on_progress: nil, parent: nil, parameters: nil)
|
|
97
|
+
Context.new(
|
|
98
|
+
task: normalize_task(task),
|
|
99
|
+
scorers: normalize_scorers(scorers),
|
|
100
|
+
cases: normalize_cases(cases),
|
|
101
|
+
experiment_id: experiment_id,
|
|
102
|
+
experiment_name: experiment_name,
|
|
103
|
+
project_id: @project_id,
|
|
104
|
+
project_name: @project_name,
|
|
105
|
+
state: @state,
|
|
106
|
+
tracer_provider: @tracer_provider || OpenTelemetry.tracer_provider,
|
|
107
|
+
on_progress: on_progress,
|
|
108
|
+
parent_span_attr: resolve_parent_span_attr(parent),
|
|
109
|
+
generation: parent&.dig(:generation),
|
|
110
|
+
parameters: parameters
|
|
111
|
+
)
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
private
|
|
115
|
+
|
|
116
|
+
# @param raw [Cases, Array, Enumerable, #each]
|
|
117
|
+
# @return [Cases]
|
|
118
|
+
# @raise [ArgumentError] if raw is not enumerable
|
|
63
119
|
def normalize_cases(raw)
|
|
64
120
|
case raw
|
|
65
121
|
when Cases
|
|
@@ -75,11 +131,15 @@ module Braintrust
|
|
|
75
131
|
end
|
|
76
132
|
end
|
|
77
133
|
|
|
134
|
+
# @param parent [Hash, nil]
|
|
135
|
+
# @return [String, nil] Formatted as "type:id", e.g. "experiment_id:abc-123"
|
|
78
136
|
def resolve_parent_span_attr(parent)
|
|
79
137
|
return nil unless parent
|
|
80
138
|
"#{parent[:object_type]}:#{parent[:object_id]}"
|
|
81
139
|
end
|
|
82
140
|
|
|
141
|
+
# @param raw [Task, Proc, #call]
|
|
142
|
+
# @return [Task]
|
|
83
143
|
def normalize_task(raw)
|
|
84
144
|
case raw
|
|
85
145
|
when Task
|
|
@@ -95,6 +155,9 @@ module Braintrust
|
|
|
95
155
|
end
|
|
96
156
|
end
|
|
97
157
|
|
|
158
|
+
# @param raw [Array<Scorer, Proc, String, Scorer::ID, #call>]
|
|
159
|
+
# @return [Array<Scorer>]
|
|
160
|
+
# @raise [ArgumentError] if a String slug is given without a project name
|
|
98
161
|
def normalize_scorers(raw)
|
|
99
162
|
raw.map do |scorer|
|
|
100
163
|
case scorer
|
|
@@ -27,6 +27,18 @@ module Braintrust
|
|
|
27
27
|
# Braintrust::Scorer.new("exact_match") { |expected:, output:| output == expected ? 1.0 : 0.0 }
|
|
28
28
|
# ]
|
|
29
29
|
# )
|
|
30
|
+
#
|
|
31
|
+
# @example Remote eval with parameters (for Playground UI)
|
|
32
|
+
# Braintrust::Eval::Evaluator.new(
|
|
33
|
+
# task: ->(input:, parameters:) {
|
|
34
|
+
# model = parameters["model"] || "gpt-4"
|
|
35
|
+
# # Use model to generate response...
|
|
36
|
+
# },
|
|
37
|
+
# scorers: [Braintrust::Scorer.new("exact") { |expected:, output:| output == expected ? 1.0 : 0.0 }],
|
|
38
|
+
# parameters: {
|
|
39
|
+
# "model" => {type: "string", default: "gpt-4", description: "Model to use"}
|
|
40
|
+
# }
|
|
41
|
+
# )
|
|
30
42
|
class Evaluator
|
|
31
43
|
attr_accessor :task, :scorers, :parameters
|
|
32
44
|
|
|
@@ -64,13 +76,15 @@ module Braintrust
|
|
|
64
76
|
def run(cases, on_progress: nil, quiet: false,
|
|
65
77
|
project: nil, experiment: nil, project_id: nil,
|
|
66
78
|
dataset: nil, scorers: nil, parent: nil,
|
|
67
|
-
state: nil, update: false, tracer_provider: nil
|
|
79
|
+
state: nil, update: false, tracer_provider: nil,
|
|
80
|
+
parameters: nil)
|
|
68
81
|
all_scorers = scorers ? self.scorers + scorers : self.scorers
|
|
69
82
|
Braintrust::Eval.run(
|
|
70
83
|
task: task, scorers: all_scorers, cases: cases, dataset: dataset,
|
|
71
84
|
project: project, experiment: experiment, project_id: project_id,
|
|
72
85
|
parent: parent, on_progress: on_progress, quiet: quiet,
|
|
73
|
-
state: state, update: update, tracer_provider: tracer_provider
|
|
86
|
+
state: state, update: update, tracer_provider: tracer_provider,
|
|
87
|
+
parameters: parameters
|
|
74
88
|
)
|
|
75
89
|
end
|
|
76
90
|
end
|
|
@@ -6,6 +6,7 @@ require_relative "summary"
|
|
|
6
6
|
require_relative "trace"
|
|
7
7
|
require_relative "../internal/thread_pool"
|
|
8
8
|
require_relative "../api/internal/btql"
|
|
9
|
+
require_relative "../internal/retry"
|
|
9
10
|
|
|
10
11
|
require "opentelemetry/sdk"
|
|
11
12
|
require "json"
|
|
@@ -24,8 +25,7 @@ module Braintrust
|
|
|
24
25
|
# @param eval_context [Context] Normalized eval context
|
|
25
26
|
def initialize(eval_context)
|
|
26
27
|
@eval_context = eval_context
|
|
27
|
-
|
|
28
|
-
@tracer = tracer_provider.tracer("braintrust-eval")
|
|
28
|
+
@tracer = eval_context.tracer_provider.tracer("braintrust-eval")
|
|
29
29
|
|
|
30
30
|
# Mutex for thread-safe score collection
|
|
31
31
|
@score_mutex = Mutex.new
|
|
@@ -79,66 +79,69 @@ module Braintrust
|
|
|
79
79
|
|
|
80
80
|
# Run a single test case with OpenTelemetry tracing
|
|
81
81
|
# Creates eval span (parent) with task and score as children
|
|
82
|
-
# @param
|
|
82
|
+
# @param kase [CaseContext] The per-case accumulator
|
|
83
83
|
# @param errors [Queue] Thread-safe error collection queue
|
|
84
|
-
def run_eval_case(
|
|
85
|
-
|
|
84
|
+
def run_eval_case(kase, errors)
|
|
85
|
+
# Each eval case starts its own trace — detach from any ambient span context
|
|
86
|
+
eval_span = tracer.start_root_span("eval")
|
|
87
|
+
OpenTelemetry::Trace.with_span(eval_span) do
|
|
88
|
+
# Set attributes known before task execution
|
|
86
89
|
eval_span.set_attribute("braintrust.parent", eval_context.parent_span_attr) if eval_context.parent_span_attr
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
eval_span
|
|
90
|
+
set_json_attr(eval_span, "braintrust.span_attributes", build_span_attributes("eval"))
|
|
91
|
+
set_json_attr(eval_span, "braintrust.input_json", {input: kase.input})
|
|
92
|
+
set_json_attr(eval_span, "braintrust.expected", kase.expected) if kase.expected
|
|
93
|
+
set_json_attr(eval_span, "braintrust.metadata", kase.metadata) if kase.metadata
|
|
94
|
+
eval_span.set_attribute("braintrust.tags", kase.tags) if kase.tags
|
|
95
|
+
eval_span.set_attribute("braintrust.origin", kase.origin) if kase.origin
|
|
90
96
|
|
|
91
97
|
# Run task
|
|
92
98
|
begin
|
|
93
|
-
|
|
99
|
+
kase.output = run_task(kase)
|
|
94
100
|
rescue => e
|
|
95
101
|
# Error already recorded on task span, set eval span status
|
|
96
102
|
eval_span.status = OpenTelemetry::Trace::Status.error(e.message)
|
|
97
|
-
|
|
98
|
-
|
|
103
|
+
set_json_attr(eval_span, "braintrust.output_json", {output: nil})
|
|
104
|
+
errors << "Task failed for input '#{kase.input}': #{e.message}"
|
|
105
|
+
report_progress(eval_span, kase, error: e.message)
|
|
99
106
|
next
|
|
100
107
|
end
|
|
101
108
|
|
|
102
109
|
# Flush spans so they're queryable via BTQL, then build trace
|
|
103
|
-
eval_context.tracer_provider
|
|
104
|
-
|
|
110
|
+
eval_context.tracer_provider.force_flush if eval_context.tracer_provider.respond_to?(:force_flush)
|
|
111
|
+
kase.trace = build_trace(eval_span)
|
|
105
112
|
|
|
106
113
|
# Run scorers
|
|
107
|
-
case_scores = nil
|
|
108
114
|
begin
|
|
109
|
-
|
|
115
|
+
run_scorers(kase)
|
|
110
116
|
rescue => e
|
|
111
117
|
# Error already recorded on score span, set eval span status
|
|
112
118
|
eval_span.status = OpenTelemetry::Trace::Status.error(e.message)
|
|
113
|
-
errors << "Scorers failed for input '#{
|
|
119
|
+
errors << "Scorers failed for input '#{kase.input}': #{e.message}"
|
|
114
120
|
end
|
|
115
121
|
|
|
116
|
-
# Set
|
|
117
|
-
set_json_attr(eval_span, "braintrust.
|
|
118
|
-
set_json_attr(eval_span, "braintrust.input_json", case_context.input)
|
|
119
|
-
set_json_attr(eval_span, "braintrust.output_json", case_context.output)
|
|
120
|
-
set_json_attr(eval_span, "braintrust.expected", case_context.expected) if case_context.expected
|
|
121
|
-
|
|
122
|
-
# Set origin for cases from remote sources (already JSON-serialized)
|
|
123
|
-
eval_span.set_attribute("braintrust.origin", case_context.origin) if case_context.origin
|
|
122
|
+
# Set output after task completes
|
|
123
|
+
set_json_attr(eval_span, "braintrust.output_json", {output: kase.output})
|
|
124
124
|
|
|
125
|
-
report_progress(eval_span,
|
|
125
|
+
report_progress(eval_span, kase, data: kase.output)
|
|
126
126
|
end
|
|
127
|
+
ensure
|
|
128
|
+
eval_span&.finish
|
|
127
129
|
end
|
|
128
130
|
|
|
129
131
|
# Run task with OpenTelemetry tracing
|
|
130
132
|
# Creates task span with input and output
|
|
131
|
-
# @param
|
|
133
|
+
# @param kase [CaseContext] The per-case context
|
|
132
134
|
# @return [Object] Task output
|
|
133
|
-
def run_task(
|
|
135
|
+
def run_task(kase)
|
|
134
136
|
tracer.in_span("task") do |task_span|
|
|
135
137
|
task_span.set_attribute("braintrust.parent", eval_context.parent_span_attr) if eval_context.parent_span_attr
|
|
136
138
|
set_json_attr(task_span, "braintrust.span_attributes", build_span_attributes("task"))
|
|
137
|
-
set_json_attr(task_span, "braintrust.input_json",
|
|
139
|
+
set_json_attr(task_span, "braintrust.input_json", kase.input)
|
|
138
140
|
|
|
139
141
|
begin
|
|
140
142
|
output = eval_context.task.call(
|
|
141
|
-
input:
|
|
143
|
+
input: kase.input,
|
|
144
|
+
parameters: eval_context.parameters || {}
|
|
142
145
|
)
|
|
143
146
|
set_json_attr(task_span, "braintrust.output_json", output)
|
|
144
147
|
output
|
|
@@ -151,43 +154,64 @@ module Braintrust
|
|
|
151
154
|
end
|
|
152
155
|
end
|
|
153
156
|
|
|
154
|
-
# Run scorers with OpenTelemetry tracing
|
|
155
|
-
# Creates
|
|
156
|
-
# @param
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
157
|
+
# Run scorers with OpenTelemetry tracing.
|
|
158
|
+
# Creates one span per scorer, each a direct child of the current (eval) span.
|
|
159
|
+
# @param kase [CaseContext] The per-case context (output must be populated)
|
|
160
|
+
def run_scorers(kase)
|
|
161
|
+
scorer_kwargs = {
|
|
162
|
+
input: kase.input,
|
|
163
|
+
expected: kase.expected,
|
|
164
|
+
output: kase.output,
|
|
165
|
+
metadata: kase.metadata || {},
|
|
166
|
+
trace: kase.trace,
|
|
167
|
+
parameters: eval_context.parameters || {}
|
|
168
|
+
}
|
|
169
|
+
scorer_input = {
|
|
170
|
+
input: kase.input,
|
|
171
|
+
expected: kase.expected,
|
|
172
|
+
output: kase.output,
|
|
173
|
+
metadata: kase.metadata || {},
|
|
174
|
+
parameters: eval_context.parameters || {}
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
scorer_error = nil
|
|
178
|
+
eval_context.scorers.each do |scorer|
|
|
179
|
+
collect_scores(run_scorer(scorer, scorer_kwargs, scorer_input))
|
|
180
|
+
rescue => e
|
|
181
|
+
scorer_error ||= e
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
raise scorer_error if scorer_error
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
# Run a single scorer inside its own span.
|
|
188
|
+
# @param scorer [Scorer] The scorer to run
|
|
189
|
+
# @param scorer_kwargs [Hash] Keyword arguments for the scorer
|
|
190
|
+
# @param scorer_input [Hash] Input to log on the span
|
|
191
|
+
# @return [Array<Hash>] Raw score results from the scorer
|
|
192
|
+
def run_scorer(scorer, scorer_kwargs, scorer_input)
|
|
193
|
+
tracer.in_span(scorer.name) do |score_span|
|
|
160
194
|
score_span.set_attribute("braintrust.parent", eval_context.parent_span_attr) if eval_context.parent_span_attr
|
|
161
|
-
set_json_attr(score_span, "braintrust.span_attributes",
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
expected: case_context.expected,
|
|
166
|
-
output: case_context.output,
|
|
167
|
-
metadata: case_context.metadata || {},
|
|
168
|
-
trace: case_context.trace
|
|
169
|
-
}
|
|
170
|
-
scores = {}
|
|
171
|
-
scorer_error = nil
|
|
172
|
-
eval_context.scorers.each do |scorer|
|
|
173
|
-
score_value = scorer.call(**scorer_kwargs)
|
|
174
|
-
scores[scorer.name] = score_value
|
|
175
|
-
|
|
176
|
-
# Collect raw score for summary (thread-safe)
|
|
177
|
-
collect_score(scorer.name, score_value)
|
|
178
|
-
rescue => e
|
|
179
|
-
# Record first error but continue processing other scorers
|
|
180
|
-
scorer_error ||= e
|
|
181
|
-
record_span_error(score_span, e, "ScorerError")
|
|
182
|
-
end
|
|
195
|
+
set_json_attr(score_span, "braintrust.span_attributes", build_scorer_span_attributes(scorer.name))
|
|
196
|
+
set_json_attr(score_span, "braintrust.input_json", scorer_input)
|
|
197
|
+
|
|
198
|
+
score_results = scorer.call(**scorer_kwargs)
|
|
183
199
|
|
|
184
|
-
|
|
185
|
-
|
|
200
|
+
scorer_scores = {}
|
|
201
|
+
scorer_metadata = {}
|
|
202
|
+
score_results.each do |s|
|
|
203
|
+
scorer_scores[s[:name]] = s[:score]
|
|
204
|
+
scorer_metadata[s[:name]] = s[:metadata] if s[:metadata].is_a?(Hash)
|
|
205
|
+
end
|
|
186
206
|
|
|
187
|
-
|
|
188
|
-
|
|
207
|
+
set_json_attr(score_span, "braintrust.output_json", scorer_scores)
|
|
208
|
+
set_json_attr(score_span, "braintrust.scores", scorer_scores)
|
|
209
|
+
set_json_attr(score_span, "braintrust.metadata", scorer_metadata) unless scorer_metadata.empty?
|
|
189
210
|
|
|
190
|
-
|
|
211
|
+
score_results
|
|
212
|
+
rescue => e
|
|
213
|
+
record_span_error(score_span, e, "ScorerError")
|
|
214
|
+
raise
|
|
191
215
|
end
|
|
192
216
|
end
|
|
193
217
|
|
|
@@ -203,9 +227,23 @@ module Braintrust
|
|
|
203
227
|
object_id = eval_context.experiment_id
|
|
204
228
|
btql = API::Internal::BTQL.new(eval_context.state)
|
|
205
229
|
|
|
206
|
-
Eval::Trace.new(
|
|
207
|
-
|
|
208
|
-
|
|
230
|
+
Eval::Trace.new(spans: -> { fetch_trace_spans(btql, object_type, object_id, root_span_id) })
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
# Fetch trace spans with retry to handle freshness and ingestion lag.
|
|
234
|
+
# @return [Array<Hash>] Parsed span data
|
|
235
|
+
def fetch_trace_spans(btql, object_type, object_id, root_span_id)
|
|
236
|
+
rows, _freshness = Internal::Retry.with_backoff(
|
|
237
|
+
max_retries: 7, base_delay: 1.0, max_delay: 8.0,
|
|
238
|
+
until: ->(result) {
|
|
239
|
+
r, f = result
|
|
240
|
+
f == "complete" && !r.empty?
|
|
241
|
+
}
|
|
242
|
+
) { btql.trace_spans(object_type: object_type, object_id: object_id, root_span_id: root_span_id) }
|
|
243
|
+
rows || []
|
|
244
|
+
rescue => e
|
|
245
|
+
Braintrust::Log.warn("[BTQL] Query failed: #{e.message}")
|
|
246
|
+
[]
|
|
209
247
|
end
|
|
210
248
|
|
|
211
249
|
# Build a CaseContext from a Case struct
|
|
@@ -220,11 +258,11 @@ module Braintrust
|
|
|
220
258
|
|
|
221
259
|
# Report progress for a case via on_progress callback.
|
|
222
260
|
# Rescues errors in the callback so a broken handler never crashes the eval.
|
|
223
|
-
def report_progress(eval_span,
|
|
261
|
+
def report_progress(eval_span, kase, **fields)
|
|
224
262
|
return unless eval_context.on_progress
|
|
225
263
|
progress = {"id" => eval_span.context.hex_span_id}.merge(fields.transform_keys(&:to_s))
|
|
226
|
-
if
|
|
227
|
-
progress["origin"] =
|
|
264
|
+
if kase.origin
|
|
265
|
+
progress["origin"] = kase.origin.is_a?(String) ? JSON.parse(kase.origin) : kase.origin
|
|
228
266
|
end
|
|
229
267
|
eval_context.on_progress.call(progress)
|
|
230
268
|
rescue => e
|
|
@@ -255,6 +293,16 @@ module Braintrust
|
|
|
255
293
|
attrs
|
|
256
294
|
end
|
|
257
295
|
|
|
296
|
+
# Build span_attributes for a scorer span.
|
|
297
|
+
# Each scorer gets its own span with type "score", purpose "scorer", and the scorer's name.
|
|
298
|
+
# @param scorer_name [String] The scorer name
|
|
299
|
+
# @return [Hash]
|
|
300
|
+
def build_scorer_span_attributes(scorer_name)
|
|
301
|
+
attrs = {type: "score", name: scorer_name, purpose: "scorer"}
|
|
302
|
+
attrs[:generation] = eval_context.generation if eval_context.generation
|
|
303
|
+
attrs
|
|
304
|
+
end
|
|
305
|
+
|
|
258
306
|
# Set a span attribute by JSON encoding the value
|
|
259
307
|
# @param span [OpenTelemetry::Trace::Span] The span
|
|
260
308
|
# @param key [String] The attribute key
|
|
@@ -263,14 +311,11 @@ module Braintrust
|
|
|
263
311
|
span.set_attribute(key, JSON.dump(value))
|
|
264
312
|
end
|
|
265
313
|
|
|
266
|
-
# Collect
|
|
267
|
-
# @param
|
|
268
|
-
|
|
269
|
-
def collect_score(name, value)
|
|
270
|
-
return unless value.is_a?(Numeric)
|
|
271
|
-
|
|
314
|
+
# Collect score results into the summary accumulator (thread-safe).
|
|
315
|
+
# @param score_results [Array<Hash>] Score results from a scorer
|
|
316
|
+
def collect_scores(score_results)
|
|
272
317
|
@score_mutex.synchronize do
|
|
273
|
-
(@scores[name] ||= []) <<
|
|
318
|
+
score_results.each { |s| (@scores[s[:name]] ||= []) << s[:score] }
|
|
274
319
|
end
|
|
275
320
|
end
|
|
276
321
|
end
|
data/lib/braintrust/eval.rb
CHANGED
|
@@ -105,6 +105,21 @@ module Braintrust
|
|
|
105
105
|
# scorers: [->(expected:, output:) { output == expected ? 1.0 : 0.0 }]
|
|
106
106
|
# )
|
|
107
107
|
#
|
|
108
|
+
# @example Using parameters for configurable tasks
|
|
109
|
+
# # Tasks and scorers that declare `parameters:` receive it automatically.
|
|
110
|
+
# # Those that don't are unaffected — KeywordFilter strips unknown kwargs.
|
|
111
|
+
# Braintrust::Eval.run(
|
|
112
|
+
# project: "my-project",
|
|
113
|
+
# experiment: "with-params",
|
|
114
|
+
# cases: [{input: "hello", expected: "HELLO!"}],
|
|
115
|
+
# task: ->(input:, parameters:) {
|
|
116
|
+
# suffix = parameters["suffix"] || ""
|
|
117
|
+
# input.upcase + suffix
|
|
118
|
+
# },
|
|
119
|
+
# scorers: [->(expected:, output:) { output == expected ? 1.0 : 0.0 }],
|
|
120
|
+
# parameters: {"suffix" => "!"}
|
|
121
|
+
# )
|
|
122
|
+
#
|
|
108
123
|
# @example Using metadata and tags
|
|
109
124
|
# Braintrust::Eval.run(
|
|
110
125
|
# project: "my-project",
|
|
@@ -158,11 +173,15 @@ module Braintrust
|
|
|
158
173
|
# @param quiet [Boolean] If true, suppress result output (default: false)
|
|
159
174
|
# @param state [State, nil] Braintrust state (defaults to global state)
|
|
160
175
|
# @param tracer_provider [TracerProvider, nil] OpenTelemetry tracer provider (defaults to global)
|
|
176
|
+
# @param project_id [String, nil] Project UUID (skips project creation when provided)
|
|
177
|
+
# @param parent [Hash, nil] Parent span context ({object_type:, object_id:, generation:})
|
|
178
|
+
# @param parameters [Hash, nil] Runtime parameters passed to task and scorers as a `parameters:` keyword argument
|
|
161
179
|
# @return [Result]
|
|
162
180
|
def run(task:, scorers:, project: nil, experiment: nil,
|
|
163
181
|
cases: nil, dataset: nil, on_progress: nil,
|
|
164
182
|
parallelism: 1, tags: nil, metadata: nil, update: false, quiet: false,
|
|
165
|
-
state: nil, tracer_provider: nil, project_id: nil, parent: nil
|
|
183
|
+
state: nil, tracer_provider: nil, project_id: nil, parent: nil,
|
|
184
|
+
parameters: nil)
|
|
166
185
|
# Validate required parameters
|
|
167
186
|
validate_params!(task: task, scorers: scorers, cases: cases, dataset: dataset)
|
|
168
187
|
|
|
@@ -205,7 +224,8 @@ module Braintrust
|
|
|
205
224
|
state: state,
|
|
206
225
|
tracer_provider: tracer_provider,
|
|
207
226
|
on_progress: on_progress,
|
|
208
|
-
parent: parent
|
|
227
|
+
parent: parent,
|
|
228
|
+
parameters: parameters
|
|
209
229
|
)
|
|
210
230
|
result = Runner.new(context).run(parallelism: parallelism)
|
|
211
231
|
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Braintrust
|
|
4
|
+
module Internal
|
|
5
|
+
module Retry
|
|
6
|
+
MAX_RETRIES = 7
|
|
7
|
+
BASE_DELAY = 1.0
|
|
8
|
+
MAX_DELAY = 8.0
|
|
9
|
+
|
|
10
|
+
# Retry a block with exponential backoff.
|
|
11
|
+
#
|
|
12
|
+
# The block is the task to attempt. Its return value is captured each attempt.
|
|
13
|
+
#
|
|
14
|
+
# @param max_retries [Integer] Maximum number of retries after the first attempt
|
|
15
|
+
# @param base_delay [Float] Initial delay in seconds (doubles each retry)
|
|
16
|
+
# @param max_delay [Float] Cap on delay between retries
|
|
17
|
+
# @param until [Proc, nil] Optional condition — receives block result, truthy stops retrying.
|
|
18
|
+
# When omitted, the block result's own truthiness decides.
|
|
19
|
+
# @return The last block result (whether retries were exhausted or condition was met)
|
|
20
|
+
#
|
|
21
|
+
# @example Simple: retry until truthy
|
|
22
|
+
# conn = Retry.with_backoff(max_retries: 5) { try_connect }
|
|
23
|
+
#
|
|
24
|
+
# @example With condition: retry until non-empty
|
|
25
|
+
# data = Retry.with_backoff(until: ->(r) { r.any? }) { api.fetch }
|
|
26
|
+
#
|
|
27
|
+
def self.with_backoff(max_retries: MAX_RETRIES, base_delay: BASE_DELAY, max_delay: MAX_DELAY, until: nil, &task)
|
|
28
|
+
check = binding.local_variable_get(:until)
|
|
29
|
+
result = task.call
|
|
30
|
+
retries = 0
|
|
31
|
+
while retries < max_retries && !(check ? check.call(result) : result)
|
|
32
|
+
retries += 1
|
|
33
|
+
delay = [base_delay * (2**(retries - 1)), max_delay].min
|
|
34
|
+
sleep(delay)
|
|
35
|
+
result = task.call
|
|
36
|
+
end
|
|
37
|
+
result
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
data/lib/braintrust/prompt.rb
CHANGED
|
@@ -11,23 +11,28 @@ module Braintrust
|
|
|
11
11
|
# params = prompt.build(text: "Article to summarize...")
|
|
12
12
|
# client.messages.create(**params)
|
|
13
13
|
class Prompt
|
|
14
|
-
attr_reader :id, :name, :slug, :project_id
|
|
14
|
+
attr_reader :id, :name, :slug, :project_id, :version
|
|
15
15
|
|
|
16
16
|
# Load a prompt from Braintrust
|
|
17
17
|
#
|
|
18
|
-
# @param project [String] Project name
|
|
18
|
+
# @param project [String, nil] Project name (provide either project or project_id)
|
|
19
|
+
# @param project_id [String, nil] Project ID (UUID, provide either project or project_id)
|
|
19
20
|
# @param slug [String] Prompt slug
|
|
20
21
|
# @param version [String, nil] Specific version (default: latest)
|
|
21
22
|
# @param defaults [Hash] Default variable values for build()
|
|
22
23
|
# @param api [API, nil] Braintrust API client (default: creates one using global state)
|
|
23
24
|
# @return [Prompt]
|
|
24
|
-
def self.load(
|
|
25
|
+
def self.load(slug:, project: nil, project_id: nil, version: nil, defaults: {}, api: nil)
|
|
26
|
+
raise ArgumentError, "Either project or project_id is required" unless project || project_id
|
|
27
|
+
|
|
25
28
|
api ||= API.new
|
|
26
29
|
|
|
27
30
|
# Find the function by project + slug
|
|
28
|
-
result = api.functions.list(project_name: project, slug: slug)
|
|
31
|
+
result = api.functions.list(project_name: project, project_id: project_id, slug: slug)
|
|
29
32
|
function = result.dig("objects")&.first
|
|
30
|
-
|
|
33
|
+
|
|
34
|
+
identifier = project ? "project '#{project}'" : "project_id '#{project_id}'"
|
|
35
|
+
raise Error, "Prompt '#{slug}' not found in #{identifier}" unless function
|
|
31
36
|
|
|
32
37
|
# Fetch full function data including prompt_data
|
|
33
38
|
full_data = api.functions.get(id: function["id"], version: version)
|
|
@@ -47,6 +52,7 @@ module Braintrust
|
|
|
47
52
|
@name = data["name"]
|
|
48
53
|
@slug = data["slug"]
|
|
49
54
|
@project_id = data["project_id"]
|
|
55
|
+
@version = data["_xact_id"]
|
|
50
56
|
end
|
|
51
57
|
|
|
52
58
|
# Get the raw prompt definition
|