braintrust 0.2.1 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,11 +9,24 @@ module Braintrust
9
9
  class Context
10
10
  attr_reader :task, :scorers, :cases, :experiment_id, :experiment_name,
11
11
  :project_id, :project_name, :state, :tracer_provider,
12
- :on_progress, :parent_span_attr, :generation
12
+ :on_progress, :parent_span_attr, :generation, :parameters
13
13
 
14
+ # @param task [Task] Normalized task wrapper
15
+ # @param scorers [Array<Scorer>] Normalized scorer wrappers
16
+ # @param cases [Cases] Normalized eval cases
17
+ # @param experiment_id [String, nil] Experiment ID for logging and trace linkage
18
+ # @param experiment_name [String, nil] Experiment name, included in span attributes
19
+ # @param project_id [String, nil] Project ID
20
+ # @param project_name [String, nil] Project name
21
+ # @param state [Braintrust::State, nil] Authenticated API state; nil for local-only evals
22
+ # @param tracer_provider [#tracer, nil] OpenTelemetry tracer provider
23
+ # @param on_progress [Proc, nil] Callback invoked after each case completes, receiving a progress Hash
24
+ # @param parent_span_attr [String, nil] Formatted parent span identifier ("type:id"), linking spans to a parent context
25
+ # @param generation [Integer, nil] Generation number from the parent span context, used to link spans in a trace hierarchy
26
+ # @param parameters [Hash, nil] Runtime parameters passed to task and scorers as a `parameters:` keyword argument
14
27
  def initialize(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil,
15
28
  project_id: nil, project_name: nil, state: nil, tracer_provider: nil,
16
- on_progress: nil, parent_span_attr: nil, generation: nil)
29
+ on_progress: nil, parent_span_attr: nil, generation: nil, parameters: nil)
17
30
  @task = task
18
31
  @scorers = scorers
19
32
  @cases = cases
@@ -26,40 +39,83 @@ module Braintrust
26
39
  @on_progress = on_progress
27
40
  @parent_span_attr = parent_span_attr
28
41
  @generation = generation
42
+ @parameters = parameters
29
43
  end
30
44
 
31
45
  # Build a Context from raw user inputs.
32
- # Factory normalizes task, scorers, and cases into typed wrappers.
33
- # Parent is resolved into parent_span_attr and generation.
46
+ # Delegates to Factory for normalization.
47
+ # @param task [Task, Proc, #call] Task to evaluate; wrapped into a {Task} if needed
48
+ # @param scorers [Array<Scorer, Proc, String, Scorer::ID, #call>] Scorers; each is normalized into a {Scorer}
49
+ # @param cases [Cases, Array, Enumerable] Eval cases; wrapped into {Cases} if needed
50
+ # @param experiment_id [String, nil] Experiment ID for logging
51
+ # @param experiment_name [String, nil] Experiment name, included in span attributes
52
+ # @param project_id [String, nil] Project ID
53
+ # @param project_name [String, nil] Project name; required when resolving scorer slugs
54
+ # @param state [Braintrust::State, nil] Authenticated API state; nil for local-only evals
55
+ # @param tracer_provider [#tracer, nil] OpenTelemetry tracer provider; defaults to global provider
56
+ # @param on_progress [Proc, nil] Callback invoked after each case completes, receiving a progress Hash
57
+ # @param parent [Hash, nil] Parent span info with keys :object_type, :object_id, and optionally :generation
58
+ # @param parameters [Hash, nil] Runtime parameters passed to task and scorers as a `parameters:` keyword argument
59
+ # @return [Context]
34
60
  def self.build(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil,
35
61
  project_id: nil, project_name: nil, state: nil, tracer_provider: nil,
36
- on_progress: nil, parent: nil)
37
- factory = Factory.new(state: state, tracer_provider: tracer_provider, project_name: project_name)
38
-
39
- Context.new(
40
- task: factory.normalize_task(task),
41
- scorers: factory.normalize_scorers(scorers),
42
- cases: factory.normalize_cases(cases),
43
- experiment_id: experiment_id,
44
- experiment_name: experiment_name,
45
- project_id: project_id,
46
- project_name: project_name,
47
- state: state,
48
- tracer_provider: tracer_provider,
49
- on_progress: on_progress,
50
- parent_span_attr: factory.resolve_parent_span_attr(parent),
51
- generation: parent&.dig(:generation)
62
+ on_progress: nil, parent: nil, parameters: nil)
63
+ Factory.new(
64
+ state: state, tracer_provider: tracer_provider,
65
+ project_id: project_id, project_name: project_name
66
+ ).build(
67
+ task: task, scorers: scorers, cases: cases,
68
+ experiment_id: experiment_id, experiment_name: experiment_name,
69
+ on_progress: on_progress, parent: parent, parameters: parameters
52
70
  )
53
71
  end
54
72
 
55
73
  # Encapsulates normalization of raw user inputs into typed wrappers.
56
74
  class Factory
57
- def initialize(state: nil, tracer_provider: nil, project_name: nil)
75
+ # @param state [Braintrust::State, nil] Authenticated API state; passed through to scorer resolution
76
+ # @param tracer_provider [#tracer, nil] OpenTelemetry tracer provider; passed through to remote scorers
77
+ # @param project_id [String, nil] Project ID; passed through to the built Context
78
+ # @param project_name [String, nil] Project name; required when resolving scorer slugs
79
+ def initialize(state: nil, tracer_provider: nil, project_id: nil, project_name: nil)
58
80
  @state = state
59
81
  @tracer_provider = tracer_provider
82
+ @project_id = project_id
60
83
  @project_name = project_name
61
84
  end
62
85
 
86
+ # Normalize raw inputs and construct a {Context}.
87
+ # @param task [Task, Proc, #call] Raw task
88
+ # @param scorers [Array] Raw scorers
89
+ # @param cases [Cases, Array, Enumerable] Raw eval cases
90
+ # @param experiment_id [String, nil]
91
+ # @param experiment_name [String, nil]
92
+ # @param on_progress [Proc, nil]
93
+ # @param parent [Hash, nil] Parent span info with keys :object_type, :object_id, and optionally :generation
94
+ # @return [Context]
95
+ def build(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil,
96
+ on_progress: nil, parent: nil, parameters: nil)
97
+ Context.new(
98
+ task: normalize_task(task),
99
+ scorers: normalize_scorers(scorers),
100
+ cases: normalize_cases(cases),
101
+ experiment_id: experiment_id,
102
+ experiment_name: experiment_name,
103
+ project_id: @project_id,
104
+ project_name: @project_name,
105
+ state: @state,
106
+ tracer_provider: @tracer_provider || OpenTelemetry.tracer_provider,
107
+ on_progress: on_progress,
108
+ parent_span_attr: resolve_parent_span_attr(parent),
109
+ generation: parent&.dig(:generation),
110
+ parameters: parameters
111
+ )
112
+ end
113
+
114
+ private
115
+
116
+ # @param raw [Cases, Array, Enumerable, #each]
117
+ # @return [Cases]
118
+ # @raise [ArgumentError] if raw is not enumerable
63
119
  def normalize_cases(raw)
64
120
  case raw
65
121
  when Cases
@@ -75,11 +131,15 @@ module Braintrust
75
131
  end
76
132
  end
77
133
 
134
+ # @param parent [Hash, nil]
135
+ # @return [String, nil] Formatted as "type:id", e.g. "experiment_id:abc-123"
78
136
  def resolve_parent_span_attr(parent)
79
137
  return nil unless parent
80
138
  "#{parent[:object_type]}:#{parent[:object_id]}"
81
139
  end
82
140
 
141
+ # @param raw [Task, Proc, #call]
142
+ # @return [Task]
83
143
  def normalize_task(raw)
84
144
  case raw
85
145
  when Task
@@ -95,6 +155,9 @@ module Braintrust
95
155
  end
96
156
  end
97
157
 
158
+ # @param raw [Array<Scorer, Proc, String, Scorer::ID, #call>]
159
+ # @return [Array<Scorer>]
160
+ # @raise [ArgumentError] if a String slug is given without a project name
98
161
  def normalize_scorers(raw)
99
162
  raw.map do |scorer|
100
163
  case scorer
@@ -27,6 +27,18 @@ module Braintrust
27
27
  # Braintrust::Scorer.new("exact_match") { |expected:, output:| output == expected ? 1.0 : 0.0 }
28
28
  # ]
29
29
  # )
30
+ #
31
+ # @example Remote eval with parameters (for Playground UI)
32
+ # Braintrust::Eval::Evaluator.new(
33
+ # task: ->(input:, parameters:) {
34
+ # model = parameters["model"] || "gpt-4"
35
+ # # Use model to generate response...
36
+ # },
37
+ # scorers: [Braintrust::Scorer.new("exact") { |expected:, output:| output == expected ? 1.0 : 0.0 }],
38
+ # parameters: {
39
+ # "model" => {type: "string", default: "gpt-4", description: "Model to use"}
40
+ # }
41
+ # )
30
42
  class Evaluator
31
43
  attr_accessor :task, :scorers, :parameters
32
44
 
@@ -64,13 +76,15 @@ module Braintrust
64
76
  def run(cases, on_progress: nil, quiet: false,
65
77
  project: nil, experiment: nil, project_id: nil,
66
78
  dataset: nil, scorers: nil, parent: nil,
67
- state: nil, update: false, tracer_provider: nil)
79
+ state: nil, update: false, tracer_provider: nil,
80
+ parameters: nil)
68
81
  all_scorers = scorers ? self.scorers + scorers : self.scorers
69
82
  Braintrust::Eval.run(
70
83
  task: task, scorers: all_scorers, cases: cases, dataset: dataset,
71
84
  project: project, experiment: experiment, project_id: project_id,
72
85
  parent: parent, on_progress: on_progress, quiet: quiet,
73
- state: state, update: update, tracer_provider: tracer_provider
86
+ state: state, update: update, tracer_provider: tracer_provider,
87
+ parameters: parameters
74
88
  )
75
89
  end
76
90
  end
@@ -6,6 +6,7 @@ require_relative "summary"
6
6
  require_relative "trace"
7
7
  require_relative "../internal/thread_pool"
8
8
  require_relative "../api/internal/btql"
9
+ require_relative "../internal/retry"
9
10
 
10
11
  require "opentelemetry/sdk"
11
12
  require "json"
@@ -24,8 +25,7 @@ module Braintrust
24
25
  # @param eval_context [Context] Normalized eval context
25
26
  def initialize(eval_context)
26
27
  @eval_context = eval_context
27
- tracer_provider = eval_context.tracer_provider || OpenTelemetry.tracer_provider
28
- @tracer = tracer_provider.tracer("braintrust-eval")
28
+ @tracer = eval_context.tracer_provider.tracer("braintrust-eval")
29
29
 
30
30
  # Mutex for thread-safe score collection
31
31
  @score_mutex = Mutex.new
@@ -79,66 +79,69 @@ module Braintrust
79
79
 
80
80
  # Run a single test case with OpenTelemetry tracing
81
81
  # Creates eval span (parent) with task and score as children
82
- # @param case_context [CaseContext] The per-case accumulator
82
+ # @param kase [CaseContext] The per-case accumulator
83
83
  # @param errors [Queue] Thread-safe error collection queue
84
- def run_eval_case(case_context, errors)
85
- tracer.in_span("eval") do |eval_span|
84
+ def run_eval_case(kase, errors)
85
+ # Each eval case starts its own trace — detach from any ambient span context
86
+ eval_span = tracer.start_root_span("eval")
87
+ OpenTelemetry::Trace.with_span(eval_span) do
88
+ # Set attributes known before task execution
86
89
  eval_span.set_attribute("braintrust.parent", eval_context.parent_span_attr) if eval_context.parent_span_attr
87
-
88
- # Set tags early so they're present even if task fails
89
- eval_span.set_attribute("braintrust.tags", case_context.tags) if case_context.tags
90
+ set_json_attr(eval_span, "braintrust.span_attributes", build_span_attributes("eval"))
91
+ set_json_attr(eval_span, "braintrust.input_json", {input: kase.input})
92
+ set_json_attr(eval_span, "braintrust.expected", kase.expected) if kase.expected
93
+ set_json_attr(eval_span, "braintrust.metadata", kase.metadata) if kase.metadata
94
+ eval_span.set_attribute("braintrust.tags", kase.tags) if kase.tags
95
+ eval_span.set_attribute("braintrust.origin", kase.origin) if kase.origin
90
96
 
91
97
  # Run task
92
98
  begin
93
- case_context.output = run_task(case_context)
99
+ kase.output = run_task(kase)
94
100
  rescue => e
95
101
  # Error already recorded on task span, set eval span status
96
102
  eval_span.status = OpenTelemetry::Trace::Status.error(e.message)
97
- errors << "Task failed for input '#{case_context.input}': #{e.message}"
98
- report_progress(eval_span, case_context, error: e.message)
103
+ set_json_attr(eval_span, "braintrust.output_json", {output: nil})
104
+ errors << "Task failed for input '#{kase.input}': #{e.message}"
105
+ report_progress(eval_span, kase, error: e.message)
99
106
  next
100
107
  end
101
108
 
102
109
  # Flush spans so they're queryable via BTQL, then build trace
103
- eval_context.tracer_provider&.force_flush
104
- case_context.trace = build_trace(eval_span)
110
+ eval_context.tracer_provider.force_flush if eval_context.tracer_provider.respond_to?(:force_flush)
111
+ kase.trace = build_trace(eval_span)
105
112
 
106
113
  # Run scorers
107
- case_scores = nil
108
114
  begin
109
- case_scores = run_scorers(case_context)
115
+ run_scorers(kase)
110
116
  rescue => e
111
117
  # Error already recorded on score span, set eval span status
112
118
  eval_span.status = OpenTelemetry::Trace::Status.error(e.message)
113
- errors << "Scorers failed for input '#{case_context.input}': #{e.message}"
119
+ errors << "Scorers failed for input '#{kase.input}': #{e.message}"
114
120
  end
115
121
 
116
- # Set eval span attributes (after task and scorers complete)
117
- set_json_attr(eval_span, "braintrust.span_attributes", build_span_attributes("eval"))
118
- set_json_attr(eval_span, "braintrust.input_json", case_context.input)
119
- set_json_attr(eval_span, "braintrust.output_json", case_context.output)
120
- set_json_attr(eval_span, "braintrust.expected", case_context.expected) if case_context.expected
121
-
122
- # Set origin for cases from remote sources (already JSON-serialized)
123
- eval_span.set_attribute("braintrust.origin", case_context.origin) if case_context.origin
122
+ # Set output after task completes
123
+ set_json_attr(eval_span, "braintrust.output_json", {output: kase.output})
124
124
 
125
- report_progress(eval_span, case_context, data: case_context.output, scores: case_scores || {})
125
+ report_progress(eval_span, kase, data: kase.output)
126
126
  end
127
+ ensure
128
+ eval_span&.finish
127
129
  end
128
130
 
129
131
  # Run task with OpenTelemetry tracing
130
132
  # Creates task span with input and output
131
- # @param case_context [CaseContext] The per-case context
133
+ # @param kase [CaseContext] The per-case context
132
134
  # @return [Object] Task output
133
- def run_task(case_context)
135
+ def run_task(kase)
134
136
  tracer.in_span("task") do |task_span|
135
137
  task_span.set_attribute("braintrust.parent", eval_context.parent_span_attr) if eval_context.parent_span_attr
136
138
  set_json_attr(task_span, "braintrust.span_attributes", build_span_attributes("task"))
137
- set_json_attr(task_span, "braintrust.input_json", case_context.input)
139
+ set_json_attr(task_span, "braintrust.input_json", kase.input)
138
140
 
139
141
  begin
140
142
  output = eval_context.task.call(
141
- input: case_context.input
143
+ input: kase.input,
144
+ parameters: eval_context.parameters || {}
142
145
  )
143
146
  set_json_attr(task_span, "braintrust.output_json", output)
144
147
  output
@@ -151,43 +154,64 @@ module Braintrust
151
154
  end
152
155
  end
153
156
 
154
- # Run scorers with OpenTelemetry tracing
155
- # Creates single score span for all scorers
156
- # @param case_context [CaseContext] The per-case context (output must be populated)
157
- # @return [Hash] Scores hash { scorer_name => score_value }
158
- def run_scorers(case_context)
159
- tracer.in_span("score") do |score_span|
157
+ # Run scorers with OpenTelemetry tracing.
158
+ # Creates one span per scorer, each a direct child of the current (eval) span.
159
+ # @param kase [CaseContext] The per-case context (output must be populated)
160
+ def run_scorers(kase)
161
+ scorer_kwargs = {
162
+ input: kase.input,
163
+ expected: kase.expected,
164
+ output: kase.output,
165
+ metadata: kase.metadata || {},
166
+ trace: kase.trace,
167
+ parameters: eval_context.parameters || {}
168
+ }
169
+ scorer_input = {
170
+ input: kase.input,
171
+ expected: kase.expected,
172
+ output: kase.output,
173
+ metadata: kase.metadata || {},
174
+ parameters: eval_context.parameters || {}
175
+ }
176
+
177
+ scorer_error = nil
178
+ eval_context.scorers.each do |scorer|
179
+ collect_scores(run_scorer(scorer, scorer_kwargs, scorer_input))
180
+ rescue => e
181
+ scorer_error ||= e
182
+ end
183
+
184
+ raise scorer_error if scorer_error
185
+ end
186
+
187
+ # Run a single scorer inside its own span.
188
+ # @param scorer [Scorer] The scorer to run
189
+ # @param scorer_kwargs [Hash] Keyword arguments for the scorer
190
+ # @param scorer_input [Hash] Input to log on the span
191
+ # @return [Array<Hash>] Raw score results from the scorer
192
+ def run_scorer(scorer, scorer_kwargs, scorer_input)
193
+ tracer.in_span(scorer.name) do |score_span|
160
194
  score_span.set_attribute("braintrust.parent", eval_context.parent_span_attr) if eval_context.parent_span_attr
161
- set_json_attr(score_span, "braintrust.span_attributes", build_span_attributes("score"))
162
-
163
- scorer_kwargs = {
164
- input: case_context.input,
165
- expected: case_context.expected,
166
- output: case_context.output,
167
- metadata: case_context.metadata || {},
168
- trace: case_context.trace
169
- }
170
- scores = {}
171
- scorer_error = nil
172
- eval_context.scorers.each do |scorer|
173
- score_value = scorer.call(**scorer_kwargs)
174
- scores[scorer.name] = score_value
175
-
176
- # Collect raw score for summary (thread-safe)
177
- collect_score(scorer.name, score_value)
178
- rescue => e
179
- # Record first error but continue processing other scorers
180
- scorer_error ||= e
181
- record_span_error(score_span, e, "ScorerError")
182
- end
195
+ set_json_attr(score_span, "braintrust.span_attributes", build_scorer_span_attributes(scorer.name))
196
+ set_json_attr(score_span, "braintrust.input_json", scorer_input)
197
+
198
+ score_results = scorer.call(**scorer_kwargs)
183
199
 
184
- # Always set scores attribute, even if some scorers failed
185
- set_json_attr(score_span, "braintrust.scores", scores)
200
+ scorer_scores = {}
201
+ scorer_metadata = {}
202
+ score_results.each do |s|
203
+ scorer_scores[s[:name]] = s[:score]
204
+ scorer_metadata[s[:name]] = s[:metadata] if s[:metadata].is_a?(Hash)
205
+ end
186
206
 
187
- # Raise after setting scores so we can see which scorers succeeded
188
- raise scorer_error if scorer_error
207
+ set_json_attr(score_span, "braintrust.output_json", scorer_scores)
208
+ set_json_attr(score_span, "braintrust.scores", scorer_scores)
209
+ set_json_attr(score_span, "braintrust.metadata", scorer_metadata) unless scorer_metadata.empty?
189
210
 
190
- scores
211
+ score_results
212
+ rescue => e
213
+ record_span_error(score_span, e, "ScorerError")
214
+ raise
191
215
  end
192
216
  end
193
217
 
@@ -203,9 +227,23 @@ module Braintrust
203
227
  object_id = eval_context.experiment_id
204
228
  btql = API::Internal::BTQL.new(eval_context.state)
205
229
 
206
- Eval::Trace.new(
207
- spans: -> { btql.trace_spans(object_type: object_type, object_id: object_id, root_span_id: root_span_id) }
208
- )
230
+ Eval::Trace.new(spans: -> { fetch_trace_spans(btql, object_type, object_id, root_span_id) })
231
+ end
232
+
233
+ # Fetch trace spans with retry to handle freshness and ingestion lag.
234
+ # @return [Array<Hash>] Parsed span data
235
+ def fetch_trace_spans(btql, object_type, object_id, root_span_id)
236
+ rows, _freshness = Internal::Retry.with_backoff(
237
+ max_retries: 7, base_delay: 1.0, max_delay: 8.0,
238
+ until: ->(result) {
239
+ r, f = result
240
+ f == "complete" && !r.empty?
241
+ }
242
+ ) { btql.trace_spans(object_type: object_type, object_id: object_id, root_span_id: root_span_id) }
243
+ rows || []
244
+ rescue => e
245
+ Braintrust::Log.warn("[BTQL] Query failed: #{e.message}")
246
+ []
209
247
  end
210
248
 
211
249
  # Build a CaseContext from a Case struct
@@ -220,11 +258,11 @@ module Braintrust
220
258
 
221
259
  # Report progress for a case via on_progress callback.
222
260
  # Rescues errors in the callback so a broken handler never crashes the eval.
223
- def report_progress(eval_span, case_context, **fields)
261
+ def report_progress(eval_span, kase, **fields)
224
262
  return unless eval_context.on_progress
225
263
  progress = {"id" => eval_span.context.hex_span_id}.merge(fields.transform_keys(&:to_s))
226
- if case_context.origin
227
- progress["origin"] = case_context.origin.is_a?(String) ? JSON.parse(case_context.origin) : case_context.origin
264
+ if kase.origin
265
+ progress["origin"] = kase.origin.is_a?(String) ? JSON.parse(kase.origin) : kase.origin
228
266
  end
229
267
  eval_context.on_progress.call(progress)
230
268
  rescue => e
@@ -255,6 +293,16 @@ module Braintrust
255
293
  attrs
256
294
  end
257
295
 
296
+ # Build span_attributes for a scorer span.
297
+ # Each scorer gets its own span with type "score", purpose "scorer", and the scorer's name.
298
+ # @param scorer_name [String] The scorer name
299
+ # @return [Hash]
300
+ def build_scorer_span_attributes(scorer_name)
301
+ attrs = {type: "score", name: scorer_name, purpose: "scorer"}
302
+ attrs[:generation] = eval_context.generation if eval_context.generation
303
+ attrs
304
+ end
305
+
258
306
  # Set a span attribute by JSON encoding the value
259
307
  # @param span [OpenTelemetry::Trace::Span] The span
260
308
  # @param key [String] The attribute key
@@ -263,14 +311,11 @@ module Braintrust
263
311
  span.set_attribute(key, JSON.dump(value))
264
312
  end
265
313
 
266
- # Collect a single score value for summary calculation
267
- # @param name [String] Scorer name
268
- # @param value [Object] Score value (only Numeric values are collected)
269
- def collect_score(name, value)
270
- return unless value.is_a?(Numeric)
271
-
314
+ # Collect score results into the summary accumulator (thread-safe).
315
+ # @param score_results [Array<Hash>] Score results from a scorer
316
+ def collect_scores(score_results)
272
317
  @score_mutex.synchronize do
273
- (@scores[name] ||= []) << value
318
+ score_results.each { |s| (@scores[s[:name]] ||= []) << s[:score] }
274
319
  end
275
320
  end
276
321
  end
@@ -105,6 +105,21 @@ module Braintrust
105
105
  # scorers: [->(expected:, output:) { output == expected ? 1.0 : 0.0 }]
106
106
  # )
107
107
  #
108
+ # @example Using parameters for configurable tasks
109
+ # # Tasks and scorers that declare `parameters:` receive it automatically.
110
+ # # Those that don't are unaffected — KeywordFilter strips unknown kwargs.
111
+ # Braintrust::Eval.run(
112
+ # project: "my-project",
113
+ # experiment: "with-params",
114
+ # cases: [{input: "hello", expected: "HELLO!"}],
115
+ # task: ->(input:, parameters:) {
116
+ # suffix = parameters["suffix"] || ""
117
+ # input.upcase + suffix
118
+ # },
119
+ # scorers: [->(expected:, output:) { output == expected ? 1.0 : 0.0 }],
120
+ # parameters: {"suffix" => "!"}
121
+ # )
122
+ #
108
123
  # @example Using metadata and tags
109
124
  # Braintrust::Eval.run(
110
125
  # project: "my-project",
@@ -158,11 +173,15 @@ module Braintrust
158
173
  # @param quiet [Boolean] If true, suppress result output (default: false)
159
174
  # @param state [State, nil] Braintrust state (defaults to global state)
160
175
  # @param tracer_provider [TracerProvider, nil] OpenTelemetry tracer provider (defaults to global)
176
+ # @param project_id [String, nil] Project UUID (skips project creation when provided)
177
+ # @param parent [Hash, nil] Parent span context ({object_type:, object_id:, generation:})
178
+ # @param parameters [Hash, nil] Runtime parameters passed to task and scorers as a `parameters:` keyword argument
161
179
  # @return [Result]
162
180
  def run(task:, scorers:, project: nil, experiment: nil,
163
181
  cases: nil, dataset: nil, on_progress: nil,
164
182
  parallelism: 1, tags: nil, metadata: nil, update: false, quiet: false,
165
- state: nil, tracer_provider: nil, project_id: nil, parent: nil)
183
+ state: nil, tracer_provider: nil, project_id: nil, parent: nil,
184
+ parameters: nil)
166
185
  # Validate required parameters
167
186
  validate_params!(task: task, scorers: scorers, cases: cases, dataset: dataset)
168
187
 
@@ -205,7 +224,8 @@ module Braintrust
205
224
  state: state,
206
225
  tracer_provider: tracer_provider,
207
226
  on_progress: on_progress,
208
- parent: parent
227
+ parent: parent,
228
+ parameters: parameters
209
229
  )
210
230
  result = Runner.new(context).run(parallelism: parallelism)
211
231
 
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Braintrust
4
+ module Internal
5
+ module Retry
6
+ MAX_RETRIES = 7
7
+ BASE_DELAY = 1.0
8
+ MAX_DELAY = 8.0
9
+
10
+ # Retry a block with exponential backoff.
11
+ #
12
+ # The block is the task to attempt. Its return value is captured each attempt.
13
+ #
14
+ # @param max_retries [Integer] Maximum number of retries after the first attempt
15
+ # @param base_delay [Float] Initial delay in seconds (doubles each retry)
16
+ # @param max_delay [Float] Cap on delay between retries
17
+ # @param until [Proc, nil] Optional condition — receives block result, truthy stops retrying.
18
+ # When omitted, the block result's own truthiness decides.
19
+ # @return The last block result (whether retries were exhausted or condition was met)
20
+ #
21
+ # @example Simple: retry until truthy
22
+ # conn = Retry.with_backoff(max_retries: 5) { try_connect }
23
+ #
24
+ # @example With condition: retry until non-empty
25
+ # data = Retry.with_backoff(until: ->(r) { r.any? }) { api.fetch }
26
+ #
27
+ def self.with_backoff(max_retries: MAX_RETRIES, base_delay: BASE_DELAY, max_delay: MAX_DELAY, until: nil, &task)
28
+ check = binding.local_variable_get(:until)
29
+ result = task.call
30
+ retries = 0
31
+ while retries < max_retries && !(check ? check.call(result) : result)
32
+ retries += 1
33
+ delay = [base_delay * (2**(retries - 1)), max_delay].min
34
+ sleep(delay)
35
+ result = task.call
36
+ end
37
+ result
38
+ end
39
+ end
40
+ end
41
+ end
@@ -11,23 +11,28 @@ module Braintrust
11
11
  # params = prompt.build(text: "Article to summarize...")
12
12
  # client.messages.create(**params)
13
13
  class Prompt
14
- attr_reader :id, :name, :slug, :project_id
14
+ attr_reader :id, :name, :slug, :project_id, :version
15
15
 
16
16
  # Load a prompt from Braintrust
17
17
  #
18
- # @param project [String] Project name
18
+ # @param project [String, nil] Project name (provide either project or project_id)
19
+ # @param project_id [String, nil] Project ID (UUID, provide either project or project_id)
19
20
  # @param slug [String] Prompt slug
20
21
  # @param version [String, nil] Specific version (default: latest)
21
22
  # @param defaults [Hash] Default variable values for build()
22
23
  # @param api [API, nil] Braintrust API client (default: creates one using global state)
23
24
  # @return [Prompt]
24
- def self.load(project:, slug:, version: nil, defaults: {}, api: nil)
25
+ def self.load(slug:, project: nil, project_id: nil, version: nil, defaults: {}, api: nil)
26
+ raise ArgumentError, "Either project or project_id is required" unless project || project_id
27
+
25
28
  api ||= API.new
26
29
 
27
30
  # Find the function by project + slug
28
- result = api.functions.list(project_name: project, slug: slug)
31
+ result = api.functions.list(project_name: project, project_id: project_id, slug: slug)
29
32
  function = result.dig("objects")&.first
30
- raise Error, "Prompt '#{slug}' not found in project '#{project}'" unless function
33
+
34
+ identifier = project ? "project '#{project}'" : "project_id '#{project_id}'"
35
+ raise Error, "Prompt '#{slug}' not found in #{identifier}" unless function
31
36
 
32
37
  # Fetch full function data including prompt_data
33
38
  full_data = api.functions.get(id: function["id"], version: version)
@@ -47,6 +52,7 @@ module Braintrust
47
52
  @name = data["name"]
48
53
  @slug = data["slug"]
49
54
  @project_id = data["project_id"]
55
+ @version = data["_xact_id"]
50
56
  end
51
57
 
52
58
  # Get the raw prompt definition