braintrust 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c07be3c454a924c5c97c2653136a2b9cdd1098409af16326b1db8676c5c8b0d2
4
- data.tar.gz: c1eb75eefdcacebc2c955ae23aa3196d276a76d6ab828cdfb817c7e9168325b3
3
+ metadata.gz: 27e146b06451b844b1e6416353b20f6bd572c3d1169a12a439745cb7280ce0ec
4
+ data.tar.gz: d726e3a146a2180bf2714846d56e65fa9d3ef1ce773adb116a8e6b1b79ba823c
5
5
  SHA512:
6
- metadata.gz: d02058bd5321ed16ea2f785aaeb24f4d4f105c5357c3c7ceb2a8a02c090b69c7187623b23e14d5026bb0cf236e64dddae7025509d7b2d6769bb50f110612120f
7
- data.tar.gz: 15627209b382c023c2640e1d2219b6d33b84cb7c67ba1a3b8e3ebbe1aa912d3df832583a1e37b3831699b67ea81f3b4242b67a606dfdd727827e648a6509fea7
6
+ metadata.gz: 69e5150452e9dde1491664af1137cc05a9a5b651dbb5fdee27ff8a09e0e11b51c283c163019566045e1771679ed6f2eece4dd1753aa06f899e3681e7c6b99d15
7
+ data.tar.gz: 28cc8c86bdc13db8d33ad0dc28325c0d858f37ba1b9f41212c52e514eed649b14596c66153bca58de251c4c6dd1ddcb170d24ae100a33f912f49349671821f7a
data/README.md CHANGED
@@ -21,6 +21,7 @@ This is the official Ruby SDK for [Braintrust](https://www.braintrust.dev), for
21
21
  - [Attachments](#attachments)
22
22
  - [Viewing traces](#viewing-traces)
23
23
  - [Evals](#evals)
24
+ - [Tasks](#tasks)
24
25
  - [Datasets](#datasets)
25
26
  - [Scorers](#scorers)
26
27
  - [Dev Server](#dev-server)
@@ -261,6 +262,48 @@ Braintrust::Eval.run(
261
262
 
262
263
  See [eval.rb](./examples/eval.rb) for a full example.
263
264
 
265
+ ### Tasks
266
+
267
+ Define the code being evaluated as a lambda or a class. Tasks receive `input:` as a keyword argument:
268
+
269
+ ```ruby
270
+ # Lambda
271
+ task = ->(input:) { classify(input) }
272
+
273
+ # Class-based (auto-derives name from class: "food_classifier")
274
+ class FoodClassifier
275
+ include Braintrust::Task
276
+
277
+ def call(input:)
278
+ classify(input)
279
+ end
280
+ end
281
+ ```
282
+
283
+ #### With parameters
284
+
285
+ Tasks can accept `parameters:` as input to drive their behavior:
286
+
287
+ ```ruby
288
+ task = ->(input:, parameters:) {
289
+ value = parameters["value"]
290
+ from_unit = parameters["to_unit"] || 'c'
291
+ to_unit = parameters["from_unit"] || 'f'
292
+
293
+ convert_temp(temperature: value, from_unit: from_unit , to_unit: to_unit)
294
+ }
295
+
296
+ Braintrust::Eval.run(
297
+ project: "my-project",
298
+ cases: [...],
299
+ task: task,
300
+ scorers: [...],
301
+ parameters: {"value" => 23.0}
302
+ )
303
+ ```
304
+
305
+ See [parameters.rb](./examples/eval/parameters.rb) for a full example.
306
+
264
307
  ### Datasets
265
308
 
266
309
  Use test cases from a Braintrust dataset:
@@ -390,6 +433,19 @@ Braintrust::Eval.run(
390
433
 
391
434
  See [trace_scoring.rb](./examples/eval/trace_scoring.rb) for a full example.
392
435
 
436
+ #### Scorer parameters
437
+
438
+ Scorers can also accept `parameters:` to use runtime configuration in their scoring logic. Like tasks, scorers that don't declare `parameters:` are unaffected:
439
+
440
+ ```ruby
441
+ Braintrust::Scorer.new("threshold_match") do |expected:, output:, parameters:|
442
+ threshold = parameters["threshold"] || 0.8
443
+ similarity(output, expected) >= threshold ? 1.0 : 0.0
444
+ end
445
+ ```
446
+
447
+ See [parameters.rb](./examples/eval/parameters.rb) for a full example.
448
+
393
449
  ### Dev Server
394
450
 
395
451
  Run evaluations from the Braintrust web UI against code in your own application.
@@ -25,13 +25,15 @@ module Braintrust
25
25
  # List functions with optional filters
26
26
  # GET /v1/function?project_name=X&...
27
27
  # @param project_name [String, nil] Filter by project name
28
+ # @param project_id [String, nil] Filter by project ID (UUID)
28
29
  # @param function_name [String, nil] Filter by function name
29
30
  # @param slug [String, nil] Filter by slug
30
31
  # @param limit [Integer, nil] Limit number of results
31
32
  # @return [Hash] Response with "objects" array
32
- def list(project_name: nil, function_name: nil, slug: nil, limit: nil)
33
+ def list(project_name: nil, project_id: nil, function_name: nil, slug: nil, limit: nil)
33
34
  params = {}
34
35
  params["project_name"] = project_name if project_name
36
+ params["project_id"] = project_id if project_id
35
37
  params["function_name"] = function_name if function_name
36
38
  params["slug"] = slug if slug
37
39
  params["limit"] = limit if limit
@@ -11,19 +11,6 @@ module Braintrust
11
11
  # Internal BTQL client for querying spans.
12
12
  # Not part of the public API — instantiated directly where needed.
13
13
  class BTQL
14
- # Maximum number of retries before returning partial results.
15
- # Covers both freshness lag (partially indexed) and ingestion lag
16
- # (spans not yet visible to BTQL after OTel flush).
17
- MAX_FRESHNESS_RETRIES = 7
18
-
19
- # Base delay (seconds) between retries (doubles each attempt, capped).
20
- FRESHNESS_BASE_DELAY = 1.0
21
-
22
- # Maximum delay (seconds) between retries. Caps exponential growth
23
- # so we keep polling at a reasonable rate in the later window.
24
- # Schedule: 1, 2, 4, 8, 8, 8, 8 = ~39s total worst-case.
25
- MAX_FRESHNESS_DELAY = 8.0
26
-
27
14
  def initialize(state)
28
15
  @state = state
29
16
  end
@@ -31,36 +18,19 @@ module Braintrust
31
18
  # Query spans belonging to a specific trace within an object.
32
19
  #
33
20
  # Builds a BTQL SQL query that matches the root_span_id and excludes scorer spans.
34
- # Retries with exponential backoff if the response indicates data is not yet fresh.
21
+ # Returns a single-shot result; callers are responsible for retry and error handling.
35
22
  #
36
23
  # @param object_type [String] e.g. "experiment"
37
24
  # @param object_id [String] Object UUID
38
25
  # @param root_span_id [String] Hex trace ID of the root span
39
- # @return [Array<Hash>] Parsed span data
26
+ # @return [Array(Array<Hash>, String)] [rows, freshness]
40
27
  def trace_spans(object_type:, object_id:, root_span_id:)
41
28
  query = build_trace_query(
42
29
  object_type: object_type,
43
30
  object_id: object_id,
44
31
  root_span_id: root_span_id
45
32
  )
46
- payload = {query: query, fmt: "jsonl"}
47
-
48
- retries = 0
49
- loop do
50
- rows, freshness = execute_query(payload)
51
- # Return when data is fresh AND non-empty, or we've exhausted retries.
52
- # We retry on empty even when "complete" because there is ingestion lag
53
- # between OTel flush and BTQL indexing — the server may report "complete"
54
- # before it knows about newly-flushed spans.
55
- return rows if (freshness == "complete" && !rows.empty?) || retries >= MAX_FRESHNESS_RETRIES
56
-
57
- retries += 1
58
- delay = [FRESHNESS_BASE_DELAY * (2**(retries - 1)), MAX_FRESHNESS_DELAY].min
59
- sleep(delay)
60
- end
61
- rescue => e
62
- Braintrust::Log.warn("[BTQL] Query failed: #{e.message}")
63
- []
33
+ execute_query(query: query, fmt: "jsonl")
64
34
  end
65
35
 
66
36
  private
@@ -9,11 +9,24 @@ module Braintrust
9
9
  class Context
10
10
  attr_reader :task, :scorers, :cases, :experiment_id, :experiment_name,
11
11
  :project_id, :project_name, :state, :tracer_provider,
12
- :on_progress, :parent_span_attr, :generation
12
+ :on_progress, :parent_span_attr, :generation, :parameters
13
13
 
14
+ # @param task [Task] Normalized task wrapper
15
+ # @param scorers [Array<Scorer>] Normalized scorer wrappers
16
+ # @param cases [Cases] Normalized eval cases
17
+ # @param experiment_id [String, nil] Experiment ID for logging and trace linkage
18
+ # @param experiment_name [String, nil] Experiment name, included in span attributes
19
+ # @param project_id [String, nil] Project ID
20
+ # @param project_name [String, nil] Project name
21
+ # @param state [Braintrust::State, nil] Authenticated API state; nil for local-only evals
22
+ # @param tracer_provider [#tracer, nil] OpenTelemetry tracer provider
23
+ # @param on_progress [Proc, nil] Callback invoked after each case completes, receiving a progress Hash
24
+ # @param parent_span_attr [String, nil] Formatted parent span identifier ("type:id"), linking spans to a parent context
25
+ # @param generation [Integer, nil] Generation number from the parent span context, used to link spans in a trace hierarchy
26
+ # @param parameters [Hash, nil] Runtime parameters passed to task and scorers as a `parameters:` keyword argument
14
27
  def initialize(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil,
15
28
  project_id: nil, project_name: nil, state: nil, tracer_provider: nil,
16
- on_progress: nil, parent_span_attr: nil, generation: nil)
29
+ on_progress: nil, parent_span_attr: nil, generation: nil, parameters: nil)
17
30
  @task = task
18
31
  @scorers = scorers
19
32
  @cases = cases
@@ -26,40 +39,83 @@ module Braintrust
26
39
  @on_progress = on_progress
27
40
  @parent_span_attr = parent_span_attr
28
41
  @generation = generation
42
+ @parameters = parameters
29
43
  end
30
44
 
31
45
  # Build a Context from raw user inputs.
32
- # Factory normalizes task, scorers, and cases into typed wrappers.
33
- # Parent is resolved into parent_span_attr and generation.
46
+ # Delegates to Factory for normalization.
47
+ # @param task [Task, Proc, #call] Task to evaluate; wrapped into a {Task} if needed
48
+ # @param scorers [Array<Scorer, Proc, String, Scorer::ID, #call>] Scorers; each is normalized into a {Scorer}
49
+ # @param cases [Cases, Array, Enumerable] Eval cases; wrapped into {Cases} if needed
50
+ # @param experiment_id [String, nil] Experiment ID for logging
51
+ # @param experiment_name [String, nil] Experiment name, included in span attributes
52
+ # @param project_id [String, nil] Project ID
53
+ # @param project_name [String, nil] Project name; required when resolving scorer slugs
54
+ # @param state [Braintrust::State, nil] Authenticated API state; nil for local-only evals
55
+ # @param tracer_provider [#tracer, nil] OpenTelemetry tracer provider; defaults to global provider
56
+ # @param on_progress [Proc, nil] Callback invoked after each case completes, receiving a progress Hash
57
+ # @param parent [Hash, nil] Parent span info with keys :object_type, :object_id, and optionally :generation
58
+ # @param parameters [Hash, nil] Runtime parameters passed to task and scorers as a `parameters:` keyword argument
59
+ # @return [Context]
34
60
  def self.build(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil,
35
61
  project_id: nil, project_name: nil, state: nil, tracer_provider: nil,
36
- on_progress: nil, parent: nil)
37
- factory = Factory.new(state: state, tracer_provider: tracer_provider, project_name: project_name)
38
-
39
- Context.new(
40
- task: factory.normalize_task(task),
41
- scorers: factory.normalize_scorers(scorers),
42
- cases: factory.normalize_cases(cases),
43
- experiment_id: experiment_id,
44
- experiment_name: experiment_name,
45
- project_id: project_id,
46
- project_name: project_name,
47
- state: state,
48
- tracer_provider: tracer_provider,
49
- on_progress: on_progress,
50
- parent_span_attr: factory.resolve_parent_span_attr(parent),
51
- generation: parent&.dig(:generation)
62
+ on_progress: nil, parent: nil, parameters: nil)
63
+ Factory.new(
64
+ state: state, tracer_provider: tracer_provider,
65
+ project_id: project_id, project_name: project_name
66
+ ).build(
67
+ task: task, scorers: scorers, cases: cases,
68
+ experiment_id: experiment_id, experiment_name: experiment_name,
69
+ on_progress: on_progress, parent: parent, parameters: parameters
52
70
  )
53
71
  end
54
72
 
55
73
  # Encapsulates normalization of raw user inputs into typed wrappers.
56
74
  class Factory
57
- def initialize(state: nil, tracer_provider: nil, project_name: nil)
75
+ # @param state [Braintrust::State, nil] Authenticated API state; passed through to scorer resolution
76
+ # @param tracer_provider [#tracer, nil] OpenTelemetry tracer provider; passed through to remote scorers
77
+ # @param project_id [String, nil] Project ID; passed through to the built Context
78
+ # @param project_name [String, nil] Project name; required when resolving scorer slugs
79
+ def initialize(state: nil, tracer_provider: nil, project_id: nil, project_name: nil)
58
80
  @state = state
59
81
  @tracer_provider = tracer_provider
82
+ @project_id = project_id
60
83
  @project_name = project_name
61
84
  end
62
85
 
86
+ # Normalize raw inputs and construct a {Context}.
87
+ # @param task [Task, Proc, #call] Raw task
88
+ # @param scorers [Array] Raw scorers
89
+ # @param cases [Cases, Array, Enumerable] Raw eval cases
90
+ # @param experiment_id [String, nil]
91
+ # @param experiment_name [String, nil]
92
+ # @param on_progress [Proc, nil]
93
+ # @param parent [Hash, nil] Parent span info with keys :object_type, :object_id, and optionally :generation
94
+ # @return [Context]
95
+ def build(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil,
96
+ on_progress: nil, parent: nil, parameters: nil)
97
+ Context.new(
98
+ task: normalize_task(task),
99
+ scorers: normalize_scorers(scorers),
100
+ cases: normalize_cases(cases),
101
+ experiment_id: experiment_id,
102
+ experiment_name: experiment_name,
103
+ project_id: @project_id,
104
+ project_name: @project_name,
105
+ state: @state,
106
+ tracer_provider: @tracer_provider || OpenTelemetry.tracer_provider,
107
+ on_progress: on_progress,
108
+ parent_span_attr: resolve_parent_span_attr(parent),
109
+ generation: parent&.dig(:generation),
110
+ parameters: parameters
111
+ )
112
+ end
113
+
114
+ private
115
+
116
+ # @param raw [Cases, Array, Enumerable, #each]
117
+ # @return [Cases]
118
+ # @raise [ArgumentError] if raw is not enumerable
63
119
  def normalize_cases(raw)
64
120
  case raw
65
121
  when Cases
@@ -75,11 +131,15 @@ module Braintrust
75
131
  end
76
132
  end
77
133
 
134
+ # @param parent [Hash, nil]
135
+ # @return [String, nil] Formatted as "type:id", e.g. "experiment_id:abc-123"
78
136
  def resolve_parent_span_attr(parent)
79
137
  return nil unless parent
80
138
  "#{parent[:object_type]}:#{parent[:object_id]}"
81
139
  end
82
140
 
141
+ # @param raw [Task, Proc, #call]
142
+ # @return [Task]
83
143
  def normalize_task(raw)
84
144
  case raw
85
145
  when Task
@@ -95,6 +155,9 @@ module Braintrust
95
155
  end
96
156
  end
97
157
 
158
+ # @param raw [Array<Scorer, Proc, String, Scorer::ID, #call>]
159
+ # @return [Array<Scorer>]
160
+ # @raise [ArgumentError] if a String slug is given without a project name
98
161
  def normalize_scorers(raw)
99
162
  raw.map do |scorer|
100
163
  case scorer
@@ -27,6 +27,18 @@ module Braintrust
27
27
  # Braintrust::Scorer.new("exact_match") { |expected:, output:| output == expected ? 1.0 : 0.0 }
28
28
  # ]
29
29
  # )
30
+ #
31
+ # @example Remote eval with parameters (for Playground UI)
32
+ # Braintrust::Eval::Evaluator.new(
33
+ # task: ->(input:, parameters:) {
34
+ # model = parameters["model"] || "gpt-4"
35
+ # # Use model to generate response...
36
+ # },
37
+ # scorers: [Braintrust::Scorer.new("exact") { |expected:, output:| output == expected ? 1.0 : 0.0 }],
38
+ # parameters: {
39
+ # "model" => {type: "string", default: "gpt-4", description: "Model to use"}
40
+ # }
41
+ # )
30
42
  class Evaluator
31
43
  attr_accessor :task, :scorers, :parameters
32
44
 
@@ -64,13 +76,15 @@ module Braintrust
64
76
  def run(cases, on_progress: nil, quiet: false,
65
77
  project: nil, experiment: nil, project_id: nil,
66
78
  dataset: nil, scorers: nil, parent: nil,
67
- state: nil, update: false, tracer_provider: nil)
79
+ state: nil, update: false, tracer_provider: nil,
80
+ parameters: nil)
68
81
  all_scorers = scorers ? self.scorers + scorers : self.scorers
69
82
  Braintrust::Eval.run(
70
83
  task: task, scorers: all_scorers, cases: cases, dataset: dataset,
71
84
  project: project, experiment: experiment, project_id: project_id,
72
85
  parent: parent, on_progress: on_progress, quiet: quiet,
73
- state: state, update: update, tracer_provider: tracer_provider
86
+ state: state, update: update, tracer_provider: tracer_provider,
87
+ parameters: parameters
74
88
  )
75
89
  end
76
90
  end
@@ -6,6 +6,7 @@ require_relative "summary"
6
6
  require_relative "trace"
7
7
  require_relative "../internal/thread_pool"
8
8
  require_relative "../api/internal/btql"
9
+ require_relative "../internal/retry"
9
10
 
10
11
  require "opentelemetry/sdk"
11
12
  require "json"
@@ -24,8 +25,7 @@ module Braintrust
24
25
  # @param eval_context [Context] Normalized eval context
25
26
  def initialize(eval_context)
26
27
  @eval_context = eval_context
27
- tracer_provider = eval_context.tracer_provider || OpenTelemetry.tracer_provider
28
- @tracer = tracer_provider.tracer("braintrust-eval")
28
+ @tracer = eval_context.tracer_provider.tracer("braintrust-eval")
29
29
 
30
30
  # Mutex for thread-safe score collection
31
31
  @score_mutex = Mutex.new
@@ -79,50 +79,50 @@ module Braintrust
79
79
 
80
80
  # Run a single test case with OpenTelemetry tracing
81
81
  # Creates eval span (parent) with task and score as children
82
- # @param case_context [CaseContext] The per-case accumulator
82
+ # @param kase [CaseContext] The per-case accumulator
83
83
  # @param errors [Queue] Thread-safe error collection queue
84
- def run_eval_case(case_context, errors)
84
+ def run_eval_case(kase, errors)
85
85
  # Each eval case starts its own trace — detach from any ambient span context
86
86
  eval_span = tracer.start_root_span("eval")
87
87
  OpenTelemetry::Trace.with_span(eval_span) do
88
88
  # Set attributes known before task execution
89
89
  eval_span.set_attribute("braintrust.parent", eval_context.parent_span_attr) if eval_context.parent_span_attr
90
90
  set_json_attr(eval_span, "braintrust.span_attributes", build_span_attributes("eval"))
91
- set_json_attr(eval_span, "braintrust.input_json", {input: case_context.input})
92
- set_json_attr(eval_span, "braintrust.expected", case_context.expected) if case_context.expected
93
- set_json_attr(eval_span, "braintrust.metadata", case_context.metadata) if case_context.metadata
94
- eval_span.set_attribute("braintrust.tags", case_context.tags) if case_context.tags
95
- eval_span.set_attribute("braintrust.origin", case_context.origin) if case_context.origin
91
+ set_json_attr(eval_span, "braintrust.input_json", {input: kase.input})
92
+ set_json_attr(eval_span, "braintrust.expected", kase.expected) if kase.expected
93
+ set_json_attr(eval_span, "braintrust.metadata", kase.metadata) if kase.metadata
94
+ eval_span.set_attribute("braintrust.tags", kase.tags) if kase.tags
95
+ eval_span.set_attribute("braintrust.origin", kase.origin) if kase.origin
96
96
 
97
97
  # Run task
98
98
  begin
99
- case_context.output = run_task(case_context)
99
+ kase.output = run_task(kase)
100
100
  rescue => e
101
101
  # Error already recorded on task span, set eval span status
102
102
  eval_span.status = OpenTelemetry::Trace::Status.error(e.message)
103
103
  set_json_attr(eval_span, "braintrust.output_json", {output: nil})
104
- errors << "Task failed for input '#{case_context.input}': #{e.message}"
105
- report_progress(eval_span, case_context, error: e.message)
104
+ errors << "Task failed for input '#{kase.input}': #{e.message}"
105
+ report_progress(eval_span, kase, error: e.message)
106
106
  next
107
107
  end
108
108
 
109
109
  # Flush spans so they're queryable via BTQL, then build trace
110
- eval_context.tracer_provider&.force_flush
111
- case_context.trace = build_trace(eval_span)
110
+ eval_context.tracer_provider.force_flush if eval_context.tracer_provider.respond_to?(:force_flush)
111
+ kase.trace = build_trace(eval_span)
112
112
 
113
113
  # Run scorers
114
114
  begin
115
- run_scorers(case_context)
115
+ run_scorers(kase)
116
116
  rescue => e
117
117
  # Error already recorded on score span, set eval span status
118
118
  eval_span.status = OpenTelemetry::Trace::Status.error(e.message)
119
- errors << "Scorers failed for input '#{case_context.input}': #{e.message}"
119
+ errors << "Scorers failed for input '#{kase.input}': #{e.message}"
120
120
  end
121
121
 
122
122
  # Set output after task completes
123
- set_json_attr(eval_span, "braintrust.output_json", {output: case_context.output})
123
+ set_json_attr(eval_span, "braintrust.output_json", {output: kase.output})
124
124
 
125
- report_progress(eval_span, case_context, data: case_context.output)
125
+ report_progress(eval_span, kase, data: kase.output)
126
126
  end
127
127
  ensure
128
128
  eval_span&.finish
@@ -130,17 +130,18 @@ module Braintrust
130
130
 
131
131
  # Run task with OpenTelemetry tracing
132
132
  # Creates task span with input and output
133
- # @param case_context [CaseContext] The per-case context
133
+ # @param kase [CaseContext] The per-case context
134
134
  # @return [Object] Task output
135
- def run_task(case_context)
135
+ def run_task(kase)
136
136
  tracer.in_span("task") do |task_span|
137
137
  task_span.set_attribute("braintrust.parent", eval_context.parent_span_attr) if eval_context.parent_span_attr
138
138
  set_json_attr(task_span, "braintrust.span_attributes", build_span_attributes("task"))
139
- set_json_attr(task_span, "braintrust.input_json", case_context.input)
139
+ set_json_attr(task_span, "braintrust.input_json", kase.input)
140
140
 
141
141
  begin
142
142
  output = eval_context.task.call(
143
- input: case_context.input
143
+ input: kase.input,
144
+ parameters: eval_context.parameters || {}
144
145
  )
145
146
  set_json_attr(task_span, "braintrust.output_json", output)
146
147
  output
@@ -155,20 +156,22 @@ module Braintrust
155
156
 
156
157
  # Run scorers with OpenTelemetry tracing.
157
158
  # Creates one span per scorer, each a direct child of the current (eval) span.
158
- # @param case_context [CaseContext] The per-case context (output must be populated)
159
- def run_scorers(case_context)
159
+ # @param kase [CaseContext] The per-case context (output must be populated)
160
+ def run_scorers(kase)
160
161
  scorer_kwargs = {
161
- input: case_context.input,
162
- expected: case_context.expected,
163
- output: case_context.output,
164
- metadata: case_context.metadata || {},
165
- trace: case_context.trace
162
+ input: kase.input,
163
+ expected: kase.expected,
164
+ output: kase.output,
165
+ metadata: kase.metadata || {},
166
+ trace: kase.trace,
167
+ parameters: eval_context.parameters || {}
166
168
  }
167
169
  scorer_input = {
168
- input: case_context.input,
169
- expected: case_context.expected,
170
- output: case_context.output,
171
- metadata: case_context.metadata || {}
170
+ input: kase.input,
171
+ expected: kase.expected,
172
+ output: kase.output,
173
+ metadata: kase.metadata || {},
174
+ parameters: eval_context.parameters || {}
172
175
  }
173
176
 
174
177
  scorer_error = nil
@@ -224,9 +227,23 @@ module Braintrust
224
227
  object_id = eval_context.experiment_id
225
228
  btql = API::Internal::BTQL.new(eval_context.state)
226
229
 
227
- Eval::Trace.new(
228
- spans: -> { btql.trace_spans(object_type: object_type, object_id: object_id, root_span_id: root_span_id) }
229
- )
230
+ Eval::Trace.new(spans: -> { fetch_trace_spans(btql, object_type, object_id, root_span_id) })
231
+ end
232
+
233
+ # Fetch trace spans with retry to handle freshness and ingestion lag.
234
+ # @return [Array<Hash>] Parsed span data
235
+ def fetch_trace_spans(btql, object_type, object_id, root_span_id)
236
+ rows, _freshness = Internal::Retry.with_backoff(
237
+ max_retries: 7, base_delay: 1.0, max_delay: 8.0,
238
+ until: ->(result) {
239
+ r, f = result
240
+ f == "complete" && !r.empty?
241
+ }
242
+ ) { btql.trace_spans(object_type: object_type, object_id: object_id, root_span_id: root_span_id) }
243
+ rows || []
244
+ rescue => e
245
+ Braintrust::Log.warn("[BTQL] Query failed: #{e.message}")
246
+ []
230
247
  end
231
248
 
232
249
  # Build a CaseContext from a Case struct
@@ -241,11 +258,11 @@ module Braintrust
241
258
 
242
259
  # Report progress for a case via on_progress callback.
243
260
  # Rescues errors in the callback so a broken handler never crashes the eval.
244
- def report_progress(eval_span, case_context, **fields)
261
+ def report_progress(eval_span, kase, **fields)
245
262
  return unless eval_context.on_progress
246
263
  progress = {"id" => eval_span.context.hex_span_id}.merge(fields.transform_keys(&:to_s))
247
- if case_context.origin
248
- progress["origin"] = case_context.origin.is_a?(String) ? JSON.parse(case_context.origin) : case_context.origin
264
+ if kase.origin
265
+ progress["origin"] = kase.origin.is_a?(String) ? JSON.parse(kase.origin) : kase.origin
249
266
  end
250
267
  eval_context.on_progress.call(progress)
251
268
  rescue => e
@@ -105,6 +105,21 @@ module Braintrust
105
105
  # scorers: [->(expected:, output:) { output == expected ? 1.0 : 0.0 }]
106
106
  # )
107
107
  #
108
+ # @example Using parameters for configurable tasks
109
+ # # Tasks and scorers that declare `parameters:` receive it automatically.
110
+ # # Those that don't are unaffected — KeywordFilter strips unknown kwargs.
111
+ # Braintrust::Eval.run(
112
+ # project: "my-project",
113
+ # experiment: "with-params",
114
+ # cases: [{input: "hello", expected: "HELLO!"}],
115
+ # task: ->(input:, parameters:) {
116
+ # suffix = parameters["suffix"] || ""
117
+ # input.upcase + suffix
118
+ # },
119
+ # scorers: [->(expected:, output:) { output == expected ? 1.0 : 0.0 }],
120
+ # parameters: {"suffix" => "!"}
121
+ # )
122
+ #
108
123
  # @example Using metadata and tags
109
124
  # Braintrust::Eval.run(
110
125
  # project: "my-project",
@@ -158,11 +173,15 @@ module Braintrust
158
173
  # @param quiet [Boolean] If true, suppress result output (default: false)
159
174
  # @param state [State, nil] Braintrust state (defaults to global state)
160
175
  # @param tracer_provider [TracerProvider, nil] OpenTelemetry tracer provider (defaults to global)
176
+ # @param project_id [String, nil] Project UUID (skips project creation when provided)
177
+ # @param parent [Hash, nil] Parent span context ({object_type:, object_id:, generation:})
178
+ # @param parameters [Hash, nil] Runtime parameters passed to task and scorers as a `parameters:` keyword argument
161
179
  # @return [Result]
162
180
  def run(task:, scorers:, project: nil, experiment: nil,
163
181
  cases: nil, dataset: nil, on_progress: nil,
164
182
  parallelism: 1, tags: nil, metadata: nil, update: false, quiet: false,
165
- state: nil, tracer_provider: nil, project_id: nil, parent: nil)
183
+ state: nil, tracer_provider: nil, project_id: nil, parent: nil,
184
+ parameters: nil)
166
185
  # Validate required parameters
167
186
  validate_params!(task: task, scorers: scorers, cases: cases, dataset: dataset)
168
187
 
@@ -205,7 +224,8 @@ module Braintrust
205
224
  state: state,
206
225
  tracer_provider: tracer_provider,
207
226
  on_progress: on_progress,
208
- parent: parent
227
+ parent: parent,
228
+ parameters: parameters
209
229
  )
210
230
  result = Runner.new(context).run(parallelism: parallelism)
211
231
 
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Braintrust
4
+ module Internal
5
+ module Retry
6
+ MAX_RETRIES = 7
7
+ BASE_DELAY = 1.0
8
+ MAX_DELAY = 8.0
9
+
10
+ # Retry a block with exponential backoff.
11
+ #
12
+ # The block is the task to attempt. Its return value is captured each attempt.
13
+ #
14
+ # @param max_retries [Integer] Maximum number of retries after the first attempt
15
+ # @param base_delay [Float] Initial delay in seconds (doubles each retry)
16
+ # @param max_delay [Float] Cap on delay between retries
17
+ # @param until [Proc, nil] Optional condition — receives block result, truthy stops retrying.
18
+ # When omitted, the block result's own truthiness decides.
19
+ # @return The last block result (whether retries were exhausted or condition was met)
20
+ #
21
+ # @example Simple: retry until truthy
22
+ # conn = Retry.with_backoff(max_retries: 5) { try_connect }
23
+ #
24
+ # @example With condition: retry until non-empty
25
+ # data = Retry.with_backoff(until: ->(r) { r.any? }) { api.fetch }
26
+ #
27
+ def self.with_backoff(max_retries: MAX_RETRIES, base_delay: BASE_DELAY, max_delay: MAX_DELAY, until: nil, &task)
28
+ check = binding.local_variable_get(:until)
29
+ result = task.call
30
+ retries = 0
31
+ while retries < max_retries && !(check ? check.call(result) : result)
32
+ retries += 1
33
+ delay = [base_delay * (2**(retries - 1)), max_delay].min
34
+ sleep(delay)
35
+ result = task.call
36
+ end
37
+ result
38
+ end
39
+ end
40
+ end
41
+ end
@@ -11,23 +11,28 @@ module Braintrust
11
11
  # params = prompt.build(text: "Article to summarize...")
12
12
  # client.messages.create(**params)
13
13
  class Prompt
14
- attr_reader :id, :name, :slug, :project_id
14
+ attr_reader :id, :name, :slug, :project_id, :version
15
15
 
16
16
  # Load a prompt from Braintrust
17
17
  #
18
- # @param project [String] Project name
18
+ # @param project [String, nil] Project name (provide either project or project_id)
19
+ # @param project_id [String, nil] Project ID (UUID, provide either project or project_id)
19
20
  # @param slug [String] Prompt slug
20
21
  # @param version [String, nil] Specific version (default: latest)
21
22
  # @param defaults [Hash] Default variable values for build()
22
23
  # @param api [API, nil] Braintrust API client (default: creates one using global state)
23
24
  # @return [Prompt]
24
- def self.load(project:, slug:, version: nil, defaults: {}, api: nil)
25
+ def self.load(slug:, project: nil, project_id: nil, version: nil, defaults: {}, api: nil)
26
+ raise ArgumentError, "Either project or project_id is required" unless project || project_id
27
+
25
28
  api ||= API.new
26
29
 
27
30
  # Find the function by project + slug
28
- result = api.functions.list(project_name: project, slug: slug)
31
+ result = api.functions.list(project_name: project, project_id: project_id, slug: slug)
29
32
  function = result.dig("objects")&.first
30
- raise Error, "Prompt '#{slug}' not found in project '#{project}'" unless function
33
+
34
+ identifier = project ? "project '#{project}'" : "project_id '#{project_id}'"
35
+ raise Error, "Prompt '#{slug}' not found in #{identifier}" unless function
31
36
 
32
37
  # Fetch full function data including prompt_data
33
38
  full_data = api.functions.get(id: function["id"], version: version)
@@ -47,6 +52,7 @@ module Braintrust
47
52
  @name = data["name"]
48
53
  @slug = data["slug"]
49
54
  @project_id = data["project_id"]
55
+ @version = data["_xact_id"]
50
56
  end
51
57
 
52
58
  # Get the raw prompt definition
@@ -40,7 +40,8 @@ module Braintrust
40
40
  experiment_name: body["experiment_name"],
41
41
  remote_scorer_ids: resolve_remote_scorers(body["scores"]),
42
42
  parent: resolve_parent(body["parent"]),
43
- project_id: body["project_id"]
43
+ project_id: body["project_id"],
44
+ parameters: resolve_parameters(body["parameters"], evaluator)
44
45
  }
45
46
  end
46
47
 
@@ -57,6 +58,7 @@ module Braintrust
57
58
  remote_scorer_ids = validated[:remote_scorer_ids]
58
59
  parent = validated[:parent]
59
60
  project_id = validated[:project_id]
61
+ parameters = validated[:parameters]
60
62
 
61
63
  state = build_state(auth)
62
64
 
@@ -89,6 +91,7 @@ module Braintrust
89
91
  }
90
92
  run_opts[:parent] = parent if parent
91
93
  run_opts[:scorers] = remote_scorer_ids if remote_scorer_ids
94
+ run_opts[:parameters] = parameters if parameters && !parameters.empty?
92
95
  run_opts[:dataset] = dataset if dataset
93
96
 
94
97
  if state
@@ -161,6 +164,15 @@ module Braintrust
161
164
  @evaluators
162
165
  end
163
166
 
167
+ # Merge request parameters with evaluator's parameter defaults.
168
+ # Request values override defaults. Returns a string-keyed Hash.
169
+ def resolve_parameters(raw_params, evaluator)
170
+ defaults = (evaluator.parameters || {}).to_h { |name, spec|
171
+ [name.to_s, spec.is_a?(Hash) ? (spec[:default] || spec["default"]) : nil]
172
+ }.compact
173
+ defaults.merge(raw_params || {})
174
+ end
175
+
164
176
  # Resolve data source from the data field.
165
177
  # Returns [cases, dataset] where exactly one is non-nil.
166
178
  def resolve_data_source(data)
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Braintrust
4
- VERSION = "0.3.0"
4
+ VERSION = "0.3.1"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: braintrust
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Braintrust
@@ -9,6 +9,20 @@ bindir: exe
9
9
  cert_chain: []
10
10
  date: 1980-01-02 00:00:00.000000000 Z
11
11
  dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: logger
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ">="
17
+ - !ruby/object:Gem::Version
18
+ version: '1.0'
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - ">="
24
+ - !ruby/object:Gem::Version
25
+ version: '1.0'
12
26
  - !ruby/object:Gem::Dependency
13
27
  name: opentelemetry-sdk
14
28
  requirement: !ruby/object:Gem::Requirement
@@ -51,132 +65,6 @@ dependencies:
51
65
  - - "~>"
52
66
  - !ruby/object:Gem::Version
53
67
  version: 3.3.1
54
- - !ruby/object:Gem::Dependency
55
- name: minitest
56
- requirement: !ruby/object:Gem::Requirement
57
- requirements:
58
- - - "~>"
59
- - !ruby/object:Gem::Version
60
- version: '5.0'
61
- type: :development
62
- prerelease: false
63
- version_requirements: !ruby/object:Gem::Requirement
64
- requirements:
65
- - - "~>"
66
- - !ruby/object:Gem::Version
67
- version: '5.0'
68
- - !ruby/object:Gem::Dependency
69
- name: rake
70
- requirement: !ruby/object:Gem::Requirement
71
- requirements:
72
- - - "~>"
73
- - !ruby/object:Gem::Version
74
- version: '13.0'
75
- type: :development
76
- prerelease: false
77
- version_requirements: !ruby/object:Gem::Requirement
78
- requirements:
79
- - - "~>"
80
- - !ruby/object:Gem::Version
81
- version: '13.0'
82
- - !ruby/object:Gem::Dependency
83
- name: standard
84
- requirement: !ruby/object:Gem::Requirement
85
- requirements:
86
- - - "~>"
87
- - !ruby/object:Gem::Version
88
- version: '1.0'
89
- type: :development
90
- prerelease: false
91
- version_requirements: !ruby/object:Gem::Requirement
92
- requirements:
93
- - - "~>"
94
- - !ruby/object:Gem::Version
95
- version: '1.0'
96
- - !ruby/object:Gem::Dependency
97
- name: simplecov
98
- requirement: !ruby/object:Gem::Requirement
99
- requirements:
100
- - - "~>"
101
- - !ruby/object:Gem::Version
102
- version: '0.22'
103
- type: :development
104
- prerelease: false
105
- version_requirements: !ruby/object:Gem::Requirement
106
- requirements:
107
- - - "~>"
108
- - !ruby/object:Gem::Version
109
- version: '0.22'
110
- - !ruby/object:Gem::Dependency
111
- name: vcr
112
- requirement: !ruby/object:Gem::Requirement
113
- requirements:
114
- - - "~>"
115
- - !ruby/object:Gem::Version
116
- version: '6.0'
117
- type: :development
118
- prerelease: false
119
- version_requirements: !ruby/object:Gem::Requirement
120
- requirements:
121
- - - "~>"
122
- - !ruby/object:Gem::Version
123
- version: '6.0'
124
- - !ruby/object:Gem::Dependency
125
- name: webmock
126
- requirement: !ruby/object:Gem::Requirement
127
- requirements:
128
- - - "~>"
129
- - !ruby/object:Gem::Version
130
- version: '3.0'
131
- type: :development
132
- prerelease: false
133
- version_requirements: !ruby/object:Gem::Requirement
134
- requirements:
135
- - - "~>"
136
- - !ruby/object:Gem::Version
137
- version: '3.0'
138
- - !ruby/object:Gem::Dependency
139
- name: appraisal
140
- requirement: !ruby/object:Gem::Requirement
141
- requirements:
142
- - - "~>"
143
- - !ruby/object:Gem::Version
144
- version: '2.5'
145
- type: :development
146
- prerelease: false
147
- version_requirements: !ruby/object:Gem::Requirement
148
- requirements:
149
- - - "~>"
150
- - !ruby/object:Gem::Version
151
- version: '2.5'
152
- - !ruby/object:Gem::Dependency
153
- name: yard
154
- requirement: !ruby/object:Gem::Requirement
155
- requirements:
156
- - - "~>"
157
- - !ruby/object:Gem::Version
158
- version: '0.9'
159
- type: :development
160
- prerelease: false
161
- version_requirements: !ruby/object:Gem::Requirement
162
- requirements:
163
- - - "~>"
164
- - !ruby/object:Gem::Version
165
- version: '0.9'
166
- - !ruby/object:Gem::Dependency
167
- name: kramdown
168
- requirement: !ruby/object:Gem::Requirement
169
- requirements:
170
- - - "~>"
171
- - !ruby/object:Gem::Version
172
- version: '2.0'
173
- type: :development
174
- prerelease: false
175
- version_requirements: !ruby/object:Gem::Requirement
176
- requirements:
177
- - - "~>"
178
- - !ruby/object:Gem::Version
179
- version: '2.0'
180
68
  description: 'Braintrust Ruby SDK for evals, tracing and more. '
181
69
  email:
182
70
  - info@braintrust.dev
@@ -258,6 +146,7 @@ files:
258
146
  - lib/braintrust/internal/env.rb
259
147
  - lib/braintrust/internal/http.rb
260
148
  - lib/braintrust/internal/origin.rb
149
+ - lib/braintrust/internal/retry.rb
261
150
  - lib/braintrust/internal/template.rb
262
151
  - lib/braintrust/internal/thread_pool.rb
263
152
  - lib/braintrust/internal/time.rb