braintrust 0.1.4 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +71 -2
- data/lib/braintrust/api/datasets.rb +10 -0
- data/lib/braintrust/api/internal/experiments.rb +1 -1
- data/lib/braintrust/dataset.rb +10 -6
- data/lib/braintrust/eval/evaluator.rb +72 -0
- data/lib/braintrust/eval/functions.rb +44 -10
- data/lib/braintrust/eval/runner.rb +55 -13
- data/lib/braintrust/eval/scorer.rb +4 -0
- data/lib/braintrust/eval.rb +97 -50
- data/lib/braintrust/server/auth/clerk_token.rb +68 -0
- data/lib/braintrust/server/auth/no_auth.rb +14 -0
- data/lib/braintrust/server/handlers/eval.rb +217 -0
- data/lib/braintrust/server/handlers/health.rb +16 -0
- data/lib/braintrust/server/handlers/list.rb +74 -0
- data/lib/braintrust/server/middleware/auth.rb +29 -0
- data/lib/braintrust/server/middleware/cors.rb +87 -0
- data/lib/braintrust/server/rack/app.rb +38 -0
- data/lib/braintrust/server/rack.rb +36 -0
- data/lib/braintrust/server/router.rb +37 -0
- data/lib/braintrust/server/sse.rb +52 -0
- data/lib/braintrust/server.rb +8 -0
- data/lib/braintrust/trace/span_exporter.rb +36 -0
- data/lib/braintrust/trace.rb +3 -4
- data/lib/braintrust/version.rb +1 -1
- metadata +15 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: d67e6d0faeb24297af8a5f43ac1bd1ceacff1f37df2610244ae5f81e34c4ae5f
|
|
4
|
+
data.tar.gz: 489ec68fee424aa8aa1880b73b58f1f26529493d8898cd0ae5876d3b919fcb7c
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: cd876122ad92c5439ff45e975fd84418bfcc7d72d6f9398e48b1ac4c60f09fb96c2b85b46ee1c8de6a75291c0b7d2754ee2fa069f77f8a2f8a4c069132c59d94
|
|
7
|
+
data.tar.gz: 45d3f80f69ac9725d93aa0db24815da093bfd992b5418f8551c8d25e8caef9299f270a92fa922a4bc4bf3190d9f823a35c7203f9a74bd58daee31869b987f103
|
data/README.md
CHANGED
|
@@ -23,6 +23,7 @@ This is the official Ruby SDK for [Braintrust](https://www.braintrust.dev), for
|
|
|
23
23
|
- [Evals](#evals)
|
|
24
24
|
- [Datasets](#datasets)
|
|
25
25
|
- [Scorers](#scorers)
|
|
26
|
+
- [Dev Server](#dev-server)
|
|
26
27
|
- [Documentation](#documentation)
|
|
27
28
|
- [Troubleshooting](#troubleshooting)
|
|
28
29
|
- [Contributing](#contributing)
|
|
@@ -148,8 +149,8 @@ Braintrust.init(
|
|
|
148
149
|
|
|
149
150
|
The SDK automatically instruments these LLM libraries:
|
|
150
151
|
|
|
151
|
-
| Provider | Gem | Versions | Integration Name | Examples
|
|
152
|
-
| --------- | ------------- | -------- | ---------------- |
|
|
152
|
+
| Provider | Gem | Versions | Integration Name | Examples |
|
|
153
|
+
| --------- | ------------- | -------- | ---------------- | ----------------------------------------- |
|
|
153
154
|
| Anthropic | `anthropic` | >= 0.3.0 | `:anthropic` | [Link](./examples/contrib/anthropic.rb) |
|
|
154
155
|
| OpenAI | `openai` | >= 0.1.0 | `:openai` | [Link](./examples/contrib/openai.rb) |
|
|
155
156
|
| | `ruby-openai` | >= 7.0.0 | `:ruby_openai` | [Link](./examples/contrib/ruby-openai.rb) |
|
|
@@ -318,6 +319,74 @@ Braintrust::Eval.run(
|
|
|
318
319
|
|
|
319
320
|
See examples: [eval.rb](./examples/eval.rb), [dataset.rb](./examples/eval/dataset.rb), [remote_functions.rb](./examples/eval/remote_functions.rb)
|
|
320
321
|
|
|
322
|
+
### Dev Server
|
|
323
|
+
|
|
324
|
+
Run evaluations from the Braintrust web UI against code in your own application. Define evaluators, pass them to the dev server, and start serving:
|
|
325
|
+
|
|
326
|
+
```ruby
|
|
327
|
+
# eval_server.ru
|
|
328
|
+
require "braintrust/eval"
|
|
329
|
+
require "braintrust/server"
|
|
330
|
+
|
|
331
|
+
# Define evaluators — these can reference your application code (models, services, etc.)
|
|
332
|
+
food_classifier = Braintrust::Eval::Evaluator.new(
|
|
333
|
+
task: ->(input) { FoodClassifier.classify(input) },
|
|
334
|
+
scorers: [
|
|
335
|
+
Braintrust::Eval.scorer("exact_match") { |input, expected, output| output == expected ? 1.0 : 0.0 }
|
|
336
|
+
]
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
# Initialize Braintrust (requires BRAINTRUST_API_KEY)
|
|
340
|
+
Braintrust.init(blocking_login: true)
|
|
341
|
+
|
|
342
|
+
# Start the server
|
|
343
|
+
run Braintrust::Server::Rack.app(
|
|
344
|
+
evaluators: {
|
|
345
|
+
"food-classifier" => food_classifier
|
|
346
|
+
}
|
|
347
|
+
)
|
|
348
|
+
```
|
|
349
|
+
|
|
350
|
+
```bash
|
|
351
|
+
bundle exec rackup eval_server.ru -p 8300 -o 0.0.0.0
|
|
352
|
+
```
|
|
353
|
+
|
|
354
|
+
**Custom evaluators**
|
|
355
|
+
|
|
356
|
+
Evaluators can also be defined as subclasses:
|
|
357
|
+
|
|
358
|
+
```ruby
|
|
359
|
+
class FoodClassifier < Braintrust::Eval::Evaluator
|
|
360
|
+
def task
|
|
361
|
+
->(input) { classify(input) }
|
|
362
|
+
end
|
|
363
|
+
|
|
364
|
+
def scorers
|
|
365
|
+
[Braintrust::Eval.scorer("exact_match") { |i, e, o| o == e ? 1.0 : 0.0 }]
|
|
366
|
+
end
|
|
367
|
+
end
|
|
368
|
+
```
|
|
369
|
+
|
|
370
|
+
**Supported web servers**
|
|
371
|
+
|
|
372
|
+
The dev server requires the `rack` gem and a Rack-compatible web server.
|
|
373
|
+
|
|
374
|
+
| Server | Version Supported | Notes |
|
|
375
|
+
| ---------------------------------------------- | ----------------- | ------------------------------------ |
|
|
376
|
+
| [Puma](https://puma.io/) | 6.x | |
|
|
377
|
+
| [Falcon](https://socketry.github.io/falcon/) | 0.x | |
|
|
378
|
+
| [Passenger](https://www.phusionpassenger.com/) | 6.x | |
|
|
379
|
+
| [WEBrick](https://github.com/ruby/webrick) | Not supported | Does not support server-sent events. |
|
|
380
|
+
|
|
381
|
+
Add your chosen server to your Gemfile:
|
|
382
|
+
|
|
383
|
+
```ruby
|
|
384
|
+
gem "rack"
|
|
385
|
+
gem "puma" # recommended
|
|
386
|
+
```
|
|
387
|
+
|
|
388
|
+
See example: [server/eval.ru](./examples/server/eval.ru)
|
|
389
|
+
|
|
321
390
|
## Documentation
|
|
322
391
|
|
|
323
392
|
- [Braintrust Documentation](https://www.braintrust.dev/docs)
|
|
@@ -82,6 +82,14 @@ module Braintrust
|
|
|
82
82
|
http_post_json("/v1/dataset/#{id}/insert", {events: events})
|
|
83
83
|
end
|
|
84
84
|
|
|
85
|
+
# Delete a dataset by ID
|
|
86
|
+
# DELETE /v1/dataset/{id}
|
|
87
|
+
# @param id [String] Dataset UUID
|
|
88
|
+
# @return [Hash] Delete response
|
|
89
|
+
def delete(id:)
|
|
90
|
+
http_request(:delete, "/v1/dataset/#{id}")
|
|
91
|
+
end
|
|
92
|
+
|
|
85
93
|
# Generate a permalink URL to view a dataset in the Braintrust UI
|
|
86
94
|
# @param id [String] Dataset UUID
|
|
87
95
|
# @return [String] Permalink URL
|
|
@@ -150,6 +158,8 @@ module Braintrust
|
|
|
150
158
|
req["Content-Type"] = "application/json"
|
|
151
159
|
req.body = JSON.dump(payload) if payload
|
|
152
160
|
req
|
|
161
|
+
when :delete
|
|
162
|
+
Net::HTTP::Delete.new(uri)
|
|
153
163
|
else
|
|
154
164
|
raise ArgumentError, "Unsupported HTTP method: #{method}"
|
|
155
165
|
end
|
|
@@ -29,9 +29,9 @@ module Braintrust
|
|
|
29
29
|
|
|
30
30
|
payload = {
|
|
31
31
|
project_id: project_id,
|
|
32
|
-
name: name,
|
|
33
32
|
ensure_new: ensure_new
|
|
34
33
|
}
|
|
34
|
+
payload[:name] = name if name
|
|
35
35
|
payload[:tags] = tags if tags
|
|
36
36
|
payload[:metadata] = metadata if metadata
|
|
37
37
|
payload[:dataset_id] = dataset_id if dataset_id
|
data/lib/braintrust/dataset.rb
CHANGED
|
@@ -12,9 +12,9 @@ module Braintrust
|
|
|
12
12
|
# dataset = Braintrust::Dataset.new(name: "my-dataset", project: "my-project")
|
|
13
13
|
# dataset.each { |record| puts record[:input] }
|
|
14
14
|
#
|
|
15
|
-
# @example With explicit
|
|
16
|
-
#
|
|
17
|
-
# dataset = Braintrust::Dataset.new(name: "my-dataset", project: "my-project",
|
|
15
|
+
# @example With explicit state
|
|
16
|
+
# state = Braintrust.init(api_key: "...")
|
|
17
|
+
# dataset = Braintrust::Dataset.new(name: "my-dataset", project: "my-project", state: state)
|
|
18
18
|
#
|
|
19
19
|
# @example Eager loading for small datasets
|
|
20
20
|
# records = dataset.fetch_all(limit: 100)
|
|
@@ -38,13 +38,13 @@ module Braintrust
|
|
|
38
38
|
# @param id [String, nil] Dataset UUID (required if name not provided)
|
|
39
39
|
# @param project [String, nil] Project name (required if using name)
|
|
40
40
|
# @param version [String, nil] Optional version to pin to
|
|
41
|
-
# @param
|
|
42
|
-
def initialize(name: nil, id: nil, project: nil, version: nil,
|
|
41
|
+
# @param state [State, nil] Braintrust state (defaults to global state)
|
|
42
|
+
def initialize(name: nil, id: nil, project: nil, version: nil, state: nil)
|
|
43
43
|
@name = name
|
|
44
44
|
@provided_id = id
|
|
45
45
|
@project = project
|
|
46
46
|
@version = version
|
|
47
|
-
@api =
|
|
47
|
+
@api = API.new(state: state)
|
|
48
48
|
@resolved_id = nil
|
|
49
49
|
@metadata = nil
|
|
50
50
|
|
|
@@ -182,4 +182,8 @@ module Braintrust
|
|
|
182
182
|
)
|
|
183
183
|
end
|
|
184
184
|
end
|
|
185
|
+
|
|
186
|
+
# Value object wrapping a dataset UUID for resolution by ID.
|
|
187
|
+
# Used by Eval.run to distinguish dataset-by-ID from dataset-by-name.
|
|
188
|
+
DatasetId = Struct.new(:id, keyword_init: true)
|
|
185
189
|
end
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Braintrust
|
|
4
|
+
module Eval
|
|
5
|
+
# Base class for evaluators. Subclass and override #task and #scorers,
|
|
6
|
+
# or instantiate directly with keyword arguments.
|
|
7
|
+
#
|
|
8
|
+
# @example Subclass pattern
|
|
9
|
+
# class FoodClassifier < Braintrust::Eval::Evaluator
|
|
10
|
+
# def task
|
|
11
|
+
# ->(input) { classify(input) }
|
|
12
|
+
# end
|
|
13
|
+
#
|
|
14
|
+
# def scorers
|
|
15
|
+
# [Braintrust::Eval.scorer("exact_match") { |i, e, o| o == e ? 1.0 : 0.0 }]
|
|
16
|
+
# end
|
|
17
|
+
# end
|
|
18
|
+
#
|
|
19
|
+
# @example Inline pattern
|
|
20
|
+
# Braintrust::Eval::Evaluator.new(
|
|
21
|
+
# task: ->(input) { input.upcase },
|
|
22
|
+
# scorers: [my_scorer]
|
|
23
|
+
# )
|
|
24
|
+
class Evaluator
|
|
25
|
+
attr_accessor :task, :scorers, :parameters
|
|
26
|
+
|
|
27
|
+
def initialize(task: nil, scorers: [], parameters: {})
|
|
28
|
+
@task = task
|
|
29
|
+
@scorers = scorers
|
|
30
|
+
@parameters = parameters
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Validate that the evaluator has required fields set.
|
|
34
|
+
# @raise [ArgumentError] if validation fails
|
|
35
|
+
def validate!
|
|
36
|
+
raise ArgumentError, "task is required" unless task
|
|
37
|
+
unless task.respond_to?(:call)
|
|
38
|
+
raise ArgumentError, "task must be callable (respond to :call)"
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Run this evaluator against the given cases.
|
|
43
|
+
# Delegates to Braintrust::Eval.run with the evaluator's task and scorers.
|
|
44
|
+
#
|
|
45
|
+
# @param cases [Array] The test cases
|
|
46
|
+
# @param on_progress [#call, nil] Optional callback fired after each test case
|
|
47
|
+
# @param quiet [Boolean] If true, suppress result output (default: false)
|
|
48
|
+
# @param project [String, nil] Project name
|
|
49
|
+
# @param experiment [String, nil] Experiment name
|
|
50
|
+
# @param project_id [String, nil] Project UUID (skips project creation)
|
|
51
|
+
# @param dataset [String, Hash, Dataset, DatasetId, nil] Dataset to fetch
|
|
52
|
+
# @param scorers [Array, nil] Additional scorers (merged with evaluator's own)
|
|
53
|
+
# @param parent [Hash, nil] Parent span context
|
|
54
|
+
# @param state [State, nil] Braintrust state
|
|
55
|
+
# @param update [Boolean] If true, allow reusing existing experiment (default: false)
|
|
56
|
+
# @param tracer_provider [TracerProvider, nil] OpenTelemetry tracer provider (defaults to global)
|
|
57
|
+
# @return [Result]
|
|
58
|
+
def run(cases, on_progress: nil, quiet: false,
|
|
59
|
+
project: nil, experiment: nil, project_id: nil,
|
|
60
|
+
dataset: nil, scorers: nil, parent: nil,
|
|
61
|
+
state: nil, update: false, tracer_provider: nil)
|
|
62
|
+
all_scorers = scorers ? self.scorers + scorers : self.scorers
|
|
63
|
+
Braintrust::Eval.run(
|
|
64
|
+
task: task, scorers: all_scorers, cases: cases, dataset: dataset,
|
|
65
|
+
project: project, experiment: experiment, project_id: project_id,
|
|
66
|
+
parent: parent, on_progress: on_progress, quiet: quiet,
|
|
67
|
+
state: state, update: update, tracer_provider: tracer_provider
|
|
68
|
+
)
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
@@ -56,6 +56,27 @@ module Braintrust
|
|
|
56
56
|
end
|
|
57
57
|
end
|
|
58
58
|
|
|
59
|
+
# Create a scorer that invokes a remote function by ID
|
|
60
|
+
# @param id [String] Function UUID
|
|
61
|
+
# @param version [String, nil] Optional version to pin to
|
|
62
|
+
# @param state [State, nil] Braintrust state (defaults to global)
|
|
63
|
+
# @param tracer_provider [TracerProvider, nil] OpenTelemetry tracer provider
|
|
64
|
+
# @return [Scorer] Scorer object that invokes remote function
|
|
65
|
+
def scorer_by_id(id:, state: nil, version: nil, tracer_provider: nil)
|
|
66
|
+
state ||= Braintrust.current_state
|
|
67
|
+
api = API.new(state: state)
|
|
68
|
+
api.login
|
|
69
|
+
|
|
70
|
+
function_metadata = api.functions.get(id: id, version: version)
|
|
71
|
+
function_id = function_metadata["id"]
|
|
72
|
+
function_name = function_metadata["name"] || id
|
|
73
|
+
|
|
74
|
+
tracer_provider ||= OpenTelemetry.tracer_provider
|
|
75
|
+
tracer = tracer_provider.tracer("braintrust.functions")
|
|
76
|
+
|
|
77
|
+
build_scorer(function_id: function_id, function_name: function_name, api: api, tracer: tracer)
|
|
78
|
+
end
|
|
79
|
+
|
|
59
80
|
# Create a scorer that invokes a remote function
|
|
60
81
|
# @param project [String] Project name
|
|
61
82
|
# @param slug [String] Function slug
|
|
@@ -76,10 +97,21 @@ module Braintrust
|
|
|
76
97
|
tracer_provider ||= OpenTelemetry.tracer_provider
|
|
77
98
|
tracer = tracer_provider.tracer("braintrust.functions")
|
|
78
99
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
100
|
+
build_scorer(function_id: function_id, function_name: function_name, api: api, tracer: tracer)
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
private
|
|
104
|
+
|
|
105
|
+
# Build a Scorer that invokes a remote function
|
|
106
|
+
# Shared implementation used by both scorer and scorer_by_id
|
|
107
|
+
# @param function_id [String] Function UUID
|
|
108
|
+
# @param function_name [String] Function display name
|
|
109
|
+
# @param api [API] Braintrust API client
|
|
110
|
+
# @param tracer [OpenTelemetry::Trace::Tracer] Tracer instance
|
|
111
|
+
# @return [Scorer]
|
|
112
|
+
def build_scorer(function_id:, function_name:, api:, tracer:)
|
|
113
|
+
Scorer.new(function_name) do |input, expected, output, metadata|
|
|
114
|
+
tracer.in_span("function: #{function_name}") do |span|
|
|
83
115
|
scorer_input = {
|
|
84
116
|
input: input,
|
|
85
117
|
expected: expected,
|
|
@@ -91,14 +123,17 @@ module Braintrust
|
|
|
91
123
|
span.set_attribute("braintrust.input_json", JSON.dump(scorer_input))
|
|
92
124
|
span.set_attribute("braintrust.function.name", function_name)
|
|
93
125
|
span.set_attribute("braintrust.function.id", function_id)
|
|
94
|
-
span.set_attribute("braintrust.function.slug", slug)
|
|
95
126
|
|
|
96
127
|
begin
|
|
97
|
-
# Invoke the function via API
|
|
98
|
-
# The remote scorer receives all scorer arguments
|
|
99
128
|
result = api.functions.invoke(id: function_id, input: scorer_input)
|
|
100
129
|
|
|
101
130
|
score = case result
|
|
131
|
+
when Numeric
|
|
132
|
+
result.to_f
|
|
133
|
+
when true
|
|
134
|
+
1.0
|
|
135
|
+
when false
|
|
136
|
+
0.0
|
|
102
137
|
when Hash
|
|
103
138
|
if result.key?("score")
|
|
104
139
|
result["score"].to_f
|
|
@@ -107,6 +142,8 @@ module Braintrust
|
|
|
107
142
|
end
|
|
108
143
|
when String
|
|
109
144
|
result.to_f
|
|
145
|
+
when nil
|
|
146
|
+
nil
|
|
110
147
|
else
|
|
111
148
|
raise Error, "Unsupported result type: #{result.class}"
|
|
112
149
|
end
|
|
@@ -114,7 +151,6 @@ module Braintrust
|
|
|
114
151
|
span.set_attribute("braintrust.output_json", JSON.dump(score))
|
|
115
152
|
score
|
|
116
153
|
rescue => e
|
|
117
|
-
# Record exception and set error status
|
|
118
154
|
span.record_exception(e)
|
|
119
155
|
span.status = OpenTelemetry::Trace::Status.error(e.message)
|
|
120
156
|
raise
|
|
@@ -123,8 +159,6 @@ module Braintrust
|
|
|
123
159
|
end
|
|
124
160
|
end
|
|
125
161
|
|
|
126
|
-
private
|
|
127
|
-
|
|
128
162
|
# Resolve function ID from project name and slug
|
|
129
163
|
# @param api [API] API client
|
|
130
164
|
# @param project [String] Project name
|
|
@@ -17,18 +17,21 @@ module Braintrust
|
|
|
17
17
|
# Maximum parallelism allowed (mirrors Internal::ThreadPool::MAX_PARALLELISM)
|
|
18
18
|
MAX_PARALLELISM = Internal::ThreadPool::MAX_PARALLELISM
|
|
19
19
|
|
|
20
|
-
def initialize(
|
|
21
|
-
|
|
20
|
+
def initialize(task:, scorers:, experiment_id: nil, experiment_name: nil,
|
|
21
|
+
project_id: nil, project_name: nil, state: nil, tracer_provider: nil,
|
|
22
|
+
on_progress: nil, parent: nil)
|
|
22
23
|
@experiment_id = experiment_id
|
|
23
24
|
@experiment_name = experiment_name
|
|
24
25
|
@project_id = project_id
|
|
25
26
|
@project_name = project_name
|
|
26
27
|
@task = task
|
|
27
28
|
@scorers = normalize_scorers(scorers)
|
|
28
|
-
@
|
|
29
|
+
@state = state
|
|
29
30
|
@tracer_provider = tracer_provider || OpenTelemetry.tracer_provider
|
|
30
31
|
@tracer = @tracer_provider.tracer("braintrust-eval")
|
|
31
|
-
@parent_attr = "
|
|
32
|
+
@parent_attr = parent ? "#{parent[:object_type]}:#{parent[:object_id]}" : nil
|
|
33
|
+
@generation = parent&.dig(:generation)
|
|
34
|
+
@on_progress = on_progress
|
|
32
35
|
|
|
33
36
|
# Mutex for thread-safe score collection
|
|
34
37
|
@score_mutex = Mutex.new
|
|
@@ -60,8 +63,10 @@ module Braintrust
|
|
|
60
63
|
# Calculate duration
|
|
61
64
|
duration = Time.now - start_time
|
|
62
65
|
|
|
63
|
-
# Generate permalink
|
|
64
|
-
permalink = @
|
|
66
|
+
# Generate permalink (only when state and experiment are available)
|
|
67
|
+
permalink = if @state && experiment_id
|
|
68
|
+
@state.object_permalink(object_type: "experiment", object_id: experiment_id)
|
|
69
|
+
end
|
|
65
70
|
|
|
66
71
|
Result.new(
|
|
67
72
|
experiment_id: experiment_id,
|
|
@@ -86,7 +91,7 @@ module Braintrust
|
|
|
86
91
|
# @param errors [Queue] Thread-safe error collection queue
|
|
87
92
|
def run_case(test_case, errors)
|
|
88
93
|
tracer.in_span("eval") do |eval_span|
|
|
89
|
-
eval_span.set_attribute("braintrust.parent", parent_attr)
|
|
94
|
+
eval_span.set_attribute("braintrust.parent", parent_attr) if parent_attr
|
|
90
95
|
|
|
91
96
|
# Set tags early so they're present even if task fails
|
|
92
97
|
eval_span.set_attribute("braintrust.tags", test_case.tags) if test_case.tags
|
|
@@ -99,12 +104,23 @@ module Braintrust
|
|
|
99
104
|
# Error already recorded on task span, set eval span status
|
|
100
105
|
eval_span.status = OpenTelemetry::Trace::Status.error(e.message)
|
|
101
106
|
errors << "Task failed for input '#{test_case.input}': #{e.message}"
|
|
107
|
+
if @on_progress
|
|
108
|
+
error_progress = {
|
|
109
|
+
"id" => eval_span.context.hex_span_id,
|
|
110
|
+
"error" => e.message
|
|
111
|
+
}
|
|
112
|
+
if test_case.origin
|
|
113
|
+
error_progress["origin"] = test_case.origin.is_a?(String) ? JSON.parse(test_case.origin) : test_case.origin
|
|
114
|
+
end
|
|
115
|
+
@on_progress.call(error_progress)
|
|
116
|
+
end
|
|
102
117
|
next
|
|
103
118
|
end
|
|
104
119
|
|
|
105
120
|
# Run scorers
|
|
121
|
+
case_scores = nil
|
|
106
122
|
begin
|
|
107
|
-
run_scorers(test_case, output)
|
|
123
|
+
case_scores = run_scorers(test_case, output)
|
|
108
124
|
rescue => e
|
|
109
125
|
# Error already recorded on score span, set eval span status
|
|
110
126
|
eval_span.status = OpenTelemetry::Trace::Status.error(e.message)
|
|
@@ -112,13 +128,25 @@ module Braintrust
|
|
|
112
128
|
end
|
|
113
129
|
|
|
114
130
|
# Set eval span attributes (after task and scorers complete)
|
|
115
|
-
set_json_attr(eval_span, "braintrust.span_attributes",
|
|
131
|
+
set_json_attr(eval_span, "braintrust.span_attributes", build_span_attributes("eval"))
|
|
116
132
|
set_json_attr(eval_span, "braintrust.input_json", test_case.input)
|
|
117
133
|
set_json_attr(eval_span, "braintrust.output_json", output)
|
|
118
134
|
set_json_attr(eval_span, "braintrust.expected", test_case.expected) if test_case.expected
|
|
119
135
|
|
|
120
136
|
# Set origin for cases from remote sources (already JSON-serialized)
|
|
121
137
|
eval_span.set_attribute("braintrust.origin", test_case.origin) if test_case.origin
|
|
138
|
+
|
|
139
|
+
if @on_progress
|
|
140
|
+
progress = {
|
|
141
|
+
"id" => eval_span.context.hex_span_id,
|
|
142
|
+
"data" => output,
|
|
143
|
+
"scores" => case_scores || {}
|
|
144
|
+
}
|
|
145
|
+
if test_case.origin
|
|
146
|
+
progress["origin"] = test_case.origin.is_a?(String) ? JSON.parse(test_case.origin) : test_case.origin
|
|
147
|
+
end
|
|
148
|
+
@on_progress.call(progress)
|
|
149
|
+
end
|
|
122
150
|
end
|
|
123
151
|
end
|
|
124
152
|
|
|
@@ -128,8 +156,8 @@ module Braintrust
|
|
|
128
156
|
# @return [Object] Task output
|
|
129
157
|
def run_task(test_case)
|
|
130
158
|
tracer.in_span("task") do |task_span|
|
|
131
|
-
task_span.set_attribute("braintrust.parent", parent_attr)
|
|
132
|
-
set_json_attr(task_span, "braintrust.span_attributes",
|
|
159
|
+
task_span.set_attribute("braintrust.parent", parent_attr) if parent_attr
|
|
160
|
+
set_json_attr(task_span, "braintrust.span_attributes", build_span_attributes("task"))
|
|
133
161
|
set_json_attr(task_span, "braintrust.input_json", test_case.input)
|
|
134
162
|
|
|
135
163
|
begin
|
|
@@ -149,10 +177,11 @@ module Braintrust
|
|
|
149
177
|
# Creates single score span for all scorers
|
|
150
178
|
# @param test_case [Case] The test case
|
|
151
179
|
# @param output [Object] Task output
|
|
180
|
+
# @return [Hash] Scores hash { scorer_name => score_value }
|
|
152
181
|
def run_scorers(test_case, output)
|
|
153
182
|
tracer.in_span("score") do |score_span|
|
|
154
|
-
score_span.set_attribute("braintrust.parent", parent_attr)
|
|
155
|
-
set_json_attr(score_span, "braintrust.span_attributes",
|
|
183
|
+
score_span.set_attribute("braintrust.parent", parent_attr) if parent_attr
|
|
184
|
+
set_json_attr(score_span, "braintrust.span_attributes", build_span_attributes("score"))
|
|
156
185
|
|
|
157
186
|
scores = {}
|
|
158
187
|
scorer_error = nil
|
|
@@ -173,6 +202,8 @@ module Braintrust
|
|
|
173
202
|
|
|
174
203
|
# Raise after setting scores so we can see which scorers succeeded
|
|
175
204
|
raise scorer_error if scorer_error
|
|
205
|
+
|
|
206
|
+
scores
|
|
176
207
|
end
|
|
177
208
|
end
|
|
178
209
|
|
|
@@ -221,6 +252,17 @@ module Braintrust
|
|
|
221
252
|
span.status = OpenTelemetry::Trace::Status.error(error.message)
|
|
222
253
|
end
|
|
223
254
|
|
|
255
|
+
# Build span_attributes hash with type, and optionally name and generation.
|
|
256
|
+
# Matches Java SDK behavior of including these on every span.
|
|
257
|
+
# @param type [String] Span type ("eval", "task", or "score")
|
|
258
|
+
# @return [Hash]
|
|
259
|
+
def build_span_attributes(type)
|
|
260
|
+
attrs = {type: type}
|
|
261
|
+
attrs[:name] = experiment_name if experiment_name
|
|
262
|
+
attrs[:generation] = @generation if @generation
|
|
263
|
+
attrs
|
|
264
|
+
end
|
|
265
|
+
|
|
224
266
|
# Set a span attribute by JSON encoding the value
|
|
225
267
|
# @param span [OpenTelemetry::Trace::Span] The span
|
|
226
268
|
# @param key [String] The attribute key
|
|
@@ -105,4 +105,8 @@ module Braintrust
|
|
|
105
105
|
end
|
|
106
106
|
end
|
|
107
107
|
end
|
|
108
|
+
|
|
109
|
+
# Value object wrapping a remote scorer function UUID.
|
|
110
|
+
# Used by Eval.run to distinguish remote scorers from local callables.
|
|
111
|
+
ScorerId = Struct.new(:function_id, :version, keyword_init: true)
|
|
108
112
|
end
|