braintrust 0.1.4 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +109 -13
- data/lib/braintrust/api/datasets.rb +10 -0
- data/lib/braintrust/api/internal/btql.rb +124 -0
- data/lib/braintrust/api/internal/experiments.rb +20 -1
- data/lib/braintrust/api/internal/projects.rb +19 -0
- data/lib/braintrust/dataset.rb +13 -6
- data/lib/braintrust/eval/context.rb +131 -0
- data/lib/braintrust/eval/evaluator.rb +78 -0
- data/lib/braintrust/eval/functions.rb +10 -132
- data/lib/braintrust/eval/runner.rb +119 -85
- data/lib/braintrust/eval/scorer.rb +24 -92
- data/lib/braintrust/eval/trace.rb +129 -0
- data/lib/braintrust/eval.rb +131 -156
- data/lib/braintrust/functions.rb +168 -0
- data/lib/braintrust/internal/callable.rb +83 -0
- data/lib/braintrust/logger.rb +9 -0
- data/lib/braintrust/scorer.rb +122 -0
- data/lib/braintrust/server/auth/clerk_token.rb +68 -0
- data/lib/braintrust/server/auth/no_auth.rb +14 -0
- data/lib/braintrust/server/handlers/eval.rb +217 -0
- data/lib/braintrust/server/handlers/health.rb +16 -0
- data/lib/braintrust/server/handlers/list.rb +74 -0
- data/lib/braintrust/server/middleware/auth.rb +29 -0
- data/lib/braintrust/server/middleware/cors.rb +87 -0
- data/lib/braintrust/server/rack/app.rb +38 -0
- data/lib/braintrust/server/rack.rb +36 -0
- data/lib/braintrust/server/router.rb +37 -0
- data/lib/braintrust/server/sse.rb +52 -0
- data/lib/braintrust/server.rb +8 -0
- data/lib/braintrust/task.rb +108 -0
- data/lib/braintrust/trace/span_exporter.rb +36 -0
- data/lib/braintrust/trace.rb +3 -4
- data/lib/braintrust/version.rb +1 -1
- metadata +22 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 747b190f21c7de342f85390f8a51b17628e23fa2436776989a3ebe637bf9d596
|
|
4
|
+
data.tar.gz: 1e6c0c59c9ce56d499a04d8424506c56e2c2ad359506a6d5175c7173dc4ab238
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 3f652583ec04f5b874e3417db4cc0dff7f43341eeffe686466b8caad5614ed336e8580ac7533ef100726f09cdb264900e0f454edd11328e611513ffc8f77d3cb
|
|
7
|
+
data.tar.gz: 3316d0cb4ccc77e2d0c0ae48c033b6f5c026237d85c85e75e139434023f713820c3790a98090dac636ae6d44127692279404bcd3ab88b0d50a3de3127d38e3a6
|
data/README.md
CHANGED
|
@@ -23,6 +23,7 @@ This is the official Ruby SDK for [Braintrust](https://www.braintrust.dev), for
|
|
|
23
23
|
- [Evals](#evals)
|
|
24
24
|
- [Datasets](#datasets)
|
|
25
25
|
- [Scorers](#scorers)
|
|
26
|
+
- [Dev Server](#dev-server)
|
|
26
27
|
- [Documentation](#documentation)
|
|
27
28
|
- [Troubleshooting](#troubleshooting)
|
|
28
29
|
- [Contributing](#contributing)
|
|
@@ -148,8 +149,8 @@ Braintrust.init(
|
|
|
148
149
|
|
|
149
150
|
The SDK automatically instruments these LLM libraries:
|
|
150
151
|
|
|
151
|
-
| Provider | Gem | Versions | Integration Name | Examples
|
|
152
|
-
| --------- | ------------- | -------- | ---------------- |
|
|
152
|
+
| Provider | Gem | Versions | Integration Name | Examples |
|
|
153
|
+
| --------- | ------------- | -------- | ---------------- | ----------------------------------------- |
|
|
153
154
|
| Anthropic | `anthropic` | >= 0.3.0 | `:anthropic` | [Link](./examples/contrib/anthropic.rb) |
|
|
154
155
|
| OpenAI | `openai` | >= 0.1.0 | `:openai` | [Link](./examples/contrib/openai.rb) |
|
|
155
156
|
| | `ruby-openai` | >= 7.0.0 | `:ruby_openai` | [Link](./examples/contrib/ruby-openai.rb) |
|
|
@@ -251,9 +252,9 @@ Braintrust::Eval.run(
|
|
|
251
252
|
{input: "apple", expected: "fruit"},
|
|
252
253
|
{input: "carrot", expected: "vegetable"}
|
|
253
254
|
],
|
|
254
|
-
task: ->(input) { classify(input) },
|
|
255
|
+
task: ->(input:) { classify(input) },
|
|
255
256
|
scorers: [
|
|
256
|
-
->(
|
|
257
|
+
->(expected:, output:) { output == expected ? 1.0 : 0.0 }
|
|
257
258
|
]
|
|
258
259
|
)
|
|
259
260
|
```
|
|
@@ -266,7 +267,7 @@ Use test cases from a Braintrust dataset:
|
|
|
266
267
|
Braintrust::Eval.run(
|
|
267
268
|
project: "my-project",
|
|
268
269
|
dataset: "my-dataset",
|
|
269
|
-
task: ->(input) { classify(input) },
|
|
270
|
+
task: ->(input:) { classify(input) },
|
|
270
271
|
scorers: [...]
|
|
271
272
|
)
|
|
272
273
|
```
|
|
@@ -281,7 +282,7 @@ Braintrust::Eval.run(
|
|
|
281
282
|
{input: "apple", expected: "fruit", tags: ["produce"], metadata: {difficulty: "easy"}},
|
|
282
283
|
{input: "salmon", expected: "protein", tags: ["seafood"], metadata: {difficulty: "medium"}}
|
|
283
284
|
],
|
|
284
|
-
task: ->(input) { classify(input) },
|
|
285
|
+
task: ->(input:) { classify(input) },
|
|
285
286
|
scorers: [...]
|
|
286
287
|
)
|
|
287
288
|
```
|
|
@@ -294,29 +295,124 @@ Use scoring functions defined in Braintrust:
|
|
|
294
295
|
Braintrust::Eval.run(
|
|
295
296
|
project: "my-project",
|
|
296
297
|
cases: [...],
|
|
297
|
-
task: ->(input) { ... },
|
|
298
|
+
task: ->(input:) { ... },
|
|
299
|
+
scorers: ["accuracy-scorer"]
|
|
300
|
+
)
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
Or define scorers inline with `Scorer.new`:
|
|
304
|
+
|
|
305
|
+
```ruby
|
|
306
|
+
Braintrust::Eval.run(
|
|
307
|
+
project: "my-project",
|
|
308
|
+
cases: [...],
|
|
309
|
+
task: ->(input:) { ... },
|
|
298
310
|
scorers: [
|
|
299
|
-
Braintrust::
|
|
311
|
+
Braintrust::Scorer.new("exact_match") do |expected:, output:|
|
|
312
|
+
output == expected ? 1.0 : 0.0
|
|
313
|
+
end
|
|
300
314
|
]
|
|
301
315
|
)
|
|
302
316
|
```
|
|
303
317
|
|
|
304
|
-
|
|
318
|
+
#### Trace scoring
|
|
319
|
+
|
|
320
|
+
Scorers can access the full evaluation trace (all spans generated by the task) by declaring a `trace:` keyword parameter. This is useful for inspecting intermediate LLM calls, validating tool usage, or checking the message thread:
|
|
305
321
|
|
|
306
322
|
```ruby
|
|
307
323
|
Braintrust::Eval.run(
|
|
308
324
|
project: "my-project",
|
|
309
|
-
cases: [
|
|
310
|
-
task:
|
|
325
|
+
cases: [{input: "What is 2+2?", expected: "4"}],
|
|
326
|
+
task: Braintrust::Task.new { |input:| my_llm_pipeline(input) },
|
|
311
327
|
scorers: [
|
|
312
|
-
|
|
328
|
+
# Access the full trace to inspect LLM spans
|
|
329
|
+
Braintrust::Scorer.new("uses_system_prompt") do |output:, trace:|
|
|
330
|
+
messages = trace.thread # reconstructed message thread from LLM spans
|
|
331
|
+
messages.any? { |m| m["role"] == "system" } ? 1.0 : 0.0
|
|
332
|
+
end,
|
|
333
|
+
|
|
334
|
+
# Filter spans by type
|
|
335
|
+
Braintrust::Scorer.new("single_llm_call") do |output:, trace:|
|
|
336
|
+
trace.spans(span_type: "llm").length == 1 ? 1.0 : 0.0
|
|
337
|
+
end,
|
|
338
|
+
|
|
339
|
+
# Scorers without trace: still work — the parameter is filtered out automatically
|
|
340
|
+
Braintrust::Scorer.new("exact_match") do |output:, expected:|
|
|
313
341
|
output == expected ? 1.0 : 0.0
|
|
314
342
|
end
|
|
315
343
|
]
|
|
316
344
|
)
|
|
317
345
|
```
|
|
318
346
|
|
|
319
|
-
See examples: [eval.rb](./examples/eval.rb), [dataset.rb](./examples/eval/dataset.rb), [remote_functions.rb](./examples/eval/remote_functions.rb)
|
|
347
|
+
See examples: [eval.rb](./examples/eval.rb), [dataset.rb](./examples/eval/dataset.rb), [remote_functions.rb](./examples/eval/remote_functions.rb), [trace_scoring.rb](./examples/eval/trace_scoring.rb)
|
|
348
|
+
|
|
349
|
+
### Dev Server
|
|
350
|
+
|
|
351
|
+
Run evaluations from the Braintrust web UI against code in your own application. Define evaluators, pass them to the dev server, and start serving:
|
|
352
|
+
|
|
353
|
+
```ruby
|
|
354
|
+
# eval_server.ru
|
|
355
|
+
require "braintrust/eval"
|
|
356
|
+
require "braintrust/server"
|
|
357
|
+
|
|
358
|
+
# Define evaluators — these can reference your application code (models, services, etc.)
|
|
359
|
+
food_classifier = Braintrust::Eval::Evaluator.new(
|
|
360
|
+
task: ->(input:) { FoodClassifier.classify(input) },
|
|
361
|
+
scorers: [
|
|
362
|
+
Braintrust::Scorer.new("exact_match") { |expected:, output:| output == expected ? 1.0 : 0.0 }
|
|
363
|
+
]
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
# Initialize Braintrust (requires BRAINTRUST_API_KEY)
|
|
367
|
+
Braintrust.init(blocking_login: true)
|
|
368
|
+
|
|
369
|
+
# Start the server
|
|
370
|
+
run Braintrust::Server::Rack.app(
|
|
371
|
+
evaluators: {
|
|
372
|
+
"food-classifier" => food_classifier
|
|
373
|
+
}
|
|
374
|
+
)
|
|
375
|
+
```
|
|
376
|
+
|
|
377
|
+
```bash
|
|
378
|
+
bundle exec rackup eval_server.ru -p 8300 -o 0.0.0.0
|
|
379
|
+
```
|
|
380
|
+
|
|
381
|
+
**Custom evaluators**
|
|
382
|
+
|
|
383
|
+
Evaluators can also be defined as subclasses:
|
|
384
|
+
|
|
385
|
+
```ruby
|
|
386
|
+
class FoodClassifier < Braintrust::Eval::Evaluator
|
|
387
|
+
def task
|
|
388
|
+
->(input:) { classify(input) }
|
|
389
|
+
end
|
|
390
|
+
|
|
391
|
+
def scorers
|
|
392
|
+
[Braintrust::Scorer.new("exact_match") { |expected:, output:| output == expected ? 1.0 : 0.0 }]
|
|
393
|
+
end
|
|
394
|
+
end
|
|
395
|
+
```
|
|
396
|
+
|
|
397
|
+
**Supported web servers**
|
|
398
|
+
|
|
399
|
+
The dev server requires the `rack` gem and a Rack-compatible web server.
|
|
400
|
+
|
|
401
|
+
| Server | Version Supported | Notes |
|
|
402
|
+
| ---------------------------------------------- | ----------------- | ------------------------------------ |
|
|
403
|
+
| [Puma](https://puma.io/) | 6.x | |
|
|
404
|
+
| [Falcon](https://socketry.github.io/falcon/) | 0.x | |
|
|
405
|
+
| [Passenger](https://www.phusionpassenger.com/) | 6.x | |
|
|
406
|
+
| [WEBrick](https://github.com/ruby/webrick) | Not supported | Does not support server-sent events. |
|
|
407
|
+
|
|
408
|
+
Add your chosen server to your Gemfile:
|
|
409
|
+
|
|
410
|
+
```ruby
|
|
411
|
+
gem "rack"
|
|
412
|
+
gem "puma" # recommended
|
|
413
|
+
```
|
|
414
|
+
|
|
415
|
+
See example: [server/eval.ru](./examples/server/eval.ru)
|
|
320
416
|
|
|
321
417
|
## Documentation
|
|
322
418
|
|
|
@@ -82,6 +82,14 @@ module Braintrust
|
|
|
82
82
|
http_post_json("/v1/dataset/#{id}/insert", {events: events})
|
|
83
83
|
end
|
|
84
84
|
|
|
85
|
+
# Delete a dataset by ID
|
|
86
|
+
# DELETE /v1/dataset/{id}
|
|
87
|
+
# @param id [String] Dataset UUID
|
|
88
|
+
# @return [Hash] Delete response
|
|
89
|
+
def delete(id:)
|
|
90
|
+
http_request(:delete, "/v1/dataset/#{id}")
|
|
91
|
+
end
|
|
92
|
+
|
|
85
93
|
# Generate a permalink URL to view a dataset in the Braintrust UI
|
|
86
94
|
# @param id [String] Dataset UUID
|
|
87
95
|
# @return [String] Permalink URL
|
|
@@ -150,6 +158,8 @@ module Braintrust
|
|
|
150
158
|
req["Content-Type"] = "application/json"
|
|
151
159
|
req.body = JSON.dump(payload) if payload
|
|
152
160
|
req
|
|
161
|
+
when :delete
|
|
162
|
+
Net::HTTP::Delete.new(uri)
|
|
153
163
|
else
|
|
154
164
|
raise ArgumentError, "Unsupported HTTP method: #{method}"
|
|
155
165
|
end
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "net/http"
|
|
4
|
+
require "json"
|
|
5
|
+
require "uri"
|
|
6
|
+
require_relative "../../internal/http"
|
|
7
|
+
|
|
8
|
+
module Braintrust
|
|
9
|
+
class API
|
|
10
|
+
module Internal
|
|
11
|
+
# Internal BTQL client for querying spans.
|
|
12
|
+
# Not part of the public API — instantiated directly where needed.
|
|
13
|
+
class BTQL
|
|
14
|
+
# Maximum number of retries before returning partial results.
|
|
15
|
+
# Covers both freshness lag (partially indexed) and ingestion lag
|
|
16
|
+
# (spans not yet visible to BTQL after OTel flush).
|
|
17
|
+
MAX_FRESHNESS_RETRIES = 7
|
|
18
|
+
|
|
19
|
+
# Base delay (seconds) between retries (doubles each attempt, capped).
|
|
20
|
+
FRESHNESS_BASE_DELAY = 1.0
|
|
21
|
+
|
|
22
|
+
# Maximum delay (seconds) between retries. Caps exponential growth
|
|
23
|
+
# so we keep polling at a reasonable rate in the later window.
|
|
24
|
+
# Schedule: 1, 2, 4, 8, 8, 8, 8 = ~39s total worst-case.
|
|
25
|
+
MAX_FRESHNESS_DELAY = 8.0
|
|
26
|
+
|
|
27
|
+
def initialize(state)
|
|
28
|
+
@state = state
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Query spans belonging to a specific trace within an object.
|
|
32
|
+
#
|
|
33
|
+
# Builds a BTQL SQL query that matches the root_span_id and excludes scorer spans.
|
|
34
|
+
# Retries with exponential backoff if the response indicates data is not yet fresh.
|
|
35
|
+
#
|
|
36
|
+
# @param object_type [String] e.g. "experiment"
|
|
37
|
+
# @param object_id [String] Object UUID
|
|
38
|
+
# @param root_span_id [String] Hex trace ID of the root span
|
|
39
|
+
# @return [Array<Hash>] Parsed span data
|
|
40
|
+
def trace_spans(object_type:, object_id:, root_span_id:)
|
|
41
|
+
query = build_trace_query(
|
|
42
|
+
object_type: object_type,
|
|
43
|
+
object_id: object_id,
|
|
44
|
+
root_span_id: root_span_id
|
|
45
|
+
)
|
|
46
|
+
payload = {query: query, fmt: "jsonl"}
|
|
47
|
+
|
|
48
|
+
retries = 0
|
|
49
|
+
loop do
|
|
50
|
+
rows, freshness = execute_query(payload)
|
|
51
|
+
# Return when data is fresh AND non-empty, or we've exhausted retries.
|
|
52
|
+
# We retry on empty even when "complete" because there is ingestion lag
|
|
53
|
+
# between OTel flush and BTQL indexing — the server may report "complete"
|
|
54
|
+
# before it knows about newly-flushed spans.
|
|
55
|
+
return rows if (freshness == "complete" && !rows.empty?) || retries >= MAX_FRESHNESS_RETRIES
|
|
56
|
+
|
|
57
|
+
retries += 1
|
|
58
|
+
delay = [FRESHNESS_BASE_DELAY * (2**(retries - 1)), MAX_FRESHNESS_DELAY].min
|
|
59
|
+
sleep(delay)
|
|
60
|
+
end
|
|
61
|
+
rescue => e
|
|
62
|
+
Braintrust::Log.warn("[BTQL] Query failed: #{e.message}")
|
|
63
|
+
[]
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
private
|
|
67
|
+
|
|
68
|
+
# Build a BTQL SQL query string for fetching trace spans.
|
|
69
|
+
#
|
|
70
|
+
# Selects all spans for a given root_span_id, excluding scorer spans
|
|
71
|
+
# (span_attributes.type = 'score').
|
|
72
|
+
#
|
|
73
|
+
# @param object_type [String] e.g. "experiment"
|
|
74
|
+
# @param object_id [String] Object UUID
|
|
75
|
+
# @param root_span_id [String] Hex trace ID
|
|
76
|
+
# @return [String] BTQL SQL query
|
|
77
|
+
def build_trace_query(object_type:, object_id:, root_span_id:)
|
|
78
|
+
escaped_root = root_span_id.gsub("'", "''")
|
|
79
|
+
escaped_id = object_id.gsub("'", "''")
|
|
80
|
+
|
|
81
|
+
"SELECT * FROM #{object_type}('#{escaped_id}') " \
|
|
82
|
+
"WHERE root_span_id = '#{escaped_root}' " \
|
|
83
|
+
"AND span_attributes.type != 'score' " \
|
|
84
|
+
"LIMIT 1000"
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# Execute a BTQL query and parse the JSONL response.
|
|
88
|
+
#
|
|
89
|
+
# @param payload [Hash] BTQL request payload
|
|
90
|
+
# @return [Array(Array<Hash>, String)] [parsed_rows, freshness_state]
|
|
91
|
+
def execute_query(payload)
|
|
92
|
+
uri = URI("#{@state.api_url}/btql")
|
|
93
|
+
|
|
94
|
+
request = Net::HTTP::Post.new(uri)
|
|
95
|
+
request["Content-Type"] = "application/json"
|
|
96
|
+
request["Authorization"] = "Bearer #{@state.api_key}"
|
|
97
|
+
request["Accept"] = "application/x-jsonlines"
|
|
98
|
+
request.body = JSON.dump(payload)
|
|
99
|
+
|
|
100
|
+
response = Braintrust::Internal::Http.with_redirects(uri, request)
|
|
101
|
+
|
|
102
|
+
unless response.is_a?(Net::HTTPSuccess)
|
|
103
|
+
raise Braintrust::Error, "HTTP #{response.code} for POST #{uri}: #{response.body}"
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
freshness = response["x-bt-freshness-state"] || "complete"
|
|
107
|
+
[parse_jsonl(response.body), freshness]
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Parse a JSONL response body into an array of hashes.
|
|
111
|
+
#
|
|
112
|
+
# @param body [String] JSONL response body
|
|
113
|
+
# @return [Array<Hash>]
|
|
114
|
+
def parse_jsonl(body)
|
|
115
|
+
body.each_line.filter_map do |line|
|
|
116
|
+
line = line.strip
|
|
117
|
+
next if line.empty?
|
|
118
|
+
JSON.parse(line)
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
end
|
|
@@ -29,9 +29,9 @@ module Braintrust
|
|
|
29
29
|
|
|
30
30
|
payload = {
|
|
31
31
|
project_id: project_id,
|
|
32
|
-
name: name,
|
|
33
32
|
ensure_new: ensure_new
|
|
34
33
|
}
|
|
34
|
+
payload[:name] = name if name
|
|
35
35
|
payload[:tags] = tags if tags
|
|
36
36
|
payload[:metadata] = metadata if metadata
|
|
37
37
|
payload[:dataset_id] = dataset_id if dataset_id
|
|
@@ -50,6 +50,25 @@ module Braintrust
|
|
|
50
50
|
|
|
51
51
|
JSON.parse(response.body)
|
|
52
52
|
end
|
|
53
|
+
|
|
54
|
+
# Delete an experiment
|
|
55
|
+
# DELETE /v1/experiment/:id
|
|
56
|
+
# @param id [String] Experiment ID
|
|
57
|
+
# @return [Hash] Deleted experiment data
|
|
58
|
+
def delete(id:)
|
|
59
|
+
uri = URI("#{@state.api_url}/v1/experiment/#{id}")
|
|
60
|
+
|
|
61
|
+
request = Net::HTTP::Delete.new(uri)
|
|
62
|
+
request["Authorization"] = "Bearer #{@state.api_key}"
|
|
63
|
+
|
|
64
|
+
response = Braintrust::Internal::Http.with_redirects(uri, request)
|
|
65
|
+
|
|
66
|
+
unless response.is_a?(Net::HTTPSuccess)
|
|
67
|
+
raise Error, "HTTP #{response.code} for DELETE #{uri}: #{response.body}"
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
JSON.parse(response.body)
|
|
71
|
+
end
|
|
53
72
|
end
|
|
54
73
|
end
|
|
55
74
|
end
|
|
@@ -35,6 +35,25 @@ module Braintrust
|
|
|
35
35
|
|
|
36
36
|
JSON.parse(response.body)
|
|
37
37
|
end
|
|
38
|
+
|
|
39
|
+
# Delete a project
|
|
40
|
+
# DELETE /v1/project/:id
|
|
41
|
+
# @param id [String] Project UUID
|
|
42
|
+
# @return [Hash] Deleted project data
|
|
43
|
+
def delete(id:)
|
|
44
|
+
uri = URI("#{@state.api_url}/v1/project/#{id}")
|
|
45
|
+
|
|
46
|
+
request = Net::HTTP::Delete.new(uri)
|
|
47
|
+
request["Authorization"] = "Bearer #{@state.api_key}"
|
|
48
|
+
|
|
49
|
+
response = Braintrust::Internal::Http.with_redirects(uri, request)
|
|
50
|
+
|
|
51
|
+
unless response.is_a?(Net::HTTPSuccess)
|
|
52
|
+
raise Error, "HTTP #{response.code} for DELETE #{uri}: #{response.body}"
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
JSON.parse(response.body)
|
|
56
|
+
end
|
|
38
57
|
end
|
|
39
58
|
end
|
|
40
59
|
end
|
data/lib/braintrust/dataset.rb
CHANGED
|
@@ -12,9 +12,9 @@ module Braintrust
|
|
|
12
12
|
# dataset = Braintrust::Dataset.new(name: "my-dataset", project: "my-project")
|
|
13
13
|
# dataset.each { |record| puts record[:input] }
|
|
14
14
|
#
|
|
15
|
-
# @example With explicit
|
|
16
|
-
#
|
|
17
|
-
# dataset = Braintrust::Dataset.new(name: "my-dataset", project: "my-project",
|
|
15
|
+
# @example With explicit state
|
|
16
|
+
# state = Braintrust.init(api_key: "...")
|
|
17
|
+
# dataset = Braintrust::Dataset.new(name: "my-dataset", project: "my-project", state: state)
|
|
18
18
|
#
|
|
19
19
|
# @example Eager loading for small datasets
|
|
20
20
|
# records = dataset.fetch_all(limit: 100)
|
|
@@ -38,13 +38,13 @@ module Braintrust
|
|
|
38
38
|
# @param id [String, nil] Dataset UUID (required if name not provided)
|
|
39
39
|
# @param project [String, nil] Project name (required if using name)
|
|
40
40
|
# @param version [String, nil] Optional version to pin to
|
|
41
|
-
# @param
|
|
42
|
-
def initialize(name: nil, id: nil, project: nil, version: nil,
|
|
41
|
+
# @param state [State, nil] Braintrust state (defaults to global state)
|
|
42
|
+
def initialize(name: nil, id: nil, project: nil, version: nil, state: nil)
|
|
43
43
|
@name = name
|
|
44
44
|
@provided_id = id
|
|
45
45
|
@project = project
|
|
46
46
|
@version = version
|
|
47
|
-
@api =
|
|
47
|
+
@api = API.new(state: state)
|
|
48
48
|
@resolved_id = nil
|
|
49
49
|
@metadata = nil
|
|
50
50
|
|
|
@@ -181,5 +181,12 @@ module Braintrust
|
|
|
181
181
|
created: raw["created"]
|
|
182
182
|
)
|
|
183
183
|
end
|
|
184
|
+
|
|
185
|
+
# Value object wrapping a dataset UUID for resolution by ID.
|
|
186
|
+
# Used by Eval.run to distinguish dataset-by-ID from dataset-by-name.
|
|
187
|
+
ID = Struct.new(:id, keyword_init: true)
|
|
184
188
|
end
|
|
189
|
+
|
|
190
|
+
# @deprecated Use {Braintrust::Dataset::ID} instead.
|
|
191
|
+
DatasetId = Dataset::ID
|
|
185
192
|
end
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "cases"
|
|
4
|
+
|
|
5
|
+
module Braintrust
|
|
6
|
+
module Eval
|
|
7
|
+
# Holds all normalized, ready-to-execute eval components.
|
|
8
|
+
# Use Context.build to construct from raw user inputs.
|
|
9
|
+
class Context
|
|
10
|
+
attr_reader :task, :scorers, :cases, :experiment_id, :experiment_name,
|
|
11
|
+
:project_id, :project_name, :state, :tracer_provider,
|
|
12
|
+
:on_progress, :parent_span_attr, :generation
|
|
13
|
+
|
|
14
|
+
def initialize(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil,
|
|
15
|
+
project_id: nil, project_name: nil, state: nil, tracer_provider: nil,
|
|
16
|
+
on_progress: nil, parent_span_attr: nil, generation: nil)
|
|
17
|
+
@task = task
|
|
18
|
+
@scorers = scorers
|
|
19
|
+
@cases = cases
|
|
20
|
+
@experiment_id = experiment_id
|
|
21
|
+
@experiment_name = experiment_name
|
|
22
|
+
@project_id = project_id
|
|
23
|
+
@project_name = project_name
|
|
24
|
+
@state = state
|
|
25
|
+
@tracer_provider = tracer_provider
|
|
26
|
+
@on_progress = on_progress
|
|
27
|
+
@parent_span_attr = parent_span_attr
|
|
28
|
+
@generation = generation
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Build a Context from raw user inputs.
|
|
32
|
+
# Factory normalizes task, scorers, and cases into typed wrappers.
|
|
33
|
+
# Parent is resolved into parent_span_attr and generation.
|
|
34
|
+
def self.build(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil,
|
|
35
|
+
project_id: nil, project_name: nil, state: nil, tracer_provider: nil,
|
|
36
|
+
on_progress: nil, parent: nil)
|
|
37
|
+
factory = Factory.new(state: state, tracer_provider: tracer_provider, project_name: project_name)
|
|
38
|
+
|
|
39
|
+
Context.new(
|
|
40
|
+
task: factory.normalize_task(task),
|
|
41
|
+
scorers: factory.normalize_scorers(scorers),
|
|
42
|
+
cases: factory.normalize_cases(cases),
|
|
43
|
+
experiment_id: experiment_id,
|
|
44
|
+
experiment_name: experiment_name,
|
|
45
|
+
project_id: project_id,
|
|
46
|
+
project_name: project_name,
|
|
47
|
+
state: state,
|
|
48
|
+
tracer_provider: tracer_provider,
|
|
49
|
+
on_progress: on_progress,
|
|
50
|
+
parent_span_attr: factory.resolve_parent_span_attr(parent),
|
|
51
|
+
generation: parent&.dig(:generation)
|
|
52
|
+
)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Encapsulates normalization of raw user inputs into typed wrappers.
|
|
56
|
+
class Factory
|
|
57
|
+
def initialize(state: nil, tracer_provider: nil, project_name: nil)
|
|
58
|
+
@state = state
|
|
59
|
+
@tracer_provider = tracer_provider
|
|
60
|
+
@project_name = project_name
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def normalize_cases(raw)
|
|
64
|
+
case raw
|
|
65
|
+
when Cases
|
|
66
|
+
raw
|
|
67
|
+
when Array, Enumerable
|
|
68
|
+
Cases.new(raw)
|
|
69
|
+
else
|
|
70
|
+
if raw.respond_to?(:each)
|
|
71
|
+
Cases.new(raw)
|
|
72
|
+
else
|
|
73
|
+
raise ArgumentError, "cases must be Array or Enumerable"
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def resolve_parent_span_attr(parent)
|
|
79
|
+
return nil unless parent
|
|
80
|
+
"#{parent[:object_type]}:#{parent[:object_id]}"
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def normalize_task(raw)
|
|
84
|
+
case raw
|
|
85
|
+
when Task
|
|
86
|
+
raw
|
|
87
|
+
when Proc
|
|
88
|
+
# Pass Proc/Lambda directly to preserve keyword arg info.
|
|
89
|
+
# Legacy positional lambdas (arity 1) are auto-wrapped by Task#wrap_block.
|
|
90
|
+
Task.new(&raw)
|
|
91
|
+
else
|
|
92
|
+
# Callable class: wrap via method(:call) to preserve keyword arg info
|
|
93
|
+
name = raw.respond_to?(:name) ? raw.name : nil
|
|
94
|
+
Task.new(name, &raw.method(:call))
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def normalize_scorers(raw)
|
|
99
|
+
raw.map do |scorer|
|
|
100
|
+
case scorer
|
|
101
|
+
when String
|
|
102
|
+
raise ArgumentError, "project is required to resolve scorer slug '#{scorer}'" unless @project_name
|
|
103
|
+
Braintrust::Functions.scorer(
|
|
104
|
+
project: @project_name,
|
|
105
|
+
slug: scorer,
|
|
106
|
+
state: @state,
|
|
107
|
+
tracer_provider: @tracer_provider
|
|
108
|
+
)
|
|
109
|
+
when Braintrust::Scorer::ID
|
|
110
|
+
Braintrust::Functions.scorer(
|
|
111
|
+
id: scorer.function_id,
|
|
112
|
+
version: scorer.version,
|
|
113
|
+
state: @state,
|
|
114
|
+
tracer_provider: @tracer_provider
|
|
115
|
+
)
|
|
116
|
+
when Braintrust::Scorer
|
|
117
|
+
scorer
|
|
118
|
+
when Proc
|
|
119
|
+
# Pass Proc/Lambda directly to preserve keyword arg info
|
|
120
|
+
# (method(:call) loses parameter metadata)
|
|
121
|
+
Braintrust::Scorer.new(&scorer)
|
|
122
|
+
else
|
|
123
|
+
name = scorer.respond_to?(:name) ? scorer.name : nil
|
|
124
|
+
Braintrust::Scorer.new(name, &scorer.method(:call))
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
end
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Braintrust
|
|
4
|
+
module Eval
|
|
5
|
+
# Base class for evaluators. Subclass and override #task and #scorers,
|
|
6
|
+
# or instantiate directly with keyword arguments.
|
|
7
|
+
#
|
|
8
|
+
# Evaluators are used with the dev server, which reports scorer names
|
|
9
|
+
# to the Braintrust UI. Always use named scorers (via Scorer.new or
|
|
10
|
+
# subclass) so they display meaningfully.
|
|
11
|
+
#
|
|
12
|
+
# @example Subclass pattern
|
|
13
|
+
# class FoodClassifier < Braintrust::Eval::Evaluator
|
|
14
|
+
# def task
|
|
15
|
+
# ->(input:) { classify(input) }
|
|
16
|
+
# end
|
|
17
|
+
#
|
|
18
|
+
# def scorers
|
|
19
|
+
# [Braintrust::Scorer.new("exact_match") { |expected:, output:| output == expected ? 1.0 : 0.0 }]
|
|
20
|
+
# end
|
|
21
|
+
# end
|
|
22
|
+
#
|
|
23
|
+
# @example Inline pattern
|
|
24
|
+
# Braintrust::Eval::Evaluator.new(
|
|
25
|
+
# task: ->(input:) { input.upcase },
|
|
26
|
+
# scorers: [
|
|
27
|
+
# Braintrust::Scorer.new("exact_match") { |expected:, output:| output == expected ? 1.0 : 0.0 }
|
|
28
|
+
# ]
|
|
29
|
+
# )
|
|
30
|
+
class Evaluator
|
|
31
|
+
attr_accessor :task, :scorers, :parameters
|
|
32
|
+
|
|
33
|
+
def initialize(task: nil, scorers: [], parameters: {})
|
|
34
|
+
@task = task
|
|
35
|
+
@scorers = scorers
|
|
36
|
+
@parameters = parameters
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Validate that the evaluator has required fields set.
|
|
40
|
+
# @raise [ArgumentError] if validation fails
|
|
41
|
+
def validate!
|
|
42
|
+
raise ArgumentError, "task is required" unless task
|
|
43
|
+
unless task.respond_to?(:call)
|
|
44
|
+
raise ArgumentError, "task must be callable (respond to :call)"
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Run this evaluator against the given cases.
|
|
49
|
+
# Delegates to Braintrust::Eval.run with the evaluator's task and scorers.
|
|
50
|
+
#
|
|
51
|
+
# @param cases [Array] The test cases
|
|
52
|
+
# @param on_progress [#call, nil] Optional callback fired after each test case
|
|
53
|
+
# @param quiet [Boolean] If true, suppress result output (default: false)
|
|
54
|
+
# @param project [String, nil] Project name
|
|
55
|
+
# @param experiment [String, nil] Experiment name
|
|
56
|
+
# @param project_id [String, nil] Project UUID (skips project creation)
|
|
57
|
+
# @param dataset [String, Hash, Dataset, Dataset::ID, nil] Dataset to fetch
|
|
58
|
+
# @param scorers [Array, nil] Additional scorers (merged with evaluator's own)
|
|
59
|
+
# @param parent [Hash, nil] Parent span context
|
|
60
|
+
# @param state [State, nil] Braintrust state
|
|
61
|
+
# @param update [Boolean] If true, allow reusing existing experiment (default: false)
|
|
62
|
+
# @param tracer_provider [TracerProvider, nil] OpenTelemetry tracer provider (defaults to global)
|
|
63
|
+
# @return [Result]
|
|
64
|
+
def run(cases, on_progress: nil, quiet: false,
|
|
65
|
+
project: nil, experiment: nil, project_id: nil,
|
|
66
|
+
dataset: nil, scorers: nil, parent: nil,
|
|
67
|
+
state: nil, update: false, tracer_provider: nil)
|
|
68
|
+
all_scorers = scorers ? self.scorers + scorers : self.scorers
|
|
69
|
+
Braintrust::Eval.run(
|
|
70
|
+
task: task, scorers: all_scorers, cases: cases, dataset: dataset,
|
|
71
|
+
project: project, experiment: experiment, project_id: project_id,
|
|
72
|
+
parent: parent, on_progress: on_progress, quiet: quiet,
|
|
73
|
+
state: state, update: update, tracer_provider: tracer_provider
|
|
74
|
+
)
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|