braintrust 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +34 -4
- data/lib/braintrust/api/datasets.rb +1 -1
- data/lib/braintrust/api/internal/experiments.rb +54 -0
- data/lib/braintrust/api/internal/projects.rb +42 -0
- data/lib/braintrust/api.rb +17 -0
- data/lib/braintrust/contrib/anthropic/instrumentation/beta_messages.rb +2 -2
- data/lib/braintrust/contrib/anthropic/instrumentation/messages.rb +2 -2
- data/lib/braintrust/dataset.rb +185 -0
- data/lib/braintrust/eval/case.rb +3 -1
- data/lib/braintrust/eval/runner.rb +7 -4
- data/lib/braintrust/eval.rb +41 -88
- data/lib/braintrust/internal/origin.rb +28 -0
- data/lib/braintrust/state.rb +10 -0
- data/lib/braintrust/version.rb +1 -1
- data/lib/braintrust.rb +1 -1
- metadata +5 -2
- data/lib/braintrust/internal/experiments.rb +0 -129
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 83ff9b69dc144333dba85a5f68e5a40d482ca99b3cae4bb55abe24ef2d05c296
|
|
4
|
+
data.tar.gz: 1a52913de27b3536c7881203f91d3d6050d53e66afeb90900d3c8a04b180951d
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: fb7da28ba278c6a1cff5bd143e28808c723f1bf1507a6fe73d55b76f81d17e74ffc62f5c5dde030a1a5101797f3399592d13d525d46ad39b6b96047f7e47a3d6
|
|
7
|
+
data.tar.gz: d49db21d70faba9e3e9b59a61b88d55897cd5420ef636ae18c7b68a4c50d1428be2fbe0c0efc0b26f3a159bdbfb629025f0cc1aa8489187ecf5b586d57c8e1d2
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@ This is the official Ruby SDK for [Braintrust](https://www.braintrust.dev), for
|
|
|
22
22
|
- [Viewing traces](#viewing-traces)
|
|
23
23
|
- [Evals](#evals)
|
|
24
24
|
- [Datasets](#datasets)
|
|
25
|
-
- [
|
|
25
|
+
- [Scorers](#scorers)
|
|
26
26
|
- [Documentation](#documentation)
|
|
27
27
|
- [Troubleshooting](#troubleshooting)
|
|
28
28
|
- [Contributing](#contributing)
|
|
@@ -260,7 +260,7 @@ Braintrust::Eval.run(
|
|
|
260
260
|
|
|
261
261
|
### Datasets
|
|
262
262
|
|
|
263
|
-
|
|
263
|
+
Use test cases from a Braintrust dataset:
|
|
264
264
|
|
|
265
265
|
```ruby
|
|
266
266
|
Braintrust::Eval.run(
|
|
@@ -271,7 +271,22 @@ Braintrust::Eval.run(
|
|
|
271
271
|
)
|
|
272
272
|
```
|
|
273
273
|
|
|
274
|
-
|
|
274
|
+
Or define test cases inline with metadata and tags:
|
|
275
|
+
|
|
276
|
+
```ruby
|
|
277
|
+
Braintrust::Eval.run(
|
|
278
|
+
project: "my-project",
|
|
279
|
+
experiment: "classifier-v1",
|
|
280
|
+
cases: [
|
|
281
|
+
{input: "apple", expected: "fruit", tags: ["produce"], metadata: {difficulty: "easy"}},
|
|
282
|
+
{input: "salmon", expected: "protein", tags: ["seafood"], metadata: {difficulty: "medium"}}
|
|
283
|
+
],
|
|
284
|
+
task: ->(input) { classify(input) },
|
|
285
|
+
scorers: [...]
|
|
286
|
+
)
|
|
287
|
+
```
|
|
288
|
+
|
|
289
|
+
### Scorers
|
|
275
290
|
|
|
276
291
|
Use scoring functions defined in Braintrust:
|
|
277
292
|
|
|
@@ -281,7 +296,22 @@ Braintrust::Eval.run(
|
|
|
281
296
|
cases: [...],
|
|
282
297
|
task: ->(input) { ... },
|
|
283
298
|
scorers: [
|
|
284
|
-
Braintrust::
|
|
299
|
+
Braintrust::Eval::Functions.scorer(project: "my-project", slug: "accuracy-scorer")
|
|
300
|
+
]
|
|
301
|
+
)
|
|
302
|
+
```
|
|
303
|
+
|
|
304
|
+
Or define scorers inline with `Eval.scorer`:
|
|
305
|
+
|
|
306
|
+
```ruby
|
|
307
|
+
Braintrust::Eval.run(
|
|
308
|
+
project: "my-project",
|
|
309
|
+
cases: [...],
|
|
310
|
+
task: ->(input) { ... },
|
|
311
|
+
scorers: [
|
|
312
|
+
Braintrust::Eval.scorer("exact_match") do |input, expected, output|
|
|
313
|
+
output == expected ? 1.0 : 0.0
|
|
314
|
+
end
|
|
285
315
|
]
|
|
286
316
|
)
|
|
287
317
|
```
|
|
@@ -85,7 +85,7 @@ module Braintrust
|
|
|
85
85
|
# @param id [String] Dataset UUID
|
|
86
86
|
# @return [String] Permalink URL
|
|
87
87
|
def permalink(id:)
|
|
88
|
-
|
|
88
|
+
@state.object_permalink(object_type: "dataset", object_id: id)
|
|
89
89
|
end
|
|
90
90
|
|
|
91
91
|
# Fetch records from dataset using BTQL
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "net/http"
|
|
4
|
+
require "json"
|
|
5
|
+
require "uri"
|
|
6
|
+
|
|
7
|
+
module Braintrust
|
|
8
|
+
class API
|
|
9
|
+
module Internal
|
|
10
|
+
# Internal Experiments API
|
|
11
|
+
# Not part of the public API - use through Eval.run
|
|
12
|
+
class Experiments
|
|
13
|
+
def initialize(state)
|
|
14
|
+
@state = state
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# Create an experiment
|
|
18
|
+
# POST /v1/experiment
|
|
19
|
+
# @param name [String] Experiment name
|
|
20
|
+
# @param project_id [String] Project ID
|
|
21
|
+
# @param ensure_new [Boolean] If true (default), fail if exists; if false, return existing
|
|
22
|
+
# @param tags [Array<String>, nil] Optional tags
|
|
23
|
+
# @param metadata [Hash, nil] Optional metadata
|
|
24
|
+
# @return [Hash] Experiment data with "id", "name", "project_id", etc.
|
|
25
|
+
def create(name:, project_id:, ensure_new: true, tags: nil, metadata: nil)
|
|
26
|
+
uri = URI("#{@state.api_url}/v1/experiment")
|
|
27
|
+
|
|
28
|
+
payload = {
|
|
29
|
+
project_id: project_id,
|
|
30
|
+
name: name,
|
|
31
|
+
ensure_new: ensure_new
|
|
32
|
+
}
|
|
33
|
+
payload[:tags] = tags if tags
|
|
34
|
+
payload[:metadata] = metadata if metadata
|
|
35
|
+
|
|
36
|
+
request = Net::HTTP::Post.new(uri)
|
|
37
|
+
request["Content-Type"] = "application/json"
|
|
38
|
+
request["Authorization"] = "Bearer #{@state.api_key}"
|
|
39
|
+
request.body = JSON.dump(payload)
|
|
40
|
+
|
|
41
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
42
|
+
http.use_ssl = (uri.scheme == "https")
|
|
43
|
+
response = http.request(request)
|
|
44
|
+
|
|
45
|
+
unless response.is_a?(Net::HTTPSuccess)
|
|
46
|
+
raise Error, "HTTP #{response.code} for POST #{uri}: #{response.body}"
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
JSON.parse(response.body)
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "net/http"
|
|
4
|
+
require "json"
|
|
5
|
+
require "uri"
|
|
6
|
+
|
|
7
|
+
module Braintrust
|
|
8
|
+
class API
|
|
9
|
+
module Internal
|
|
10
|
+
# Internal Projects API
|
|
11
|
+
# Not part of the public API - use through Eval.run
|
|
12
|
+
class Projects
|
|
13
|
+
def initialize(state)
|
|
14
|
+
@state = state
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# Create or get a project by name (idempotent)
|
|
18
|
+
# POST /v1/project
|
|
19
|
+
# @param name [String] Project name
|
|
20
|
+
# @return [Hash] Project data with "id", "name", "org_id", etc.
|
|
21
|
+
def create(name:)
|
|
22
|
+
uri = URI("#{@state.api_url}/v1/project")
|
|
23
|
+
|
|
24
|
+
request = Net::HTTP::Post.new(uri)
|
|
25
|
+
request["Content-Type"] = "application/json"
|
|
26
|
+
request["Authorization"] = "Bearer #{@state.api_key}"
|
|
27
|
+
request.body = JSON.dump({name: name})
|
|
28
|
+
|
|
29
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
30
|
+
http.use_ssl = (uri.scheme == "https")
|
|
31
|
+
response = http.request(request)
|
|
32
|
+
|
|
33
|
+
unless response.is_a?(Net::HTTPSuccess)
|
|
34
|
+
raise Error, "HTTP #{response.code} for POST #{uri}: #{response.body}"
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
JSON.parse(response.body)
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
data/lib/braintrust/api.rb
CHANGED
|
@@ -25,5 +25,22 @@ module Braintrust
|
|
|
25
25
|
def functions
|
|
26
26
|
@functions ||= API::Functions.new(self)
|
|
27
27
|
end
|
|
28
|
+
|
|
29
|
+
# Login to Braintrust API (idempotent)
|
|
30
|
+
# @return [self]
|
|
31
|
+
def login
|
|
32
|
+
@state.login
|
|
33
|
+
self
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Generate a permalink URL to view an object in the Braintrust UI
|
|
37
|
+
# This is for the /object endpoint (experiments, datasets, etc.)
|
|
38
|
+
# For trace span permalinks, use Trace.permalink instead.
|
|
39
|
+
# @param object_type [String] Type of object (e.g., "experiment", "dataset")
|
|
40
|
+
# @param object_id [String] Object UUID
|
|
41
|
+
# @return [String] Permalink URL
|
|
42
|
+
def object_permalink(object_type:, object_id:)
|
|
43
|
+
@state.object_permalink(object_type: object_type, object_id: object_id)
|
|
44
|
+
end
|
|
28
45
|
end
|
|
29
46
|
end
|
|
@@ -169,8 +169,8 @@ module Braintrust
|
|
|
169
169
|
input_messages = []
|
|
170
170
|
|
|
171
171
|
begin
|
|
172
|
-
if params[:
|
|
173
|
-
system_content = params[:
|
|
172
|
+
if params[:system_]
|
|
173
|
+
system_content = params[:system_]
|
|
174
174
|
if system_content.is_a?(Array)
|
|
175
175
|
system_text = system_content.map { |blk|
|
|
176
176
|
blk.is_a?(Hash) ? blk[:text] : blk
|
|
@@ -98,8 +98,8 @@ module Braintrust
|
|
|
98
98
|
def set_input(span, params)
|
|
99
99
|
input_messages = []
|
|
100
100
|
|
|
101
|
-
if params[:
|
|
102
|
-
system_content = params[:
|
|
101
|
+
if params[:system_]
|
|
102
|
+
system_content = params[:system_]
|
|
103
103
|
if system_content.is_a?(Array)
|
|
104
104
|
system_text = system_content.map { |blk|
|
|
105
105
|
blk.is_a?(Hash) ? blk[:text] : blk
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "api"
|
|
4
|
+
require_relative "internal/origin"
|
|
5
|
+
|
|
6
|
+
module Braintrust
|
|
7
|
+
# High-level interface for working with Braintrust datasets.
|
|
8
|
+
# Provides both eager loading and lazy enumeration for efficient access to dataset records.
|
|
9
|
+
#
|
|
10
|
+
# @example Basic usage (uses global state)
|
|
11
|
+
# Braintrust.init(api_key: "...")
|
|
12
|
+
# dataset = Braintrust::Dataset.new(name: "my-dataset", project: "my-project")
|
|
13
|
+
# dataset.each { |record| puts record[:input] }
|
|
14
|
+
#
|
|
15
|
+
# @example With explicit API client
|
|
16
|
+
# api = Braintrust::API.new(state: my_state)
|
|
17
|
+
# dataset = Braintrust::Dataset.new(name: "my-dataset", project: "my-project", api: api)
|
|
18
|
+
#
|
|
19
|
+
# @example Eager loading for small datasets
|
|
20
|
+
# records = dataset.fetch_all(limit: 100)
|
|
21
|
+
#
|
|
22
|
+
# @example Using Enumerable methods
|
|
23
|
+
# dataset.take(10)
|
|
24
|
+
# dataset.select { |r| r[:tags]&.include?("important") }
|
|
25
|
+
#
|
|
26
|
+
# @example With version pinning
|
|
27
|
+
# dataset = Braintrust::Dataset.new(name: "my-dataset", project: "my-project", version: "1.0")
|
|
28
|
+
class Dataset
|
|
29
|
+
include Enumerable
|
|
30
|
+
|
|
31
|
+
# Default number of records to fetch per API page
|
|
32
|
+
DEFAULT_PAGE_SIZE = 1000
|
|
33
|
+
|
|
34
|
+
attr_reader :name, :project, :version
|
|
35
|
+
|
|
36
|
+
# Initialize a dataset reference
|
|
37
|
+
# @param name [String, nil] Dataset name (required if id not provided)
|
|
38
|
+
# @param id [String, nil] Dataset UUID (required if name not provided)
|
|
39
|
+
# @param project [String, nil] Project name (required if using name)
|
|
40
|
+
# @param version [String, nil] Optional version to pin to
|
|
41
|
+
# @param api [API, nil] Braintrust API client (defaults to API.new using global state)
|
|
42
|
+
def initialize(name: nil, id: nil, project: nil, version: nil, api: nil)
|
|
43
|
+
@name = name
|
|
44
|
+
@provided_id = id
|
|
45
|
+
@project = project
|
|
46
|
+
@version = version
|
|
47
|
+
@api = api || API.new
|
|
48
|
+
@resolved_id = nil
|
|
49
|
+
@metadata = nil
|
|
50
|
+
|
|
51
|
+
validate_params!
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Get the dataset ID, resolving from name if necessary
|
|
55
|
+
# @return [String] Dataset UUID
|
|
56
|
+
def id
|
|
57
|
+
return @provided_id if @provided_id
|
|
58
|
+
resolve_name! unless @resolved_id
|
|
59
|
+
@resolved_id
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Get the dataset metadata from the API
|
|
63
|
+
# Makes an API call if metadata hasn't been fetched yet.
|
|
64
|
+
# Note: When initialized with name, metadata is fetched during name resolution.
|
|
65
|
+
# When initialized with ID, this triggers a separate get_by_id call.
|
|
66
|
+
# @return [Hash] Dataset metadata including name, description, created, etc.
|
|
67
|
+
def metadata
|
|
68
|
+
fetch_metadata! unless @metadata
|
|
69
|
+
@metadata
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# Fetch all records eagerly into an array
|
|
73
|
+
# @param limit [Integer, nil] Maximum records to return (nil for all)
|
|
74
|
+
# @return [Array<Hash>] Array of records with :input, :expected, :tags, :metadata, :origin
|
|
75
|
+
def fetch_all(limit: nil)
|
|
76
|
+
records = []
|
|
77
|
+
each_record(limit: limit) { |record| records << record }
|
|
78
|
+
records
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Iterate over records lazily (implements Enumerable)
|
|
82
|
+
# Fetches pages on demand for memory efficiency with large datasets.
|
|
83
|
+
# @yield [Hash] Each record with :input, :expected, :tags, :metadata, :origin
|
|
84
|
+
def each(&block)
|
|
85
|
+
return enum_for(:each) unless block_given?
|
|
86
|
+
each_record(&block)
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
private
|
|
90
|
+
|
|
91
|
+
def validate_params!
|
|
92
|
+
if @provided_id.nil? && @name.nil?
|
|
93
|
+
raise ArgumentError, "must specify either :name or :id"
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
if @name && @project.nil?
|
|
97
|
+
raise ArgumentError, ":project is required when using :name"
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Resolve dataset name to ID (also fetches metadata as side effect)
|
|
102
|
+
def resolve_name!
|
|
103
|
+
@metadata = @api.datasets.get(project_name: @project, name: @name)
|
|
104
|
+
@resolved_id = @metadata["id"]
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Fetch metadata explicitly (for when ID was provided directly)
|
|
108
|
+
def fetch_metadata!
|
|
109
|
+
if @provided_id
|
|
110
|
+
@metadata = @api.datasets.get_by_id(id: @provided_id)
|
|
111
|
+
else
|
|
112
|
+
resolve_name! unless @metadata
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Core iteration with pagination
|
|
117
|
+
# @param limit [Integer, nil] Maximum records to return
|
|
118
|
+
def each_record(limit: nil, &block)
|
|
119
|
+
dataset_id = id # Resolve once
|
|
120
|
+
cursor = nil
|
|
121
|
+
count = 0
|
|
122
|
+
|
|
123
|
+
loop do
|
|
124
|
+
page_limit = if limit
|
|
125
|
+
[DEFAULT_PAGE_SIZE, limit - count].min
|
|
126
|
+
else
|
|
127
|
+
DEFAULT_PAGE_SIZE
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
result = @api.datasets.fetch(
|
|
131
|
+
id: dataset_id,
|
|
132
|
+
limit: page_limit,
|
|
133
|
+
cursor: cursor,
|
|
134
|
+
version: @version
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
result[:records].each do |raw_record|
|
|
138
|
+
record = build_record(raw_record, dataset_id)
|
|
139
|
+
block.call(record)
|
|
140
|
+
count += 1
|
|
141
|
+
break if limit && count >= limit
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# Stop if we've hit the limit or no more pages
|
|
145
|
+
break if limit && count >= limit
|
|
146
|
+
|
|
147
|
+
cursor = result[:cursor]
|
|
148
|
+
break unless cursor
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
# Build a normalized record hash from raw API response
|
|
153
|
+
# @param raw [Hash] Raw record from API
|
|
154
|
+
# @param dataset_id [String] Dataset ID for origin
|
|
155
|
+
# @return [Hash] Normalized record with origin
|
|
156
|
+
def build_record(raw, dataset_id)
|
|
157
|
+
record = {}
|
|
158
|
+
record[:input] = raw["input"] if raw.key?("input")
|
|
159
|
+
record[:expected] = raw["expected"] if raw.key?("expected")
|
|
160
|
+
record[:tags] = raw["tags"] if raw.key?("tags")
|
|
161
|
+
record[:metadata] = raw["metadata"] if raw.key?("metadata")
|
|
162
|
+
|
|
163
|
+
origin = build_origin(raw, dataset_id)
|
|
164
|
+
record[:origin] = origin if origin
|
|
165
|
+
|
|
166
|
+
record
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
# Build origin JSON for tracing/linking
|
|
170
|
+
# @param raw [Hash] Raw record from API
|
|
171
|
+
# @param dataset_id [String] Dataset ID (fallback if not in record)
|
|
172
|
+
# @return [String, nil] JSON-serialized origin, or nil if record lacks required fields
|
|
173
|
+
def build_origin(raw, dataset_id)
|
|
174
|
+
return nil unless raw["id"] && raw["_xact_id"]
|
|
175
|
+
|
|
176
|
+
Internal::Origin.to_json(
|
|
177
|
+
object_type: "dataset",
|
|
178
|
+
object_id: raw["dataset_id"] || dataset_id,
|
|
179
|
+
id: raw["id"],
|
|
180
|
+
xact_id: raw["_xact_id"],
|
|
181
|
+
created: raw["created"]
|
|
182
|
+
)
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
end
|
data/lib/braintrust/eval/case.rb
CHANGED
|
@@ -7,6 +7,8 @@ module Braintrust
|
|
|
7
7
|
# @attr expected [Object, nil] The expected output (optional)
|
|
8
8
|
# @attr tags [Array<String>, nil] Optional tags for filtering/grouping
|
|
9
9
|
# @attr metadata [Hash, nil] Optional metadata for the case
|
|
10
|
-
|
|
10
|
+
# @attr origin [Hash, nil] Origin pointer for cases from remote sources (e.g., datasets).
|
|
11
|
+
# Contains: object_type, object_id, id, _xact_id, created
|
|
12
|
+
Case = Struct.new(:input, :expected, :tags, :metadata, :origin, keyword_init: true)
|
|
11
13
|
end
|
|
12
14
|
end
|
|
@@ -18,14 +18,14 @@ module Braintrust
|
|
|
18
18
|
MAX_PARALLELISM = Internal::ThreadPool::MAX_PARALLELISM
|
|
19
19
|
|
|
20
20
|
def initialize(experiment_id:, experiment_name:, project_id:, project_name:,
|
|
21
|
-
task:, scorers:,
|
|
21
|
+
task:, scorers:, api:, tracer_provider: nil)
|
|
22
22
|
@experiment_id = experiment_id
|
|
23
23
|
@experiment_name = experiment_name
|
|
24
24
|
@project_id = project_id
|
|
25
25
|
@project_name = project_name
|
|
26
26
|
@task = task
|
|
27
27
|
@scorers = normalize_scorers(scorers)
|
|
28
|
-
@
|
|
28
|
+
@api = api
|
|
29
29
|
@tracer_provider = tracer_provider || OpenTelemetry.tracer_provider
|
|
30
30
|
@tracer = @tracer_provider.tracer("braintrust-eval")
|
|
31
31
|
@parent_attr = "experiment_id:#{experiment_id}"
|
|
@@ -61,7 +61,7 @@ module Braintrust
|
|
|
61
61
|
duration = Time.now - start_time
|
|
62
62
|
|
|
63
63
|
# Generate permalink
|
|
64
|
-
permalink =
|
|
64
|
+
permalink = @api.object_permalink(object_type: "experiment", object_id: experiment_id)
|
|
65
65
|
|
|
66
66
|
Result.new(
|
|
67
67
|
experiment_id: experiment_id,
|
|
@@ -78,7 +78,7 @@ module Braintrust
|
|
|
78
78
|
private
|
|
79
79
|
|
|
80
80
|
attr_reader :experiment_id, :experiment_name, :project_id, :project_name,
|
|
81
|
-
:task, :scorers, :
|
|
81
|
+
:task, :scorers, :tracer, :parent_attr
|
|
82
82
|
|
|
83
83
|
# Run a single test case with OpenTelemetry tracing
|
|
84
84
|
# Creates eval span (parent) with task and score as children
|
|
@@ -116,6 +116,9 @@ module Braintrust
|
|
|
116
116
|
set_json_attr(eval_span, "braintrust.input_json", test_case.input)
|
|
117
117
|
set_json_attr(eval_span, "braintrust.output_json", output)
|
|
118
118
|
set_json_attr(eval_span, "braintrust.expected", test_case.expected) if test_case.expected
|
|
119
|
+
|
|
120
|
+
# Set origin for cases from remote sources (already JSON-serialized)
|
|
121
|
+
eval_span.set_attribute("braintrust.origin", test_case.origin) if test_case.origin
|
|
119
122
|
end
|
|
120
123
|
end
|
|
121
124
|
|
data/lib/braintrust/eval.rb
CHANGED
|
@@ -2,7 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
require_relative "eval/scorer"
|
|
4
4
|
require_relative "eval/runner"
|
|
5
|
-
require_relative "internal/
|
|
5
|
+
require_relative "api/internal/projects"
|
|
6
|
+
require_relative "api/internal/experiments"
|
|
7
|
+
require_relative "dataset"
|
|
6
8
|
|
|
7
9
|
require "opentelemetry/sdk"
|
|
8
10
|
require "json"
|
|
@@ -199,39 +201,45 @@ module Braintrust
|
|
|
199
201
|
# @param metadata [Hash] Optional experiment metadata
|
|
200
202
|
# @param update [Boolean] If true, allow reusing existing experiment (default: false)
|
|
201
203
|
# @param quiet [Boolean] If true, suppress result output (default: false)
|
|
202
|
-
# @param
|
|
204
|
+
# @param api [API, nil] Braintrust API client (defaults to API.new using global state)
|
|
203
205
|
# @param tracer_provider [TracerProvider, nil] OpenTelemetry tracer provider (defaults to global)
|
|
204
206
|
# @return [Result]
|
|
205
207
|
def run(project:, experiment:, task:, scorers:,
|
|
206
208
|
cases: nil, dataset: nil,
|
|
207
209
|
parallelism: 1, tags: nil, metadata: nil, update: false, quiet: false,
|
|
208
|
-
|
|
210
|
+
api: nil, tracer_provider: nil)
|
|
209
211
|
# Validate required parameters
|
|
210
212
|
validate_params!(project: project, experiment: experiment,
|
|
211
213
|
cases: cases, dataset: dataset, task: task, scorers: scorers)
|
|
212
214
|
|
|
213
|
-
# Get
|
|
214
|
-
|
|
215
|
-
raise Error, "No state available" unless state
|
|
215
|
+
# Get API from parameter or create from global state
|
|
216
|
+
api ||= API.new
|
|
216
217
|
|
|
217
|
-
# Ensure
|
|
218
|
+
# Ensure logged in (to populate org_name, etc.)
|
|
218
219
|
# login is idempotent and returns early if already logged in
|
|
219
|
-
|
|
220
|
+
api.login
|
|
220
221
|
|
|
221
222
|
# Resolve dataset to cases if dataset parameter provided
|
|
222
223
|
if dataset
|
|
223
|
-
cases = resolve_dataset(dataset, project,
|
|
224
|
+
cases = resolve_dataset(dataset, project, api)
|
|
224
225
|
end
|
|
225
226
|
|
|
226
|
-
# Register project and experiment via API
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
227
|
+
# Register project and experiment via internal API
|
|
228
|
+
projects_api = API::Internal::Projects.new(api.state)
|
|
229
|
+
experiments_api = API::Internal::Experiments.new(api.state)
|
|
230
|
+
|
|
231
|
+
project_result = projects_api.create(name: project)
|
|
232
|
+
experiment_result = experiments_api.create(
|
|
233
|
+
name: experiment,
|
|
234
|
+
project_id: project_result["id"],
|
|
235
|
+
ensure_new: !update,
|
|
236
|
+
tags: tags,
|
|
237
|
+
metadata: metadata
|
|
230
238
|
)
|
|
231
239
|
|
|
232
|
-
experiment_id =
|
|
233
|
-
project_id =
|
|
234
|
-
project_name =
|
|
240
|
+
experiment_id = experiment_result["id"]
|
|
241
|
+
project_id = project_result["id"]
|
|
242
|
+
project_name = project_result["name"]
|
|
235
243
|
|
|
236
244
|
# Instantiate Runner and run evaluation
|
|
237
245
|
runner = Runner.new(
|
|
@@ -241,7 +249,7 @@ module Braintrust
|
|
|
241
249
|
project_name: project_name,
|
|
242
250
|
task: task,
|
|
243
251
|
scorers: scorers,
|
|
244
|
-
|
|
252
|
+
api: api,
|
|
245
253
|
tracer_provider: tracer_provider
|
|
246
254
|
)
|
|
247
255
|
result = runner.run(cases, parallelism: parallelism)
|
|
@@ -285,84 +293,29 @@ module Braintrust
|
|
|
285
293
|
end
|
|
286
294
|
|
|
287
295
|
# Resolve dataset parameter to an array of case records
|
|
288
|
-
# @param dataset [String, Hash] Dataset specifier
|
|
289
|
-
# @param project [String] Project name (used as default if not specified
|
|
290
|
-
# @param
|
|
296
|
+
# @param dataset [String, Hash, Dataset] Dataset specifier or instance
|
|
297
|
+
# @param project [String] Project name (used as default if not specified)
|
|
298
|
+
# @param api [API] Braintrust API client
|
|
291
299
|
# @return [Array<Hash>] Array of case records
|
|
292
|
-
def resolve_dataset(dataset, project,
|
|
293
|
-
|
|
300
|
+
def resolve_dataset(dataset, project, api)
|
|
301
|
+
limit = nil
|
|
294
302
|
|
|
295
|
-
|
|
296
|
-
|
|
303
|
+
dataset_obj = case dataset
|
|
304
|
+
when Dataset
|
|
305
|
+
dataset
|
|
297
306
|
when String
|
|
298
|
-
|
|
299
|
-
{name: dataset, project: project}
|
|
307
|
+
Dataset.new(name: dataset, project: project, api: api)
|
|
300
308
|
when Hash
|
|
301
|
-
|
|
302
|
-
|
|
309
|
+
opts = dataset.dup
|
|
310
|
+
limit = opts.delete(:limit)
|
|
311
|
+
opts[:project] ||= project
|
|
312
|
+
opts[:api] = api
|
|
313
|
+
Dataset.new(**opts)
|
|
303
314
|
else
|
|
304
|
-
raise ArgumentError, "dataset must be String or
|
|
315
|
+
raise ArgumentError, "dataset must be String, Hash, or Dataset, got #{dataset.class}"
|
|
305
316
|
end
|
|
306
317
|
|
|
307
|
-
|
|
308
|
-
dataset_opts[:project] ||= project
|
|
309
|
-
|
|
310
|
-
# Create API client
|
|
311
|
-
api = API.new(state: state)
|
|
312
|
-
|
|
313
|
-
# Resolve dataset ID
|
|
314
|
-
dataset_id = if dataset_opts[:id]
|
|
315
|
-
# ID provided directly
|
|
316
|
-
dataset_opts[:id]
|
|
317
|
-
elsif dataset_opts[:name]
|
|
318
|
-
# Fetch by name + project
|
|
319
|
-
metadata = api.datasets.get(
|
|
320
|
-
project_name: dataset_opts[:project],
|
|
321
|
-
name: dataset_opts[:name]
|
|
322
|
-
)
|
|
323
|
-
metadata["id"]
|
|
324
|
-
else
|
|
325
|
-
raise ArgumentError, "dataset hash must specify either :name or :id"
|
|
326
|
-
end
|
|
327
|
-
|
|
328
|
-
# Fetch records with pagination
|
|
329
|
-
limit_per_page = 1000
|
|
330
|
-
max_records = dataset_opts[:limit]
|
|
331
|
-
version = dataset_opts[:version]
|
|
332
|
-
records = []
|
|
333
|
-
cursor = nil
|
|
334
|
-
|
|
335
|
-
loop do
|
|
336
|
-
result = api.datasets.fetch(
|
|
337
|
-
id: dataset_id,
|
|
338
|
-
limit: limit_per_page,
|
|
339
|
-
cursor: cursor,
|
|
340
|
-
version: version
|
|
341
|
-
)
|
|
342
|
-
|
|
343
|
-
records.concat(result[:records])
|
|
344
|
-
|
|
345
|
-
# Check if we've hit the user-specified limit
|
|
346
|
-
if max_records && records.length >= max_records
|
|
347
|
-
records = records.take(max_records)
|
|
348
|
-
break
|
|
349
|
-
end
|
|
350
|
-
|
|
351
|
-
# Check if there's more data
|
|
352
|
-
cursor = result[:cursor]
|
|
353
|
-
break unless cursor
|
|
354
|
-
end
|
|
355
|
-
|
|
356
|
-
# Filter records to only include Case-compatible fields
|
|
357
|
-
# Case accepts: input, expected, tags, metadata
|
|
358
|
-
records.map do |record|
|
|
359
|
-
filtered = {}
|
|
360
|
-
filtered[:input] = record["input"] if record.key?("input")
|
|
361
|
-
filtered[:expected] = record["expected"] if record.key?("expected")
|
|
362
|
-
filtered[:tags] = record["tags"] if record.key?("tags")
|
|
363
|
-
filtered[:metadata] = record["metadata"] if record.key?("metadata")
|
|
364
|
-
filtered
|
|
365
|
-
end
|
|
318
|
+
dataset_obj.fetch_all(limit: limit)
|
|
366
319
|
end
|
|
367
320
|
end
|
|
368
321
|
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
|
|
5
|
+
module Braintrust
|
|
6
|
+
module Internal
|
|
7
|
+
# Origin provides serialization for source object pointers in Braintrust.
|
|
8
|
+
# Used internally to link spans back to their source records (e.g., dataset rows).
|
|
9
|
+
module Origin
|
|
10
|
+
# Serialize an origin pointer to JSON
|
|
11
|
+
# @param object_type [String] Type of source object (e.g., "dataset", "playground_logs")
|
|
12
|
+
# @param object_id [String] ID of the source object
|
|
13
|
+
# @param id [String] ID of the specific record within the source
|
|
14
|
+
# @param xact_id [String] Transaction ID
|
|
15
|
+
# @param created [String, nil] Creation timestamp
|
|
16
|
+
# @return [String] JSON-serialized origin
|
|
17
|
+
def self.to_json(object_type:, object_id:, id:, xact_id:, created:)
|
|
18
|
+
JSON.dump({
|
|
19
|
+
object_type: object_type,
|
|
20
|
+
object_id: object_id,
|
|
21
|
+
id: id,
|
|
22
|
+
_xact_id: xact_id,
|
|
23
|
+
created: created
|
|
24
|
+
})
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
data/lib/braintrust/state.rb
CHANGED
|
@@ -139,6 +139,16 @@ module Braintrust
|
|
|
139
139
|
end
|
|
140
140
|
end
|
|
141
141
|
|
|
142
|
+
# Generate a permalink URL to view an object in the Braintrust UI
|
|
143
|
+
# This is for the /object endpoint (experiments, datasets, etc.)
|
|
144
|
+
# For trace span permalinks, use Trace.permalink instead.
|
|
145
|
+
# @param object_type [String] Type of object (e.g., "experiment", "dataset")
|
|
146
|
+
# @param object_id [String] Object UUID
|
|
147
|
+
# @return [String] Permalink URL
|
|
148
|
+
def object_permalink(object_type:, object_id:)
|
|
149
|
+
"#{@app_url}/app/#{@org_name}/object?object_type=#{object_type}&object_id=#{object_id}"
|
|
150
|
+
end
|
|
151
|
+
|
|
142
152
|
# Login to Braintrust API in a background thread with retry logic
|
|
143
153
|
# Retries indefinitely with exponential backoff until success
|
|
144
154
|
# Idempotent: returns early if already logged in
|
data/lib/braintrust/version.rb
CHANGED
data/lib/braintrust.rb
CHANGED
|
@@ -6,7 +6,7 @@ require_relative "braintrust/state"
|
|
|
6
6
|
require_relative "braintrust/trace"
|
|
7
7
|
require_relative "braintrust/api"
|
|
8
8
|
require_relative "braintrust/prompt"
|
|
9
|
-
require_relative "braintrust/
|
|
9
|
+
require_relative "braintrust/dataset"
|
|
10
10
|
require_relative "braintrust/internal/env"
|
|
11
11
|
require_relative "braintrust/eval"
|
|
12
12
|
require_relative "braintrust/contrib"
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: braintrust
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.3
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Braintrust
|
|
@@ -193,6 +193,8 @@ files:
|
|
|
193
193
|
- lib/braintrust/api/datasets.rb
|
|
194
194
|
- lib/braintrust/api/functions.rb
|
|
195
195
|
- lib/braintrust/api/internal/auth.rb
|
|
196
|
+
- lib/braintrust/api/internal/experiments.rb
|
|
197
|
+
- lib/braintrust/api/internal/projects.rb
|
|
196
198
|
- lib/braintrust/config.rb
|
|
197
199
|
- lib/braintrust/contrib.rb
|
|
198
200
|
- lib/braintrust/contrib/anthropic/deprecated.rb
|
|
@@ -228,6 +230,7 @@ files:
|
|
|
228
230
|
- lib/braintrust/contrib/setup.rb
|
|
229
231
|
- lib/braintrust/contrib/support/openai.rb
|
|
230
232
|
- lib/braintrust/contrib/support/otel.rb
|
|
233
|
+
- lib/braintrust/dataset.rb
|
|
231
234
|
- lib/braintrust/eval.rb
|
|
232
235
|
- lib/braintrust/eval/case.rb
|
|
233
236
|
- lib/braintrust/eval/cases.rb
|
|
@@ -239,7 +242,7 @@ files:
|
|
|
239
242
|
- lib/braintrust/eval/summary.rb
|
|
240
243
|
- lib/braintrust/internal/encoding.rb
|
|
241
244
|
- lib/braintrust/internal/env.rb
|
|
242
|
-
- lib/braintrust/internal/
|
|
245
|
+
- lib/braintrust/internal/origin.rb
|
|
243
246
|
- lib/braintrust/internal/template.rb
|
|
244
247
|
- lib/braintrust/internal/thread_pool.rb
|
|
245
248
|
- lib/braintrust/internal/time.rb
|
|
@@ -1,129 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require "net/http"
|
|
4
|
-
require "json"
|
|
5
|
-
require "uri"
|
|
6
|
-
require_relative "../logger"
|
|
7
|
-
|
|
8
|
-
module Braintrust
|
|
9
|
-
module Internal
|
|
10
|
-
# Experiments module provides internal API methods for registering projects and experiments
|
|
11
|
-
# Methods are marked private to prevent direct user access - use through Eval.run
|
|
12
|
-
module Experiments
|
|
13
|
-
# Public convenience method to register/get both project and experiment
|
|
14
|
-
# @param experiment_name [String] The experiment name
|
|
15
|
-
# @param project_name [String] The project name
|
|
16
|
-
# @param state [State] Braintrust state with API key and URL
|
|
17
|
-
# @param tags [Array<String>, nil] Optional experiment tags
|
|
18
|
-
# @param metadata [Hash, nil] Optional experiment metadata
|
|
19
|
-
# @param update [Boolean] If true, allow reusing existing experiment (default: false)
|
|
20
|
-
# @return [Hash] Hash with :experiment_id, :experiment_name, :project_id, :project_name
|
|
21
|
-
def self.get_or_create(experiment_name, project_name, state:,
|
|
22
|
-
tags: nil, metadata: nil, update: false)
|
|
23
|
-
# Register/get project first
|
|
24
|
-
project = register_project(project_name, state)
|
|
25
|
-
|
|
26
|
-
# Then register/get experiment
|
|
27
|
-
experiment = register_experiment(
|
|
28
|
-
experiment_name,
|
|
29
|
-
project["id"],
|
|
30
|
-
state,
|
|
31
|
-
tags: tags,
|
|
32
|
-
metadata: metadata,
|
|
33
|
-
update: update
|
|
34
|
-
)
|
|
35
|
-
|
|
36
|
-
{
|
|
37
|
-
experiment_id: experiment["id"],
|
|
38
|
-
experiment_name: experiment["name"],
|
|
39
|
-
project_id: project["id"],
|
|
40
|
-
project_name: project["name"]
|
|
41
|
-
}
|
|
42
|
-
end
|
|
43
|
-
|
|
44
|
-
# Register or get a project by name
|
|
45
|
-
# POST /v1/project with {name: "project-name"}
|
|
46
|
-
# Returns existing project if already exists
|
|
47
|
-
# @param name [String] Project name
|
|
48
|
-
# @param state [State] Braintrust state
|
|
49
|
-
# @return [Hash] Project data with "id", "name", "org_id", etc.
|
|
50
|
-
# @raise [Braintrust::Error] if API call fails
|
|
51
|
-
def self.register_project(name, state)
|
|
52
|
-
Log.debug("Registering project: #{name}")
|
|
53
|
-
|
|
54
|
-
uri = URI("#{state.api_url}/v1/project")
|
|
55
|
-
request = Net::HTTP::Post.new(uri)
|
|
56
|
-
request["Content-Type"] = "application/json"
|
|
57
|
-
request["Authorization"] = "Bearer #{state.api_key}"
|
|
58
|
-
request.body = JSON.dump({name: name})
|
|
59
|
-
|
|
60
|
-
http = Net::HTTP.new(uri.hostname, uri.port)
|
|
61
|
-
http.use_ssl = true if uri.scheme == "https"
|
|
62
|
-
|
|
63
|
-
response = http.start do |http_session|
|
|
64
|
-
http_session.request(request)
|
|
65
|
-
end
|
|
66
|
-
|
|
67
|
-
Log.debug("Register project response: [#{response.code}]")
|
|
68
|
-
|
|
69
|
-
# Handle response codes
|
|
70
|
-
unless response.is_a?(Net::HTTPSuccess)
|
|
71
|
-
raise Error, "Failed to register project '#{name}': [#{response.code}] #{response.body}"
|
|
72
|
-
end
|
|
73
|
-
|
|
74
|
-
project = JSON.parse(response.body)
|
|
75
|
-
Log.debug("Project registered: #{project["id"]} (#{project["name"]})")
|
|
76
|
-
project
|
|
77
|
-
end
|
|
78
|
-
private_class_method :register_project
|
|
79
|
-
|
|
80
|
-
# Register or get an experiment by name
|
|
81
|
-
# POST /v1/experiment with {project_id:, name:, ensure_new:, tags:[], metadata:{}}
|
|
82
|
-
# @param name [String] Experiment name
|
|
83
|
-
# @param project_id [String] Project ID
|
|
84
|
-
# @param state [State] Braintrust state
|
|
85
|
-
# @param tags [Array<String>, nil] Optional tags
|
|
86
|
-
# @param metadata [Hash, nil] Optional metadata
|
|
87
|
-
# @param update [Boolean] If true, allow reusing existing experiment (ensure_new: false)
|
|
88
|
-
# @return [Hash] Experiment data with "id", "name", "project_id", etc.
|
|
89
|
-
# @raise [Braintrust::Error] if API call fails
|
|
90
|
-
def self.register_experiment(name, project_id, state, tags: nil, metadata: nil, update: false)
|
|
91
|
-
Log.debug("Registering experiment: #{name} (project: #{project_id}, update: #{update})")
|
|
92
|
-
|
|
93
|
-
uri = URI("#{state.api_url}/v1/experiment")
|
|
94
|
-
request = Net::HTTP::Post.new(uri)
|
|
95
|
-
request["Content-Type"] = "application/json"
|
|
96
|
-
request["Authorization"] = "Bearer #{state.api_key}"
|
|
97
|
-
|
|
98
|
-
payload = {
|
|
99
|
-
project_id: project_id,
|
|
100
|
-
name: name,
|
|
101
|
-
ensure_new: !update # When update=true, allow reusing existing experiment
|
|
102
|
-
}
|
|
103
|
-
payload[:tags] = tags if tags
|
|
104
|
-
payload[:metadata] = metadata if metadata
|
|
105
|
-
|
|
106
|
-
request.body = JSON.dump(payload)
|
|
107
|
-
|
|
108
|
-
http = Net::HTTP.new(uri.hostname, uri.port)
|
|
109
|
-
http.use_ssl = true if uri.scheme == "https"
|
|
110
|
-
|
|
111
|
-
response = http.start do |http_session|
|
|
112
|
-
http_session.request(request)
|
|
113
|
-
end
|
|
114
|
-
|
|
115
|
-
Log.debug("Register experiment response: [#{response.code}]")
|
|
116
|
-
|
|
117
|
-
# Handle response codes
|
|
118
|
-
unless response.is_a?(Net::HTTPSuccess)
|
|
119
|
-
raise Error, "Failed to register experiment '#{name}': [#{response.code}] #{response.body}"
|
|
120
|
-
end
|
|
121
|
-
|
|
122
|
-
experiment = JSON.parse(response.body)
|
|
123
|
-
Log.debug("Experiment registered: #{experiment["id"]} (#{experiment["name"]})")
|
|
124
|
-
experiment
|
|
125
|
-
end
|
|
126
|
-
private_class_method :register_experiment
|
|
127
|
-
end
|
|
128
|
-
end
|
|
129
|
-
end
|