braintrust 0.1.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +71 -2
- data/lib/braintrust/api/datasets.rb +13 -3
- data/lib/braintrust/api/functions.rb +2 -3
- data/lib/braintrust/api/internal/auth.rb +2 -6
- data/lib/braintrust/api/internal/experiments.rb +7 -5
- data/lib/braintrust/api/internal/projects.rb +2 -3
- data/lib/braintrust/dataset.rb +10 -6
- data/lib/braintrust/eval/evaluator.rb +72 -0
- data/lib/braintrust/eval/functions.rb +56 -13
- data/lib/braintrust/eval/runner.rb +55 -13
- data/lib/braintrust/eval/scorer.rb +4 -0
- data/lib/braintrust/eval.rb +108 -45
- data/lib/braintrust/internal/http.rb +97 -0
- data/lib/braintrust/server/auth/clerk_token.rb +68 -0
- data/lib/braintrust/server/auth/no_auth.rb +14 -0
- data/lib/braintrust/server/handlers/eval.rb +217 -0
- data/lib/braintrust/server/handlers/health.rb +16 -0
- data/lib/braintrust/server/handlers/list.rb +74 -0
- data/lib/braintrust/server/middleware/auth.rb +29 -0
- data/lib/braintrust/server/middleware/cors.rb +87 -0
- data/lib/braintrust/server/rack/app.rb +38 -0
- data/lib/braintrust/server/rack.rb +36 -0
- data/lib/braintrust/server/router.rb +37 -0
- data/lib/braintrust/server/sse.rb +52 -0
- data/lib/braintrust/server.rb +8 -0
- data/lib/braintrust/trace/attachment.rb +3 -1
- data/lib/braintrust/trace/span_exporter.rb +36 -0
- data/lib/braintrust/trace.rb +3 -4
- data/lib/braintrust/version.rb +1 -1
- metadata +16 -1
data/lib/braintrust/eval.rb
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require_relative "eval/scorer"
|
|
4
|
+
require_relative "eval/evaluator"
|
|
4
5
|
require_relative "eval/runner"
|
|
6
|
+
require_relative "eval/functions"
|
|
5
7
|
require_relative "api/internal/projects"
|
|
6
8
|
require_relative "api/internal/experiments"
|
|
7
9
|
require_relative "dataset"
|
|
@@ -186,14 +188,17 @@ module Braintrust
|
|
|
186
188
|
end
|
|
187
189
|
|
|
188
190
|
# Run an evaluation
|
|
189
|
-
# @param project [String] The project name
|
|
190
|
-
# @param experiment [String] The experiment name
|
|
191
|
+
# @param project [String, nil] The project name (triggers full API mode: creates project + experiment)
|
|
192
|
+
# @param experiment [String, nil] The experiment name
|
|
191
193
|
# @param cases [Array, Enumerable, nil] The test cases (mutually exclusive with dataset)
|
|
192
194
|
# @param dataset [String, Hash, nil] Dataset to fetch (mutually exclusive with cases)
|
|
193
195
|
# - String: dataset name (fetches from same project)
|
|
194
196
|
# - Hash: {name:, id:, project:, version:, limit:}
|
|
195
197
|
# @param task [#call] The task to evaluate (must be callable)
|
|
196
198
|
# @param scorers [Array<Scorer, #call>] The scorers to use (Scorer objects or callables)
|
|
199
|
+
# @param on_progress [#call, nil] Optional callback fired after each test case.
|
|
200
|
+
# Receives a Hash: {"data" => output, "scores" => {name => value}} on success,
|
|
201
|
+
# or {"error" => message} on failure.
|
|
197
202
|
# @param parallelism [Integer] Number of parallel workers (default: 1).
|
|
198
203
|
# When parallelism > 1, test cases are executed concurrently using a thread pool.
|
|
199
204
|
# The task and scorers MUST be thread-safe when using parallelism > 1.
|
|
@@ -201,45 +206,45 @@ module Braintrust
|
|
|
201
206
|
# @param metadata [Hash] Optional experiment metadata
|
|
202
207
|
# @param update [Boolean] If true, allow reusing existing experiment (default: false)
|
|
203
208
|
# @param quiet [Boolean] If true, suppress result output (default: false)
|
|
204
|
-
# @param
|
|
209
|
+
# @param state [State, nil] Braintrust state (defaults to global state)
|
|
205
210
|
# @param tracer_provider [TracerProvider, nil] OpenTelemetry tracer provider (defaults to global)
|
|
206
211
|
# @return [Result]
|
|
207
|
-
def run(
|
|
208
|
-
cases: nil, dataset: nil,
|
|
212
|
+
def run(task:, scorers:, project: nil, experiment: nil,
|
|
213
|
+
cases: nil, dataset: nil, on_progress: nil,
|
|
209
214
|
parallelism: 1, tags: nil, metadata: nil, update: false, quiet: false,
|
|
210
|
-
|
|
215
|
+
state: nil, tracer_provider: nil, project_id: nil, parent: nil)
|
|
211
216
|
# Validate required parameters
|
|
212
|
-
validate_params!(
|
|
213
|
-
cases: cases, dataset: dataset, task: task, scorers: scorers)
|
|
217
|
+
validate_params!(task: task, scorers: scorers, cases: cases, dataset: dataset)
|
|
214
218
|
|
|
215
|
-
#
|
|
216
|
-
|
|
219
|
+
# Resolve any ScorerId entries to real Scorer objects
|
|
220
|
+
scorers = resolve_scorers(scorers, state: state, tracer_provider: tracer_provider)
|
|
217
221
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
api.login
|
|
222
|
+
experiment_id = nil
|
|
223
|
+
project_name = project
|
|
221
224
|
|
|
222
|
-
#
|
|
223
|
-
if
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
# Register project and experiment via internal API
|
|
228
|
-
projects_api = API::Internal::Projects.new(api.state)
|
|
229
|
-
experiments_api = API::Internal::Experiments.new(api.state)
|
|
225
|
+
# Full API mode: project name or project_id provided, resolve via API
|
|
226
|
+
if project || project_id
|
|
227
|
+
state ||= Braintrust.current_state
|
|
228
|
+
state.login
|
|
230
229
|
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
ensure_new: !update,
|
|
236
|
-
tags: tags,
|
|
237
|
-
metadata: metadata
|
|
238
|
-
)
|
|
230
|
+
if dataset
|
|
231
|
+
resolved = resolve_dataset(dataset, project, state)
|
|
232
|
+
cases = resolved[:cases]
|
|
233
|
+
end
|
|
239
234
|
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
235
|
+
# Skip experiment creation for remote evals (parent present).
|
|
236
|
+
# The OTLP backend creates experiments from ingested spans.
|
|
237
|
+
unless parent
|
|
238
|
+
project_id, project_name = resolve_project(state, project, project_id)
|
|
239
|
+
experiment_id = create_experiment(
|
|
240
|
+
state, experiment, project_id,
|
|
241
|
+
update: update, tags: tags, metadata: metadata,
|
|
242
|
+
dataset_id: resolved&.dig(:dataset_id),
|
|
243
|
+
dataset_version: resolved&.dig(:dataset_version)
|
|
244
|
+
)
|
|
245
|
+
parent = {object_type: "experiment_id", object_id: experiment_id}
|
|
246
|
+
end
|
|
247
|
+
end
|
|
243
248
|
|
|
244
249
|
# Instantiate Runner and run evaluation
|
|
245
250
|
runner = Runner.new(
|
|
@@ -249,8 +254,10 @@ module Braintrust
|
|
|
249
254
|
project_name: project_name,
|
|
250
255
|
task: task,
|
|
251
256
|
scorers: scorers,
|
|
252
|
-
|
|
253
|
-
tracer_provider: tracer_provider
|
|
257
|
+
state: state,
|
|
258
|
+
tracer_provider: tracer_provider,
|
|
259
|
+
on_progress: on_progress,
|
|
260
|
+
parent: parent
|
|
254
261
|
)
|
|
255
262
|
result = runner.run(cases, parallelism: parallelism)
|
|
256
263
|
|
|
@@ -268,11 +275,29 @@ module Braintrust
|
|
|
268
275
|
puts result.to_pretty
|
|
269
276
|
end
|
|
270
277
|
|
|
278
|
+
# Resolve scorers array: ScorerId entries become real Scorer objects, others pass through
|
|
279
|
+
# @param scorers [Array] Scorers (Scorer, callable, or ScorerId)
|
|
280
|
+
# @param state [State, nil] Braintrust state (required for ScorerId resolution)
|
|
281
|
+
# @param tracer_provider [TracerProvider, nil] OpenTelemetry tracer provider
|
|
282
|
+
# @return [Array<Scorer, #call>] Resolved scorers
|
|
283
|
+
def resolve_scorers(scorers, state: nil, tracer_provider: nil)
|
|
284
|
+
scorers.map do |scorer|
|
|
285
|
+
if scorer.is_a?(ScorerId)
|
|
286
|
+
Functions.scorer_by_id(
|
|
287
|
+
id: scorer.function_id,
|
|
288
|
+
version: scorer.version,
|
|
289
|
+
state: state,
|
|
290
|
+
tracer_provider: tracer_provider
|
|
291
|
+
)
|
|
292
|
+
else
|
|
293
|
+
scorer
|
|
294
|
+
end
|
|
295
|
+
end
|
|
296
|
+
end
|
|
297
|
+
|
|
271
298
|
# Validate required parameters
|
|
272
299
|
# @raise [ArgumentError] if validation fails
|
|
273
|
-
def validate_params!(
|
|
274
|
-
raise ArgumentError, "project is required" unless project
|
|
275
|
-
raise ArgumentError, "experiment is required" unless experiment
|
|
300
|
+
def validate_params!(task:, scorers:, cases:, dataset:)
|
|
276
301
|
raise ArgumentError, "task is required" unless task
|
|
277
302
|
raise ArgumentError, "scorers is required" unless scorers
|
|
278
303
|
|
|
@@ -292,30 +317,68 @@ module Braintrust
|
|
|
292
317
|
end
|
|
293
318
|
end
|
|
294
319
|
|
|
295
|
-
# Resolve
|
|
320
|
+
# Resolve project by name or ID. Creates if needed.
|
|
321
|
+
# @return [Array(String, String)] [project_id, project_name]
|
|
322
|
+
def resolve_project(state, project, project_id)
|
|
323
|
+
if project_id
|
|
324
|
+
[project_id, project]
|
|
325
|
+
else
|
|
326
|
+
result = API::Internal::Projects.new(state).create(name: project)
|
|
327
|
+
[result["id"], result["name"]]
|
|
328
|
+
end
|
|
329
|
+
end
|
|
330
|
+
|
|
331
|
+
# Create an experiment in the given project.
|
|
332
|
+
# @return [String] experiment_id
|
|
333
|
+
def create_experiment(state, name, project_id,
|
|
334
|
+
update: false, tags: nil, metadata: nil,
|
|
335
|
+
dataset_id: nil, dataset_version: nil)
|
|
336
|
+
result = API::Internal::Experiments.new(state).create(
|
|
337
|
+
name: name,
|
|
338
|
+
project_id: project_id,
|
|
339
|
+
ensure_new: !update,
|
|
340
|
+
tags: tags,
|
|
341
|
+
metadata: metadata,
|
|
342
|
+
dataset_id: dataset_id,
|
|
343
|
+
dataset_version: dataset_version
|
|
344
|
+
)
|
|
345
|
+
result["id"]
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
# Resolve dataset parameter to cases with metadata for experiment linking
|
|
296
349
|
# @param dataset [String, Hash, Dataset] Dataset specifier or instance
|
|
297
350
|
# @param project [String] Project name (used as default if not specified)
|
|
298
|
-
# @param
|
|
299
|
-
# @return [
|
|
300
|
-
def resolve_dataset(dataset, project,
|
|
351
|
+
# @param state [State] Braintrust state
|
|
352
|
+
# @return [Hash] Hash with :cases, :dataset_id, and :dataset_version
|
|
353
|
+
def resolve_dataset(dataset, project, state)
|
|
301
354
|
limit = nil
|
|
302
355
|
|
|
303
356
|
dataset_obj = case dataset
|
|
304
357
|
when Dataset
|
|
305
358
|
dataset
|
|
359
|
+
when DatasetId
|
|
360
|
+
Dataset.new(id: dataset.id, state: state)
|
|
306
361
|
when String
|
|
307
|
-
Dataset.new(name: dataset, project: project,
|
|
362
|
+
Dataset.new(name: dataset, project: project, state: state)
|
|
308
363
|
when Hash
|
|
309
364
|
opts = dataset.dup
|
|
310
365
|
limit = opts.delete(:limit)
|
|
311
366
|
opts[:project] ||= project
|
|
312
|
-
opts[:
|
|
367
|
+
opts[:state] = state
|
|
313
368
|
Dataset.new(**opts)
|
|
314
369
|
else
|
|
315
|
-
raise ArgumentError, "dataset must be String, Hash, or
|
|
370
|
+
raise ArgumentError, "dataset must be String, Hash, Dataset, or DatasetId, got #{dataset.class}"
|
|
316
371
|
end
|
|
317
372
|
|
|
318
|
-
dataset_obj.fetch_all(limit: limit)
|
|
373
|
+
cases = dataset_obj.fetch_all(limit: limit)
|
|
374
|
+
|
|
375
|
+
# Use pinned version if available, otherwise compute from max(_xact_id)
|
|
376
|
+
version = dataset_obj.version
|
|
377
|
+
version ||= cases
|
|
378
|
+
.filter_map { |c| c[:origin] && JSON.parse(c[:origin])["_xact_id"] }
|
|
379
|
+
.max
|
|
380
|
+
|
|
381
|
+
{cases: cases, dataset_id: dataset_obj.id, dataset_version: version}
|
|
319
382
|
end
|
|
320
383
|
end
|
|
321
384
|
end
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "net/http"
|
|
4
|
+
require "uri"
|
|
5
|
+
require "zlib"
|
|
6
|
+
require "stringio"
|
|
7
|
+
require_relative "../logger"
|
|
8
|
+
|
|
9
|
+
module Braintrust
|
|
10
|
+
module Internal
|
|
11
|
+
# HTTP utilities for redirect following and response decompression.
|
|
12
|
+
# Drop-in enhancement for raw Net::HTTP request calls throughout the SDK.
|
|
13
|
+
module Http
|
|
14
|
+
DEFAULT_MAX_REDIRECTS = 5
|
|
15
|
+
|
|
16
|
+
# Execute an HTTP request, following redirects as needed.
|
|
17
|
+
#
|
|
18
|
+
# @param uri [URI] The request URI
|
|
19
|
+
# @param request [Net::HTTPRequest] The prepared request object
|
|
20
|
+
# @param max_redirects [Integer] Maximum number of redirects to follow
|
|
21
|
+
# @return [Net::HTTPResponse] The final response
|
|
22
|
+
# @raise [Braintrust::Error] On too many redirects or missing Location header
|
|
23
|
+
def self.with_redirects(uri, request, max_redirects: DEFAULT_MAX_REDIRECTS)
|
|
24
|
+
response = perform_request(uri, request)
|
|
25
|
+
|
|
26
|
+
redirects = 0
|
|
27
|
+
original_request = request
|
|
28
|
+
|
|
29
|
+
while response.is_a?(Net::HTTPRedirection)
|
|
30
|
+
redirects += 1
|
|
31
|
+
if redirects > max_redirects
|
|
32
|
+
raise Error, "Too many redirects (max #{max_redirects})"
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
location = response["location"]
|
|
36
|
+
unless location
|
|
37
|
+
raise Error, "Redirect response #{response.code} without Location header"
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
redirect_uri = URI(location)
|
|
41
|
+
redirect_uri = uri + redirect_uri unless redirect_uri.host
|
|
42
|
+
|
|
43
|
+
Log.debug("[HTTP] Following #{response.code} redirect to #{redirect_uri}")
|
|
44
|
+
|
|
45
|
+
request = build_redirect_request(response, redirect_uri, original_request, uri)
|
|
46
|
+
uri = redirect_uri
|
|
47
|
+
response = perform_request(uri, request)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
response
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Decompress an HTTP response body in place based on Content-Encoding.
|
|
54
|
+
# No-op if the response has no recognized encoding.
|
|
55
|
+
#
|
|
56
|
+
# @param response [Net::HTTPResponse] The response to decompress
|
|
57
|
+
# @return [void]
|
|
58
|
+
def self.decompress_response!(response)
|
|
59
|
+
encoding = response["content-encoding"]&.downcase
|
|
60
|
+
case encoding
|
|
61
|
+
when "gzip", "x-gzip"
|
|
62
|
+
gz = Zlib::GzipReader.new(StringIO.new(response.body))
|
|
63
|
+
response.body.replace(gz.read)
|
|
64
|
+
gz.close
|
|
65
|
+
response.delete("content-encoding")
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def self.perform_request(uri, request)
|
|
70
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
71
|
+
http.use_ssl = (uri.scheme == "https")
|
|
72
|
+
http.request(request)
|
|
73
|
+
end
|
|
74
|
+
private_class_method :perform_request
|
|
75
|
+
|
|
76
|
+
def self.build_redirect_request(response, redirect_uri, original_request, original_uri)
|
|
77
|
+
if response.code == "307" || response.code == "308"
|
|
78
|
+
request = original_request.class.new(redirect_uri)
|
|
79
|
+
request.body = original_request.body
|
|
80
|
+
request["Content-Type"] = original_request["Content-Type"] if original_request["Content-Type"]
|
|
81
|
+
else
|
|
82
|
+
# 301, 302, 303: follow with GET, no body
|
|
83
|
+
request = Net::HTTP::Get.new(redirect_uri)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Strip Authorization when redirecting to a different host (e.g. S3)
|
|
87
|
+
if original_uri.host == redirect_uri.host
|
|
88
|
+
auth = original_request["Authorization"]
|
|
89
|
+
request["Authorization"] = auth if auth
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
request
|
|
93
|
+
end
|
|
94
|
+
private_class_method :build_redirect_request
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "net/http"
|
|
4
|
+
require "json"
|
|
5
|
+
|
|
6
|
+
module Braintrust
|
|
7
|
+
module Server
|
|
8
|
+
module Auth
|
|
9
|
+
# Validates Clerk JWT session tokens via the Braintrust app endpoint.
|
|
10
|
+
# The browser forwards the Clerk session token which is validated by
|
|
11
|
+
# POST /api/apikey/login on the app server.
|
|
12
|
+
class ClerkToken
|
|
13
|
+
DEFAULT_APP_URL = "https://www.braintrust.dev"
|
|
14
|
+
RACK_AUTH_HEADER = "HTTP_AUTHORIZATION"
|
|
15
|
+
RACK_ORG_NAME_HEADER = "HTTP_X_BT_ORG_NAME"
|
|
16
|
+
BEARER_PATTERN = /\ABearer (.+)\z/
|
|
17
|
+
LOGIN_PATH = "/api/apikey/login"
|
|
18
|
+
|
|
19
|
+
def initialize(app_url: nil)
|
|
20
|
+
@app_url = app_url || DEFAULT_APP_URL
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def authenticate(env)
|
|
24
|
+
token = extract_bearer_token(env)
|
|
25
|
+
return nil unless token
|
|
26
|
+
|
|
27
|
+
login_response = validate_token(token)
|
|
28
|
+
return nil unless login_response
|
|
29
|
+
|
|
30
|
+
org_name = env[RACK_ORG_NAME_HEADER]
|
|
31
|
+
|
|
32
|
+
{
|
|
33
|
+
"api_key" => token,
|
|
34
|
+
"org_id" => login_response["org_id"],
|
|
35
|
+
"org_name" => org_name || login_response["org_name"],
|
|
36
|
+
"app_url" => @app_url,
|
|
37
|
+
"api_url" => login_response["api_url"] || @app_url
|
|
38
|
+
}
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
private
|
|
42
|
+
|
|
43
|
+
def extract_bearer_token(env)
|
|
44
|
+
header = env[RACK_AUTH_HEADER]
|
|
45
|
+
return nil unless header
|
|
46
|
+
header[BEARER_PATTERN, 1]
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def validate_token(token)
|
|
50
|
+
uri = URI("#{@app_url}#{LOGIN_PATH}")
|
|
51
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
52
|
+
http.use_ssl = uri.scheme == "https"
|
|
53
|
+
|
|
54
|
+
request = Net::HTTP::Post.new(uri)
|
|
55
|
+
request["Content-Type"] = "application/json"
|
|
56
|
+
request.body = JSON.dump({token: token})
|
|
57
|
+
|
|
58
|
+
response = http.request(request)
|
|
59
|
+
return nil unless response.code == "200"
|
|
60
|
+
|
|
61
|
+
JSON.parse(response.body)
|
|
62
|
+
rescue
|
|
63
|
+
nil
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
|
|
5
|
+
module Braintrust
|
|
6
|
+
module Server
|
|
7
|
+
module Handlers
|
|
8
|
+
# POST /eval — adapter that maps HTTP request to Evaluator#run and streams SSE results.
|
|
9
|
+
# Handles auth passthrough, datasets, remote scorers, project_id, and parent.
|
|
10
|
+
class Eval
|
|
11
|
+
def initialize(evaluators)
|
|
12
|
+
@evaluators = evaluators
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def call(env)
|
|
16
|
+
body = parse_body(env)
|
|
17
|
+
return error_response(400, "Invalid JSON body") unless body
|
|
18
|
+
|
|
19
|
+
name = body["name"]
|
|
20
|
+
return error_response(400, "Missing required field: name") unless name
|
|
21
|
+
|
|
22
|
+
evaluator = @evaluators[name]
|
|
23
|
+
return error_response(404, "Evaluator '#{name}' not found") unless evaluator
|
|
24
|
+
|
|
25
|
+
data = body["data"]
|
|
26
|
+
return error_response(400, "Missing required field: data") unless data
|
|
27
|
+
|
|
28
|
+
# Validate exactly one data source
|
|
29
|
+
data_sources = ["data", "dataset_name", "dataset_id"].count { |k| data.key?(k) }
|
|
30
|
+
return error_response(400, "Exactly one data source required") if data_sources != 1
|
|
31
|
+
|
|
32
|
+
experiment_name = body["experiment_name"]
|
|
33
|
+
|
|
34
|
+
# Resolve data source
|
|
35
|
+
cases, dataset = resolve_data_source(data)
|
|
36
|
+
|
|
37
|
+
# Resolve remote scorers from request
|
|
38
|
+
remote_scorer_ids = resolve_remote_scorers(body["scores"])
|
|
39
|
+
|
|
40
|
+
# Resolve parent span context
|
|
41
|
+
parent = resolve_parent(body["parent"])
|
|
42
|
+
|
|
43
|
+
# Build state from auth context (if present)
|
|
44
|
+
state = build_state(env)
|
|
45
|
+
|
|
46
|
+
# The protocol-rack adapter (used by Falcon and any server built on
|
|
47
|
+
# protocol-http) buffers `each`-based bodies through an Enumerable path.
|
|
48
|
+
# Detect it via the "protocol.http.request" env key it injects, and use
|
|
49
|
+
# SSEStreamBody (call-only) so it dispatches through the Streaming path.
|
|
50
|
+
body_class = env.key?("protocol.http.request") ? SSEStreamBody : SSEBody
|
|
51
|
+
|
|
52
|
+
sse_body = body_class.new do |sse|
|
|
53
|
+
# Only pass project/experiment params when state is available
|
|
54
|
+
run_opts = {
|
|
55
|
+
on_progress: ->(progress_data) {
|
|
56
|
+
# Build remote eval protocol events from generic progress data.
|
|
57
|
+
# Runner provides: id, data/error, scores (optional), origin (optional).
|
|
58
|
+
# Protocol requires: id, object_type, origin, name, format, output_type, event, data.
|
|
59
|
+
base = {
|
|
60
|
+
"object_type" => "task",
|
|
61
|
+
"name" => name,
|
|
62
|
+
"format" => "code",
|
|
63
|
+
"output_type" => "completion"
|
|
64
|
+
}
|
|
65
|
+
base["id"] = progress_data["id"] if progress_data["id"]
|
|
66
|
+
base["origin"] = progress_data["origin"] if progress_data["origin"]
|
|
67
|
+
|
|
68
|
+
if progress_data.key?("error")
|
|
69
|
+
sse.event("progress", JSON.dump(base.merge("event" => "error", "data" => progress_data["error"])))
|
|
70
|
+
else
|
|
71
|
+
sse.event("progress", JSON.dump(base.merge("event" => "json_delta", "data" => JSON.dump(progress_data["data"]))))
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Signal per-cell completion so the UI exits "Streaming..." state
|
|
75
|
+
# and updates the progress bar immediately.
|
|
76
|
+
sse.event("progress", JSON.dump(base.merge("event" => "done", "data" => "")))
|
|
77
|
+
},
|
|
78
|
+
quiet: true
|
|
79
|
+
}
|
|
80
|
+
run_opts[:parent] = parent if parent
|
|
81
|
+
run_opts[:scorers] = remote_scorer_ids if remote_scorer_ids
|
|
82
|
+
run_opts[:dataset] = dataset if dataset
|
|
83
|
+
|
|
84
|
+
if state
|
|
85
|
+
run_opts[:state] = state
|
|
86
|
+
run_opts[:experiment] = experiment_name if experiment_name
|
|
87
|
+
run_opts[:project_id] = body["project_id"] if body["project_id"]
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
result = evaluator.run(cases, **run_opts)
|
|
91
|
+
|
|
92
|
+
# Flush buffered OTLP spans before sending completion events.
|
|
93
|
+
# The BatchSpanProcessor exports every ~5s; fast evals can finish
|
|
94
|
+
# before a single export fires, causing the UI to see no results.
|
|
95
|
+
Braintrust::Trace.flush_spans
|
|
96
|
+
|
|
97
|
+
# Build summary from result scores
|
|
98
|
+
averaged_scores = {}
|
|
99
|
+
result.scorer_stats.each do |scorer_name, stats|
|
|
100
|
+
averaged_scores[scorer_name] = stats.score_mean
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
sse.event("summary", JSON.dump({
|
|
104
|
+
"scores" => averaged_scores,
|
|
105
|
+
"experiment_name" => experiment_name,
|
|
106
|
+
"experiment_id" => result.experiment_id,
|
|
107
|
+
"project_id" => result.project_id
|
|
108
|
+
}))
|
|
109
|
+
|
|
110
|
+
sse.event("done", "")
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
[200, {"content-type" => "text/event-stream", "cache-control" => "no-cache", "connection" => "keep-alive"}, sse_body]
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
private
|
|
117
|
+
|
|
118
|
+
# Resolve data source from the data field.
|
|
119
|
+
# Returns [cases, dataset] where exactly one is non-nil.
|
|
120
|
+
def resolve_data_source(data)
|
|
121
|
+
if data.key?("data")
|
|
122
|
+
cases = data["data"].map do |d|
|
|
123
|
+
{input: d["input"], expected: d["expected"]}
|
|
124
|
+
end
|
|
125
|
+
[cases, nil]
|
|
126
|
+
elsif data.key?("dataset_id")
|
|
127
|
+
[nil, Braintrust::DatasetId.new(id: data["dataset_id"])]
|
|
128
|
+
elsif data.key?("dataset_name")
|
|
129
|
+
dataset_opts = {name: data["dataset_name"]}
|
|
130
|
+
dataset_opts[:project] = data["project_name"] if data["project_name"]
|
|
131
|
+
[nil, dataset_opts]
|
|
132
|
+
else
|
|
133
|
+
[nil, nil]
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# Map request scores array to ScorerId structs.
|
|
138
|
+
# The UI sends function_id as a nested object: {"function_id": "uuid"}.
|
|
139
|
+
def resolve_remote_scorers(scores)
|
|
140
|
+
return nil if scores.nil? || scores.empty?
|
|
141
|
+
scores.map do |s|
|
|
142
|
+
func_id = s["function_id"]
|
|
143
|
+
func_id = func_id["function_id"] if func_id.is_a?(Hash)
|
|
144
|
+
Braintrust::ScorerId.new(
|
|
145
|
+
function_id: func_id,
|
|
146
|
+
version: s["version"]
|
|
147
|
+
)
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
# Map request parent to symbol-keyed Hash.
|
|
152
|
+
# Hardcode playground_id to match Java SDK behavior.
|
|
153
|
+
# Also extracts generation from propagated_event for span_attributes.
|
|
154
|
+
def resolve_parent(parent)
|
|
155
|
+
return nil unless parent.is_a?(Hash)
|
|
156
|
+
object_id = parent["object_id"]
|
|
157
|
+
return nil unless object_id
|
|
158
|
+
|
|
159
|
+
generation = parent.dig("propagated_event", "span_attributes", "generation")
|
|
160
|
+
|
|
161
|
+
result = {object_type: "playground_id", object_id: object_id}
|
|
162
|
+
result[:generation] = generation if generation
|
|
163
|
+
result
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
# Build State from auth context set by Auth middleware.
|
|
167
|
+
# Returns nil when no auth context is present (e.g. NoAuth strategy).
|
|
168
|
+
# Uses an LRU-style cache (max 64 entries) keyed by [api_key, app_url, org_name].
|
|
169
|
+
def build_state(env)
|
|
170
|
+
auth = env["braintrust.auth"]
|
|
171
|
+
return nil unless auth.is_a?(Hash)
|
|
172
|
+
|
|
173
|
+
cache_key = [auth["api_key"], auth["app_url"], auth["org_name"]]
|
|
174
|
+
|
|
175
|
+
@state_mutex ||= Mutex.new
|
|
176
|
+
@state_cache ||= {}
|
|
177
|
+
|
|
178
|
+
@state_mutex.synchronize do
|
|
179
|
+
cached = @state_cache[cache_key]
|
|
180
|
+
return cached if cached
|
|
181
|
+
|
|
182
|
+
state = Braintrust::State.new(
|
|
183
|
+
api_key: auth["api_key"],
|
|
184
|
+
org_id: auth["org_id"],
|
|
185
|
+
org_name: auth["org_name"],
|
|
186
|
+
app_url: auth["app_url"],
|
|
187
|
+
api_url: auth["api_url"],
|
|
188
|
+
enable_tracing: false
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
# Evict oldest entry if cache is full
|
|
192
|
+
if @state_cache.size >= 64
|
|
193
|
+
oldest_key = @state_cache.keys.first
|
|
194
|
+
@state_cache.delete(oldest_key)
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
@state_cache[cache_key] = state
|
|
198
|
+
state
|
|
199
|
+
end
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
def parse_body(env)
|
|
203
|
+
body = env["rack.input"]&.read
|
|
204
|
+
return nil if body.nil? || body.empty?
|
|
205
|
+
JSON.parse(body)
|
|
206
|
+
rescue JSON::ParserError
|
|
207
|
+
nil
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
def error_response(status, message)
|
|
211
|
+
[status, {"content-type" => "application/json"},
|
|
212
|
+
[JSON.dump({"error" => message})]]
|
|
213
|
+
end
|
|
214
|
+
end
|
|
215
|
+
end
|
|
216
|
+
end
|
|
217
|
+
end
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
|
|
5
|
+
module Braintrust
|
|
6
|
+
module Server
|
|
7
|
+
module Handlers
|
|
8
|
+
# GET / — simple health check endpoint.
|
|
9
|
+
class Health
|
|
10
|
+
def call(_env)
|
|
11
|
+
[200, {"content-type" => "application/json"}, [JSON.dump({"status" => "ok"})]]
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|