braintrust 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +107 -10
- data/lib/braintrust/contrib/rails/server/application_controller.rb +34 -0
- data/lib/braintrust/contrib/rails/server/engine.rb +72 -0
- data/lib/braintrust/contrib/rails/server/eval_controller.rb +36 -0
- data/lib/braintrust/contrib/rails/server/generator.rb +43 -0
- data/lib/braintrust/contrib/rails/server/health_controller.rb +15 -0
- data/lib/braintrust/contrib/rails/server/list_controller.rb +16 -0
- data/lib/braintrust/contrib/rails/server/routes.rb +8 -0
- data/lib/braintrust/contrib/rails/server.rb +20 -0
- data/lib/braintrust/eval/runner.rb +80 -52
- data/lib/braintrust/scorer.rb +55 -4
- data/lib/braintrust/server/handlers/eval.rb +8 -168
- data/lib/braintrust/server/handlers/list.rb +3 -41
- data/lib/braintrust/server/rack.rb +2 -0
- data/lib/braintrust/server/services/eval_service.rb +214 -0
- data/lib/braintrust/server/services/list_service.rb +64 -0
- data/lib/braintrust/trace/span_processor.rb +0 -5
- data/lib/braintrust/version.rb +1 -1
- metadata +11 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c07be3c454a924c5c97c2653136a2b9cdd1098409af16326b1db8676c5c8b0d2
|
|
4
|
+
data.tar.gz: c1eb75eefdcacebc2c955ae23aa3196d276a76d6ab828cdfb817c7e9168325b3
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: d02058bd5321ed16ea2f785aaeb24f4d4f105c5357c3c7ceb2a8a02c090b69c7187623b23e14d5026bb0cf236e64dddae7025509d7b2d6769bb50f110612120f
|
|
7
|
+
data.tar.gz: 15627209b382c023c2640e1d2219b6d33b84cb7c67ba1a3b8e3ebbe1aa912d3df832583a1e37b3831699b67ea81f3b4242b67a606dfdd727827e648a6509fea7
|
data/README.md
CHANGED
|
@@ -259,6 +259,8 @@ Braintrust::Eval.run(
|
|
|
259
259
|
)
|
|
260
260
|
```
|
|
261
261
|
|
|
262
|
+
See [eval.rb](./examples/eval.rb) for a full example.
|
|
263
|
+
|
|
262
264
|
### Datasets
|
|
263
265
|
|
|
264
266
|
Use test cases from a Braintrust dataset:
|
|
@@ -287,6 +289,8 @@ Braintrust::Eval.run(
|
|
|
287
289
|
)
|
|
288
290
|
```
|
|
289
291
|
|
|
292
|
+
See [dataset.rb](./examples/eval/dataset.rb) for a full example.
|
|
293
|
+
|
|
290
294
|
### Scorers
|
|
291
295
|
|
|
292
296
|
Use scoring functions defined in Braintrust:
|
|
@@ -315,6 +319,46 @@ Braintrust::Eval.run(
|
|
|
315
319
|
)
|
|
316
320
|
```
|
|
317
321
|
|
|
322
|
+
See [remote_functions.rb](./examples/eval/remote_functions.rb) for a full example.
|
|
323
|
+
|
|
324
|
+
#### Scorer metadata
|
|
325
|
+
|
|
326
|
+
Scorers can return a Hash with `:score` and `:metadata` to attach structured context to the score. The metadata is logged on the scorer's span and visible in the Braintrust UI for debugging and filtering:
|
|
327
|
+
|
|
328
|
+
```ruby
|
|
329
|
+
Braintrust::Scorer.new("translation") do |expected:, output:|
|
|
330
|
+
common_words = output.downcase.split & expected.downcase.split
|
|
331
|
+
overlap = common_words.size.to_f / expected.split.size
|
|
332
|
+
{
|
|
333
|
+
score: overlap,
|
|
334
|
+
metadata: {word_overlap: common_words.size, missing_words: expected.downcase.split - output.downcase.split}
|
|
335
|
+
}
|
|
336
|
+
end
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
See [scorer_metadata.rb](./examples/eval/scorer_metadata.rb) for a full example.
|
|
340
|
+
|
|
341
|
+
#### Multiple scores from one scorer
|
|
342
|
+
|
|
343
|
+
When several scores can be computed together (e.g. in one LLM call), you can return an `Array` of score `Hash` instead of a single value. Each metric appears as a separate score column in the Braintrust UI:
|
|
344
|
+
|
|
345
|
+
```ruby
|
|
346
|
+
Braintrust::Scorer.new("summary_quality") do |output:, expected:|
|
|
347
|
+
words = output.downcase.split
|
|
348
|
+
key_terms = expected[:key_terms]
|
|
349
|
+
covered = key_terms.count { |t| words.include?(t) }
|
|
350
|
+
|
|
351
|
+
[
|
|
352
|
+
{name: "coverage", score: covered.to_f / key_terms.size, metadata: {missing: key_terms - words}},
|
|
353
|
+
{name: "conciseness", score: words.size <= expected[:max_words] ? 1.0 : 0.0}
|
|
354
|
+
]
|
|
355
|
+
end
|
|
356
|
+
```
|
|
357
|
+
|
|
358
|
+
`name` and `score` are required, `metadata` is optional.
|
|
359
|
+
|
|
360
|
+
See [multi_score.rb](./examples/eval/multi_score.rb) for a full example.
|
|
361
|
+
|
|
318
362
|
#### Trace scoring
|
|
319
363
|
|
|
320
364
|
Scorers can access the full evaluation trace (all spans generated by the task) by declaring a `trace:` keyword parameter. This is useful for inspecting intermediate LLM calls, validating tool usage, or checking the message thread:
|
|
@@ -344,11 +388,15 @@ Braintrust::Eval.run(
|
|
|
344
388
|
)
|
|
345
389
|
```
|
|
346
390
|
|
|
347
|
-
See
|
|
391
|
+
See [trace_scoring.rb](./examples/eval/trace_scoring.rb) for a full example.
|
|
348
392
|
|
|
349
393
|
### Dev Server
|
|
350
394
|
|
|
351
|
-
Run evaluations from the Braintrust web UI against code in your own application.
|
|
395
|
+
Run evaluations from the Braintrust web UI against code in your own application.
|
|
396
|
+
|
|
397
|
+
#### Run as a Rack app
|
|
398
|
+
|
|
399
|
+
Define evaluators, pass them to the dev server, and start serving:
|
|
352
400
|
|
|
353
401
|
```ruby
|
|
354
402
|
# eval_server.ru
|
|
@@ -374,10 +422,21 @@ run Braintrust::Server::Rack.app(
|
|
|
374
422
|
)
|
|
375
423
|
```
|
|
376
424
|
|
|
425
|
+
Add your Rack server to your Gemfile:
|
|
426
|
+
|
|
427
|
+
```ruby
|
|
428
|
+
gem "rack"
|
|
429
|
+
gem "puma" # recommended
|
|
430
|
+
```
|
|
431
|
+
|
|
432
|
+
Then start the server:
|
|
433
|
+
|
|
377
434
|
```bash
|
|
378
435
|
bundle exec rackup eval_server.ru -p 8300 -o 0.0.0.0
|
|
379
436
|
```
|
|
380
437
|
|
|
438
|
+
See example: [server/eval.ru](./examples/server/eval.ru)
|
|
439
|
+
|
|
381
440
|
**Custom evaluators**
|
|
382
441
|
|
|
383
442
|
Evaluators can also be defined as subclasses:
|
|
@@ -394,6 +453,51 @@ class FoodClassifier < Braintrust::Eval::Evaluator
|
|
|
394
453
|
end
|
|
395
454
|
```
|
|
396
455
|
|
|
456
|
+
#### Run as a Rails engine
|
|
457
|
+
|
|
458
|
+
Use the Rails engine when your evaluators live inside an existing Rails app and you want to mount the Braintrust eval server into that application.
|
|
459
|
+
|
|
460
|
+
Define each evaluator in its own file, for example under `app/evaluators/`:
|
|
461
|
+
|
|
462
|
+
```ruby
|
|
463
|
+
# app/evaluators/food_classifier.rb
|
|
464
|
+
class FoodClassifier < Braintrust::Eval::Evaluator
|
|
465
|
+
def task
|
|
466
|
+
->(input:) { classify(input) }
|
|
467
|
+
end
|
|
468
|
+
|
|
469
|
+
def scorers
|
|
470
|
+
[Braintrust::Scorer.new("exact_match") { |expected:, output:| output == expected ? 1.0 : 0.0 }]
|
|
471
|
+
end
|
|
472
|
+
end
|
|
473
|
+
```
|
|
474
|
+
|
|
475
|
+
Then generate the Braintrust initializer:
|
|
476
|
+
|
|
477
|
+
```bash
|
|
478
|
+
bin/rails generate braintrust:eval_server
|
|
479
|
+
```
|
|
480
|
+
|
|
481
|
+
```ruby
|
|
482
|
+
# config/routes.rb
|
|
483
|
+
Rails.application.routes.draw do
|
|
484
|
+
mount Braintrust::Contrib::Rails::Engine, at: "/braintrust"
|
|
485
|
+
end
|
|
486
|
+
```
|
|
487
|
+
|
|
488
|
+
The generator writes `config/initializers/braintrust_server.rb`, where you can review or customize the slug-to-evaluator mapping it discovers from `app/evaluators/**/*.rb` and `evaluators/**/*.rb`.
|
|
489
|
+
|
|
490
|
+
See example: [contrib/rails/eval.rb](./examples/contrib/rails/eval.rb)
|
|
491
|
+
|
|
492
|
+
**Developing locally**
|
|
493
|
+
|
|
494
|
+
If you want to skip authentication on incoming eval requests while developing locally:
|
|
495
|
+
|
|
496
|
+
- **For Rack**: Pass `auth: :none` to `Braintrust::Server::Rack.app(...)`
|
|
497
|
+
- **For Rails**: Set `config.auth = :none` in `config/initializers/braintrust_server.rb`
|
|
498
|
+
|
|
499
|
+
*NOTE: Setting `:none` disables authentication on incoming requests into your server; executing evals requires a `BRAINTRUST_API_KEY` to fetch resources.*
|
|
500
|
+
|
|
397
501
|
**Supported web servers**
|
|
398
502
|
|
|
399
503
|
The dev server requires the `rack` gem and a Rack-compatible web server.
|
|
@@ -405,14 +509,7 @@ The dev server requires the `rack` gem and a Rack-compatible web server.
|
|
|
405
509
|
| [Passenger](https://www.phusionpassenger.com/) | 6.x | |
|
|
406
510
|
| [WEBrick](https://github.com/ruby/webrick) | Not supported | Does not support server-sent events. |
|
|
407
511
|
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
```ruby
|
|
411
|
-
gem "rack"
|
|
412
|
-
gem "puma" # recommended
|
|
413
|
-
```
|
|
414
|
-
|
|
415
|
-
See example: [server/eval.ru](./examples/server/eval.ru)
|
|
512
|
+
See examples: [server/eval.ru](./examples/server/eval.ru),
|
|
416
513
|
|
|
417
514
|
## Documentation
|
|
418
515
|
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Braintrust
|
|
4
|
+
module Contrib
|
|
5
|
+
module Rails
|
|
6
|
+
module Server
|
|
7
|
+
class ApplicationController < ActionController::API
|
|
8
|
+
before_action :authenticate!
|
|
9
|
+
|
|
10
|
+
private
|
|
11
|
+
|
|
12
|
+
def authenticate!
|
|
13
|
+
auth_result = Engine.auth_strategy.authenticate(request.env)
|
|
14
|
+
unless auth_result
|
|
15
|
+
render json: {"error" => "Unauthorized"}, status: :unauthorized
|
|
16
|
+
return
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
request.env["braintrust.auth"] = auth_result
|
|
20
|
+
@braintrust_auth = auth_result
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def parse_json_body
|
|
24
|
+
body = request.body.read
|
|
25
|
+
return nil if body.nil? || body.empty?
|
|
26
|
+
JSON.parse(body)
|
|
27
|
+
rescue JSON::ParserError
|
|
28
|
+
nil
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Braintrust
|
|
4
|
+
module Contrib
|
|
5
|
+
module Rails
|
|
6
|
+
module Server
|
|
7
|
+
class Engine < ::Rails::Engine
|
|
8
|
+
isolate_namespace Braintrust::Contrib::Rails::Server
|
|
9
|
+
|
|
10
|
+
config.evaluators = {}
|
|
11
|
+
config.auth = :clerk_token
|
|
12
|
+
|
|
13
|
+
# Register the engine's routes file so Rails loads it during initialization.
|
|
14
|
+
paths["config/routes.rb"] << File.expand_path("routes.rb", __dir__)
|
|
15
|
+
|
|
16
|
+
initializer "braintrust.server.cors" do |app|
|
|
17
|
+
app.middleware.use Braintrust::Server::Middleware::Cors
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Class-level helpers that read from engine config.
|
|
21
|
+
|
|
22
|
+
def self.evaluators
|
|
23
|
+
config.evaluators
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def self.auth_strategy
|
|
27
|
+
resolve_auth(config.auth)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def self.list_service
|
|
31
|
+
Braintrust::Server::Services::List.new(-> { config.evaluators })
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Long-lived so the state cache persists across requests.
|
|
35
|
+
def self.eval_service
|
|
36
|
+
@eval_service ||= Braintrust::Server::Services::Eval.new(-> { config.evaluators })
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Support the explicit `|config|` style used by this integration while
|
|
40
|
+
# still delegating zero-arity DSL blocks to Rails' native implementation.
|
|
41
|
+
def self.configure(&block)
|
|
42
|
+
return super if block&.arity == 0
|
|
43
|
+
yield config if block
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def self.resolve_auth(auth)
|
|
47
|
+
case auth
|
|
48
|
+
when :none
|
|
49
|
+
Braintrust::Server::Auth::NoAuth.new
|
|
50
|
+
when :clerk_token
|
|
51
|
+
Braintrust::Server::Auth::ClerkToken.new
|
|
52
|
+
when Symbol, String
|
|
53
|
+
raise ArgumentError, "Unknown auth strategy #{auth.inspect}. Expected :none, :clerk_token, or an auth object."
|
|
54
|
+
else
|
|
55
|
+
auth
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
private_class_method :resolve_auth
|
|
59
|
+
|
|
60
|
+
generators do
|
|
61
|
+
require "braintrust/contrib/rails/server/generator"
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
require_relative "application_controller"
|
|
70
|
+
require_relative "health_controller"
|
|
71
|
+
require_relative "list_controller"
|
|
72
|
+
require_relative "eval_controller"
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Braintrust
|
|
4
|
+
module Contrib
|
|
5
|
+
module Rails
|
|
6
|
+
module Server
|
|
7
|
+
class EvalController < ApplicationController
|
|
8
|
+
include ActionController::Live
|
|
9
|
+
|
|
10
|
+
def create
|
|
11
|
+
body = parse_json_body
|
|
12
|
+
unless body
|
|
13
|
+
render json: {"error" => "Invalid JSON body"}, status: :bad_request
|
|
14
|
+
return
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
result = Engine.eval_service.validate(body)
|
|
18
|
+
if result[:error]
|
|
19
|
+
render json: {"error" => result[:error]}, status: result[:status]
|
|
20
|
+
return
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
response.headers["Content-Type"] = "text/event-stream"
|
|
24
|
+
response.headers["Cache-Control"] = "no-cache"
|
|
25
|
+
response.headers["Connection"] = "keep-alive"
|
|
26
|
+
|
|
27
|
+
sse = Braintrust::Server::SSEWriter.new { |chunk| response.stream.write(chunk) }
|
|
28
|
+
Engine.eval_service.stream(result, auth: @braintrust_auth, sse: sse)
|
|
29
|
+
ensure
|
|
30
|
+
response.stream.close
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "rails/generators"
|
|
4
|
+
|
|
5
|
+
module Braintrust
|
|
6
|
+
module Contrib
|
|
7
|
+
module Rails
|
|
8
|
+
module Server
|
|
9
|
+
module Generators
|
|
10
|
+
class ServerGenerator < ::Rails::Generators::Base
|
|
11
|
+
namespace "braintrust:server"
|
|
12
|
+
source_root File.expand_path("templates", __dir__)
|
|
13
|
+
|
|
14
|
+
def create_initializer
|
|
15
|
+
@evaluators = discovered_evaluators
|
|
16
|
+
template "initializer.rb.tt", "config/initializers/braintrust_server.rb"
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
private
|
|
20
|
+
|
|
21
|
+
def discovered_evaluators
|
|
22
|
+
evaluator_roots.flat_map do |root|
|
|
23
|
+
Dir[File.join(destination_root, root, "**/*.rb")].sort.map do |file|
|
|
24
|
+
relative_path = file.delete_prefix("#{File.join(destination_root, root)}/").sub(/\.rb\z/, "")
|
|
25
|
+
{
|
|
26
|
+
class_name: relative_path.split("/").map(&:camelize).join("::"),
|
|
27
|
+
slug: relative_path.tr("/", "-").tr("_", "-")
|
|
28
|
+
}
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def evaluator_roots
|
|
34
|
+
%w[app/evaluators evaluators].select do |root|
|
|
35
|
+
Dir.exist?(File.join(destination_root, root))
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Braintrust
|
|
4
|
+
module Contrib
|
|
5
|
+
module Rails
|
|
6
|
+
module Server
|
|
7
|
+
class ListController < ApplicationController
|
|
8
|
+
def show
|
|
9
|
+
result = Engine.list_service.call
|
|
10
|
+
render json: result
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
begin
|
|
4
|
+
require "action_controller"
|
|
5
|
+
require "rails/engine"
|
|
6
|
+
rescue LoadError
|
|
7
|
+
raise LoadError,
|
|
8
|
+
"Rails (actionpack + railties) is required for the Braintrust Rails server engine. " \
|
|
9
|
+
"Add `gem 'rails'` or `gem 'actionpack'` and `gem 'railties'` to your Gemfile."
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
require "json"
|
|
13
|
+
require_relative "../../eval"
|
|
14
|
+
require_relative "../../server/sse"
|
|
15
|
+
require_relative "../../server/auth/no_auth"
|
|
16
|
+
require_relative "../../server/auth/clerk_token"
|
|
17
|
+
require_relative "../../server/middleware/cors"
|
|
18
|
+
require_relative "../../server/services/list_service"
|
|
19
|
+
require_relative "../../server/services/eval_service"
|
|
20
|
+
require_relative "server/engine"
|
|
@@ -82,11 +82,17 @@ module Braintrust
|
|
|
82
82
|
# @param case_context [CaseContext] The per-case accumulator
|
|
83
83
|
# @param errors [Queue] Thread-safe error collection queue
|
|
84
84
|
def run_eval_case(case_context, errors)
|
|
85
|
-
|
|
85
|
+
# Each eval case starts its own trace — detach from any ambient span context
|
|
86
|
+
eval_span = tracer.start_root_span("eval")
|
|
87
|
+
OpenTelemetry::Trace.with_span(eval_span) do
|
|
88
|
+
# Set attributes known before task execution
|
|
86
89
|
eval_span.set_attribute("braintrust.parent", eval_context.parent_span_attr) if eval_context.parent_span_attr
|
|
87
|
-
|
|
88
|
-
|
|
90
|
+
set_json_attr(eval_span, "braintrust.span_attributes", build_span_attributes("eval"))
|
|
91
|
+
set_json_attr(eval_span, "braintrust.input_json", {input: case_context.input})
|
|
92
|
+
set_json_attr(eval_span, "braintrust.expected", case_context.expected) if case_context.expected
|
|
93
|
+
set_json_attr(eval_span, "braintrust.metadata", case_context.metadata) if case_context.metadata
|
|
89
94
|
eval_span.set_attribute("braintrust.tags", case_context.tags) if case_context.tags
|
|
95
|
+
eval_span.set_attribute("braintrust.origin", case_context.origin) if case_context.origin
|
|
90
96
|
|
|
91
97
|
# Run task
|
|
92
98
|
begin
|
|
@@ -94,6 +100,7 @@ module Braintrust
|
|
|
94
100
|
rescue => e
|
|
95
101
|
# Error already recorded on task span, set eval span status
|
|
96
102
|
eval_span.status = OpenTelemetry::Trace::Status.error(e.message)
|
|
103
|
+
set_json_attr(eval_span, "braintrust.output_json", {output: nil})
|
|
97
104
|
errors << "Task failed for input '#{case_context.input}': #{e.message}"
|
|
98
105
|
report_progress(eval_span, case_context, error: e.message)
|
|
99
106
|
next
|
|
@@ -104,26 +111,21 @@ module Braintrust
|
|
|
104
111
|
case_context.trace = build_trace(eval_span)
|
|
105
112
|
|
|
106
113
|
# Run scorers
|
|
107
|
-
case_scores = nil
|
|
108
114
|
begin
|
|
109
|
-
|
|
115
|
+
run_scorers(case_context)
|
|
110
116
|
rescue => e
|
|
111
117
|
# Error already recorded on score span, set eval span status
|
|
112
118
|
eval_span.status = OpenTelemetry::Trace::Status.error(e.message)
|
|
113
119
|
errors << "Scorers failed for input '#{case_context.input}': #{e.message}"
|
|
114
120
|
end
|
|
115
121
|
|
|
116
|
-
# Set
|
|
117
|
-
set_json_attr(eval_span, "braintrust.
|
|
118
|
-
set_json_attr(eval_span, "braintrust.input_json", case_context.input)
|
|
119
|
-
set_json_attr(eval_span, "braintrust.output_json", case_context.output)
|
|
120
|
-
set_json_attr(eval_span, "braintrust.expected", case_context.expected) if case_context.expected
|
|
122
|
+
# Set output after task completes
|
|
123
|
+
set_json_attr(eval_span, "braintrust.output_json", {output: case_context.output})
|
|
121
124
|
|
|
122
|
-
|
|
123
|
-
eval_span.set_attribute("braintrust.origin", case_context.origin) if case_context.origin
|
|
124
|
-
|
|
125
|
-
report_progress(eval_span, case_context, data: case_context.output, scores: case_scores || {})
|
|
125
|
+
report_progress(eval_span, case_context, data: case_context.output)
|
|
126
126
|
end
|
|
127
|
+
ensure
|
|
128
|
+
eval_span&.finish
|
|
127
129
|
end
|
|
128
130
|
|
|
129
131
|
# Run task with OpenTelemetry tracing
|
|
@@ -151,43 +153,62 @@ module Braintrust
|
|
|
151
153
|
end
|
|
152
154
|
end
|
|
153
155
|
|
|
154
|
-
# Run scorers with OpenTelemetry tracing
|
|
155
|
-
# Creates
|
|
156
|
+
# Run scorers with OpenTelemetry tracing.
|
|
157
|
+
# Creates one span per scorer, each a direct child of the current (eval) span.
|
|
156
158
|
# @param case_context [CaseContext] The per-case context (output must be populated)
|
|
157
|
-
# @return [Hash] Scores hash { scorer_name => score_value }
|
|
158
159
|
def run_scorers(case_context)
|
|
159
|
-
|
|
160
|
+
scorer_kwargs = {
|
|
161
|
+
input: case_context.input,
|
|
162
|
+
expected: case_context.expected,
|
|
163
|
+
output: case_context.output,
|
|
164
|
+
metadata: case_context.metadata || {},
|
|
165
|
+
trace: case_context.trace
|
|
166
|
+
}
|
|
167
|
+
scorer_input = {
|
|
168
|
+
input: case_context.input,
|
|
169
|
+
expected: case_context.expected,
|
|
170
|
+
output: case_context.output,
|
|
171
|
+
metadata: case_context.metadata || {}
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
scorer_error = nil
|
|
175
|
+
eval_context.scorers.each do |scorer|
|
|
176
|
+
collect_scores(run_scorer(scorer, scorer_kwargs, scorer_input))
|
|
177
|
+
rescue => e
|
|
178
|
+
scorer_error ||= e
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
raise scorer_error if scorer_error
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
# Run a single scorer inside its own span.
|
|
185
|
+
# @param scorer [Scorer] The scorer to run
|
|
186
|
+
# @param scorer_kwargs [Hash] Keyword arguments for the scorer
|
|
187
|
+
# @param scorer_input [Hash] Input to log on the span
|
|
188
|
+
# @return [Array<Hash>] Raw score results from the scorer
|
|
189
|
+
def run_scorer(scorer, scorer_kwargs, scorer_input)
|
|
190
|
+
tracer.in_span(scorer.name) do |score_span|
|
|
160
191
|
score_span.set_attribute("braintrust.parent", eval_context.parent_span_attr) if eval_context.parent_span_attr
|
|
161
|
-
set_json_attr(score_span, "braintrust.span_attributes",
|
|
162
|
-
|
|
163
|
-
scorer_kwargs = {
|
|
164
|
-
input: case_context.input,
|
|
165
|
-
expected: case_context.expected,
|
|
166
|
-
output: case_context.output,
|
|
167
|
-
metadata: case_context.metadata || {},
|
|
168
|
-
trace: case_context.trace
|
|
169
|
-
}
|
|
170
|
-
scores = {}
|
|
171
|
-
scorer_error = nil
|
|
172
|
-
eval_context.scorers.each do |scorer|
|
|
173
|
-
score_value = scorer.call(**scorer_kwargs)
|
|
174
|
-
scores[scorer.name] = score_value
|
|
175
|
-
|
|
176
|
-
# Collect raw score for summary (thread-safe)
|
|
177
|
-
collect_score(scorer.name, score_value)
|
|
178
|
-
rescue => e
|
|
179
|
-
# Record first error but continue processing other scorers
|
|
180
|
-
scorer_error ||= e
|
|
181
|
-
record_span_error(score_span, e, "ScorerError")
|
|
182
|
-
end
|
|
192
|
+
set_json_attr(score_span, "braintrust.span_attributes", build_scorer_span_attributes(scorer.name))
|
|
193
|
+
set_json_attr(score_span, "braintrust.input_json", scorer_input)
|
|
183
194
|
|
|
184
|
-
|
|
185
|
-
set_json_attr(score_span, "braintrust.scores", scores)
|
|
195
|
+
score_results = scorer.call(**scorer_kwargs)
|
|
186
196
|
|
|
187
|
-
|
|
188
|
-
|
|
197
|
+
scorer_scores = {}
|
|
198
|
+
scorer_metadata = {}
|
|
199
|
+
score_results.each do |s|
|
|
200
|
+
scorer_scores[s[:name]] = s[:score]
|
|
201
|
+
scorer_metadata[s[:name]] = s[:metadata] if s[:metadata].is_a?(Hash)
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
set_json_attr(score_span, "braintrust.output_json", scorer_scores)
|
|
205
|
+
set_json_attr(score_span, "braintrust.scores", scorer_scores)
|
|
206
|
+
set_json_attr(score_span, "braintrust.metadata", scorer_metadata) unless scorer_metadata.empty?
|
|
189
207
|
|
|
190
|
-
|
|
208
|
+
score_results
|
|
209
|
+
rescue => e
|
|
210
|
+
record_span_error(score_span, e, "ScorerError")
|
|
211
|
+
raise
|
|
191
212
|
end
|
|
192
213
|
end
|
|
193
214
|
|
|
@@ -255,6 +276,16 @@ module Braintrust
|
|
|
255
276
|
attrs
|
|
256
277
|
end
|
|
257
278
|
|
|
279
|
+
# Build span_attributes for a scorer span.
|
|
280
|
+
# Each scorer gets its own span with type "score", purpose "scorer", and the scorer's name.
|
|
281
|
+
# @param scorer_name [String] The scorer name
|
|
282
|
+
# @return [Hash]
|
|
283
|
+
def build_scorer_span_attributes(scorer_name)
|
|
284
|
+
attrs = {type: "score", name: scorer_name, purpose: "scorer"}
|
|
285
|
+
attrs[:generation] = eval_context.generation if eval_context.generation
|
|
286
|
+
attrs
|
|
287
|
+
end
|
|
288
|
+
|
|
258
289
|
# Set a span attribute by JSON encoding the value
|
|
259
290
|
# @param span [OpenTelemetry::Trace::Span] The span
|
|
260
291
|
# @param key [String] The attribute key
|
|
@@ -263,14 +294,11 @@ module Braintrust
|
|
|
263
294
|
span.set_attribute(key, JSON.dump(value))
|
|
264
295
|
end
|
|
265
296
|
|
|
266
|
-
# Collect
|
|
267
|
-
# @param
|
|
268
|
-
|
|
269
|
-
def collect_score(name, value)
|
|
270
|
-
return unless value.is_a?(Numeric)
|
|
271
|
-
|
|
297
|
+
# Collect score results into the summary accumulator (thread-safe).
|
|
298
|
+
# @param score_results [Array<Hash>] Score results from a scorer
|
|
299
|
+
def collect_scores(score_results)
|
|
272
300
|
@score_mutex.synchronize do
|
|
273
|
-
(@scores[name] ||= []) <<
|
|
301
|
+
score_results.each { |s| (@scores[s[:name]] ||= []) << s[:score] }
|
|
274
302
|
end
|
|
275
303
|
end
|
|
276
304
|
end
|
data/lib/braintrust/scorer.rb
CHANGED
|
@@ -40,12 +40,52 @@ module Braintrust
|
|
|
40
40
|
Block.new(name: name || DEFAULT_NAME, &block)
|
|
41
41
|
end
|
|
42
42
|
|
|
43
|
-
# Included into classes that +include Scorer+. Prepends KeywordFilter
|
|
44
|
-
# so #call receives only
|
|
43
|
+
# Included into classes that +include Scorer+. Prepends KeywordFilter and
|
|
44
|
+
# ResultNormalizer so #call receives only declared kwargs and always returns
|
|
45
|
+
# Array<Hash>. Also provides a default #name and #call_parameters.
|
|
45
46
|
module Callable
|
|
47
|
+
# Normalizes the raw return value of #call into Array<Hash>.
|
|
48
|
+
# Nested inside Callable because it depends on #name which Callable provides.
|
|
49
|
+
module ResultNormalizer
|
|
50
|
+
# @return [Array<Hash>] normalized score hashes with :score, :metadata, :name keys
|
|
51
|
+
def call(**kwargs)
|
|
52
|
+
normalize_score_result(super)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
private
|
|
56
|
+
|
|
57
|
+
# @param result [Numeric, Hash, Array<Hash>] raw return value from #call
|
|
58
|
+
# @return [Array<Hash>] one or more score hashes with :score, :metadata, :name keys
|
|
59
|
+
# @raise [ArgumentError] if any score value is not Numeric
|
|
60
|
+
def normalize_score_result(result)
|
|
61
|
+
case result
|
|
62
|
+
when Array then result.map { |item| normalize_score_item(item) }
|
|
63
|
+
when Hash then [normalize_score_item(result)]
|
|
64
|
+
else
|
|
65
|
+
raise ArgumentError, "#{name}: score must be Numeric, got #{result.inspect}" unless result.is_a?(Numeric)
|
|
66
|
+
[{score: result, metadata: nil, name: name}]
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Fills in missing :name from the scorer and validates :score.
|
|
71
|
+
# @param item [Hash] a score hash with at least a :score key
|
|
72
|
+
# @return [Hash] the same hash with :name set
|
|
73
|
+
# @raise [ArgumentError] if :score is not Numeric
|
|
74
|
+
def normalize_score_item(item)
|
|
75
|
+
item[:name] ||= name
|
|
76
|
+
raise ArgumentError, "#{item[:name]}: score must be Numeric, got #{item[:score].inspect}" unless item[:score].is_a?(Numeric)
|
|
77
|
+
item
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Infrastructure modules prepended onto every scorer class.
|
|
82
|
+
# Used both to set up the ancestor chain and to skip past them in
|
|
83
|
+
# #call_parameters so KeywordFilter sees the real call signature.
|
|
84
|
+
PREPENDED = [Internal::Callable::KeywordFilter, ResultNormalizer].freeze
|
|
85
|
+
|
|
46
86
|
# @param base [Class] the class including Callable
|
|
47
87
|
def self.included(base)
|
|
48
|
-
base.prepend(
|
|
88
|
+
PREPENDED.each { |mod| base.prepend(mod) }
|
|
49
89
|
end
|
|
50
90
|
|
|
51
91
|
# Default name derived from the class name (e.g. FuzzyMatch -> "fuzzy_match").
|
|
@@ -55,6 +95,17 @@ module Braintrust
|
|
|
55
95
|
return Scorer::DEFAULT_NAME unless klass
|
|
56
96
|
klass.gsub(/([a-z])([A-Z])/, '\1_\2').downcase
|
|
57
97
|
end
|
|
98
|
+
|
|
99
|
+
# Provides KeywordFilter with the actual call signature of the subclass.
|
|
100
|
+
# Walks past PREPENDED modules in the ancestor chain so that user-defined
|
|
101
|
+
# #call keyword params are correctly introspected.
|
|
102
|
+
# Block overrides this to point directly at @block.parameters.
|
|
103
|
+
# @return [Array<Array>] parameter list
|
|
104
|
+
def call_parameters
|
|
105
|
+
meth = method(:call)
|
|
106
|
+
meth = meth.super_method while meth.super_method && PREPENDED.include?(meth.owner)
|
|
107
|
+
meth.parameters
|
|
108
|
+
end
|
|
58
109
|
end
|
|
59
110
|
|
|
60
111
|
# Block-based scorer. Stores a Proc and delegates #call to it.
|
|
@@ -75,7 +126,7 @@ module Braintrust
|
|
|
75
126
|
end
|
|
76
127
|
|
|
77
128
|
# @param kwargs [Hash] keyword arguments (filtered by KeywordFilter)
|
|
78
|
-
# @return [
|
|
129
|
+
# @return [Array<Hash>] normalized score results
|
|
79
130
|
def call(**kwargs)
|
|
80
131
|
@block.call(**kwargs)
|
|
81
132
|
end
|
|
@@ -10,38 +10,15 @@ module Braintrust
|
|
|
10
10
|
class Eval
|
|
11
11
|
def initialize(evaluators)
|
|
12
12
|
@evaluators = evaluators
|
|
13
|
+
@service = Services::Eval.new(evaluators)
|
|
13
14
|
end
|
|
14
15
|
|
|
15
16
|
def call(env)
|
|
16
17
|
body = parse_body(env)
|
|
17
18
|
return error_response(400, "Invalid JSON body") unless body
|
|
18
19
|
|
|
19
|
-
|
|
20
|
-
return error_response(
|
|
21
|
-
|
|
22
|
-
evaluator = @evaluators[name]
|
|
23
|
-
return error_response(404, "Evaluator '#{name}' not found") unless evaluator
|
|
24
|
-
|
|
25
|
-
data = body["data"]
|
|
26
|
-
return error_response(400, "Missing required field: data") unless data
|
|
27
|
-
|
|
28
|
-
# Validate exactly one data source
|
|
29
|
-
data_sources = ["data", "dataset_name", "dataset_id"].count { |k| data.key?(k) }
|
|
30
|
-
return error_response(400, "Exactly one data source required") if data_sources != 1
|
|
31
|
-
|
|
32
|
-
experiment_name = body["experiment_name"]
|
|
33
|
-
|
|
34
|
-
# Resolve data source
|
|
35
|
-
cases, dataset = resolve_data_source(data)
|
|
36
|
-
|
|
37
|
-
# Resolve remote scorers from request
|
|
38
|
-
remote_scorer_ids = resolve_remote_scorers(body["scores"])
|
|
39
|
-
|
|
40
|
-
# Resolve parent span context
|
|
41
|
-
parent = resolve_parent(body["parent"])
|
|
42
|
-
|
|
43
|
-
# Build state from auth context (if present)
|
|
44
|
-
state = build_state(env)
|
|
20
|
+
result = @service.validate(body)
|
|
21
|
+
return error_response(result[:status], result[:error]) if result[:error]
|
|
45
22
|
|
|
46
23
|
# The protocol-rack adapter (used by Falcon and any server built on
|
|
47
24
|
# protocol-http) buffers `each`-based bodies through an Enumerable path.
|
|
@@ -50,64 +27,7 @@ module Braintrust
|
|
|
50
27
|
body_class = env.key?("protocol.http.request") ? SSEStreamBody : SSEBody
|
|
51
28
|
|
|
52
29
|
sse_body = body_class.new do |sse|
|
|
53
|
-
|
|
54
|
-
run_opts = {
|
|
55
|
-
on_progress: ->(progress_data) {
|
|
56
|
-
# Build remote eval protocol events from generic progress data.
|
|
57
|
-
# Runner provides: id, data/error, scores (optional), origin (optional).
|
|
58
|
-
# Protocol requires: id, object_type, origin, name, format, output_type, event, data.
|
|
59
|
-
base = {
|
|
60
|
-
"object_type" => "task",
|
|
61
|
-
"name" => name,
|
|
62
|
-
"format" => "code",
|
|
63
|
-
"output_type" => "completion"
|
|
64
|
-
}
|
|
65
|
-
base["id"] = progress_data["id"] if progress_data["id"]
|
|
66
|
-
base["origin"] = progress_data["origin"] if progress_data["origin"]
|
|
67
|
-
|
|
68
|
-
if progress_data.key?("error")
|
|
69
|
-
sse.event("progress", JSON.dump(base.merge("event" => "error", "data" => progress_data["error"])))
|
|
70
|
-
else
|
|
71
|
-
sse.event("progress", JSON.dump(base.merge("event" => "json_delta", "data" => JSON.dump(progress_data["data"]))))
|
|
72
|
-
end
|
|
73
|
-
|
|
74
|
-
# Signal per-cell completion so the UI exits "Streaming..." state
|
|
75
|
-
# and updates the progress bar immediately.
|
|
76
|
-
sse.event("progress", JSON.dump(base.merge("event" => "done", "data" => "")))
|
|
77
|
-
},
|
|
78
|
-
quiet: true
|
|
79
|
-
}
|
|
80
|
-
run_opts[:parent] = parent if parent
|
|
81
|
-
run_opts[:scorers] = remote_scorer_ids if remote_scorer_ids
|
|
82
|
-
run_opts[:dataset] = dataset if dataset
|
|
83
|
-
|
|
84
|
-
if state
|
|
85
|
-
run_opts[:state] = state
|
|
86
|
-
run_opts[:experiment] = experiment_name if experiment_name
|
|
87
|
-
run_opts[:project_id] = body["project_id"] if body["project_id"]
|
|
88
|
-
end
|
|
89
|
-
|
|
90
|
-
result = evaluator.run(cases, **run_opts)
|
|
91
|
-
|
|
92
|
-
# Flush buffered OTLP spans before sending completion events.
|
|
93
|
-
# The BatchSpanProcessor exports every ~5s; fast evals can finish
|
|
94
|
-
# before a single export fires, causing the UI to see no results.
|
|
95
|
-
Braintrust::Trace.flush_spans
|
|
96
|
-
|
|
97
|
-
# Build summary from result scores
|
|
98
|
-
averaged_scores = {}
|
|
99
|
-
result.scorer_stats.each do |scorer_name, stats|
|
|
100
|
-
averaged_scores[scorer_name] = stats.score_mean
|
|
101
|
-
end
|
|
102
|
-
|
|
103
|
-
sse.event("summary", JSON.dump({
|
|
104
|
-
"scores" => averaged_scores,
|
|
105
|
-
"experiment_name" => experiment_name,
|
|
106
|
-
"experiment_id" => result.experiment_id,
|
|
107
|
-
"project_id" => result.project_id
|
|
108
|
-
}))
|
|
109
|
-
|
|
110
|
-
sse.event("done", "")
|
|
30
|
+
@service.stream(result, auth: env["braintrust.auth"], sse: sse)
|
|
111
31
|
end
|
|
112
32
|
|
|
113
33
|
[200, {"content-type" => "text/event-stream", "cache-control" => "no-cache", "connection" => "keep-alive"}, sse_body]
|
|
@@ -115,90 +35,6 @@ module Braintrust
|
|
|
115
35
|
|
|
116
36
|
private
|
|
117
37
|
|
|
118
|
-
# Resolve data source from the data field.
|
|
119
|
-
# Returns [cases, dataset] where exactly one is non-nil.
|
|
120
|
-
def resolve_data_source(data)
|
|
121
|
-
if data.key?("data")
|
|
122
|
-
cases = data["data"].map do |d|
|
|
123
|
-
{input: d["input"], expected: d["expected"]}
|
|
124
|
-
end
|
|
125
|
-
[cases, nil]
|
|
126
|
-
elsif data.key?("dataset_id")
|
|
127
|
-
[nil, Braintrust::Dataset::ID.new(id: data["dataset_id"])]
|
|
128
|
-
elsif data.key?("dataset_name")
|
|
129
|
-
dataset_opts = {name: data["dataset_name"]}
|
|
130
|
-
dataset_opts[:project] = data["project_name"] if data["project_name"]
|
|
131
|
-
[nil, dataset_opts]
|
|
132
|
-
else
|
|
133
|
-
[nil, nil]
|
|
134
|
-
end
|
|
135
|
-
end
|
|
136
|
-
|
|
137
|
-
# Map request scores array to Scorer::ID structs.
|
|
138
|
-
# The UI sends function_id as a nested object: {"function_id": "uuid"}.
|
|
139
|
-
def resolve_remote_scorers(scores)
|
|
140
|
-
return nil if scores.nil? || scores.empty?
|
|
141
|
-
scores.map do |s|
|
|
142
|
-
func_id = s["function_id"]
|
|
143
|
-
func_id = func_id["function_id"] if func_id.is_a?(Hash)
|
|
144
|
-
Braintrust::Scorer::ID.new(
|
|
145
|
-
function_id: func_id,
|
|
146
|
-
version: s["version"]
|
|
147
|
-
)
|
|
148
|
-
end
|
|
149
|
-
end
|
|
150
|
-
|
|
151
|
-
# Map request parent to symbol-keyed Hash.
|
|
152
|
-
# Hardcode playground_id to match Java SDK behavior.
|
|
153
|
-
# Also extracts generation from propagated_event for span_attributes.
|
|
154
|
-
def resolve_parent(parent)
|
|
155
|
-
return nil unless parent.is_a?(Hash)
|
|
156
|
-
object_id = parent["object_id"]
|
|
157
|
-
return nil unless object_id
|
|
158
|
-
|
|
159
|
-
generation = parent.dig("propagated_event", "span_attributes", "generation")
|
|
160
|
-
|
|
161
|
-
result = {object_type: "playground_id", object_id: object_id}
|
|
162
|
-
result[:generation] = generation if generation
|
|
163
|
-
result
|
|
164
|
-
end
|
|
165
|
-
|
|
166
|
-
# Build State from auth context set by Auth middleware.
|
|
167
|
-
# Returns nil when no auth context is present (e.g. NoAuth strategy).
|
|
168
|
-
# Uses an LRU-style cache (max 64 entries) keyed by [api_key, app_url, org_name].
|
|
169
|
-
def build_state(env)
|
|
170
|
-
auth = env["braintrust.auth"]
|
|
171
|
-
return nil unless auth.is_a?(Hash)
|
|
172
|
-
|
|
173
|
-
cache_key = [auth["api_key"], auth["app_url"], auth["org_name"]]
|
|
174
|
-
|
|
175
|
-
@state_mutex ||= Mutex.new
|
|
176
|
-
@state_cache ||= {}
|
|
177
|
-
|
|
178
|
-
@state_mutex.synchronize do
|
|
179
|
-
cached = @state_cache[cache_key]
|
|
180
|
-
return cached if cached
|
|
181
|
-
|
|
182
|
-
state = Braintrust::State.new(
|
|
183
|
-
api_key: auth["api_key"],
|
|
184
|
-
org_id: auth["org_id"],
|
|
185
|
-
org_name: auth["org_name"],
|
|
186
|
-
app_url: auth["app_url"],
|
|
187
|
-
api_url: auth["api_url"],
|
|
188
|
-
enable_tracing: false
|
|
189
|
-
)
|
|
190
|
-
|
|
191
|
-
# Evict oldest entry if cache is full
|
|
192
|
-
if @state_cache.size >= 64
|
|
193
|
-
oldest_key = @state_cache.keys.first
|
|
194
|
-
@state_cache.delete(oldest_key)
|
|
195
|
-
end
|
|
196
|
-
|
|
197
|
-
@state_cache[cache_key] = state
|
|
198
|
-
state
|
|
199
|
-
end
|
|
200
|
-
end
|
|
201
|
-
|
|
202
38
|
def parse_body(env)
|
|
203
39
|
body = env["rack.input"]&.read
|
|
204
40
|
return nil if body.nil? || body.empty?
|
|
@@ -211,6 +47,10 @@ module Braintrust
|
|
|
211
47
|
[status, {"content-type" => "application/json"},
|
|
212
48
|
[JSON.dump({"error" => message})]]
|
|
213
49
|
end
|
|
50
|
+
|
|
51
|
+
def build_state(env)
|
|
52
|
+
@service.build_state(env["braintrust.auth"])
|
|
53
|
+
end
|
|
214
54
|
end
|
|
215
55
|
end
|
|
216
56
|
end
|
|
@@ -23,50 +23,12 @@ module Braintrust
|
|
|
23
23
|
class List
|
|
24
24
|
def initialize(evaluators)
|
|
25
25
|
@evaluators = evaluators
|
|
26
|
+
@service = Services::List.new(evaluators)
|
|
26
27
|
end
|
|
27
28
|
|
|
28
29
|
def call(_env)
|
|
29
|
-
result =
|
|
30
|
-
|
|
31
|
-
scores = (evaluator.scorers || []).each_with_index.map do |scorer, i|
|
|
32
|
-
scorer_name = scorer.respond_to?(:name) ? scorer.name : "score_#{i}"
|
|
33
|
-
{"name" => scorer_name}
|
|
34
|
-
end
|
|
35
|
-
entry = {"scores" => scores}
|
|
36
|
-
params = serialize_parameters(evaluator.parameters)
|
|
37
|
-
entry["parameters"] = params if params
|
|
38
|
-
result[name] = entry
|
|
39
|
-
end
|
|
40
|
-
|
|
41
|
-
[200, {"content-type" => "application/json"},
|
|
42
|
-
[JSON.dump(result)]]
|
|
43
|
-
end
|
|
44
|
-
|
|
45
|
-
private
|
|
46
|
-
|
|
47
|
-
# Convert user-defined parameters to the dev server protocol format.
|
|
48
|
-
# Wraps in a staticParameters container with "data" typed entries.
|
|
49
|
-
def serialize_parameters(parameters)
|
|
50
|
-
return nil unless parameters && !parameters.empty?
|
|
51
|
-
|
|
52
|
-
schema = {}
|
|
53
|
-
parameters.each do |name, spec|
|
|
54
|
-
spec = spec.transform_keys(&:to_s) if spec.is_a?(Hash)
|
|
55
|
-
if spec.is_a?(Hash)
|
|
56
|
-
schema[name.to_s] = {
|
|
57
|
-
"type" => "data",
|
|
58
|
-
"schema" => {"type" => spec["type"] || "string"},
|
|
59
|
-
"default" => spec["default"],
|
|
60
|
-
"description" => spec["description"]
|
|
61
|
-
}
|
|
62
|
-
end
|
|
63
|
-
end
|
|
64
|
-
|
|
65
|
-
{
|
|
66
|
-
"type" => "braintrust.staticParameters",
|
|
67
|
-
"schema" => schema,
|
|
68
|
-
"source" => nil
|
|
69
|
-
}
|
|
30
|
+
result = @service.call
|
|
31
|
+
[200, {"content-type" => "application/json"}, [JSON.dump(result)]]
|
|
70
32
|
end
|
|
71
33
|
end
|
|
72
34
|
end
|
|
@@ -15,6 +15,8 @@ require_relative "auth/no_auth"
|
|
|
15
15
|
require_relative "auth/clerk_token"
|
|
16
16
|
require_relative "middleware/cors"
|
|
17
17
|
require_relative "middleware/auth"
|
|
18
|
+
require_relative "services/list_service"
|
|
19
|
+
require_relative "services/eval_service"
|
|
18
20
|
require_relative "handlers/health"
|
|
19
21
|
require_relative "handlers/list"
|
|
20
22
|
require_relative "handlers/eval"
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
|
|
5
|
+
module Braintrust
|
|
6
|
+
module Server
|
|
7
|
+
module Services
|
|
8
|
+
# Framework-agnostic service for running evaluations and streaming SSE results.
|
|
9
|
+
# Must be long-lived (not per-request) to preserve the @state_cache across requests.
|
|
10
|
+
class Eval
|
|
11
|
+
def initialize(evaluators)
|
|
12
|
+
@evaluators = evaluators
|
|
13
|
+
@state_mutex = Mutex.new
|
|
14
|
+
@state_cache = {}
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# Validates request body. Returns:
|
|
18
|
+
# {error: String, status: Integer} on failure
|
|
19
|
+
# {evaluator:, name:, cases:, dataset:, ...} on success
|
|
20
|
+
def validate(body)
|
|
21
|
+
name = body["name"]
|
|
22
|
+
return {error: "Missing required field: name", status: 400} unless name
|
|
23
|
+
|
|
24
|
+
evaluator = current_evaluators[name]
|
|
25
|
+
return {error: "Evaluator '#{name}' not found", status: 404} unless evaluator
|
|
26
|
+
|
|
27
|
+
data = body["data"]
|
|
28
|
+
return {error: "Missing required field: data", status: 400} unless data
|
|
29
|
+
|
|
30
|
+
data_sources = ["data", "dataset_name", "dataset_id"].count { |k| data.key?(k) }
|
|
31
|
+
return {error: "Exactly one data source required", status: 400} if data_sources != 1
|
|
32
|
+
|
|
33
|
+
cases, dataset = resolve_data_source(data)
|
|
34
|
+
|
|
35
|
+
{
|
|
36
|
+
evaluator: evaluator,
|
|
37
|
+
name: name,
|
|
38
|
+
cases: cases,
|
|
39
|
+
dataset: dataset,
|
|
40
|
+
experiment_name: body["experiment_name"],
|
|
41
|
+
remote_scorer_ids: resolve_remote_scorers(body["scores"]),
|
|
42
|
+
parent: resolve_parent(body["parent"]),
|
|
43
|
+
project_id: body["project_id"]
|
|
44
|
+
}
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Runs the validated eval and streams SSE events via the sse writer.
|
|
48
|
+
# +validated+ is the hash returned by #validate.
|
|
49
|
+
# +auth+ is the auth context hash (or nil/true for no-auth).
|
|
50
|
+
# +sse+ is an SSEWriter instance.
|
|
51
|
+
def stream(validated, auth:, sse:)
|
|
52
|
+
name = validated[:name]
|
|
53
|
+
evaluator = validated[:evaluator]
|
|
54
|
+
cases = validated[:cases]
|
|
55
|
+
dataset = validated[:dataset]
|
|
56
|
+
experiment_name = validated[:experiment_name]
|
|
57
|
+
remote_scorer_ids = validated[:remote_scorer_ids]
|
|
58
|
+
parent = validated[:parent]
|
|
59
|
+
project_id = validated[:project_id]
|
|
60
|
+
|
|
61
|
+
state = build_state(auth)
|
|
62
|
+
|
|
63
|
+
# Only pass project/experiment params when state is available
|
|
64
|
+
run_opts = {
|
|
65
|
+
on_progress: ->(progress_data) {
|
|
66
|
+
# Build remote eval protocol events from generic progress data.
|
|
67
|
+
# Runner provides: id, data/error, scores (optional), origin (optional).
|
|
68
|
+
# Protocol requires: id, object_type, origin, name, format, output_type, event, data.
|
|
69
|
+
base = {
|
|
70
|
+
"object_type" => "task",
|
|
71
|
+
"name" => name,
|
|
72
|
+
"format" => "code",
|
|
73
|
+
"output_type" => "completion"
|
|
74
|
+
}
|
|
75
|
+
base["id"] = progress_data["id"] if progress_data["id"]
|
|
76
|
+
base["origin"] = progress_data["origin"] if progress_data["origin"]
|
|
77
|
+
|
|
78
|
+
if progress_data.key?("error")
|
|
79
|
+
sse.event("progress", JSON.dump(base.merge("event" => "error", "data" => progress_data["error"])))
|
|
80
|
+
else
|
|
81
|
+
sse.event("progress", JSON.dump(base.merge("event" => "json_delta", "data" => JSON.dump(progress_data["data"]))))
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Signal per-cell completion so the UI exits "Streaming..." state
|
|
85
|
+
# and updates the progress bar immediately.
|
|
86
|
+
sse.event("progress", JSON.dump(base.merge("event" => "done", "data" => "")))
|
|
87
|
+
},
|
|
88
|
+
quiet: true
|
|
89
|
+
}
|
|
90
|
+
run_opts[:parent] = parent if parent
|
|
91
|
+
run_opts[:scorers] = remote_scorer_ids if remote_scorer_ids
|
|
92
|
+
run_opts[:dataset] = dataset if dataset
|
|
93
|
+
|
|
94
|
+
if state
|
|
95
|
+
run_opts[:state] = state
|
|
96
|
+
run_opts[:experiment] = experiment_name if experiment_name
|
|
97
|
+
run_opts[:project_id] = project_id if project_id
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
result = evaluator.run(cases, **run_opts)
|
|
101
|
+
|
|
102
|
+
# Flush buffered OTLP spans before sending completion events.
|
|
103
|
+
# The BatchSpanProcessor exports every ~5s; fast evals can finish
|
|
104
|
+
# before a single export fires, causing the UI to see no results.
|
|
105
|
+
Braintrust::Trace.flush_spans
|
|
106
|
+
|
|
107
|
+
# Build summary from result scores
|
|
108
|
+
averaged_scores = {}
|
|
109
|
+
result.scorer_stats.each do |scorer_name, stats|
|
|
110
|
+
averaged_scores[scorer_name] = stats.score_mean
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
sse.event("summary", JSON.dump({
|
|
114
|
+
"scores" => averaged_scores,
|
|
115
|
+
"experiment_name" => experiment_name,
|
|
116
|
+
"experiment_id" => result.experiment_id,
|
|
117
|
+
"project_id" => result.project_id
|
|
118
|
+
}))
|
|
119
|
+
|
|
120
|
+
sse.event("done", "")
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# Build State from auth context hash.
|
|
124
|
+
# Returns nil when auth is not a Hash (e.g. NoAuth returns true).
|
|
125
|
+
# Uses an LRU-style cache (max 64 entries) keyed by [api_key, app_url, org_name].
|
|
126
|
+
def build_state(auth)
|
|
127
|
+
return nil unless auth.is_a?(Hash)
|
|
128
|
+
|
|
129
|
+
cache_key = [auth["api_key"], auth["app_url"], auth["org_name"]]
|
|
130
|
+
|
|
131
|
+
@state_mutex ||= Mutex.new
|
|
132
|
+
@state_cache ||= {}
|
|
133
|
+
|
|
134
|
+
@state_mutex.synchronize do
|
|
135
|
+
cached = @state_cache[cache_key]
|
|
136
|
+
return cached if cached
|
|
137
|
+
|
|
138
|
+
state = Braintrust::State.new(
|
|
139
|
+
api_key: auth["api_key"],
|
|
140
|
+
org_id: auth["org_id"],
|
|
141
|
+
org_name: auth["org_name"],
|
|
142
|
+
app_url: auth["app_url"],
|
|
143
|
+
api_url: auth["api_url"],
|
|
144
|
+
enable_tracing: false
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
if @state_cache.size >= 64
|
|
148
|
+
oldest_key = @state_cache.keys.first
|
|
149
|
+
@state_cache.delete(oldest_key)
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
@state_cache[cache_key] = state
|
|
153
|
+
state
|
|
154
|
+
end
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
private
|
|
158
|
+
|
|
159
|
+
def current_evaluators
|
|
160
|
+
return @evaluators.call if @evaluators.respond_to?(:call)
|
|
161
|
+
@evaluators
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# Resolve data source from the data field.
|
|
165
|
+
# Returns [cases, dataset] where exactly one is non-nil.
|
|
166
|
+
def resolve_data_source(data)
|
|
167
|
+
if data.key?("data")
|
|
168
|
+
cases = data["data"].map do |d|
|
|
169
|
+
{input: d["input"], expected: d["expected"]}
|
|
170
|
+
end
|
|
171
|
+
[cases, nil]
|
|
172
|
+
elsif data.key?("dataset_id")
|
|
173
|
+
[nil, Braintrust::Dataset::ID.new(id: data["dataset_id"])]
|
|
174
|
+
elsif data.key?("dataset_name")
|
|
175
|
+
dataset_opts = {name: data["dataset_name"]}
|
|
176
|
+
dataset_opts[:project] = data["project_name"] if data["project_name"]
|
|
177
|
+
[nil, dataset_opts]
|
|
178
|
+
else
|
|
179
|
+
[nil, nil]
|
|
180
|
+
end
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
# Map request scores array to Scorer::ID structs.
|
|
184
|
+
# The UI sends function_id as a nested object: {"function_id": "uuid"}.
|
|
185
|
+
def resolve_remote_scorers(scores)
|
|
186
|
+
return nil if scores.nil? || scores.empty?
|
|
187
|
+
scores.map do |s|
|
|
188
|
+
func_id = s["function_id"]
|
|
189
|
+
func_id = func_id["function_id"] if func_id.is_a?(Hash)
|
|
190
|
+
Braintrust::Scorer::ID.new(
|
|
191
|
+
function_id: func_id,
|
|
192
|
+
version: s["version"]
|
|
193
|
+
)
|
|
194
|
+
end
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
# Map request parent to symbol-keyed Hash.
|
|
198
|
+
# Hardcode playground_id to match Java SDK behavior.
|
|
199
|
+
# Also extracts generation from propagated_event for span_attributes.
|
|
200
|
+
def resolve_parent(parent)
|
|
201
|
+
return nil unless parent.is_a?(Hash)
|
|
202
|
+
object_id = parent["object_id"]
|
|
203
|
+
return nil unless object_id
|
|
204
|
+
|
|
205
|
+
generation = parent.dig("propagated_event", "span_attributes", "generation")
|
|
206
|
+
|
|
207
|
+
result = {object_type: "playground_id", object_id: object_id}
|
|
208
|
+
result[:generation] = generation if generation
|
|
209
|
+
result
|
|
210
|
+
end
|
|
211
|
+
end
|
|
212
|
+
end
|
|
213
|
+
end
|
|
214
|
+
end
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
|
|
5
|
+
module Braintrust
|
|
6
|
+
module Server
|
|
7
|
+
module Services
|
|
8
|
+
# Framework-agnostic service for listing evaluators.
|
|
9
|
+
# Returns a plain Hash (not a Rack triplet) suitable for JSON.dump.
|
|
10
|
+
class List
|
|
11
|
+
def initialize(evaluators)
|
|
12
|
+
@evaluators = evaluators
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def call
|
|
16
|
+
result = {}
|
|
17
|
+
current_evaluators.each do |name, evaluator|
|
|
18
|
+
scores = (evaluator.scorers || []).each_with_index.map do |scorer, i|
|
|
19
|
+
scorer_name = scorer.respond_to?(:name) ? scorer.name : "score_#{i}"
|
|
20
|
+
{"name" => scorer_name}
|
|
21
|
+
end
|
|
22
|
+
entry = {"scores" => scores}
|
|
23
|
+
params = serialize_parameters(evaluator.parameters)
|
|
24
|
+
entry["parameters"] = params if params
|
|
25
|
+
result[name] = entry
|
|
26
|
+
end
|
|
27
|
+
result
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
private
|
|
31
|
+
|
|
32
|
+
def current_evaluators
|
|
33
|
+
return @evaluators.call if @evaluators.respond_to?(:call)
|
|
34
|
+
@evaluators
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Convert user-defined parameters to the dev server protocol format.
|
|
38
|
+
# Wraps in a staticParameters container with "data" typed entries.
|
|
39
|
+
def serialize_parameters(parameters)
|
|
40
|
+
return nil unless parameters && !parameters.empty?
|
|
41
|
+
|
|
42
|
+
schema = {}
|
|
43
|
+
parameters.each do |name, spec|
|
|
44
|
+
spec = spec.transform_keys(&:to_s) if spec.is_a?(Hash)
|
|
45
|
+
if spec.is_a?(Hash)
|
|
46
|
+
schema[name.to_s] = {
|
|
47
|
+
"type" => "data",
|
|
48
|
+
"schema" => {"type" => spec["type"] || "string"},
|
|
49
|
+
"default" => spec["default"],
|
|
50
|
+
"description" => spec["description"]
|
|
51
|
+
}
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
{
|
|
56
|
+
"type" => "braintrust.staticParameters",
|
|
57
|
+
"schema" => schema,
|
|
58
|
+
"source" => nil
|
|
59
|
+
}
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
@@ -80,11 +80,6 @@ module Braintrust
|
|
|
80
80
|
# Determine if a span should be forwarded to the wrapped processor
|
|
81
81
|
# based on configured filters
|
|
82
82
|
def should_forward_span?(span)
|
|
83
|
-
# Always keep root spans (spans with no parent)
|
|
84
|
-
# Check if parent_span_id is the invalid/zero span ID
|
|
85
|
-
is_root = span.parent_span_id == OpenTelemetry::Trace::INVALID_SPAN_ID
|
|
86
|
-
return true if is_root
|
|
87
|
-
|
|
88
83
|
# If no filters, keep everything
|
|
89
84
|
return true if @filters.empty?
|
|
90
85
|
|
data/lib/braintrust/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: braintrust
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.3.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Braintrust
|
|
@@ -215,6 +215,14 @@ files:
|
|
|
215
215
|
- lib/braintrust/contrib/openai/patcher.rb
|
|
216
216
|
- lib/braintrust/contrib/patcher.rb
|
|
217
217
|
- lib/braintrust/contrib/rails/railtie.rb
|
|
218
|
+
- lib/braintrust/contrib/rails/server.rb
|
|
219
|
+
- lib/braintrust/contrib/rails/server/application_controller.rb
|
|
220
|
+
- lib/braintrust/contrib/rails/server/engine.rb
|
|
221
|
+
- lib/braintrust/contrib/rails/server/eval_controller.rb
|
|
222
|
+
- lib/braintrust/contrib/rails/server/generator.rb
|
|
223
|
+
- lib/braintrust/contrib/rails/server/health_controller.rb
|
|
224
|
+
- lib/braintrust/contrib/rails/server/list_controller.rb
|
|
225
|
+
- lib/braintrust/contrib/rails/server/routes.rb
|
|
218
226
|
- lib/braintrust/contrib/registry.rb
|
|
219
227
|
- lib/braintrust/contrib/ruby_llm/deprecated.rb
|
|
220
228
|
- lib/braintrust/contrib/ruby_llm/instrumentation/chat.rb
|
|
@@ -267,6 +275,8 @@ files:
|
|
|
267
275
|
- lib/braintrust/server/rack.rb
|
|
268
276
|
- lib/braintrust/server/rack/app.rb
|
|
269
277
|
- lib/braintrust/server/router.rb
|
|
278
|
+
- lib/braintrust/server/services/eval_service.rb
|
|
279
|
+
- lib/braintrust/server/services/list_service.rb
|
|
270
280
|
- lib/braintrust/server/sse.rb
|
|
271
281
|
- lib/braintrust/setup.rb
|
|
272
282
|
- lib/braintrust/state.rb
|