braintrust 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +148 -24
- data/lib/braintrust/api/internal/btql.rb +124 -0
- data/lib/braintrust/api/internal/experiments.rb +19 -0
- data/lib/braintrust/api/internal/projects.rb +19 -0
- data/lib/braintrust/contrib/rails/server/application_controller.rb +34 -0
- data/lib/braintrust/contrib/rails/server/engine.rb +72 -0
- data/lib/braintrust/contrib/rails/server/eval_controller.rb +36 -0
- data/lib/braintrust/contrib/rails/server/generator.rb +43 -0
- data/lib/braintrust/contrib/rails/server/health_controller.rb +15 -0
- data/lib/braintrust/contrib/rails/server/list_controller.rb +16 -0
- data/lib/braintrust/contrib/rails/server/routes.rb +8 -0
- data/lib/braintrust/contrib/rails/server.rb +20 -0
- data/lib/braintrust/dataset.rb +6 -3
- data/lib/braintrust/eval/context.rb +131 -0
- data/lib/braintrust/eval/evaluator.rb +11 -5
- data/lib/braintrust/eval/functions.rb +10 -166
- data/lib/braintrust/eval/runner.rb +165 -145
- data/lib/braintrust/eval/scorer.rb +24 -96
- data/lib/braintrust/eval/trace.rb +129 -0
- data/lib/braintrust/eval.rb +60 -132
- data/lib/braintrust/functions.rb +168 -0
- data/lib/braintrust/internal/callable.rb +83 -0
- data/lib/braintrust/logger.rb +9 -0
- data/lib/braintrust/scorer.rb +173 -0
- data/lib/braintrust/server/handlers/eval.rb +8 -168
- data/lib/braintrust/server/handlers/list.rb +3 -41
- data/lib/braintrust/server/rack.rb +2 -0
- data/lib/braintrust/server/services/eval_service.rb +214 -0
- data/lib/braintrust/server/services/list_service.rb +64 -0
- data/lib/braintrust/task.rb +108 -0
- data/lib/braintrust/trace/span_processor.rb +0 -5
- data/lib/braintrust/version.rb +1 -1
- metadata +18 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c07be3c454a924c5c97c2653136a2b9cdd1098409af16326b1db8676c5c8b0d2
|
|
4
|
+
data.tar.gz: c1eb75eefdcacebc2c955ae23aa3196d276a76d6ab828cdfb817c7e9168325b3
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: d02058bd5321ed16ea2f785aaeb24f4d4f105c5357c3c7ceb2a8a02c090b69c7187623b23e14d5026bb0cf236e64dddae7025509d7b2d6769bb50f110612120f
|
|
7
|
+
data.tar.gz: 15627209b382c023c2640e1d2219b6d33b84cb7c67ba1a3b8e3ebbe1aa912d3df832583a1e37b3831699b67ea81f3b4242b67a606dfdd727827e648a6509fea7
|
data/README.md
CHANGED
|
@@ -252,13 +252,15 @@ Braintrust::Eval.run(
|
|
|
252
252
|
{input: "apple", expected: "fruit"},
|
|
253
253
|
{input: "carrot", expected: "vegetable"}
|
|
254
254
|
],
|
|
255
|
-
task: ->(input) { classify(input) },
|
|
255
|
+
task: ->(input:) { classify(input) },
|
|
256
256
|
scorers: [
|
|
257
|
-
->(
|
|
257
|
+
->(expected:, output:) { output == expected ? 1.0 : 0.0 }
|
|
258
258
|
]
|
|
259
259
|
)
|
|
260
260
|
```
|
|
261
261
|
|
|
262
|
+
See [eval.rb](./examples/eval.rb) for a full example.
|
|
263
|
+
|
|
262
264
|
### Datasets
|
|
263
265
|
|
|
264
266
|
Use test cases from a Braintrust dataset:
|
|
@@ -267,7 +269,7 @@ Use test cases from a Braintrust dataset:
|
|
|
267
269
|
Braintrust::Eval.run(
|
|
268
270
|
project: "my-project",
|
|
269
271
|
dataset: "my-dataset",
|
|
270
|
-
task: ->(input) { classify(input) },
|
|
272
|
+
task: ->(input:) { classify(input) },
|
|
271
273
|
scorers: [...]
|
|
272
274
|
)
|
|
273
275
|
```
|
|
@@ -282,11 +284,13 @@ Braintrust::Eval.run(
|
|
|
282
284
|
{input: "apple", expected: "fruit", tags: ["produce"], metadata: {difficulty: "easy"}},
|
|
283
285
|
{input: "salmon", expected: "protein", tags: ["seafood"], metadata: {difficulty: "medium"}}
|
|
284
286
|
],
|
|
285
|
-
task: ->(input) { classify(input) },
|
|
287
|
+
task: ->(input:) { classify(input) },
|
|
286
288
|
scorers: [...]
|
|
287
289
|
)
|
|
288
290
|
```
|
|
289
291
|
|
|
292
|
+
See [dataset.rb](./examples/eval/dataset.rb) for a full example.
|
|
293
|
+
|
|
290
294
|
### Scorers
|
|
291
295
|
|
|
292
296
|
Use scoring functions defined in Braintrust:
|
|
@@ -295,33 +299,104 @@ Use scoring functions defined in Braintrust:
|
|
|
295
299
|
Braintrust::Eval.run(
|
|
296
300
|
project: "my-project",
|
|
297
301
|
cases: [...],
|
|
298
|
-
task: ->(input) { ... },
|
|
302
|
+
task: ->(input:) { ... },
|
|
303
|
+
scorers: ["accuracy-scorer"]
|
|
304
|
+
)
|
|
305
|
+
```
|
|
306
|
+
|
|
307
|
+
Or define scorers inline with `Scorer.new`:
|
|
308
|
+
|
|
309
|
+
```ruby
|
|
310
|
+
Braintrust::Eval.run(
|
|
311
|
+
project: "my-project",
|
|
312
|
+
cases: [...],
|
|
313
|
+
task: ->(input:) { ... },
|
|
299
314
|
scorers: [
|
|
300
|
-
Braintrust::
|
|
315
|
+
Braintrust::Scorer.new("exact_match") do |expected:, output:|
|
|
316
|
+
output == expected ? 1.0 : 0.0
|
|
317
|
+
end
|
|
301
318
|
]
|
|
302
319
|
)
|
|
303
320
|
```
|
|
304
321
|
|
|
305
|
-
|
|
322
|
+
See [remote_functions.rb](./examples/eval/remote_functions.rb) for a full example.
|
|
323
|
+
|
|
324
|
+
#### Scorer metadata
|
|
325
|
+
|
|
326
|
+
Scorers can return a Hash with `:score` and `:metadata` to attach structured context to the score. The metadata is logged on the scorer's span and visible in the Braintrust UI for debugging and filtering:
|
|
327
|
+
|
|
328
|
+
```ruby
|
|
329
|
+
Braintrust::Scorer.new("translation") do |expected:, output:|
|
|
330
|
+
common_words = output.downcase.split & expected.downcase.split
|
|
331
|
+
overlap = common_words.size.to_f / expected.split.size
|
|
332
|
+
{
|
|
333
|
+
score: overlap,
|
|
334
|
+
metadata: {word_overlap: common_words.size, missing_words: expected.downcase.split - output.downcase.split}
|
|
335
|
+
}
|
|
336
|
+
end
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
See [scorer_metadata.rb](./examples/eval/scorer_metadata.rb) for a full example.
|
|
340
|
+
|
|
341
|
+
#### Multiple scores from one scorer
|
|
342
|
+
|
|
343
|
+
When several scores can be computed together (e.g. in one LLM call), you can return an `Array` of score `Hash` instead of a single value. Each metric appears as a separate score column in the Braintrust UI:
|
|
344
|
+
|
|
345
|
+
```ruby
|
|
346
|
+
Braintrust::Scorer.new("summary_quality") do |output:, expected:|
|
|
347
|
+
words = output.downcase.split
|
|
348
|
+
key_terms = expected[:key_terms]
|
|
349
|
+
covered = key_terms.count { |t| words.include?(t) }
|
|
350
|
+
|
|
351
|
+
[
|
|
352
|
+
{name: "coverage", score: covered.to_f / key_terms.size, metadata: {missing: key_terms - words}},
|
|
353
|
+
{name: "conciseness", score: words.size <= expected[:max_words] ? 1.0 : 0.0}
|
|
354
|
+
]
|
|
355
|
+
end
|
|
356
|
+
```
|
|
357
|
+
|
|
358
|
+
`name` and `score` are required, `metadata` is optional.
|
|
359
|
+
|
|
360
|
+
See [multi_score.rb](./examples/eval/multi_score.rb) for a full example.
|
|
361
|
+
|
|
362
|
+
#### Trace scoring
|
|
363
|
+
|
|
364
|
+
Scorers can access the full evaluation trace (all spans generated by the task) by declaring a `trace:` keyword parameter. This is useful for inspecting intermediate LLM calls, validating tool usage, or checking the message thread:
|
|
306
365
|
|
|
307
366
|
```ruby
|
|
308
367
|
Braintrust::Eval.run(
|
|
309
368
|
project: "my-project",
|
|
310
|
-
cases: [
|
|
311
|
-
task:
|
|
369
|
+
cases: [{input: "What is 2+2?", expected: "4"}],
|
|
370
|
+
task: Braintrust::Task.new { |input:| my_llm_pipeline(input) },
|
|
312
371
|
scorers: [
|
|
313
|
-
|
|
372
|
+
# Access the full trace to inspect LLM spans
|
|
373
|
+
Braintrust::Scorer.new("uses_system_prompt") do |output:, trace:|
|
|
374
|
+
messages = trace.thread # reconstructed message thread from LLM spans
|
|
375
|
+
messages.any? { |m| m["role"] == "system" } ? 1.0 : 0.0
|
|
376
|
+
end,
|
|
377
|
+
|
|
378
|
+
# Filter spans by type
|
|
379
|
+
Braintrust::Scorer.new("single_llm_call") do |output:, trace:|
|
|
380
|
+
trace.spans(span_type: "llm").length == 1 ? 1.0 : 0.0
|
|
381
|
+
end,
|
|
382
|
+
|
|
383
|
+
# Scorers without trace: still work — the parameter is filtered out automatically
|
|
384
|
+
Braintrust::Scorer.new("exact_match") do |output:, expected:|
|
|
314
385
|
output == expected ? 1.0 : 0.0
|
|
315
386
|
end
|
|
316
387
|
]
|
|
317
388
|
)
|
|
318
389
|
```
|
|
319
390
|
|
|
320
|
-
See
|
|
391
|
+
See [trace_scoring.rb](./examples/eval/trace_scoring.rb) for a full example.
|
|
321
392
|
|
|
322
393
|
### Dev Server
|
|
323
394
|
|
|
324
|
-
Run evaluations from the Braintrust web UI against code in your own application.
|
|
395
|
+
Run evaluations from the Braintrust web UI against code in your own application.
|
|
396
|
+
|
|
397
|
+
#### Run as a Rack app
|
|
398
|
+
|
|
399
|
+
Define evaluators, pass them to the dev server, and start serving:
|
|
325
400
|
|
|
326
401
|
```ruby
|
|
327
402
|
# eval_server.ru
|
|
@@ -330,9 +405,9 @@ require "braintrust/server"
|
|
|
330
405
|
|
|
331
406
|
# Define evaluators — these can reference your application code (models, services, etc.)
|
|
332
407
|
food_classifier = Braintrust::Eval::Evaluator.new(
|
|
333
|
-
task: ->(input) { FoodClassifier.classify(input) },
|
|
408
|
+
task: ->(input:) { FoodClassifier.classify(input) },
|
|
334
409
|
scorers: [
|
|
335
|
-
Braintrust::
|
|
410
|
+
Braintrust::Scorer.new("exact_match") { |expected:, output:| output == expected ? 1.0 : 0.0 }
|
|
336
411
|
]
|
|
337
412
|
)
|
|
338
413
|
|
|
@@ -347,10 +422,21 @@ run Braintrust::Server::Rack.app(
|
|
|
347
422
|
)
|
|
348
423
|
```
|
|
349
424
|
|
|
425
|
+
Add your Rack server to your Gemfile:
|
|
426
|
+
|
|
427
|
+
```ruby
|
|
428
|
+
gem "rack"
|
|
429
|
+
gem "puma" # recommended
|
|
430
|
+
```
|
|
431
|
+
|
|
432
|
+
Then start the server:
|
|
433
|
+
|
|
350
434
|
```bash
|
|
351
435
|
bundle exec rackup eval_server.ru -p 8300 -o 0.0.0.0
|
|
352
436
|
```
|
|
353
437
|
|
|
438
|
+
See example: [server/eval.ru](./examples/server/eval.ru)
|
|
439
|
+
|
|
354
440
|
**Custom evaluators**
|
|
355
441
|
|
|
356
442
|
Evaluators can also be defined as subclasses:
|
|
@@ -358,15 +444,60 @@ Evaluators can also be defined as subclasses:
|
|
|
358
444
|
```ruby
|
|
359
445
|
class FoodClassifier < Braintrust::Eval::Evaluator
|
|
360
446
|
def task
|
|
361
|
-
->(input) { classify(input) }
|
|
447
|
+
->(input:) { classify(input) }
|
|
448
|
+
end
|
|
449
|
+
|
|
450
|
+
def scorers
|
|
451
|
+
[Braintrust::Scorer.new("exact_match") { |expected:, output:| output == expected ? 1.0 : 0.0 }]
|
|
452
|
+
end
|
|
453
|
+
end
|
|
454
|
+
```
|
|
455
|
+
|
|
456
|
+
#### Run as a Rails engine
|
|
457
|
+
|
|
458
|
+
Use the Rails engine when your evaluators live inside an existing Rails app and you want to mount the Braintrust eval server into that application.
|
|
459
|
+
|
|
460
|
+
Define each evaluator in its own file, for example under `app/evaluators/`:
|
|
461
|
+
|
|
462
|
+
```ruby
|
|
463
|
+
# app/evaluators/food_classifier.rb
|
|
464
|
+
class FoodClassifier < Braintrust::Eval::Evaluator
|
|
465
|
+
def task
|
|
466
|
+
->(input:) { classify(input) }
|
|
362
467
|
end
|
|
363
468
|
|
|
364
469
|
def scorers
|
|
365
|
-
[Braintrust::
|
|
470
|
+
[Braintrust::Scorer.new("exact_match") { |expected:, output:| output == expected ? 1.0 : 0.0 }]
|
|
366
471
|
end
|
|
367
472
|
end
|
|
368
473
|
```
|
|
369
474
|
|
|
475
|
+
Then generate the Braintrust initializer:
|
|
476
|
+
|
|
477
|
+
```bash
|
|
478
|
+
bin/rails generate braintrust:eval_server
|
|
479
|
+
```
|
|
480
|
+
|
|
481
|
+
```ruby
|
|
482
|
+
# config/routes.rb
|
|
483
|
+
Rails.application.routes.draw do
|
|
484
|
+
mount Braintrust::Contrib::Rails::Engine, at: "/braintrust"
|
|
485
|
+
end
|
|
486
|
+
```
|
|
487
|
+
|
|
488
|
+
The generator writes `config/initializers/braintrust_server.rb`, where you can review or customize the slug-to-evaluator mapping it discovers from `app/evaluators/**/*.rb` and `evaluators/**/*.rb`.
|
|
489
|
+
|
|
490
|
+
See example: [contrib/rails/eval.rb](./examples/contrib/rails/eval.rb)
|
|
491
|
+
|
|
492
|
+
**Developing locally**
|
|
493
|
+
|
|
494
|
+
If you want to skip authentication on incoming eval requests while developing locally:
|
|
495
|
+
|
|
496
|
+
- **For Rack**: Pass `auth: :none` to `Braintrust::Server::Rack.app(...)`
|
|
497
|
+
- **For Rails**: Set `config.auth = :none` in `config/initializers/braintrust_server.rb`
|
|
498
|
+
|
|
499
|
+
*NOTE: Setting `:none` disables authentication on incoming requests into your server; executing evals requires a `BRAINTRUST_API_KEY` to fetch resources.*
|
|
500
|
+
|
|
370
501
|
**Supported web servers**
|
|
371
502
|
|
|
372
503
|
The dev server requires the `rack` gem and a Rack-compatible web server.
|
|
@@ -378,14 +509,7 @@ The dev server requires the `rack` gem and a Rack-compatible web server.
|
|
|
378
509
|
| [Passenger](https://www.phusionpassenger.com/) | 6.x | |
|
|
379
510
|
| [WEBrick](https://github.com/ruby/webrick) | Not supported | Does not support server-sent events. |
|
|
380
511
|
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
```ruby
|
|
384
|
-
gem "rack"
|
|
385
|
-
gem "puma" # recommended
|
|
386
|
-
```
|
|
387
|
-
|
|
388
|
-
See example: [server/eval.ru](./examples/server/eval.ru)
|
|
512
|
+
See examples: [server/eval.ru](./examples/server/eval.ru),
|
|
389
513
|
|
|
390
514
|
## Documentation
|
|
391
515
|
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "net/http"
|
|
4
|
+
require "json"
|
|
5
|
+
require "uri"
|
|
6
|
+
require_relative "../../internal/http"
|
|
7
|
+
|
|
8
|
+
module Braintrust
|
|
9
|
+
class API
|
|
10
|
+
module Internal
|
|
11
|
+
# Internal BTQL client for querying spans.
|
|
12
|
+
# Not part of the public API — instantiated directly where needed.
|
|
13
|
+
class BTQL
|
|
14
|
+
# Maximum number of retries before returning partial results.
|
|
15
|
+
# Covers both freshness lag (partially indexed) and ingestion lag
|
|
16
|
+
# (spans not yet visible to BTQL after OTel flush).
|
|
17
|
+
MAX_FRESHNESS_RETRIES = 7
|
|
18
|
+
|
|
19
|
+
# Base delay (seconds) between retries (doubles each attempt, capped).
|
|
20
|
+
FRESHNESS_BASE_DELAY = 1.0
|
|
21
|
+
|
|
22
|
+
# Maximum delay (seconds) between retries. Caps exponential growth
|
|
23
|
+
# so we keep polling at a reasonable rate in the later window.
|
|
24
|
+
# Schedule: 1, 2, 4, 8, 8, 8, 8 = ~39s total worst-case.
|
|
25
|
+
MAX_FRESHNESS_DELAY = 8.0
|
|
26
|
+
|
|
27
|
+
def initialize(state)
|
|
28
|
+
@state = state
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Query spans belonging to a specific trace within an object.
|
|
32
|
+
#
|
|
33
|
+
# Builds a BTQL SQL query that matches the root_span_id and excludes scorer spans.
|
|
34
|
+
# Retries with exponential backoff if the response indicates data is not yet fresh.
|
|
35
|
+
#
|
|
36
|
+
# @param object_type [String] e.g. "experiment"
|
|
37
|
+
# @param object_id [String] Object UUID
|
|
38
|
+
# @param root_span_id [String] Hex trace ID of the root span
|
|
39
|
+
# @return [Array<Hash>] Parsed span data
|
|
40
|
+
def trace_spans(object_type:, object_id:, root_span_id:)
|
|
41
|
+
query = build_trace_query(
|
|
42
|
+
object_type: object_type,
|
|
43
|
+
object_id: object_id,
|
|
44
|
+
root_span_id: root_span_id
|
|
45
|
+
)
|
|
46
|
+
payload = {query: query, fmt: "jsonl"}
|
|
47
|
+
|
|
48
|
+
retries = 0
|
|
49
|
+
loop do
|
|
50
|
+
rows, freshness = execute_query(payload)
|
|
51
|
+
# Return when data is fresh AND non-empty, or we've exhausted retries.
|
|
52
|
+
# We retry on empty even when "complete" because there is ingestion lag
|
|
53
|
+
# between OTel flush and BTQL indexing — the server may report "complete"
|
|
54
|
+
# before it knows about newly-flushed spans.
|
|
55
|
+
return rows if (freshness == "complete" && !rows.empty?) || retries >= MAX_FRESHNESS_RETRIES
|
|
56
|
+
|
|
57
|
+
retries += 1
|
|
58
|
+
delay = [FRESHNESS_BASE_DELAY * (2**(retries - 1)), MAX_FRESHNESS_DELAY].min
|
|
59
|
+
sleep(delay)
|
|
60
|
+
end
|
|
61
|
+
rescue => e
|
|
62
|
+
Braintrust::Log.warn("[BTQL] Query failed: #{e.message}")
|
|
63
|
+
[]
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
private
|
|
67
|
+
|
|
68
|
+
# Build a BTQL SQL query string for fetching trace spans.
|
|
69
|
+
#
|
|
70
|
+
# Selects all spans for a given root_span_id, excluding scorer spans
|
|
71
|
+
# (span_attributes.type = 'score').
|
|
72
|
+
#
|
|
73
|
+
# @param object_type [String] e.g. "experiment"
|
|
74
|
+
# @param object_id [String] Object UUID
|
|
75
|
+
# @param root_span_id [String] Hex trace ID
|
|
76
|
+
# @return [String] BTQL SQL query
|
|
77
|
+
def build_trace_query(object_type:, object_id:, root_span_id:)
|
|
78
|
+
escaped_root = root_span_id.gsub("'", "''")
|
|
79
|
+
escaped_id = object_id.gsub("'", "''")
|
|
80
|
+
|
|
81
|
+
"SELECT * FROM #{object_type}('#{escaped_id}') " \
|
|
82
|
+
"WHERE root_span_id = '#{escaped_root}' " \
|
|
83
|
+
"AND span_attributes.type != 'score' " \
|
|
84
|
+
"LIMIT 1000"
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# Execute a BTQL query and parse the JSONL response.
|
|
88
|
+
#
|
|
89
|
+
# @param payload [Hash] BTQL request payload
|
|
90
|
+
# @return [Array(Array<Hash>, String)] [parsed_rows, freshness_state]
|
|
91
|
+
def execute_query(payload)
|
|
92
|
+
uri = URI("#{@state.api_url}/btql")
|
|
93
|
+
|
|
94
|
+
request = Net::HTTP::Post.new(uri)
|
|
95
|
+
request["Content-Type"] = "application/json"
|
|
96
|
+
request["Authorization"] = "Bearer #{@state.api_key}"
|
|
97
|
+
request["Accept"] = "application/x-jsonlines"
|
|
98
|
+
request.body = JSON.dump(payload)
|
|
99
|
+
|
|
100
|
+
response = Braintrust::Internal::Http.with_redirects(uri, request)
|
|
101
|
+
|
|
102
|
+
unless response.is_a?(Net::HTTPSuccess)
|
|
103
|
+
raise Braintrust::Error, "HTTP #{response.code} for POST #{uri}: #{response.body}"
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
freshness = response["x-bt-freshness-state"] || "complete"
|
|
107
|
+
[parse_jsonl(response.body), freshness]
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Parse a JSONL response body into an array of hashes.
|
|
111
|
+
#
|
|
112
|
+
# @param body [String] JSONL response body
|
|
113
|
+
# @return [Array<Hash>]
|
|
114
|
+
def parse_jsonl(body)
|
|
115
|
+
body.each_line.filter_map do |line|
|
|
116
|
+
line = line.strip
|
|
117
|
+
next if line.empty?
|
|
118
|
+
JSON.parse(line)
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
end
|
|
@@ -50,6 +50,25 @@ module Braintrust
|
|
|
50
50
|
|
|
51
51
|
JSON.parse(response.body)
|
|
52
52
|
end
|
|
53
|
+
|
|
54
|
+
# Delete an experiment
|
|
55
|
+
# DELETE /v1/experiment/:id
|
|
56
|
+
# @param id [String] Experiment ID
|
|
57
|
+
# @return [Hash] Deleted experiment data
|
|
58
|
+
def delete(id:)
|
|
59
|
+
uri = URI("#{@state.api_url}/v1/experiment/#{id}")
|
|
60
|
+
|
|
61
|
+
request = Net::HTTP::Delete.new(uri)
|
|
62
|
+
request["Authorization"] = "Bearer #{@state.api_key}"
|
|
63
|
+
|
|
64
|
+
response = Braintrust::Internal::Http.with_redirects(uri, request)
|
|
65
|
+
|
|
66
|
+
unless response.is_a?(Net::HTTPSuccess)
|
|
67
|
+
raise Error, "HTTP #{response.code} for DELETE #{uri}: #{response.body}"
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
JSON.parse(response.body)
|
|
71
|
+
end
|
|
53
72
|
end
|
|
54
73
|
end
|
|
55
74
|
end
|
|
@@ -35,6 +35,25 @@ module Braintrust
|
|
|
35
35
|
|
|
36
36
|
JSON.parse(response.body)
|
|
37
37
|
end
|
|
38
|
+
|
|
39
|
+
# Delete a project
|
|
40
|
+
# DELETE /v1/project/:id
|
|
41
|
+
# @param id [String] Project UUID
|
|
42
|
+
# @return [Hash] Deleted project data
|
|
43
|
+
def delete(id:)
|
|
44
|
+
uri = URI("#{@state.api_url}/v1/project/#{id}")
|
|
45
|
+
|
|
46
|
+
request = Net::HTTP::Delete.new(uri)
|
|
47
|
+
request["Authorization"] = "Bearer #{@state.api_key}"
|
|
48
|
+
|
|
49
|
+
response = Braintrust::Internal::Http.with_redirects(uri, request)
|
|
50
|
+
|
|
51
|
+
unless response.is_a?(Net::HTTPSuccess)
|
|
52
|
+
raise Error, "HTTP #{response.code} for DELETE #{uri}: #{response.body}"
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
JSON.parse(response.body)
|
|
56
|
+
end
|
|
38
57
|
end
|
|
39
58
|
end
|
|
40
59
|
end
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Braintrust
|
|
4
|
+
module Contrib
|
|
5
|
+
module Rails
|
|
6
|
+
module Server
|
|
7
|
+
class ApplicationController < ActionController::API
|
|
8
|
+
before_action :authenticate!
|
|
9
|
+
|
|
10
|
+
private
|
|
11
|
+
|
|
12
|
+
def authenticate!
|
|
13
|
+
auth_result = Engine.auth_strategy.authenticate(request.env)
|
|
14
|
+
unless auth_result
|
|
15
|
+
render json: {"error" => "Unauthorized"}, status: :unauthorized
|
|
16
|
+
return
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
request.env["braintrust.auth"] = auth_result
|
|
20
|
+
@braintrust_auth = auth_result
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def parse_json_body
|
|
24
|
+
body = request.body.read
|
|
25
|
+
return nil if body.nil? || body.empty?
|
|
26
|
+
JSON.parse(body)
|
|
27
|
+
rescue JSON::ParserError
|
|
28
|
+
nil
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Braintrust
|
|
4
|
+
module Contrib
|
|
5
|
+
module Rails
|
|
6
|
+
module Server
|
|
7
|
+
class Engine < ::Rails::Engine
|
|
8
|
+
isolate_namespace Braintrust::Contrib::Rails::Server
|
|
9
|
+
|
|
10
|
+
config.evaluators = {}
|
|
11
|
+
config.auth = :clerk_token
|
|
12
|
+
|
|
13
|
+
# Register the engine's routes file so Rails loads it during initialization.
|
|
14
|
+
paths["config/routes.rb"] << File.expand_path("routes.rb", __dir__)
|
|
15
|
+
|
|
16
|
+
initializer "braintrust.server.cors" do |app|
|
|
17
|
+
app.middleware.use Braintrust::Server::Middleware::Cors
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Class-level helpers that read from engine config.
|
|
21
|
+
|
|
22
|
+
def self.evaluators
|
|
23
|
+
config.evaluators
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def self.auth_strategy
|
|
27
|
+
resolve_auth(config.auth)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def self.list_service
|
|
31
|
+
Braintrust::Server::Services::List.new(-> { config.evaluators })
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Long-lived so the state cache persists across requests.
|
|
35
|
+
def self.eval_service
|
|
36
|
+
@eval_service ||= Braintrust::Server::Services::Eval.new(-> { config.evaluators })
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Support the explicit `|config|` style used by this integration while
|
|
40
|
+
# still delegating zero-arity DSL blocks to Rails' native implementation.
|
|
41
|
+
def self.configure(&block)
|
|
42
|
+
return super if block&.arity == 0
|
|
43
|
+
yield config if block
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def self.resolve_auth(auth)
|
|
47
|
+
case auth
|
|
48
|
+
when :none
|
|
49
|
+
Braintrust::Server::Auth::NoAuth.new
|
|
50
|
+
when :clerk_token
|
|
51
|
+
Braintrust::Server::Auth::ClerkToken.new
|
|
52
|
+
when Symbol, String
|
|
53
|
+
raise ArgumentError, "Unknown auth strategy #{auth.inspect}. Expected :none, :clerk_token, or an auth object."
|
|
54
|
+
else
|
|
55
|
+
auth
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
private_class_method :resolve_auth
|
|
59
|
+
|
|
60
|
+
generators do
|
|
61
|
+
require "braintrust/contrib/rails/server/generator"
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
require_relative "application_controller"
|
|
70
|
+
require_relative "health_controller"
|
|
71
|
+
require_relative "list_controller"
|
|
72
|
+
require_relative "eval_controller"
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Braintrust
|
|
4
|
+
module Contrib
|
|
5
|
+
module Rails
|
|
6
|
+
module Server
|
|
7
|
+
class EvalController < ApplicationController
|
|
8
|
+
include ActionController::Live
|
|
9
|
+
|
|
10
|
+
def create
|
|
11
|
+
body = parse_json_body
|
|
12
|
+
unless body
|
|
13
|
+
render json: {"error" => "Invalid JSON body"}, status: :bad_request
|
|
14
|
+
return
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
result = Engine.eval_service.validate(body)
|
|
18
|
+
if result[:error]
|
|
19
|
+
render json: {"error" => result[:error]}, status: result[:status]
|
|
20
|
+
return
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
response.headers["Content-Type"] = "text/event-stream"
|
|
24
|
+
response.headers["Cache-Control"] = "no-cache"
|
|
25
|
+
response.headers["Connection"] = "keep-alive"
|
|
26
|
+
|
|
27
|
+
sse = Braintrust::Server::SSEWriter.new { |chunk| response.stream.write(chunk) }
|
|
28
|
+
Engine.eval_service.stream(result, auth: @braintrust_auth, sse: sse)
|
|
29
|
+
ensure
|
|
30
|
+
response.stream.close
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "rails/generators"
|
|
4
|
+
|
|
5
|
+
module Braintrust
|
|
6
|
+
module Contrib
|
|
7
|
+
module Rails
|
|
8
|
+
module Server
|
|
9
|
+
module Generators
|
|
10
|
+
class ServerGenerator < ::Rails::Generators::Base
|
|
11
|
+
namespace "braintrust:server"
|
|
12
|
+
source_root File.expand_path("templates", __dir__)
|
|
13
|
+
|
|
14
|
+
def create_initializer
|
|
15
|
+
@evaluators = discovered_evaluators
|
|
16
|
+
template "initializer.rb.tt", "config/initializers/braintrust_server.rb"
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
private
|
|
20
|
+
|
|
21
|
+
def discovered_evaluators
|
|
22
|
+
evaluator_roots.flat_map do |root|
|
|
23
|
+
Dir[File.join(destination_root, root, "**/*.rb")].sort.map do |file|
|
|
24
|
+
relative_path = file.delete_prefix("#{File.join(destination_root, root)}/").sub(/\.rb\z/, "")
|
|
25
|
+
{
|
|
26
|
+
class_name: relative_path.split("/").map(&:camelize).join("::"),
|
|
27
|
+
slug: relative_path.tr("/", "-").tr("_", "-")
|
|
28
|
+
}
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def evaluator_roots
|
|
34
|
+
%w[app/evaluators evaluators].select do |root|
|
|
35
|
+
Dir.exist?(File.join(destination_root, root))
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Braintrust
|
|
4
|
+
module Contrib
|
|
5
|
+
module Rails
|
|
6
|
+
module Server
|
|
7
|
+
class ListController < ApplicationController
|
|
8
|
+
def show
|
|
9
|
+
result = Engine.list_service.call
|
|
10
|
+
render json: result
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|