braintrust 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,9 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative "eval/scorer"
4
+ require_relative "eval/evaluator"
4
5
  require_relative "eval/runner"
6
+ require_relative "eval/functions"
5
7
  require_relative "api/internal/projects"
6
8
  require_relative "api/internal/experiments"
7
9
  require_relative "dataset"
@@ -186,14 +188,17 @@ module Braintrust
186
188
  end
187
189
 
188
190
  # Run an evaluation
189
- # @param project [String] The project name
190
- # @param experiment [String] The experiment name
191
+ # @param project [String, nil] The project name (triggers full API mode: creates project + experiment)
192
+ # @param experiment [String, nil] The experiment name
191
193
  # @param cases [Array, Enumerable, nil] The test cases (mutually exclusive with dataset)
192
194
  # @param dataset [String, Hash, nil] Dataset to fetch (mutually exclusive with cases)
193
195
  # - String: dataset name (fetches from same project)
194
196
  # - Hash: {name:, id:, project:, version:, limit:}
195
197
  # @param task [#call] The task to evaluate (must be callable)
196
198
  # @param scorers [Array<Scorer, #call>] The scorers to use (Scorer objects or callables)
199
+ # @param on_progress [#call, nil] Optional callback fired after each test case.
200
+ # Receives a Hash: {"data" => output, "scores" => {name => value}} on success,
201
+ # or {"error" => message} on failure.
197
202
  # @param parallelism [Integer] Number of parallel workers (default: 1).
198
203
  # When parallelism > 1, test cases are executed concurrently using a thread pool.
199
204
  # The task and scorers MUST be thread-safe when using parallelism > 1.
@@ -201,53 +206,45 @@ module Braintrust
201
206
  # @param metadata [Hash] Optional experiment metadata
202
207
  # @param update [Boolean] If true, allow reusing existing experiment (default: false)
203
208
  # @param quiet [Boolean] If true, suppress result output (default: false)
204
- # @param api [API, nil] Braintrust API client (defaults to API.new using global state)
209
+ # @param state [State, nil] Braintrust state (defaults to global state)
205
210
  # @param tracer_provider [TracerProvider, nil] OpenTelemetry tracer provider (defaults to global)
206
211
  # @return [Result]
207
- def run(project:, experiment:, task:, scorers:,
208
- cases: nil, dataset: nil,
212
+ def run(task:, scorers:, project: nil, experiment: nil,
213
+ cases: nil, dataset: nil, on_progress: nil,
209
214
  parallelism: 1, tags: nil, metadata: nil, update: false, quiet: false,
210
- api: nil, tracer_provider: nil)
215
+ state: nil, tracer_provider: nil, project_id: nil, parent: nil)
211
216
  # Validate required parameters
212
- validate_params!(project: project, experiment: experiment,
213
- cases: cases, dataset: dataset, task: task, scorers: scorers)
217
+ validate_params!(task: task, scorers: scorers, cases: cases, dataset: dataset)
214
218
 
215
- # Get API from parameter or create from global state
216
- api ||= API.new
219
+ # Resolve any ScorerId entries to real Scorer objects
220
+ scorers = resolve_scorers(scorers, state: state, tracer_provider: tracer_provider)
217
221
 
218
- # Ensure logged in (to populate org_name, etc.)
219
- # login is idempotent and returns early if already logged in
220
- api.login
222
+ experiment_id = nil
223
+ project_name = project
221
224
 
222
- # Resolve dataset to cases if dataset parameter provided
223
- dataset_id = nil
224
- dataset_version = nil
225
+ # Full API mode: project name or project_id provided, resolve via API
226
+ if project || project_id
227
+ state ||= Braintrust.current_state
228
+ state.login
225
229
 
226
- if dataset
227
- resolved = resolve_dataset(dataset, project, api)
228
- cases = resolved[:cases]
229
- dataset_id = resolved[:dataset_id]
230
- dataset_version = resolved[:dataset_version]
231
- end
232
-
233
- # Register project and experiment via internal API
234
- projects_api = API::Internal::Projects.new(api.state)
235
- experiments_api = API::Internal::Experiments.new(api.state)
230
+ if dataset
231
+ resolved = resolve_dataset(dataset, project, state)
232
+ cases = resolved[:cases]
233
+ end
236
234
 
237
- project_result = projects_api.create(name: project)
238
- experiment_result = experiments_api.create(
239
- name: experiment,
240
- project_id: project_result["id"],
241
- ensure_new: !update,
242
- tags: tags,
243
- metadata: metadata,
244
- dataset_id: dataset_id,
245
- dataset_version: dataset_version
246
- )
247
-
248
- experiment_id = experiment_result["id"]
249
- project_id = project_result["id"]
250
- project_name = project_result["name"]
235
+ # Skip experiment creation for remote evals (parent present).
236
+ # The OTLP backend creates experiments from ingested spans.
237
+ unless parent
238
+ project_id, project_name = resolve_project(state, project, project_id)
239
+ experiment_id = create_experiment(
240
+ state, experiment, project_id,
241
+ update: update, tags: tags, metadata: metadata,
242
+ dataset_id: resolved&.dig(:dataset_id),
243
+ dataset_version: resolved&.dig(:dataset_version)
244
+ )
245
+ parent = {object_type: "experiment_id", object_id: experiment_id}
246
+ end
247
+ end
251
248
 
252
249
  # Instantiate Runner and run evaluation
253
250
  runner = Runner.new(
@@ -257,8 +254,10 @@ module Braintrust
257
254
  project_name: project_name,
258
255
  task: task,
259
256
  scorers: scorers,
260
- api: api,
261
- tracer_provider: tracer_provider
257
+ state: state,
258
+ tracer_provider: tracer_provider,
259
+ on_progress: on_progress,
260
+ parent: parent
262
261
  )
263
262
  result = runner.run(cases, parallelism: parallelism)
264
263
 
@@ -276,11 +275,29 @@ module Braintrust
276
275
  puts result.to_pretty
277
276
  end
278
277
 
278
+ # Resolve scorers array: ScorerId entries become real Scorer objects, others pass through
279
+ # @param scorers [Array] Scorers (Scorer, callable, or ScorerId)
280
+ # @param state [State, nil] Braintrust state (required for ScorerId resolution)
281
+ # @param tracer_provider [TracerProvider, nil] OpenTelemetry tracer provider
282
+ # @return [Array<Scorer, #call>] Resolved scorers
283
+ def resolve_scorers(scorers, state: nil, tracer_provider: nil)
284
+ scorers.map do |scorer|
285
+ if scorer.is_a?(ScorerId)
286
+ Functions.scorer_by_id(
287
+ id: scorer.function_id,
288
+ version: scorer.version,
289
+ state: state,
290
+ tracer_provider: tracer_provider
291
+ )
292
+ else
293
+ scorer
294
+ end
295
+ end
296
+ end
297
+
279
298
  # Validate required parameters
280
299
  # @raise [ArgumentError] if validation fails
281
- def validate_params!(project:, experiment:, cases:, dataset:, task:, scorers:)
282
- raise ArgumentError, "project is required" unless project
283
- raise ArgumentError, "experiment is required" unless experiment
300
+ def validate_params!(task:, scorers:, cases:, dataset:)
284
301
  raise ArgumentError, "task is required" unless task
285
302
  raise ArgumentError, "scorers is required" unless scorers
286
303
 
@@ -300,27 +317,57 @@ module Braintrust
300
317
  end
301
318
  end
302
319
 
320
+ # Resolve project by name or ID. Creates if needed.
321
+ # @return [Array(String, String)] [project_id, project_name]
322
+ def resolve_project(state, project, project_id)
323
+ if project_id
324
+ [project_id, project]
325
+ else
326
+ result = API::Internal::Projects.new(state).create(name: project)
327
+ [result["id"], result["name"]]
328
+ end
329
+ end
330
+
331
+ # Create an experiment in the given project.
332
+ # @return [String] experiment_id
333
+ def create_experiment(state, name, project_id,
334
+ update: false, tags: nil, metadata: nil,
335
+ dataset_id: nil, dataset_version: nil)
336
+ result = API::Internal::Experiments.new(state).create(
337
+ name: name,
338
+ project_id: project_id,
339
+ ensure_new: !update,
340
+ tags: tags,
341
+ metadata: metadata,
342
+ dataset_id: dataset_id,
343
+ dataset_version: dataset_version
344
+ )
345
+ result["id"]
346
+ end
347
+
303
348
  # Resolve dataset parameter to cases with metadata for experiment linking
304
349
  # @param dataset [String, Hash, Dataset] Dataset specifier or instance
305
350
  # @param project [String] Project name (used as default if not specified)
306
- # @param api [API] Braintrust API client
351
+ # @param state [State] Braintrust state
307
352
  # @return [Hash] Hash with :cases, :dataset_id, and :dataset_version
308
- def resolve_dataset(dataset, project, api)
353
+ def resolve_dataset(dataset, project, state)
309
354
  limit = nil
310
355
 
311
356
  dataset_obj = case dataset
312
357
  when Dataset
313
358
  dataset
359
+ when DatasetId
360
+ Dataset.new(id: dataset.id, state: state)
314
361
  when String
315
- Dataset.new(name: dataset, project: project, api: api)
362
+ Dataset.new(name: dataset, project: project, state: state)
316
363
  when Hash
317
364
  opts = dataset.dup
318
365
  limit = opts.delete(:limit)
319
366
  opts[:project] ||= project
320
- opts[:api] = api
367
+ opts[:state] = state
321
368
  Dataset.new(**opts)
322
369
  else
323
- raise ArgumentError, "dataset must be String, Hash, or Dataset, got #{dataset.class}"
370
+ raise ArgumentError, "dataset must be String, Hash, Dataset, or DatasetId, got #{dataset.class}"
324
371
  end
325
372
 
326
373
  cases = dataset_obj.fetch_all(limit: limit)
@@ -0,0 +1,68 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "net/http"
4
+ require "json"
5
+
6
+ module Braintrust
7
+ module Server
8
+ module Auth
9
+ # Validates Clerk JWT session tokens via the Braintrust app endpoint.
10
+ # The browser forwards the Clerk session token which is validated by
11
+ # POST /api/apikey/login on the app server.
12
+ class ClerkToken
13
+ DEFAULT_APP_URL = "https://www.braintrust.dev"
14
+ RACK_AUTH_HEADER = "HTTP_AUTHORIZATION"
15
+ RACK_ORG_NAME_HEADER = "HTTP_X_BT_ORG_NAME"
16
+ BEARER_PATTERN = /\ABearer (.+)\z/
17
+ LOGIN_PATH = "/api/apikey/login"
18
+
19
+ def initialize(app_url: nil)
20
+ @app_url = app_url || DEFAULT_APP_URL
21
+ end
22
+
23
+ def authenticate(env)
24
+ token = extract_bearer_token(env)
25
+ return nil unless token
26
+
27
+ login_response = validate_token(token)
28
+ return nil unless login_response
29
+
30
+ org_name = env[RACK_ORG_NAME_HEADER]
31
+
32
+ {
33
+ "api_key" => token,
34
+ "org_id" => login_response["org_id"],
35
+ "org_name" => org_name || login_response["org_name"],
36
+ "app_url" => @app_url,
37
+ "api_url" => login_response["api_url"] || @app_url
38
+ }
39
+ end
40
+
41
+ private
42
+
43
+ def extract_bearer_token(env)
44
+ header = env[RACK_AUTH_HEADER]
45
+ return nil unless header
46
+ header[BEARER_PATTERN, 1]
47
+ end
48
+
49
+ def validate_token(token)
50
+ uri = URI("#{@app_url}#{LOGIN_PATH}")
51
+ http = Net::HTTP.new(uri.host, uri.port)
52
+ http.use_ssl = uri.scheme == "https"
53
+
54
+ request = Net::HTTP::Post.new(uri)
55
+ request["Content-Type"] = "application/json"
56
+ request.body = JSON.dump({token: token})
57
+
58
+ response = http.request(request)
59
+ return nil unless response.code == "200"
60
+
61
+ JSON.parse(response.body)
62
+ rescue
63
+ nil
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Braintrust
4
+ module Server
5
+ module Auth
6
+ # No-op auth strategy for testing and local development.
7
+ class NoAuth
8
+ def authenticate(_env)
9
+ true
10
+ end
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,217 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module Braintrust
6
+ module Server
7
+ module Handlers
8
+ # POST /eval — adapter that maps HTTP request to Evaluator#run and streams SSE results.
9
+ # Handles auth passthrough, datasets, remote scorers, project_id, and parent.
10
+ class Eval
11
+ def initialize(evaluators)
12
+ @evaluators = evaluators
13
+ end
14
+
15
+ def call(env)
16
+ body = parse_body(env)
17
+ return error_response(400, "Invalid JSON body") unless body
18
+
19
+ name = body["name"]
20
+ return error_response(400, "Missing required field: name") unless name
21
+
22
+ evaluator = @evaluators[name]
23
+ return error_response(404, "Evaluator '#{name}' not found") unless evaluator
24
+
25
+ data = body["data"]
26
+ return error_response(400, "Missing required field: data") unless data
27
+
28
+ # Validate exactly one data source
29
+ data_sources = ["data", "dataset_name", "dataset_id"].count { |k| data.key?(k) }
30
+ return error_response(400, "Exactly one data source required") if data_sources != 1
31
+
32
+ experiment_name = body["experiment_name"]
33
+
34
+ # Resolve data source
35
+ cases, dataset = resolve_data_source(data)
36
+
37
+ # Resolve remote scorers from request
38
+ remote_scorer_ids = resolve_remote_scorers(body["scores"])
39
+
40
+ # Resolve parent span context
41
+ parent = resolve_parent(body["parent"])
42
+
43
+ # Build state from auth context (if present)
44
+ state = build_state(env)
45
+
46
+ # The protocol-rack adapter (used by Falcon and any server built on
47
+ # protocol-http) buffers `each`-based bodies through an Enumerable path.
48
+ # Detect it via the "protocol.http.request" env key it injects, and use
49
+ # SSEStreamBody (call-only) so it dispatches through the Streaming path.
50
+ body_class = env.key?("protocol.http.request") ? SSEStreamBody : SSEBody
51
+
52
+ sse_body = body_class.new do |sse|
53
+ # Only pass project/experiment params when state is available
54
+ run_opts = {
55
+ on_progress: ->(progress_data) {
56
+ # Build remote eval protocol events from generic progress data.
57
+ # Runner provides: id, data/error, scores (optional), origin (optional).
58
+ # Protocol requires: id, object_type, origin, name, format, output_type, event, data.
59
+ base = {
60
+ "object_type" => "task",
61
+ "name" => name,
62
+ "format" => "code",
63
+ "output_type" => "completion"
64
+ }
65
+ base["id"] = progress_data["id"] if progress_data["id"]
66
+ base["origin"] = progress_data["origin"] if progress_data["origin"]
67
+
68
+ if progress_data.key?("error")
69
+ sse.event("progress", JSON.dump(base.merge("event" => "error", "data" => progress_data["error"])))
70
+ else
71
+ sse.event("progress", JSON.dump(base.merge("event" => "json_delta", "data" => JSON.dump(progress_data["data"]))))
72
+ end
73
+
74
+ # Signal per-cell completion so the UI exits "Streaming..." state
75
+ # and updates the progress bar immediately.
76
+ sse.event("progress", JSON.dump(base.merge("event" => "done", "data" => "")))
77
+ },
78
+ quiet: true
79
+ }
80
+ run_opts[:parent] = parent if parent
81
+ run_opts[:scorers] = remote_scorer_ids if remote_scorer_ids
82
+ run_opts[:dataset] = dataset if dataset
83
+
84
+ if state
85
+ run_opts[:state] = state
86
+ run_opts[:experiment] = experiment_name if experiment_name
87
+ run_opts[:project_id] = body["project_id"] if body["project_id"]
88
+ end
89
+
90
+ result = evaluator.run(cases, **run_opts)
91
+
92
+ # Flush buffered OTLP spans before sending completion events.
93
+ # The BatchSpanProcessor exports every ~5s; fast evals can finish
94
+ # before a single export fires, causing the UI to see no results.
95
+ Braintrust::Trace.flush_spans
96
+
97
+ # Build summary from result scores
98
+ averaged_scores = {}
99
+ result.scorer_stats.each do |scorer_name, stats|
100
+ averaged_scores[scorer_name] = stats.score_mean
101
+ end
102
+
103
+ sse.event("summary", JSON.dump({
104
+ "scores" => averaged_scores,
105
+ "experiment_name" => experiment_name,
106
+ "experiment_id" => result.experiment_id,
107
+ "project_id" => result.project_id
108
+ }))
109
+
110
+ sse.event("done", "")
111
+ end
112
+
113
+ [200, {"content-type" => "text/event-stream", "cache-control" => "no-cache", "connection" => "keep-alive"}, sse_body]
114
+ end
115
+
116
+ private
117
+
118
+ # Resolve data source from the data field.
119
+ # Returns [cases, dataset] where exactly one is non-nil.
120
+ def resolve_data_source(data)
121
+ if data.key?("data")
122
+ cases = data["data"].map do |d|
123
+ {input: d["input"], expected: d["expected"]}
124
+ end
125
+ [cases, nil]
126
+ elsif data.key?("dataset_id")
127
+ [nil, Braintrust::DatasetId.new(id: data["dataset_id"])]
128
+ elsif data.key?("dataset_name")
129
+ dataset_opts = {name: data["dataset_name"]}
130
+ dataset_opts[:project] = data["project_name"] if data["project_name"]
131
+ [nil, dataset_opts]
132
+ else
133
+ [nil, nil]
134
+ end
135
+ end
136
+
137
+ # Map request scores array to ScorerId structs.
138
+ # The UI sends function_id as a nested object: {"function_id": "uuid"}.
139
+ def resolve_remote_scorers(scores)
140
+ return nil if scores.nil? || scores.empty?
141
+ scores.map do |s|
142
+ func_id = s["function_id"]
143
+ func_id = func_id["function_id"] if func_id.is_a?(Hash)
144
+ Braintrust::ScorerId.new(
145
+ function_id: func_id,
146
+ version: s["version"]
147
+ )
148
+ end
149
+ end
150
+
151
+ # Map request parent to symbol-keyed Hash.
152
+ # Hardcode playground_id to match Java SDK behavior.
153
+ # Also extracts generation from propagated_event for span_attributes.
154
+ def resolve_parent(parent)
155
+ return nil unless parent.is_a?(Hash)
156
+ object_id = parent["object_id"]
157
+ return nil unless object_id
158
+
159
+ generation = parent.dig("propagated_event", "span_attributes", "generation")
160
+
161
+ result = {object_type: "playground_id", object_id: object_id}
162
+ result[:generation] = generation if generation
163
+ result
164
+ end
165
+
166
+ # Build State from auth context set by Auth middleware.
167
+ # Returns nil when no auth context is present (e.g. NoAuth strategy).
168
+ # Uses an LRU-style cache (max 64 entries) keyed by [api_key, app_url, org_name].
169
+ def build_state(env)
170
+ auth = env["braintrust.auth"]
171
+ return nil unless auth.is_a?(Hash)
172
+
173
+ cache_key = [auth["api_key"], auth["app_url"], auth["org_name"]]
174
+
175
+ @state_mutex ||= Mutex.new
176
+ @state_cache ||= {}
177
+
178
+ @state_mutex.synchronize do
179
+ cached = @state_cache[cache_key]
180
+ return cached if cached
181
+
182
+ state = Braintrust::State.new(
183
+ api_key: auth["api_key"],
184
+ org_id: auth["org_id"],
185
+ org_name: auth["org_name"],
186
+ app_url: auth["app_url"],
187
+ api_url: auth["api_url"],
188
+ enable_tracing: false
189
+ )
190
+
191
+ # Evict oldest entry if cache is full
192
+ if @state_cache.size >= 64
193
+ oldest_key = @state_cache.keys.first
194
+ @state_cache.delete(oldest_key)
195
+ end
196
+
197
+ @state_cache[cache_key] = state
198
+ state
199
+ end
200
+ end
201
+
202
+ def parse_body(env)
203
+ body = env["rack.input"]&.read
204
+ return nil if body.nil? || body.empty?
205
+ JSON.parse(body)
206
+ rescue JSON::ParserError
207
+ nil
208
+ end
209
+
210
+ def error_response(status, message)
211
+ [status, {"content-type" => "application/json"},
212
+ [JSON.dump({"error" => message})]]
213
+ end
214
+ end
215
+ end
216
+ end
217
+ end
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module Braintrust
6
+ module Server
7
+ module Handlers
8
+ # GET / — simple health check endpoint.
9
+ class Health
10
+ def call(_env)
11
+ [200, {"content-type" => "application/json"}, [JSON.dump({"status" => "ok"})]]
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,74 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module Braintrust
6
+ module Server
7
+ module Handlers
8
+ # GET/POST /list — returns all evaluators keyed by name.
9
+ #
10
+ # Response format (Braintrust dev server protocol):
11
+ # {
12
+ # "evaluator-name": {
13
+ # "parameters": { # optional
14
+ # "type": "braintrust.staticParameters",
15
+ # "schema": {
16
+ # "param_name": { "type": "data", "schema": {...}, "default": ..., "description": ... }
17
+ # },
18
+ # "source": null
19
+ # },
20
+ # "scores": [{ "name": "scorer_name" }, ...]
21
+ # }
22
+ # }
23
+ class List
24
+ def initialize(evaluators)
25
+ @evaluators = evaluators
26
+ end
27
+
28
+ def call(_env)
29
+ result = {}
30
+ @evaluators.each do |name, evaluator|
31
+ scores = (evaluator.scorers || []).each_with_index.map do |scorer, i|
32
+ scorer_name = scorer.respond_to?(:name) ? scorer.name : "score_#{i}"
33
+ {"name" => scorer_name}
34
+ end
35
+ entry = {"scores" => scores}
36
+ params = serialize_parameters(evaluator.parameters)
37
+ entry["parameters"] = params if params
38
+ result[name] = entry
39
+ end
40
+
41
+ [200, {"content-type" => "application/json"},
42
+ [JSON.dump(result)]]
43
+ end
44
+
45
+ private
46
+
47
+ # Convert user-defined parameters to the dev server protocol format.
48
+ # Wraps in a staticParameters container with "data" typed entries.
49
+ def serialize_parameters(parameters)
50
+ return nil unless parameters && !parameters.empty?
51
+
52
+ schema = {}
53
+ parameters.each do |name, spec|
54
+ spec = spec.transform_keys(&:to_s) if spec.is_a?(Hash)
55
+ if spec.is_a?(Hash)
56
+ schema[name.to_s] = {
57
+ "type" => "data",
58
+ "schema" => {"type" => spec["type"] || "string"},
59
+ "default" => spec["default"],
60
+ "description" => spec["description"]
61
+ }
62
+ end
63
+ end
64
+
65
+ {
66
+ "type" => "braintrust.staticParameters",
67
+ "schema" => schema,
68
+ "source" => nil
69
+ }
70
+ end
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module Braintrust
6
+ module Server
7
+ module Middleware
8
+ # Auth middleware that validates requests using a pluggable strategy.
9
+ # Sets env["braintrust.auth"] with the authentication result on success.
10
+ class Auth
11
+ def initialize(app, strategy:)
12
+ @app = app
13
+ @strategy = strategy
14
+ end
15
+
16
+ def call(env)
17
+ auth_result = @strategy.authenticate(env)
18
+ unless auth_result
19
+ return [401, {"content-type" => "application/json"},
20
+ [JSON.dump({"error" => "Unauthorized"})]]
21
+ end
22
+
23
+ env["braintrust.auth"] = auth_result
24
+ @app.call(env)
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end