braintrust 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d67e6d0faeb24297af8a5f43ac1bd1ceacff1f37df2610244ae5f81e34c4ae5f
4
- data.tar.gz: 489ec68fee424aa8aa1880b73b58f1f26529493d8898cd0ae5876d3b919fcb7c
3
+ metadata.gz: 747b190f21c7de342f85390f8a51b17628e23fa2436776989a3ebe637bf9d596
4
+ data.tar.gz: 1e6c0c59c9ce56d499a04d8424506c56e2c2ad359506a6d5175c7173dc4ab238
5
5
  SHA512:
6
- metadata.gz: cd876122ad92c5439ff45e975fd84418bfcc7d72d6f9398e48b1ac4c60f09fb96c2b85b46ee1c8de6a75291c0b7d2754ee2fa069f77f8a2f8a4c069132c59d94
7
- data.tar.gz: 45d3f80f69ac9725d93aa0db24815da093bfd992b5418f8551c8d25e8caef9299f270a92fa922a4bc4bf3190d9f823a35c7203f9a74bd58daee31869b987f103
6
+ metadata.gz: 3f652583ec04f5b874e3417db4cc0dff7f43341eeffe686466b8caad5614ed336e8580ac7533ef100726f09cdb264900e0f454edd11328e611513ffc8f77d3cb
7
+ data.tar.gz: 3316d0cb4ccc77e2d0c0ae48c033b6f5c026237d85c85e75e139434023f713820c3790a98090dac636ae6d44127692279404bcd3ab88b0d50a3de3127d38e3a6
data/README.md CHANGED
@@ -252,9 +252,9 @@ Braintrust::Eval.run(
252
252
  {input: "apple", expected: "fruit"},
253
253
  {input: "carrot", expected: "vegetable"}
254
254
  ],
255
- task: ->(input) { classify(input) },
255
+ task: ->(input:) { classify(input) },
256
256
  scorers: [
257
- ->(input, expected, output) { output == expected ? 1.0 : 0.0 }
257
+ ->(expected:, output:) { output == expected ? 1.0 : 0.0 }
258
258
  ]
259
259
  )
260
260
  ```
@@ -267,7 +267,7 @@ Use test cases from a Braintrust dataset:
267
267
  Braintrust::Eval.run(
268
268
  project: "my-project",
269
269
  dataset: "my-dataset",
270
- task: ->(input) { classify(input) },
270
+ task: ->(input:) { classify(input) },
271
271
  scorers: [...]
272
272
  )
273
273
  ```
@@ -282,7 +282,7 @@ Braintrust::Eval.run(
282
282
  {input: "apple", expected: "fruit", tags: ["produce"], metadata: {difficulty: "easy"}},
283
283
  {input: "salmon", expected: "protein", tags: ["seafood"], metadata: {difficulty: "medium"}}
284
284
  ],
285
- task: ->(input) { classify(input) },
285
+ task: ->(input:) { classify(input) },
286
286
  scorers: [...]
287
287
  )
288
288
  ```
@@ -295,29 +295,56 @@ Use scoring functions defined in Braintrust:
295
295
  Braintrust::Eval.run(
296
296
  project: "my-project",
297
297
  cases: [...],
298
- task: ->(input) { ... },
298
+ task: ->(input:) { ... },
299
+ scorers: ["accuracy-scorer"]
300
+ )
301
+ ```
302
+
303
+ Or define scorers inline with `Scorer.new`:
304
+
305
+ ```ruby
306
+ Braintrust::Eval.run(
307
+ project: "my-project",
308
+ cases: [...],
309
+ task: ->(input:) { ... },
299
310
  scorers: [
300
- Braintrust::Eval::Functions.scorer(project: "my-project", slug: "accuracy-scorer")
311
+ Braintrust::Scorer.new("exact_match") do |expected:, output:|
312
+ output == expected ? 1.0 : 0.0
313
+ end
301
314
  ]
302
315
  )
303
316
  ```
304
317
 
305
- Or define scorers inline with `Eval.scorer`:
318
+ #### Trace scoring
319
+
320
+ Scorers can access the full evaluation trace (all spans generated by the task) by declaring a `trace:` keyword parameter. This is useful for inspecting intermediate LLM calls, validating tool usage, or checking the message thread:
306
321
 
307
322
  ```ruby
308
323
  Braintrust::Eval.run(
309
324
  project: "my-project",
310
- cases: [...],
311
- task: ->(input) { ... },
325
+ cases: [{input: "What is 2+2?", expected: "4"}],
326
+ task: Braintrust::Task.new { |input:| my_llm_pipeline(input) },
312
327
  scorers: [
313
- Braintrust::Eval.scorer("exact_match") do |input, expected, output|
328
+ # Access the full trace to inspect LLM spans
329
+ Braintrust::Scorer.new("uses_system_prompt") do |output:, trace:|
330
+ messages = trace.thread # reconstructed message thread from LLM spans
331
+ messages.any? { |m| m["role"] == "system" } ? 1.0 : 0.0
332
+ end,
333
+
334
+ # Filter spans by type
335
+ Braintrust::Scorer.new("single_llm_call") do |output:, trace:|
336
+ trace.spans(span_type: "llm").length == 1 ? 1.0 : 0.0
337
+ end,
338
+
339
+ # Scorers without trace: still work — the parameter is filtered out automatically
340
+ Braintrust::Scorer.new("exact_match") do |output:, expected:|
314
341
  output == expected ? 1.0 : 0.0
315
342
  end
316
343
  ]
317
344
  )
318
345
  ```
319
346
 
320
- See examples: [eval.rb](./examples/eval.rb), [dataset.rb](./examples/eval/dataset.rb), [remote_functions.rb](./examples/eval/remote_functions.rb)
347
+ See examples: [eval.rb](./examples/eval.rb), [dataset.rb](./examples/eval/dataset.rb), [remote_functions.rb](./examples/eval/remote_functions.rb), [trace_scoring.rb](./examples/eval/trace_scoring.rb)
321
348
 
322
349
  ### Dev Server
323
350
 
@@ -330,9 +357,9 @@ require "braintrust/server"
330
357
 
331
358
  # Define evaluators — these can reference your application code (models, services, etc.)
332
359
  food_classifier = Braintrust::Eval::Evaluator.new(
333
- task: ->(input) { FoodClassifier.classify(input) },
360
+ task: ->(input:) { FoodClassifier.classify(input) },
334
361
  scorers: [
335
- Braintrust::Eval.scorer("exact_match") { |input, expected, output| output == expected ? 1.0 : 0.0 }
362
+ Braintrust::Scorer.new("exact_match") { |expected:, output:| output == expected ? 1.0 : 0.0 }
336
363
  ]
337
364
  )
338
365
 
@@ -358,11 +385,11 @@ Evaluators can also be defined as subclasses:
358
385
  ```ruby
359
386
  class FoodClassifier < Braintrust::Eval::Evaluator
360
387
  def task
361
- ->(input) { classify(input) }
388
+ ->(input:) { classify(input) }
362
389
  end
363
390
 
364
391
  def scorers
365
- [Braintrust::Eval.scorer("exact_match") { |i, e, o| o == e ? 1.0 : 0.0 }]
392
+ [Braintrust::Scorer.new("exact_match") { |expected:, output:| output == expected ? 1.0 : 0.0 }]
366
393
  end
367
394
  end
368
395
  ```
@@ -0,0 +1,124 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "net/http"
4
+ require "json"
5
+ require "uri"
6
+ require_relative "../../internal/http"
7
+
8
+ module Braintrust
9
+ class API
10
+ module Internal
11
+ # Internal BTQL client for querying spans.
12
+ # Not part of the public API — instantiated directly where needed.
13
+ class BTQL
14
+ # Maximum number of retries before returning partial results.
15
+ # Covers both freshness lag (partially indexed) and ingestion lag
16
+ # (spans not yet visible to BTQL after OTel flush).
17
+ MAX_FRESHNESS_RETRIES = 7
18
+
19
+ # Base delay (seconds) between retries (doubles each attempt, capped).
20
+ FRESHNESS_BASE_DELAY = 1.0
21
+
22
+ # Maximum delay (seconds) between retries. Caps exponential growth
23
+ # so we keep polling at a reasonable rate in the later window.
24
+ # Schedule: 1, 2, 4, 8, 8, 8, 8 = ~39s total worst-case.
25
+ MAX_FRESHNESS_DELAY = 8.0
26
+
27
+ def initialize(state)
28
+ @state = state
29
+ end
30
+
31
+ # Query spans belonging to a specific trace within an object.
32
+ #
33
+ # Builds a BTQL SQL query that matches the root_span_id and excludes scorer spans.
34
+ # Retries with exponential backoff if the response indicates data is not yet fresh.
35
+ #
36
+ # @param object_type [String] e.g. "experiment"
37
+ # @param object_id [String] Object UUID
38
+ # @param root_span_id [String] Hex trace ID of the root span
39
+ # @return [Array<Hash>] Parsed span data
40
+ def trace_spans(object_type:, object_id:, root_span_id:)
41
+ query = build_trace_query(
42
+ object_type: object_type,
43
+ object_id: object_id,
44
+ root_span_id: root_span_id
45
+ )
46
+ payload = {query: query, fmt: "jsonl"}
47
+
48
+ retries = 0
49
+ loop do
50
+ rows, freshness = execute_query(payload)
51
+ # Return when data is fresh AND non-empty, or we've exhausted retries.
52
+ # We retry on empty even when "complete" because there is ingestion lag
53
+ # between OTel flush and BTQL indexing — the server may report "complete"
54
+ # before it knows about newly-flushed spans.
55
+ return rows if (freshness == "complete" && !rows.empty?) || retries >= MAX_FRESHNESS_RETRIES
56
+
57
+ retries += 1
58
+ delay = [FRESHNESS_BASE_DELAY * (2**(retries - 1)), MAX_FRESHNESS_DELAY].min
59
+ sleep(delay)
60
+ end
61
+ rescue => e
62
+ Braintrust::Log.warn("[BTQL] Query failed: #{e.message}")
63
+ []
64
+ end
65
+
66
+ private
67
+
68
+ # Build a BTQL SQL query string for fetching trace spans.
69
+ #
70
+ # Selects all spans for a given root_span_id, excluding scorer spans
71
+ # (span_attributes.type = 'score').
72
+ #
73
+ # @param object_type [String] e.g. "experiment"
74
+ # @param object_id [String] Object UUID
75
+ # @param root_span_id [String] Hex trace ID
76
+ # @return [String] BTQL SQL query
77
+ def build_trace_query(object_type:, object_id:, root_span_id:)
78
+ escaped_root = root_span_id.gsub("'", "''")
79
+ escaped_id = object_id.gsub("'", "''")
80
+
81
+ "SELECT * FROM #{object_type}('#{escaped_id}') " \
82
+ "WHERE root_span_id = '#{escaped_root}' " \
83
+ "AND span_attributes.type != 'score' " \
84
+ "LIMIT 1000"
85
+ end
86
+
87
+ # Execute a BTQL query and parse the JSONL response.
88
+ #
89
+ # @param payload [Hash] BTQL request payload
90
+ # @return [Array(Array<Hash>, String)] [parsed_rows, freshness_state]
91
+ def execute_query(payload)
92
+ uri = URI("#{@state.api_url}/btql")
93
+
94
+ request = Net::HTTP::Post.new(uri)
95
+ request["Content-Type"] = "application/json"
96
+ request["Authorization"] = "Bearer #{@state.api_key}"
97
+ request["Accept"] = "application/x-jsonlines"
98
+ request.body = JSON.dump(payload)
99
+
100
+ response = Braintrust::Internal::Http.with_redirects(uri, request)
101
+
102
+ unless response.is_a?(Net::HTTPSuccess)
103
+ raise Braintrust::Error, "HTTP #{response.code} for POST #{uri}: #{response.body}"
104
+ end
105
+
106
+ freshness = response["x-bt-freshness-state"] || "complete"
107
+ [parse_jsonl(response.body), freshness]
108
+ end
109
+
110
+ # Parse a JSONL response body into an array of hashes.
111
+ #
112
+ # @param body [String] JSONL response body
113
+ # @return [Array<Hash>]
114
+ def parse_jsonl(body)
115
+ body.each_line.filter_map do |line|
116
+ line = line.strip
117
+ next if line.empty?
118
+ JSON.parse(line)
119
+ end
120
+ end
121
+ end
122
+ end
123
+ end
124
+ end
@@ -50,6 +50,25 @@ module Braintrust
50
50
 
51
51
  JSON.parse(response.body)
52
52
  end
53
+
54
+ # Delete an experiment
55
+ # DELETE /v1/experiment/:id
56
+ # @param id [String] Experiment ID
57
+ # @return [Hash] Deleted experiment data
58
+ def delete(id:)
59
+ uri = URI("#{@state.api_url}/v1/experiment/#{id}")
60
+
61
+ request = Net::HTTP::Delete.new(uri)
62
+ request["Authorization"] = "Bearer #{@state.api_key}"
63
+
64
+ response = Braintrust::Internal::Http.with_redirects(uri, request)
65
+
66
+ unless response.is_a?(Net::HTTPSuccess)
67
+ raise Error, "HTTP #{response.code} for DELETE #{uri}: #{response.body}"
68
+ end
69
+
70
+ JSON.parse(response.body)
71
+ end
53
72
  end
54
73
  end
55
74
  end
@@ -35,6 +35,25 @@ module Braintrust
35
35
 
36
36
  JSON.parse(response.body)
37
37
  end
38
+
39
+ # Delete a project
40
+ # DELETE /v1/project/:id
41
+ # @param id [String] Project UUID
42
+ # @return [Hash] Deleted project data
43
+ def delete(id:)
44
+ uri = URI("#{@state.api_url}/v1/project/#{id}")
45
+
46
+ request = Net::HTTP::Delete.new(uri)
47
+ request["Authorization"] = "Bearer #{@state.api_key}"
48
+
49
+ response = Braintrust::Internal::Http.with_redirects(uri, request)
50
+
51
+ unless response.is_a?(Net::HTTPSuccess)
52
+ raise Error, "HTTP #{response.code} for DELETE #{uri}: #{response.body}"
53
+ end
54
+
55
+ JSON.parse(response.body)
56
+ end
38
57
  end
39
58
  end
40
59
  end
@@ -181,9 +181,12 @@ module Braintrust
181
181
  created: raw["created"]
182
182
  )
183
183
  end
184
+
185
+ # Value object wrapping a dataset UUID for resolution by ID.
186
+ # Used by Eval.run to distinguish dataset-by-ID from dataset-by-name.
187
+ ID = Struct.new(:id, keyword_init: true)
184
188
  end
185
189
 
186
- # Value object wrapping a dataset UUID for resolution by ID.
187
- # Used by Eval.run to distinguish dataset-by-ID from dataset-by-name.
188
- DatasetId = Struct.new(:id, keyword_init: true)
190
+ # @deprecated Use {Braintrust::Dataset::ID} instead.
191
+ DatasetId = Dataset::ID
189
192
  end
@@ -0,0 +1,131 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "cases"
4
+
5
+ module Braintrust
6
+ module Eval
7
+ # Holds all normalized, ready-to-execute eval components.
8
+ # Use Context.build to construct from raw user inputs.
9
+ class Context
10
+ attr_reader :task, :scorers, :cases, :experiment_id, :experiment_name,
11
+ :project_id, :project_name, :state, :tracer_provider,
12
+ :on_progress, :parent_span_attr, :generation
13
+
14
+ def initialize(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil,
15
+ project_id: nil, project_name: nil, state: nil, tracer_provider: nil,
16
+ on_progress: nil, parent_span_attr: nil, generation: nil)
17
+ @task = task
18
+ @scorers = scorers
19
+ @cases = cases
20
+ @experiment_id = experiment_id
21
+ @experiment_name = experiment_name
22
+ @project_id = project_id
23
+ @project_name = project_name
24
+ @state = state
25
+ @tracer_provider = tracer_provider
26
+ @on_progress = on_progress
27
+ @parent_span_attr = parent_span_attr
28
+ @generation = generation
29
+ end
30
+
31
+ # Build a Context from raw user inputs.
32
+ # Factory normalizes task, scorers, and cases into typed wrappers.
33
+ # Parent is resolved into parent_span_attr and generation.
34
+ def self.build(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil,
35
+ project_id: nil, project_name: nil, state: nil, tracer_provider: nil,
36
+ on_progress: nil, parent: nil)
37
+ factory = Factory.new(state: state, tracer_provider: tracer_provider, project_name: project_name)
38
+
39
+ Context.new(
40
+ task: factory.normalize_task(task),
41
+ scorers: factory.normalize_scorers(scorers),
42
+ cases: factory.normalize_cases(cases),
43
+ experiment_id: experiment_id,
44
+ experiment_name: experiment_name,
45
+ project_id: project_id,
46
+ project_name: project_name,
47
+ state: state,
48
+ tracer_provider: tracer_provider,
49
+ on_progress: on_progress,
50
+ parent_span_attr: factory.resolve_parent_span_attr(parent),
51
+ generation: parent&.dig(:generation)
52
+ )
53
+ end
54
+
55
+ # Encapsulates normalization of raw user inputs into typed wrappers.
56
+ class Factory
57
+ def initialize(state: nil, tracer_provider: nil, project_name: nil)
58
+ @state = state
59
+ @tracer_provider = tracer_provider
60
+ @project_name = project_name
61
+ end
62
+
63
+ def normalize_cases(raw)
64
+ case raw
65
+ when Cases
66
+ raw
67
+ when Array, Enumerable
68
+ Cases.new(raw)
69
+ else
70
+ if raw.respond_to?(:each)
71
+ Cases.new(raw)
72
+ else
73
+ raise ArgumentError, "cases must be Array or Enumerable"
74
+ end
75
+ end
76
+ end
77
+
78
+ def resolve_parent_span_attr(parent)
79
+ return nil unless parent
80
+ "#{parent[:object_type]}:#{parent[:object_id]}"
81
+ end
82
+
83
+ def normalize_task(raw)
84
+ case raw
85
+ when Task
86
+ raw
87
+ when Proc
88
+ # Pass Proc/Lambda directly to preserve keyword arg info.
89
+ # Legacy positional lambdas (arity 1) are auto-wrapped by Task#wrap_block.
90
+ Task.new(&raw)
91
+ else
92
+ # Callable class: wrap via method(:call) to preserve keyword arg info
93
+ name = raw.respond_to?(:name) ? raw.name : nil
94
+ Task.new(name, &raw.method(:call))
95
+ end
96
+ end
97
+
98
+ def normalize_scorers(raw)
99
+ raw.map do |scorer|
100
+ case scorer
101
+ when String
102
+ raise ArgumentError, "project is required to resolve scorer slug '#{scorer}'" unless @project_name
103
+ Braintrust::Functions.scorer(
104
+ project: @project_name,
105
+ slug: scorer,
106
+ state: @state,
107
+ tracer_provider: @tracer_provider
108
+ )
109
+ when Braintrust::Scorer::ID
110
+ Braintrust::Functions.scorer(
111
+ id: scorer.function_id,
112
+ version: scorer.version,
113
+ state: @state,
114
+ tracer_provider: @tracer_provider
115
+ )
116
+ when Braintrust::Scorer
117
+ scorer
118
+ when Proc
119
+ # Pass Proc/Lambda directly to preserve keyword arg info
120
+ # (method(:call) loses parameter metadata)
121
+ Braintrust::Scorer.new(&scorer)
122
+ else
123
+ name = scorer.respond_to?(:name) ? scorer.name : nil
124
+ Braintrust::Scorer.new(name, &scorer.method(:call))
125
+ end
126
+ end
127
+ end
128
+ end
129
+ end
130
+ end
131
+ end
@@ -5,21 +5,27 @@ module Braintrust
5
5
  # Base class for evaluators. Subclass and override #task and #scorers,
6
6
  # or instantiate directly with keyword arguments.
7
7
  #
8
+ # Evaluators are used with the dev server, which reports scorer names
9
+ # to the Braintrust UI. Always use named scorers (via Scorer.new or
10
+ # subclass) so they display meaningfully.
11
+ #
8
12
  # @example Subclass pattern
9
13
  # class FoodClassifier < Braintrust::Eval::Evaluator
10
14
  # def task
11
- # ->(input) { classify(input) }
15
+ # ->(input:) { classify(input) }
12
16
  # end
13
17
  #
14
18
  # def scorers
15
- # [Braintrust::Eval.scorer("exact_match") { |i, e, o| o == e ? 1.0 : 0.0 }]
19
+ # [Braintrust::Scorer.new("exact_match") { |expected:, output:| output == expected ? 1.0 : 0.0 }]
16
20
  # end
17
21
  # end
18
22
  #
19
23
  # @example Inline pattern
20
24
  # Braintrust::Eval::Evaluator.new(
21
- # task: ->(input) { input.upcase },
22
- # scorers: [my_scorer]
25
+ # task: ->(input:) { input.upcase },
26
+ # scorers: [
27
+ # Braintrust::Scorer.new("exact_match") { |expected:, output:| output == expected ? 1.0 : 0.0 }
28
+ # ]
23
29
  # )
24
30
  class Evaluator
25
31
  attr_accessor :task, :scorers, :parameters
@@ -48,7 +54,7 @@ module Braintrust
48
54
  # @param project [String, nil] Project name
49
55
  # @param experiment [String, nil] Experiment name
50
56
  # @param project_id [String, nil] Project UUID (skips project creation)
51
- # @param dataset [String, Hash, Dataset, DatasetId, nil] Dataset to fetch
57
+ # @param dataset [String, Hash, Dataset, Dataset::ID, nil] Dataset to fetch
52
58
  # @param scorers [Array, nil] Additional scorers (merged with evaluator's own)
53
59
  # @param parent [Hash, nil] Parent span context
54
60
  # @param state [State, nil] Braintrust state