braintrust 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +42 -15
- data/lib/braintrust/api/internal/btql.rb +124 -0
- data/lib/braintrust/api/internal/experiments.rb +19 -0
- data/lib/braintrust/api/internal/projects.rb +19 -0
- data/lib/braintrust/dataset.rb +6 -3
- data/lib/braintrust/eval/context.rb +131 -0
- data/lib/braintrust/eval/evaluator.rb +11 -5
- data/lib/braintrust/eval/functions.rb +10 -166
- data/lib/braintrust/eval/runner.rb +100 -108
- data/lib/braintrust/eval/scorer.rb +24 -96
- data/lib/braintrust/eval/trace.rb +129 -0
- data/lib/braintrust/eval.rb +60 -132
- data/lib/braintrust/functions.rb +168 -0
- data/lib/braintrust/internal/callable.rb +83 -0
- data/lib/braintrust/logger.rb +9 -0
- data/lib/braintrust/scorer.rb +122 -0
- data/lib/braintrust/server/handlers/eval.rb +3 -3
- data/lib/braintrust/task.rb +108 -0
- data/lib/braintrust/version.rb +1 -1
- metadata +8 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 747b190f21c7de342f85390f8a51b17628e23fa2436776989a3ebe637bf9d596
|
|
4
|
+
data.tar.gz: 1e6c0c59c9ce56d499a04d8424506c56e2c2ad359506a6d5175c7173dc4ab238
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 3f652583ec04f5b874e3417db4cc0dff7f43341eeffe686466b8caad5614ed336e8580ac7533ef100726f09cdb264900e0f454edd11328e611513ffc8f77d3cb
|
|
7
|
+
data.tar.gz: 3316d0cb4ccc77e2d0c0ae48c033b6f5c026237d85c85e75e139434023f713820c3790a98090dac636ae6d44127692279404bcd3ab88b0d50a3de3127d38e3a6
|
data/README.md
CHANGED
|
@@ -252,9 +252,9 @@ Braintrust::Eval.run(
|
|
|
252
252
|
{input: "apple", expected: "fruit"},
|
|
253
253
|
{input: "carrot", expected: "vegetable"}
|
|
254
254
|
],
|
|
255
|
-
task: ->(input) { classify(input) },
|
|
255
|
+
task: ->(input:) { classify(input) },
|
|
256
256
|
scorers: [
|
|
257
|
-
->(
|
|
257
|
+
->(expected:, output:) { output == expected ? 1.0 : 0.0 }
|
|
258
258
|
]
|
|
259
259
|
)
|
|
260
260
|
```
|
|
@@ -267,7 +267,7 @@ Use test cases from a Braintrust dataset:
|
|
|
267
267
|
Braintrust::Eval.run(
|
|
268
268
|
project: "my-project",
|
|
269
269
|
dataset: "my-dataset",
|
|
270
|
-
task: ->(input) { classify(input) },
|
|
270
|
+
task: ->(input:) { classify(input) },
|
|
271
271
|
scorers: [...]
|
|
272
272
|
)
|
|
273
273
|
```
|
|
@@ -282,7 +282,7 @@ Braintrust::Eval.run(
|
|
|
282
282
|
{input: "apple", expected: "fruit", tags: ["produce"], metadata: {difficulty: "easy"}},
|
|
283
283
|
{input: "salmon", expected: "protein", tags: ["seafood"], metadata: {difficulty: "medium"}}
|
|
284
284
|
],
|
|
285
|
-
task: ->(input) { classify(input) },
|
|
285
|
+
task: ->(input:) { classify(input) },
|
|
286
286
|
scorers: [...]
|
|
287
287
|
)
|
|
288
288
|
```
|
|
@@ -295,29 +295,56 @@ Use scoring functions defined in Braintrust:
|
|
|
295
295
|
Braintrust::Eval.run(
|
|
296
296
|
project: "my-project",
|
|
297
297
|
cases: [...],
|
|
298
|
-
task: ->(input) { ... },
|
|
298
|
+
task: ->(input:) { ... },
|
|
299
|
+
scorers: ["accuracy-scorer"]
|
|
300
|
+
)
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
Or define scorers inline with `Scorer.new`:
|
|
304
|
+
|
|
305
|
+
```ruby
|
|
306
|
+
Braintrust::Eval.run(
|
|
307
|
+
project: "my-project",
|
|
308
|
+
cases: [...],
|
|
309
|
+
task: ->(input:) { ... },
|
|
299
310
|
scorers: [
|
|
300
|
-
Braintrust::
|
|
311
|
+
Braintrust::Scorer.new("exact_match") do |expected:, output:|
|
|
312
|
+
output == expected ? 1.0 : 0.0
|
|
313
|
+
end
|
|
301
314
|
]
|
|
302
315
|
)
|
|
303
316
|
```
|
|
304
317
|
|
|
305
|
-
|
|
318
|
+
#### Trace scoring
|
|
319
|
+
|
|
320
|
+
Scorers can access the full evaluation trace (all spans generated by the task) by declaring a `trace:` keyword parameter. This is useful for inspecting intermediate LLM calls, validating tool usage, or checking the message thread:
|
|
306
321
|
|
|
307
322
|
```ruby
|
|
308
323
|
Braintrust::Eval.run(
|
|
309
324
|
project: "my-project",
|
|
310
|
-
cases: [
|
|
311
|
-
task:
|
|
325
|
+
cases: [{input: "What is 2+2?", expected: "4"}],
|
|
326
|
+
task: Braintrust::Task.new { |input:| my_llm_pipeline(input) },
|
|
312
327
|
scorers: [
|
|
313
|
-
|
|
328
|
+
# Access the full trace to inspect LLM spans
|
|
329
|
+
Braintrust::Scorer.new("uses_system_prompt") do |output:, trace:|
|
|
330
|
+
messages = trace.thread # reconstructed message thread from LLM spans
|
|
331
|
+
messages.any? { |m| m["role"] == "system" } ? 1.0 : 0.0
|
|
332
|
+
end,
|
|
333
|
+
|
|
334
|
+
# Filter spans by type
|
|
335
|
+
Braintrust::Scorer.new("single_llm_call") do |output:, trace:|
|
|
336
|
+
trace.spans(span_type: "llm").length == 1 ? 1.0 : 0.0
|
|
337
|
+
end,
|
|
338
|
+
|
|
339
|
+
# Scorers without trace: still work — the parameter is filtered out automatically
|
|
340
|
+
Braintrust::Scorer.new("exact_match") do |output:, expected:|
|
|
314
341
|
output == expected ? 1.0 : 0.0
|
|
315
342
|
end
|
|
316
343
|
]
|
|
317
344
|
)
|
|
318
345
|
```
|
|
319
346
|
|
|
320
|
-
See examples: [eval.rb](./examples/eval.rb), [dataset.rb](./examples/eval/dataset.rb), [remote_functions.rb](./examples/eval/remote_functions.rb)
|
|
347
|
+
See examples: [eval.rb](./examples/eval.rb), [dataset.rb](./examples/eval/dataset.rb), [remote_functions.rb](./examples/eval/remote_functions.rb), [trace_scoring.rb](./examples/eval/trace_scoring.rb)
|
|
321
348
|
|
|
322
349
|
### Dev Server
|
|
323
350
|
|
|
@@ -330,9 +357,9 @@ require "braintrust/server"
|
|
|
330
357
|
|
|
331
358
|
# Define evaluators — these can reference your application code (models, services, etc.)
|
|
332
359
|
food_classifier = Braintrust::Eval::Evaluator.new(
|
|
333
|
-
task: ->(input) { FoodClassifier.classify(input) },
|
|
360
|
+
task: ->(input:) { FoodClassifier.classify(input) },
|
|
334
361
|
scorers: [
|
|
335
|
-
Braintrust::
|
|
362
|
+
Braintrust::Scorer.new("exact_match") { |expected:, output:| output == expected ? 1.0 : 0.0 }
|
|
336
363
|
]
|
|
337
364
|
)
|
|
338
365
|
|
|
@@ -358,11 +385,11 @@ Evaluators can also be defined as subclasses:
|
|
|
358
385
|
```ruby
|
|
359
386
|
class FoodClassifier < Braintrust::Eval::Evaluator
|
|
360
387
|
def task
|
|
361
|
-
->(input) { classify(input) }
|
|
388
|
+
->(input:) { classify(input) }
|
|
362
389
|
end
|
|
363
390
|
|
|
364
391
|
def scorers
|
|
365
|
-
[Braintrust::
|
|
392
|
+
[Braintrust::Scorer.new("exact_match") { |expected:, output:| output == expected ? 1.0 : 0.0 }]
|
|
366
393
|
end
|
|
367
394
|
end
|
|
368
395
|
```
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "net/http"
|
|
4
|
+
require "json"
|
|
5
|
+
require "uri"
|
|
6
|
+
require_relative "../../internal/http"
|
|
7
|
+
|
|
8
|
+
module Braintrust
|
|
9
|
+
class API
|
|
10
|
+
module Internal
|
|
11
|
+
# Internal BTQL client for querying spans.
|
|
12
|
+
# Not part of the public API — instantiated directly where needed.
|
|
13
|
+
class BTQL
|
|
14
|
+
# Maximum number of retries before returning partial results.
|
|
15
|
+
# Covers both freshness lag (partially indexed) and ingestion lag
|
|
16
|
+
# (spans not yet visible to BTQL after OTel flush).
|
|
17
|
+
MAX_FRESHNESS_RETRIES = 7
|
|
18
|
+
|
|
19
|
+
# Base delay (seconds) between retries (doubles each attempt, capped).
|
|
20
|
+
FRESHNESS_BASE_DELAY = 1.0
|
|
21
|
+
|
|
22
|
+
# Maximum delay (seconds) between retries. Caps exponential growth
|
|
23
|
+
# so we keep polling at a reasonable rate in the later window.
|
|
24
|
+
# Schedule: 1, 2, 4, 8, 8, 8, 8 = ~39s total worst-case.
|
|
25
|
+
MAX_FRESHNESS_DELAY = 8.0
|
|
26
|
+
|
|
27
|
+
def initialize(state)
|
|
28
|
+
@state = state
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Query spans belonging to a specific trace within an object.
|
|
32
|
+
#
|
|
33
|
+
# Builds a BTQL SQL query that matches the root_span_id and excludes scorer spans.
|
|
34
|
+
# Retries with exponential backoff if the response indicates data is not yet fresh.
|
|
35
|
+
#
|
|
36
|
+
# @param object_type [String] e.g. "experiment"
|
|
37
|
+
# @param object_id [String] Object UUID
|
|
38
|
+
# @param root_span_id [String] Hex trace ID of the root span
|
|
39
|
+
# @return [Array<Hash>] Parsed span data
|
|
40
|
+
def trace_spans(object_type:, object_id:, root_span_id:)
|
|
41
|
+
query = build_trace_query(
|
|
42
|
+
object_type: object_type,
|
|
43
|
+
object_id: object_id,
|
|
44
|
+
root_span_id: root_span_id
|
|
45
|
+
)
|
|
46
|
+
payload = {query: query, fmt: "jsonl"}
|
|
47
|
+
|
|
48
|
+
retries = 0
|
|
49
|
+
loop do
|
|
50
|
+
rows, freshness = execute_query(payload)
|
|
51
|
+
# Return when data is fresh AND non-empty, or we've exhausted retries.
|
|
52
|
+
# We retry on empty even when "complete" because there is ingestion lag
|
|
53
|
+
# between OTel flush and BTQL indexing — the server may report "complete"
|
|
54
|
+
# before it knows about newly-flushed spans.
|
|
55
|
+
return rows if (freshness == "complete" && !rows.empty?) || retries >= MAX_FRESHNESS_RETRIES
|
|
56
|
+
|
|
57
|
+
retries += 1
|
|
58
|
+
delay = [FRESHNESS_BASE_DELAY * (2**(retries - 1)), MAX_FRESHNESS_DELAY].min
|
|
59
|
+
sleep(delay)
|
|
60
|
+
end
|
|
61
|
+
rescue => e
|
|
62
|
+
Braintrust::Log.warn("[BTQL] Query failed: #{e.message}")
|
|
63
|
+
[]
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
private
|
|
67
|
+
|
|
68
|
+
# Build a BTQL SQL query string for fetching trace spans.
|
|
69
|
+
#
|
|
70
|
+
# Selects all spans for a given root_span_id, excluding scorer spans
|
|
71
|
+
# (span_attributes.type = 'score').
|
|
72
|
+
#
|
|
73
|
+
# @param object_type [String] e.g. "experiment"
|
|
74
|
+
# @param object_id [String] Object UUID
|
|
75
|
+
# @param root_span_id [String] Hex trace ID
|
|
76
|
+
# @return [String] BTQL SQL query
|
|
77
|
+
def build_trace_query(object_type:, object_id:, root_span_id:)
|
|
78
|
+
escaped_root = root_span_id.gsub("'", "''")
|
|
79
|
+
escaped_id = object_id.gsub("'", "''")
|
|
80
|
+
|
|
81
|
+
"SELECT * FROM #{object_type}('#{escaped_id}') " \
|
|
82
|
+
"WHERE root_span_id = '#{escaped_root}' " \
|
|
83
|
+
"AND span_attributes.type != 'score' " \
|
|
84
|
+
"LIMIT 1000"
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# Execute a BTQL query and parse the JSONL response.
|
|
88
|
+
#
|
|
89
|
+
# @param payload [Hash] BTQL request payload
|
|
90
|
+
# @return [Array(Array<Hash>, String)] [parsed_rows, freshness_state]
|
|
91
|
+
def execute_query(payload)
|
|
92
|
+
uri = URI("#{@state.api_url}/btql")
|
|
93
|
+
|
|
94
|
+
request = Net::HTTP::Post.new(uri)
|
|
95
|
+
request["Content-Type"] = "application/json"
|
|
96
|
+
request["Authorization"] = "Bearer #{@state.api_key}"
|
|
97
|
+
request["Accept"] = "application/x-jsonlines"
|
|
98
|
+
request.body = JSON.dump(payload)
|
|
99
|
+
|
|
100
|
+
response = Braintrust::Internal::Http.with_redirects(uri, request)
|
|
101
|
+
|
|
102
|
+
unless response.is_a?(Net::HTTPSuccess)
|
|
103
|
+
raise Braintrust::Error, "HTTP #{response.code} for POST #{uri}: #{response.body}"
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
freshness = response["x-bt-freshness-state"] || "complete"
|
|
107
|
+
[parse_jsonl(response.body), freshness]
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Parse a JSONL response body into an array of hashes.
|
|
111
|
+
#
|
|
112
|
+
# @param body [String] JSONL response body
|
|
113
|
+
# @return [Array<Hash>]
|
|
114
|
+
def parse_jsonl(body)
|
|
115
|
+
body.each_line.filter_map do |line|
|
|
116
|
+
line = line.strip
|
|
117
|
+
next if line.empty?
|
|
118
|
+
JSON.parse(line)
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
end
|
|
@@ -50,6 +50,25 @@ module Braintrust
|
|
|
50
50
|
|
|
51
51
|
JSON.parse(response.body)
|
|
52
52
|
end
|
|
53
|
+
|
|
54
|
+
# Delete an experiment
|
|
55
|
+
# DELETE /v1/experiment/:id
|
|
56
|
+
# @param id [String] Experiment ID
|
|
57
|
+
# @return [Hash] Deleted experiment data
|
|
58
|
+
def delete(id:)
|
|
59
|
+
uri = URI("#{@state.api_url}/v1/experiment/#{id}")
|
|
60
|
+
|
|
61
|
+
request = Net::HTTP::Delete.new(uri)
|
|
62
|
+
request["Authorization"] = "Bearer #{@state.api_key}"
|
|
63
|
+
|
|
64
|
+
response = Braintrust::Internal::Http.with_redirects(uri, request)
|
|
65
|
+
|
|
66
|
+
unless response.is_a?(Net::HTTPSuccess)
|
|
67
|
+
raise Error, "HTTP #{response.code} for DELETE #{uri}: #{response.body}"
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
JSON.parse(response.body)
|
|
71
|
+
end
|
|
53
72
|
end
|
|
54
73
|
end
|
|
55
74
|
end
|
|
@@ -35,6 +35,25 @@ module Braintrust
|
|
|
35
35
|
|
|
36
36
|
JSON.parse(response.body)
|
|
37
37
|
end
|
|
38
|
+
|
|
39
|
+
# Delete a project
|
|
40
|
+
# DELETE /v1/project/:id
|
|
41
|
+
# @param id [String] Project UUID
|
|
42
|
+
# @return [Hash] Deleted project data
|
|
43
|
+
def delete(id:)
|
|
44
|
+
uri = URI("#{@state.api_url}/v1/project/#{id}")
|
|
45
|
+
|
|
46
|
+
request = Net::HTTP::Delete.new(uri)
|
|
47
|
+
request["Authorization"] = "Bearer #{@state.api_key}"
|
|
48
|
+
|
|
49
|
+
response = Braintrust::Internal::Http.with_redirects(uri, request)
|
|
50
|
+
|
|
51
|
+
unless response.is_a?(Net::HTTPSuccess)
|
|
52
|
+
raise Error, "HTTP #{response.code} for DELETE #{uri}: #{response.body}"
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
JSON.parse(response.body)
|
|
56
|
+
end
|
|
38
57
|
end
|
|
39
58
|
end
|
|
40
59
|
end
|
data/lib/braintrust/dataset.rb
CHANGED
|
@@ -181,9 +181,12 @@ module Braintrust
|
|
|
181
181
|
created: raw["created"]
|
|
182
182
|
)
|
|
183
183
|
end
|
|
184
|
+
|
|
185
|
+
# Value object wrapping a dataset UUID for resolution by ID.
|
|
186
|
+
# Used by Eval.run to distinguish dataset-by-ID from dataset-by-name.
|
|
187
|
+
ID = Struct.new(:id, keyword_init: true)
|
|
184
188
|
end
|
|
185
189
|
|
|
186
|
-
#
|
|
187
|
-
|
|
188
|
-
DatasetId = Struct.new(:id, keyword_init: true)
|
|
190
|
+
# @deprecated Use {Braintrust::Dataset::ID} instead.
|
|
191
|
+
DatasetId = Dataset::ID
|
|
189
192
|
end
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "cases"
|
|
4
|
+
|
|
5
|
+
module Braintrust
|
|
6
|
+
module Eval
|
|
7
|
+
# Holds all normalized, ready-to-execute eval components.
|
|
8
|
+
# Use Context.build to construct from raw user inputs.
|
|
9
|
+
class Context
|
|
10
|
+
attr_reader :task, :scorers, :cases, :experiment_id, :experiment_name,
|
|
11
|
+
:project_id, :project_name, :state, :tracer_provider,
|
|
12
|
+
:on_progress, :parent_span_attr, :generation
|
|
13
|
+
|
|
14
|
+
def initialize(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil,
|
|
15
|
+
project_id: nil, project_name: nil, state: nil, tracer_provider: nil,
|
|
16
|
+
on_progress: nil, parent_span_attr: nil, generation: nil)
|
|
17
|
+
@task = task
|
|
18
|
+
@scorers = scorers
|
|
19
|
+
@cases = cases
|
|
20
|
+
@experiment_id = experiment_id
|
|
21
|
+
@experiment_name = experiment_name
|
|
22
|
+
@project_id = project_id
|
|
23
|
+
@project_name = project_name
|
|
24
|
+
@state = state
|
|
25
|
+
@tracer_provider = tracer_provider
|
|
26
|
+
@on_progress = on_progress
|
|
27
|
+
@parent_span_attr = parent_span_attr
|
|
28
|
+
@generation = generation
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Build a Context from raw user inputs.
|
|
32
|
+
# Factory normalizes task, scorers, and cases into typed wrappers.
|
|
33
|
+
# Parent is resolved into parent_span_attr and generation.
|
|
34
|
+
def self.build(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil,
|
|
35
|
+
project_id: nil, project_name: nil, state: nil, tracer_provider: nil,
|
|
36
|
+
on_progress: nil, parent: nil)
|
|
37
|
+
factory = Factory.new(state: state, tracer_provider: tracer_provider, project_name: project_name)
|
|
38
|
+
|
|
39
|
+
Context.new(
|
|
40
|
+
task: factory.normalize_task(task),
|
|
41
|
+
scorers: factory.normalize_scorers(scorers),
|
|
42
|
+
cases: factory.normalize_cases(cases),
|
|
43
|
+
experiment_id: experiment_id,
|
|
44
|
+
experiment_name: experiment_name,
|
|
45
|
+
project_id: project_id,
|
|
46
|
+
project_name: project_name,
|
|
47
|
+
state: state,
|
|
48
|
+
tracer_provider: tracer_provider,
|
|
49
|
+
on_progress: on_progress,
|
|
50
|
+
parent_span_attr: factory.resolve_parent_span_attr(parent),
|
|
51
|
+
generation: parent&.dig(:generation)
|
|
52
|
+
)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Encapsulates normalization of raw user inputs into typed wrappers.
|
|
56
|
+
class Factory
|
|
57
|
+
def initialize(state: nil, tracer_provider: nil, project_name: nil)
|
|
58
|
+
@state = state
|
|
59
|
+
@tracer_provider = tracer_provider
|
|
60
|
+
@project_name = project_name
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def normalize_cases(raw)
|
|
64
|
+
case raw
|
|
65
|
+
when Cases
|
|
66
|
+
raw
|
|
67
|
+
when Array, Enumerable
|
|
68
|
+
Cases.new(raw)
|
|
69
|
+
else
|
|
70
|
+
if raw.respond_to?(:each)
|
|
71
|
+
Cases.new(raw)
|
|
72
|
+
else
|
|
73
|
+
raise ArgumentError, "cases must be Array or Enumerable"
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def resolve_parent_span_attr(parent)
|
|
79
|
+
return nil unless parent
|
|
80
|
+
"#{parent[:object_type]}:#{parent[:object_id]}"
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def normalize_task(raw)
|
|
84
|
+
case raw
|
|
85
|
+
when Task
|
|
86
|
+
raw
|
|
87
|
+
when Proc
|
|
88
|
+
# Pass Proc/Lambda directly to preserve keyword arg info.
|
|
89
|
+
# Legacy positional lambdas (arity 1) are auto-wrapped by Task#wrap_block.
|
|
90
|
+
Task.new(&raw)
|
|
91
|
+
else
|
|
92
|
+
# Callable class: wrap via method(:call) to preserve keyword arg info
|
|
93
|
+
name = raw.respond_to?(:name) ? raw.name : nil
|
|
94
|
+
Task.new(name, &raw.method(:call))
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def normalize_scorers(raw)
|
|
99
|
+
raw.map do |scorer|
|
|
100
|
+
case scorer
|
|
101
|
+
when String
|
|
102
|
+
raise ArgumentError, "project is required to resolve scorer slug '#{scorer}'" unless @project_name
|
|
103
|
+
Braintrust::Functions.scorer(
|
|
104
|
+
project: @project_name,
|
|
105
|
+
slug: scorer,
|
|
106
|
+
state: @state,
|
|
107
|
+
tracer_provider: @tracer_provider
|
|
108
|
+
)
|
|
109
|
+
when Braintrust::Scorer::ID
|
|
110
|
+
Braintrust::Functions.scorer(
|
|
111
|
+
id: scorer.function_id,
|
|
112
|
+
version: scorer.version,
|
|
113
|
+
state: @state,
|
|
114
|
+
tracer_provider: @tracer_provider
|
|
115
|
+
)
|
|
116
|
+
when Braintrust::Scorer
|
|
117
|
+
scorer
|
|
118
|
+
when Proc
|
|
119
|
+
# Pass Proc/Lambda directly to preserve keyword arg info
|
|
120
|
+
# (method(:call) loses parameter metadata)
|
|
121
|
+
Braintrust::Scorer.new(&scorer)
|
|
122
|
+
else
|
|
123
|
+
name = scorer.respond_to?(:name) ? scorer.name : nil
|
|
124
|
+
Braintrust::Scorer.new(name, &scorer.method(:call))
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
end
|
|
@@ -5,21 +5,27 @@ module Braintrust
|
|
|
5
5
|
# Base class for evaluators. Subclass and override #task and #scorers,
|
|
6
6
|
# or instantiate directly with keyword arguments.
|
|
7
7
|
#
|
|
8
|
+
# Evaluators are used with the dev server, which reports scorer names
|
|
9
|
+
# to the Braintrust UI. Always use named scorers (via Scorer.new or
|
|
10
|
+
# subclass) so they display meaningfully.
|
|
11
|
+
#
|
|
8
12
|
# @example Subclass pattern
|
|
9
13
|
# class FoodClassifier < Braintrust::Eval::Evaluator
|
|
10
14
|
# def task
|
|
11
|
-
# ->(input) { classify(input) }
|
|
15
|
+
# ->(input:) { classify(input) }
|
|
12
16
|
# end
|
|
13
17
|
#
|
|
14
18
|
# def scorers
|
|
15
|
-
# [Braintrust::
|
|
19
|
+
# [Braintrust::Scorer.new("exact_match") { |expected:, output:| output == expected ? 1.0 : 0.0 }]
|
|
16
20
|
# end
|
|
17
21
|
# end
|
|
18
22
|
#
|
|
19
23
|
# @example Inline pattern
|
|
20
24
|
# Braintrust::Eval::Evaluator.new(
|
|
21
|
-
# task: ->(input) { input.upcase },
|
|
22
|
-
# scorers: [
|
|
25
|
+
# task: ->(input:) { input.upcase },
|
|
26
|
+
# scorers: [
|
|
27
|
+
# Braintrust::Scorer.new("exact_match") { |expected:, output:| output == expected ? 1.0 : 0.0 }
|
|
28
|
+
# ]
|
|
23
29
|
# )
|
|
24
30
|
class Evaluator
|
|
25
31
|
attr_accessor :task, :scorers, :parameters
|
|
@@ -48,7 +54,7 @@ module Braintrust
|
|
|
48
54
|
# @param project [String, nil] Project name
|
|
49
55
|
# @param experiment [String, nil] Experiment name
|
|
50
56
|
# @param project_id [String, nil] Project UUID (skips project creation)
|
|
51
|
-
# @param dataset [String, Hash, Dataset,
|
|
57
|
+
# @param dataset [String, Hash, Dataset, Dataset::ID, nil] Dataset to fetch
|
|
52
58
|
# @param scorers [Array, nil] Additional scorers (merged with evaluator's own)
|
|
53
59
|
# @param parent [Hash, nil] Parent span context
|
|
54
60
|
# @param state [State, nil] Braintrust state
|