braintrust 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +42 -15
- data/lib/braintrust/api/internal/btql.rb +124 -0
- data/lib/braintrust/api/internal/experiments.rb +19 -0
- data/lib/braintrust/api/internal/projects.rb +19 -0
- data/lib/braintrust/dataset.rb +6 -3
- data/lib/braintrust/eval/context.rb +131 -0
- data/lib/braintrust/eval/evaluator.rb +11 -5
- data/lib/braintrust/eval/functions.rb +10 -166
- data/lib/braintrust/eval/runner.rb +100 -108
- data/lib/braintrust/eval/scorer.rb +24 -96
- data/lib/braintrust/eval/trace.rb +129 -0
- data/lib/braintrust/eval.rb +60 -132
- data/lib/braintrust/functions.rb +168 -0
- data/lib/braintrust/internal/callable.rb +83 -0
- data/lib/braintrust/logger.rb +9 -0
- data/lib/braintrust/scorer.rb +122 -0
- data/lib/braintrust/server/handlers/eval.rb +3 -3
- data/lib/braintrust/task.rb +108 -0
- data/lib/braintrust/version.rb +1 -1
- metadata +8 -1
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Braintrust
|
|
4
|
+
module Eval
|
|
5
|
+
# Read-only trace data accessor for scorers.
|
|
6
|
+
#
|
|
7
|
+
# Per-case throwaway object — no global cache, no shared state.
|
|
8
|
+
# Accepts lazy (lambda) or eager (Array) span sources.
|
|
9
|
+
#
|
|
10
|
+
# BTQL span shape (string keys from JSON):
|
|
11
|
+
# "span_attributes" => {"type" => "llm", "name" => "Chat Completion"}
|
|
12
|
+
# "input" => [{"role" => "user", "content" => "..."}] # flat message array
|
|
13
|
+
# "output" => [{"message" => {"role" => "assistant", ...}}] # flat choices array
|
|
14
|
+
#
|
|
15
|
+
# @example Lazy loading from BTQL
|
|
16
|
+
# trace = Trace.new(spans: -> { btql.trace_spans(...) })
|
|
17
|
+
# trace.spans # triggers BTQL query on first access
|
|
18
|
+
# trace.spans # returns memoized result
|
|
19
|
+
#
|
|
20
|
+
# @example Eager loading
|
|
21
|
+
# trace = Trace.new(spans: [span1, span2])
|
|
22
|
+
# trace.spans # returns array directly
|
|
23
|
+
class Trace
|
|
24
|
+
# @param spans [Proc, Array] Span source — a lambda (lazy) or Array (eager).
|
|
25
|
+
def initialize(spans:)
|
|
26
|
+
@spans_source = spans
|
|
27
|
+
@spans_resolved = false
|
|
28
|
+
@spans_memo = nil
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Resolve and return spans, optionally filtered by type.
|
|
32
|
+
#
|
|
33
|
+
# The type lives at span_attributes.type in BTQL rows (e.g. "llm", "eval", "task").
|
|
34
|
+
#
|
|
35
|
+
# @param span_type [String, Array<String>, nil] Filter to spans matching this type.
|
|
36
|
+
# Accepts a single type string or an array of types (returns the union).
|
|
37
|
+
# Returns all spans when nil.
|
|
38
|
+
# @return [Array<Hash>] Matching spans.
|
|
39
|
+
def spans(span_type: nil)
|
|
40
|
+
resolved = resolve_spans
|
|
41
|
+
if span_type
|
|
42
|
+
types = Array(span_type)
|
|
43
|
+
resolved.select { |s| types.include?(span_type_for(s)) }
|
|
44
|
+
else
|
|
45
|
+
resolved
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Convenience method: extract a chronological message thread from LLM spans.
|
|
50
|
+
#
|
|
51
|
+
# Walks LLM spans, collects input messages (deduplicated) and output messages
|
|
52
|
+
# (always included). Returns a flat chronological array.
|
|
53
|
+
#
|
|
54
|
+
# BTQL LLM span format:
|
|
55
|
+
# input: flat array of messages [{"role" => "user", "content" => "..."}]
|
|
56
|
+
# output: flat array of choices [{"message" => {"role" => "assistant", ...}}]
|
|
57
|
+
#
|
|
58
|
+
# @return [Array<Hash>] Ordered message list.
|
|
59
|
+
def thread
|
|
60
|
+
llm_spans = spans(span_type: "llm")
|
|
61
|
+
return [] if llm_spans.empty?
|
|
62
|
+
|
|
63
|
+
seen = Set.new
|
|
64
|
+
messages = []
|
|
65
|
+
|
|
66
|
+
llm_spans.each do |span|
|
|
67
|
+
# Input: flat message array or {messages: [...]} wrapper
|
|
68
|
+
input = span["input"] || span[:input]
|
|
69
|
+
input_messages = extract_input_messages(input)
|
|
70
|
+
input_messages&.each do |msg|
|
|
71
|
+
key = msg.hash
|
|
72
|
+
unless seen.include?(key)
|
|
73
|
+
seen.add(key)
|
|
74
|
+
messages << msg
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Output: flat choices array or {choices: [...]} wrapper
|
|
79
|
+
output = span["output"] || span[:output]
|
|
80
|
+
extract_output_messages(output)&.each do |msg|
|
|
81
|
+
messages << msg
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
messages
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
private
|
|
89
|
+
|
|
90
|
+
# Extract the span type from a span hash.
|
|
91
|
+
# Handles both string and symbol keys for span_attributes.type.
|
|
92
|
+
def span_type_for(span)
|
|
93
|
+
attrs = span["span_attributes"] || span[:span_attributes]
|
|
94
|
+
return nil unless attrs
|
|
95
|
+
attrs["type"] || attrs[:type]
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# Extract input messages from a span's input field.
|
|
99
|
+
# Handles both flat array format (BTQL) and {messages: [...]} wrapper.
|
|
100
|
+
def extract_input_messages(input)
|
|
101
|
+
return nil unless input
|
|
102
|
+
return input if input.is_a?(Array)
|
|
103
|
+
input["messages"] || input[:messages]
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Extract output messages from a span's output field.
|
|
107
|
+
# Handles both flat choices array (BTQL) and {choices: [...]} wrapper.
|
|
108
|
+
def extract_output_messages(output)
|
|
109
|
+
return nil unless output
|
|
110
|
+
choices = output.is_a?(Array) ? output : (output["choices"] || output[:choices])
|
|
111
|
+
return nil unless choices
|
|
112
|
+
choices.filter_map { |c| c["message"] || c[:message] }
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def resolve_spans
|
|
116
|
+
unless @spans_resolved
|
|
117
|
+
@spans_memo = if @spans_source.respond_to?(:call)
|
|
118
|
+
@spans_source.call
|
|
119
|
+
else
|
|
120
|
+
@spans_source
|
|
121
|
+
end
|
|
122
|
+
@spans_memo ||= []
|
|
123
|
+
@spans_resolved = true
|
|
124
|
+
end
|
|
125
|
+
@spans_memo
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
end
|
data/lib/braintrust/eval.rb
CHANGED
|
@@ -1,9 +1,13 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require_relative "
|
|
3
|
+
require_relative "scorer"
|
|
4
|
+
require_relative "task"
|
|
5
|
+
require_relative "functions"
|
|
6
|
+
require_relative "eval/context"
|
|
4
7
|
require_relative "eval/evaluator"
|
|
5
|
-
require_relative "eval/runner"
|
|
6
8
|
require_relative "eval/functions"
|
|
9
|
+
require_relative "eval/runner"
|
|
10
|
+
require_relative "eval/scorer"
|
|
7
11
|
require_relative "api/internal/projects"
|
|
8
12
|
require_relative "api/internal/experiments"
|
|
9
13
|
require_relative "dataset"
|
|
@@ -17,18 +21,21 @@ module Braintrust
|
|
|
17
21
|
# The Eval module provides tools for running systematic evaluations of your AI systems. An
|
|
18
22
|
# evaluation consists of:
|
|
19
23
|
# - **Cases**: Test inputs with optional expected outputs
|
|
20
|
-
# - **Task**: The code/model being evaluated
|
|
21
|
-
# - **Scorers**: Functions that judge the quality of outputs
|
|
24
|
+
# - **Task**: The code/model being evaluated (a {Braintrust::Task} or callable)
|
|
25
|
+
# - **Scorers**: Functions that judge the quality of outputs (String name, {Braintrust::Scorer}, or callable)
|
|
26
|
+
#
|
|
27
|
+
# Tasks and scorers use keyword arguments. Only declare the keywords you need —
|
|
28
|
+
# extra kwargs are automatically filtered out.
|
|
29
|
+
#
|
|
30
|
+
# When using multiple scorers, each must have a unique name — scores are keyed
|
|
31
|
+
# by name, so duplicates overwrite each other. Use +Scorer.new("name")+ or a
|
|
32
|
+
# Scorer subclass to assign names. Anonymous lambdas default to "scorer".
|
|
22
33
|
#
|
|
23
34
|
# @example Basic evaluation with inline cases
|
|
24
35
|
# require "braintrust"
|
|
25
36
|
#
|
|
26
37
|
# Braintrust.init
|
|
27
38
|
#
|
|
28
|
-
# # Define a simple task (the code being evaluated)
|
|
29
|
-
# task = ->(input) { input.include?("a") ? "fruit" : "vegetable" }
|
|
30
|
-
#
|
|
31
|
-
# # Run evaluation with inline cases
|
|
32
39
|
# Braintrust::Eval.run(
|
|
33
40
|
# project: "my-project",
|
|
34
41
|
# experiment: "food-classifier",
|
|
@@ -37,114 +44,65 @@ module Braintrust
|
|
|
37
44
|
# {input: "carrot", expected: "vegetable"},
|
|
38
45
|
# {input: "banana", expected: "fruit"}
|
|
39
46
|
# ],
|
|
40
|
-
# task:
|
|
47
|
+
# task: ->(input:) { input.include?("a") ? "fruit" : "vegetable" },
|
|
41
48
|
# scorers: [
|
|
42
|
-
#
|
|
43
|
-
# Braintrust::Eval.scorer("exact_match") do |input, expected, output|
|
|
44
|
-
# output == expected ? 1.0 : 0.0
|
|
45
|
-
# end
|
|
49
|
+
# ->(expected:, output:) { output == expected ? 1.0 : 0.0 }
|
|
46
50
|
# ]
|
|
47
51
|
# )
|
|
48
52
|
#
|
|
49
|
-
# @example Different ways to define scorers
|
|
50
|
-
# #
|
|
51
|
-
#
|
|
52
|
-
# output == expected ? 1.0 : 0.0
|
|
53
|
-
# end
|
|
53
|
+
# @example Different ways to define scorers
|
|
54
|
+
# # String — references a scorer defined in your Braintrust project
|
|
55
|
+
# scorers: ["accuracy-scorer", "relevance-scorer"]
|
|
54
56
|
#
|
|
55
|
-
# #
|
|
56
|
-
#
|
|
57
|
-
# output.downcase == expected.downcase ? 1.0 : 0.0
|
|
58
|
-
# end
|
|
57
|
+
# # Lambda — declare only the kwargs you need (input:, expected:, output:, metadata:, tags:)
|
|
58
|
+
# exact = ->(expected:, output:) { output == expected ? 1.0 : 0.0 }
|
|
59
59
|
#
|
|
60
|
-
# #
|
|
61
|
-
#
|
|
62
|
-
# def name
|
|
63
|
-
# "fuzzy_match"
|
|
64
|
-
# end
|
|
60
|
+
# # Named scorer with Scorer.new
|
|
61
|
+
# named = Braintrust::Scorer.new("case_insensitive") { |expected:, output:| output.downcase == expected.downcase ? 1.0 : 0.0 }
|
|
65
62
|
#
|
|
66
|
-
#
|
|
67
|
-
#
|
|
63
|
+
# # Class-based pattern (auto-derives name from class: "fuzzy_match")
|
|
64
|
+
# class FuzzyMatch
|
|
65
|
+
# include Braintrust::Scorer
|
|
66
|
+
# def call(expected:, output:)
|
|
68
67
|
# # scoring logic here
|
|
69
68
|
# 1.0
|
|
70
69
|
# end
|
|
71
70
|
# end
|
|
72
71
|
#
|
|
73
|
-
# # Anonymous lambda that returns named score object
|
|
74
|
-
# multi_score = ->(input, expected, output) {
|
|
75
|
-
# [
|
|
76
|
-
# {name: "exact_match", score: output == expected ? 1.0 : 0.0},
|
|
77
|
-
# {name: "length_match", score: output.length == expected.length ? 1.0 : 0.0}
|
|
78
|
-
# ]
|
|
79
|
-
# }
|
|
80
|
-
#
|
|
81
|
-
# # All can be used together
|
|
82
|
-
# Braintrust::Eval.run(
|
|
83
|
-
# project: "my-project",
|
|
84
|
-
# experiment: "scorer-examples",
|
|
85
|
-
# cases: [{input: "test", expected: "test"}],
|
|
86
|
-
# task: ->(input) { input },
|
|
87
|
-
# scorers: [method(:exact_match), case_insensitive, FuzzyMatch.new, multi_score]
|
|
88
|
-
# )
|
|
89
|
-
#
|
|
90
72
|
# @example Different ways to define tasks
|
|
91
|
-
# # Lambda
|
|
92
|
-
#
|
|
73
|
+
# # Lambda with keyword args
|
|
74
|
+
# task = ->(input:) { process(input) }
|
|
93
75
|
#
|
|
94
|
-
# #
|
|
95
|
-
#
|
|
96
|
-
#
|
|
97
|
-
# # Method reference
|
|
98
|
-
# def my_task(input)
|
|
99
|
-
# "result"
|
|
100
|
-
# end
|
|
101
|
-
# task_method = method(:my_task)
|
|
76
|
+
# # Named task with Task.new
|
|
77
|
+
# task = Braintrust::Task.new("my_task") { |input:| process(input) }
|
|
102
78
|
#
|
|
103
|
-
# #
|
|
79
|
+
# # Class-based pattern
|
|
104
80
|
# class MyTask
|
|
105
|
-
#
|
|
106
|
-
#
|
|
81
|
+
# include Braintrust::Task
|
|
82
|
+
# def call(input:)
|
|
83
|
+
# process(input)
|
|
107
84
|
# end
|
|
108
85
|
# end
|
|
109
|
-
# task_class = MyTask.new
|
|
110
86
|
#
|
|
111
|
-
# #
|
|
112
|
-
#
|
|
113
|
-
# project: "my-project",
|
|
114
|
-
# experiment: "task-examples",
|
|
115
|
-
# cases: [{input: "test"}],
|
|
116
|
-
# task: task_lambda, # or task_proc, task_method, task_class
|
|
117
|
-
# scorers: [
|
|
118
|
-
# Braintrust::Eval.scorer("my_scorer") { |input, expected, output| 1.0 }
|
|
119
|
-
# ]
|
|
120
|
-
# )
|
|
87
|
+
# # Legacy lambdas (positional args) are also accepted for backwards compatibility
|
|
88
|
+
# legacy_task = ->(input) { process(input) }
|
|
121
89
|
#
|
|
122
90
|
# @example Using datasets instead of inline cases
|
|
123
|
-
# # Fetch cases from a dataset stored in Braintrust
|
|
124
91
|
# Braintrust::Eval.run(
|
|
125
92
|
# project: "my-project",
|
|
126
93
|
# experiment: "with-dataset",
|
|
127
94
|
# dataset: "my-dataset-name", # fetches from same project
|
|
128
|
-
# task: ->(input) {
|
|
129
|
-
# scorers: [
|
|
130
|
-
# Braintrust::Eval.scorer("my_scorer") { |input, expected, output| 1.0 }
|
|
131
|
-
# ]
|
|
95
|
+
# task: ->(input:) { input.upcase },
|
|
96
|
+
# scorers: [->(expected:, output:) { output == expected ? 1.0 : 0.0 }]
|
|
132
97
|
# )
|
|
133
98
|
#
|
|
134
99
|
# # Or with more options
|
|
135
100
|
# Braintrust::Eval.run(
|
|
136
101
|
# project: "my-project",
|
|
137
102
|
# experiment: "with-dataset-options",
|
|
138
|
-
# dataset: {
|
|
139
|
-
#
|
|
140
|
-
#
|
|
141
|
-
# version: "1.0",
|
|
142
|
-
# limit: 100
|
|
143
|
-
# },
|
|
144
|
-
# task: ->(input) { "result" },
|
|
145
|
-
# scorers: [
|
|
146
|
-
# Braintrust::Eval.scorer("my_scorer") { |input, expected, output| 1.0 }
|
|
147
|
-
# ]
|
|
103
|
+
# dataset: { name: "my-dataset", project: "other-project", version: "1.0", limit: 100 },
|
|
104
|
+
# task: ->(input:) { input.upcase },
|
|
105
|
+
# scorers: [->(expected:, output:) { output == expected ? 1.0 : 0.0 }]
|
|
148
106
|
# )
|
|
149
107
|
#
|
|
150
108
|
# @example Using metadata and tags
|
|
@@ -159,32 +117,24 @@ module Braintrust
|
|
|
159
117
|
# metadata: {threshold: 0.9, category: "produce"}
|
|
160
118
|
# }
|
|
161
119
|
# ],
|
|
162
|
-
# task: ->(input) { "fruit" },
|
|
120
|
+
# task: ->(input:) { "fruit" },
|
|
163
121
|
# scorers: [
|
|
164
|
-
#
|
|
165
|
-
# Braintrust::Eval.scorer("threshold_match") do |input, expected, output, metadata|
|
|
122
|
+
# ->(expected:, output:, metadata:) {
|
|
166
123
|
# threshold = metadata[:threshold] || 0.5
|
|
167
124
|
# # scoring logic using threshold
|
|
168
125
|
# 1.0
|
|
169
|
-
#
|
|
126
|
+
# }
|
|
170
127
|
# ],
|
|
171
|
-
# # Experiment-level tags and metadata
|
|
172
128
|
# tags: ["v1", "production"],
|
|
173
|
-
# metadata: {
|
|
174
|
-
# model: "gpt-4",
|
|
175
|
-
# temperature: 0.7,
|
|
176
|
-
# version: "1.0.0"
|
|
177
|
-
# }
|
|
129
|
+
# metadata: { model: "gpt-4", temperature: 0.7, version: "1.0.0" }
|
|
178
130
|
# )
|
|
179
131
|
module Eval
|
|
180
132
|
class << self
|
|
181
|
-
#
|
|
182
|
-
# @param name [String] The scorer name
|
|
183
|
-
# @param callable [#call, nil] Optional callable (if not using block)
|
|
184
|
-
# @param block [Proc] The scorer block
|
|
185
|
-
# @return [Scorer]
|
|
133
|
+
# @deprecated Use {Braintrust::Scorer.new} instead
|
|
186
134
|
def scorer(name, callable = nil, &block)
|
|
187
|
-
|
|
135
|
+
Log.warn_once(:eval_scorer, "Braintrust::Eval.scorer is deprecated: use Braintrust::Scorer.new instead.")
|
|
136
|
+
block = callable.method(:call) if callable && !block
|
|
137
|
+
Scorer.new(name, &block)
|
|
188
138
|
end
|
|
189
139
|
|
|
190
140
|
# Run an evaluation
|
|
@@ -195,7 +145,7 @@ module Braintrust
|
|
|
195
145
|
# - String: dataset name (fetches from same project)
|
|
196
146
|
# - Hash: {name:, id:, project:, version:, limit:}
|
|
197
147
|
# @param task [#call] The task to evaluate (must be callable)
|
|
198
|
-
# @param scorers [Array<Scorer, #call>] The scorers to use (Scorer objects or callables)
|
|
148
|
+
# @param scorers [Array<String, Scorer, #call>] The scorers to use (String names, Scorer objects, or callables)
|
|
199
149
|
# @param on_progress [#call, nil] Optional callback fired after each test case.
|
|
200
150
|
# Receives a Hash: {"data" => output, "scores" => {name => value}} on success,
|
|
201
151
|
# or {"error" => message} on failure.
|
|
@@ -216,9 +166,6 @@ module Braintrust
|
|
|
216
166
|
# Validate required parameters
|
|
217
167
|
validate_params!(task: task, scorers: scorers, cases: cases, dataset: dataset)
|
|
218
168
|
|
|
219
|
-
# Resolve any ScorerId entries to real Scorer objects
|
|
220
|
-
scorers = resolve_scorers(scorers, state: state, tracer_provider: tracer_provider)
|
|
221
|
-
|
|
222
169
|
experiment_id = nil
|
|
223
170
|
project_name = project
|
|
224
171
|
|
|
@@ -246,20 +193,21 @@ module Braintrust
|
|
|
246
193
|
end
|
|
247
194
|
end
|
|
248
195
|
|
|
249
|
-
#
|
|
250
|
-
|
|
196
|
+
# Build normalized context and run
|
|
197
|
+
context = Context.build(
|
|
198
|
+
task: task,
|
|
199
|
+
scorers: scorers,
|
|
200
|
+
cases: cases,
|
|
251
201
|
experiment_id: experiment_id,
|
|
252
202
|
experiment_name: experiment,
|
|
253
203
|
project_id: project_id,
|
|
254
204
|
project_name: project_name,
|
|
255
|
-
task: task,
|
|
256
|
-
scorers: scorers,
|
|
257
205
|
state: state,
|
|
258
206
|
tracer_provider: tracer_provider,
|
|
259
207
|
on_progress: on_progress,
|
|
260
208
|
parent: parent
|
|
261
209
|
)
|
|
262
|
-
result =
|
|
210
|
+
result = Runner.new(context).run(parallelism: parallelism)
|
|
263
211
|
|
|
264
212
|
# Print result summary unless quiet
|
|
265
213
|
print_result(result) unless quiet
|
|
@@ -275,26 +223,6 @@ module Braintrust
|
|
|
275
223
|
puts result.to_pretty
|
|
276
224
|
end
|
|
277
225
|
|
|
278
|
-
# Resolve scorers array: ScorerId entries become real Scorer objects, others pass through
|
|
279
|
-
# @param scorers [Array] Scorers (Scorer, callable, or ScorerId)
|
|
280
|
-
# @param state [State, nil] Braintrust state (required for ScorerId resolution)
|
|
281
|
-
# @param tracer_provider [TracerProvider, nil] OpenTelemetry tracer provider
|
|
282
|
-
# @return [Array<Scorer, #call>] Resolved scorers
|
|
283
|
-
def resolve_scorers(scorers, state: nil, tracer_provider: nil)
|
|
284
|
-
scorers.map do |scorer|
|
|
285
|
-
if scorer.is_a?(ScorerId)
|
|
286
|
-
Functions.scorer_by_id(
|
|
287
|
-
id: scorer.function_id,
|
|
288
|
-
version: scorer.version,
|
|
289
|
-
state: state,
|
|
290
|
-
tracer_provider: tracer_provider
|
|
291
|
-
)
|
|
292
|
-
else
|
|
293
|
-
scorer
|
|
294
|
-
end
|
|
295
|
-
end
|
|
296
|
-
end
|
|
297
|
-
|
|
298
226
|
# Validate required parameters
|
|
299
227
|
# @raise [ArgumentError] if validation fails
|
|
300
228
|
def validate_params!(task:, scorers:, cases:, dataset:)
|
|
@@ -356,7 +284,7 @@ module Braintrust
|
|
|
356
284
|
dataset_obj = case dataset
|
|
357
285
|
when Dataset
|
|
358
286
|
dataset
|
|
359
|
-
when
|
|
287
|
+
when Dataset::ID
|
|
360
288
|
Dataset.new(id: dataset.id, state: state)
|
|
361
289
|
when String
|
|
362
290
|
Dataset.new(name: dataset, project: project, state: state)
|
|
@@ -367,7 +295,7 @@ module Braintrust
|
|
|
367
295
|
opts[:state] = state
|
|
368
296
|
Dataset.new(**opts)
|
|
369
297
|
else
|
|
370
|
-
raise ArgumentError, "dataset must be String, Hash, Dataset, or
|
|
298
|
+
raise ArgumentError, "dataset must be String, Hash, Dataset, or Dataset::ID, got #{dataset.class}"
|
|
371
299
|
end
|
|
372
300
|
|
|
373
301
|
cases = dataset_obj.fetch_all(limit: limit)
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "api"
|
|
4
|
+
require_relative "scorer"
|
|
5
|
+
require_relative "task"
|
|
6
|
+
require "opentelemetry/sdk"
|
|
7
|
+
require "json"
|
|
8
|
+
|
|
9
|
+
module Braintrust
|
|
10
|
+
# Functions provides remote function execution capabilities.
|
|
11
|
+
# Allows calling prompts hosted on Braintrust servers as tasks or scorers.
|
|
12
|
+
module Functions
|
|
13
|
+
class << self
|
|
14
|
+
# Create a Task that invokes a remote function
|
|
15
|
+
# @param project [String] Project name
|
|
16
|
+
# @param slug [String] Function slug
|
|
17
|
+
# @param state [State, nil] Braintrust state (defaults to global)
|
|
18
|
+
# @param tracer_provider [TracerProvider, nil] OpenTelemetry tracer provider
|
|
19
|
+
# @return [Task] Task object that invokes remote function
|
|
20
|
+
def task(project:, slug:, state: nil, tracer_provider: nil)
|
|
21
|
+
state ||= Braintrust.current_state
|
|
22
|
+
raise Error, "No state available" unless state
|
|
23
|
+
|
|
24
|
+
# Resolve function ID from project + slug
|
|
25
|
+
api = API.new(state: state)
|
|
26
|
+
function_metadata = resolve_function(api, project, slug)
|
|
27
|
+
function_id = function_metadata["id"]
|
|
28
|
+
function_name = function_metadata["name"] || slug
|
|
29
|
+
|
|
30
|
+
# Get tracer for creating spans
|
|
31
|
+
tracer_provider ||= OpenTelemetry.tracer_provider
|
|
32
|
+
tracer = tracer_provider.tracer("braintrust.functions")
|
|
33
|
+
|
|
34
|
+
Task.new(function_name) do |input:|
|
|
35
|
+
tracer.in_span("function: #{slug}") do |span|
|
|
36
|
+
span.set_attribute("braintrust.span_attributes", JSON.dump({type: "function"}))
|
|
37
|
+
span.set_attribute("braintrust.input_json", JSON.dump(input))
|
|
38
|
+
span.set_attribute("braintrust.function.name", function_name)
|
|
39
|
+
span.set_attribute("braintrust.function.id", function_id)
|
|
40
|
+
span.set_attribute("braintrust.function.slug", slug)
|
|
41
|
+
|
|
42
|
+
begin
|
|
43
|
+
output = api.functions.invoke(id: function_id, input: input)
|
|
44
|
+
span.set_attribute("braintrust.output_json", JSON.dump(output))
|
|
45
|
+
output
|
|
46
|
+
rescue => e
|
|
47
|
+
span.record_exception(e)
|
|
48
|
+
span.status = OpenTelemetry::Trace::Status.error(e.message)
|
|
49
|
+
raise
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Create a scorer that invokes a remote function.
|
|
56
|
+
# Resolve by project + slug, or by function UUID (id).
|
|
57
|
+
# @param project [String, nil] Project name (used with slug)
|
|
58
|
+
# @param slug [String, nil] Function slug (used with project)
|
|
59
|
+
# @param id [String, nil] Function UUID (alternative to project + slug)
|
|
60
|
+
# @param version [String, nil] Optional version to pin to (used with id)
|
|
61
|
+
# @param state [State, nil] Braintrust state (defaults to global)
|
|
62
|
+
# @param tracer_provider [TracerProvider, nil] OpenTelemetry tracer provider
|
|
63
|
+
# @return [Scorer] Scorer object that invokes remote function
|
|
64
|
+
def scorer(project: nil, slug: nil, id: nil, version: nil, state: nil, tracer_provider: nil)
|
|
65
|
+
has_id = !id.nil?
|
|
66
|
+
has_project_slug = !project.nil? && !slug.nil?
|
|
67
|
+
|
|
68
|
+
unless has_id || has_project_slug
|
|
69
|
+
raise ArgumentError, "scorer requires either id: or both project: and slug:"
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
state ||= Braintrust.current_state
|
|
73
|
+
raise Error, "No state available" unless state
|
|
74
|
+
|
|
75
|
+
api = API.new(state: state)
|
|
76
|
+
|
|
77
|
+
function_metadata = if id
|
|
78
|
+
api.login
|
|
79
|
+
api.functions.get(id: id, version: version)
|
|
80
|
+
else
|
|
81
|
+
resolve_function(api, project, slug)
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
function_id = function_metadata["id"]
|
|
85
|
+
function_name = function_metadata["name"] || id || slug
|
|
86
|
+
|
|
87
|
+
tracer_provider ||= OpenTelemetry.tracer_provider
|
|
88
|
+
tracer = tracer_provider.tracer("braintrust.functions")
|
|
89
|
+
|
|
90
|
+
build_scorer(function_id: function_id, function_name: function_name, api: api, tracer: tracer)
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
private
|
|
94
|
+
|
|
95
|
+
# Build a Scorer that invokes a remote function
|
|
96
|
+
# @param function_id [String] Function UUID
|
|
97
|
+
# @param function_name [String] Function display name
|
|
98
|
+
# @param api [API] Braintrust API client
|
|
99
|
+
# @param tracer [OpenTelemetry::Trace::Tracer] Tracer instance
|
|
100
|
+
# @return [Scorer]
|
|
101
|
+
def build_scorer(function_id:, function_name:, api:, tracer:)
|
|
102
|
+
Scorer.new(function_name) do |input:, expected:, output:, metadata:|
|
|
103
|
+
tracer.in_span("function: #{function_name}") do |span|
|
|
104
|
+
scorer_input = {
|
|
105
|
+
input: input,
|
|
106
|
+
expected: expected,
|
|
107
|
+
output: output,
|
|
108
|
+
metadata: metadata
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
span.set_attribute("braintrust.span_attributes", JSON.dump({type: "function"}))
|
|
112
|
+
span.set_attribute("braintrust.input_json", JSON.dump(scorer_input))
|
|
113
|
+
span.set_attribute("braintrust.function.name", function_name)
|
|
114
|
+
span.set_attribute("braintrust.function.id", function_id)
|
|
115
|
+
|
|
116
|
+
begin
|
|
117
|
+
result = api.functions.invoke(id: function_id, input: scorer_input)
|
|
118
|
+
|
|
119
|
+
score = case result
|
|
120
|
+
when Numeric
|
|
121
|
+
result.to_f
|
|
122
|
+
when true
|
|
123
|
+
1.0
|
|
124
|
+
when false
|
|
125
|
+
0.0
|
|
126
|
+
when Hash
|
|
127
|
+
if result.key?("score")
|
|
128
|
+
result["score"].to_f
|
|
129
|
+
else
|
|
130
|
+
raise Error, "Hash result must contain 'score' key"
|
|
131
|
+
end
|
|
132
|
+
when String
|
|
133
|
+
result.to_f
|
|
134
|
+
when nil
|
|
135
|
+
nil
|
|
136
|
+
else
|
|
137
|
+
raise Error, "Unsupported result type: #{result.class}"
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
span.set_attribute("braintrust.output_json", JSON.dump(score))
|
|
141
|
+
score
|
|
142
|
+
rescue => e
|
|
143
|
+
span.record_exception(e)
|
|
144
|
+
span.status = OpenTelemetry::Trace::Status.error(e.message)
|
|
145
|
+
raise
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
# Resolve function ID from project name and slug
|
|
152
|
+
# @param api [API] API client
|
|
153
|
+
# @param project [String] Project name
|
|
154
|
+
# @param slug [String] Function slug
|
|
155
|
+
# @return [Hash] Function metadata
|
|
156
|
+
def resolve_function(api, project, slug)
|
|
157
|
+
result = api.functions.list(project_name: project, slug: slug)
|
|
158
|
+
functions = result["objects"]
|
|
159
|
+
|
|
160
|
+
if functions.nil? || functions.empty?
|
|
161
|
+
raise Error, "Function '#{slug}' not found in project '#{project}'"
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
functions.first
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
end
|