braintrust 0.0.1.alpha.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +24 -0
- data/lib/braintrust/api/datasets.rb +198 -0
- data/lib/braintrust/api/functions.rb +152 -0
- data/lib/braintrust/api/internal/auth.rb +97 -0
- data/lib/braintrust/api.rb +29 -0
- data/lib/braintrust/config.rb +30 -0
- data/lib/braintrust/eval/case.rb +12 -0
- data/lib/braintrust/eval/cases.rb +58 -0
- data/lib/braintrust/eval/functions.rb +137 -0
- data/lib/braintrust/eval/result.rb +53 -0
- data/lib/braintrust/eval/scorer.rb +108 -0
- data/lib/braintrust/eval.rb +418 -0
- data/lib/braintrust/internal/experiments.rb +129 -0
- data/lib/braintrust/logger.rb +32 -0
- data/lib/braintrust/state.rb +121 -0
- data/lib/braintrust/trace/openai.rb +87 -0
- data/lib/braintrust/trace/span_processor.rb +71 -0
- data/lib/braintrust/trace.rb +108 -0
- data/lib/braintrust/version.rb +5 -0
- data/lib/braintrust.rb +110 -0
- metadata +176 -0
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "../api"
|
|
4
|
+
require_relative "scorer"
|
|
5
|
+
require "opentelemetry/sdk"
|
|
6
|
+
require "json"
|
|
7
|
+
|
|
8
|
+
module Braintrust
|
|
9
|
+
module Eval
|
|
10
|
+
# Functions provides remote function execution capabilities
|
|
11
|
+
# Allows calling prompts hosted on Braintrust servers as tasks or scorers
|
|
12
|
+
module Functions
|
|
13
|
+
class << self
|
|
14
|
+
# Create a task callable that invokes a remote function
|
|
15
|
+
# @param project [String] Project name
|
|
16
|
+
# @param slug [String] Function slug
|
|
17
|
+
# @param state [State, nil] Braintrust state (defaults to global)
|
|
18
|
+
# @param tracer_provider [TracerProvider, nil] OpenTelemetry tracer provider
|
|
19
|
+
# @return [Proc] Callable that accepts input and returns output
|
|
20
|
+
def task(project:, slug:, state: nil, tracer_provider: nil)
|
|
21
|
+
state ||= Braintrust.current_state
|
|
22
|
+
raise Error, "No state available" unless state
|
|
23
|
+
|
|
24
|
+
# Resolve function ID from project + slug
|
|
25
|
+
api = API.new(state: state)
|
|
26
|
+
function_metadata = resolve_function(api, project, slug)
|
|
27
|
+
function_id = function_metadata["id"]
|
|
28
|
+
function_name = function_metadata["name"] || slug
|
|
29
|
+
|
|
30
|
+
# Get tracer for creating spans
|
|
31
|
+
tracer_provider ||= OpenTelemetry.tracer_provider
|
|
32
|
+
tracer = tracer_provider.tracer("braintrust.functions")
|
|
33
|
+
|
|
34
|
+
# Return a lambda that invokes the remote function with tracing
|
|
35
|
+
lambda do |input|
|
|
36
|
+
# Create a span for the function invocation
|
|
37
|
+
tracer.in_span("function: #{slug}") do |span|
|
|
38
|
+
span.set_attribute("braintrust.span_attributes", JSON.dump({type: "function"}))
|
|
39
|
+
span.set_attribute("braintrust.input_json", JSON.dump(input))
|
|
40
|
+
span.set_attribute("braintrust.function.name", function_name)
|
|
41
|
+
span.set_attribute("braintrust.function.id", function_id)
|
|
42
|
+
span.set_attribute("braintrust.function.slug", slug)
|
|
43
|
+
|
|
44
|
+
begin
|
|
45
|
+
# Invoke the function via API
|
|
46
|
+
output = api.functions.invoke(id: function_id, input: input)
|
|
47
|
+
span.set_attribute("braintrust.output_json", JSON.dump(output))
|
|
48
|
+
output
|
|
49
|
+
rescue => e
|
|
50
|
+
# Record exception and set error status
|
|
51
|
+
span.record_exception(e)
|
|
52
|
+
span.status = OpenTelemetry::Trace::Status.error(e.message)
|
|
53
|
+
raise
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Create a scorer that invokes a remote function
|
|
60
|
+
# @param project [String] Project name
|
|
61
|
+
# @param slug [String] Function slug
|
|
62
|
+
# @param state [State, nil] Braintrust state (defaults to global)
|
|
63
|
+
# @param tracer_provider [TracerProvider, nil] OpenTelemetry tracer provider
|
|
64
|
+
# @return [Scorer] Scorer object that invokes remote function
|
|
65
|
+
def scorer(project:, slug:, state: nil, tracer_provider: nil)
|
|
66
|
+
state ||= Braintrust.current_state
|
|
67
|
+
raise Error, "No state available" unless state
|
|
68
|
+
|
|
69
|
+
# Resolve function ID from project + slug
|
|
70
|
+
api = API.new(state: state)
|
|
71
|
+
function_metadata = resolve_function(api, project, slug)
|
|
72
|
+
function_id = function_metadata["id"]
|
|
73
|
+
function_name = function_metadata["name"] || slug
|
|
74
|
+
|
|
75
|
+
# Get tracer for creating spans
|
|
76
|
+
tracer_provider ||= OpenTelemetry.tracer_provider
|
|
77
|
+
tracer = tracer_provider.tracer("braintrust.functions")
|
|
78
|
+
|
|
79
|
+
# Create a scorer that invokes the remote function
|
|
80
|
+
Scorer.new(slug) do |input, expected, output, metadata|
|
|
81
|
+
# Create a span for the function invocation
|
|
82
|
+
tracer.in_span("function: #{slug}") do |span|
|
|
83
|
+
scorer_input = {
|
|
84
|
+
input: input,
|
|
85
|
+
expected: expected,
|
|
86
|
+
output: output,
|
|
87
|
+
metadata: metadata
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
span.set_attribute("braintrust.span_attributes", JSON.dump({type: "function"}))
|
|
91
|
+
span.set_attribute("braintrust.input_json", JSON.dump(scorer_input))
|
|
92
|
+
span.set_attribute("braintrust.function.name", function_name)
|
|
93
|
+
span.set_attribute("braintrust.function.id", function_id)
|
|
94
|
+
span.set_attribute("braintrust.function.slug", slug)
|
|
95
|
+
|
|
96
|
+
begin
|
|
97
|
+
# Invoke the function via API
|
|
98
|
+
# The remote scorer receives all scorer arguments
|
|
99
|
+
result = api.functions.invoke(id: function_id, input: scorer_input)
|
|
100
|
+
|
|
101
|
+
# Parse result as float score
|
|
102
|
+
# The remote function should return a number
|
|
103
|
+
score = result.is_a?(Numeric) ? result.to_f : result.to_s.to_f
|
|
104
|
+
|
|
105
|
+
span.set_attribute("braintrust.output_json", JSON.dump(score))
|
|
106
|
+
score
|
|
107
|
+
rescue => e
|
|
108
|
+
# Record exception and set error status
|
|
109
|
+
span.record_exception(e)
|
|
110
|
+
span.status = OpenTelemetry::Trace::Status.error(e.message)
|
|
111
|
+
raise
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
private
|
|
118
|
+
|
|
119
|
+
# Resolve function ID from project name and slug
|
|
120
|
+
# @param api [API] API client
|
|
121
|
+
# @param project [String] Project name
|
|
122
|
+
# @param slug [String] Function slug
|
|
123
|
+
# @return [Hash] Function metadata
|
|
124
|
+
def resolve_function(api, project, slug)
|
|
125
|
+
result = api.functions.list(project_name: project, slug: slug)
|
|
126
|
+
functions = result["objects"]
|
|
127
|
+
|
|
128
|
+
if functions.nil? || functions.empty?
|
|
129
|
+
raise Error, "Function '#{slug}' not found in project '#{project}'"
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
functions.first
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
end
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Braintrust
|
|
4
|
+
module Eval
|
|
5
|
+
# Result represents the outcome of an evaluation run
|
|
6
|
+
# Contains experiment metadata, errors, and timing information
|
|
7
|
+
class Result
|
|
8
|
+
attr_reader :experiment_id, :experiment_name, :project_id,
|
|
9
|
+
:permalink, :errors, :duration
|
|
10
|
+
|
|
11
|
+
# Create a new result
|
|
12
|
+
# @param experiment_id [String] The experiment ID
|
|
13
|
+
# @param experiment_name [String] The experiment name
|
|
14
|
+
# @param project_id [String] The project ID
|
|
15
|
+
# @param permalink [String] Link to view the experiment in Braintrust UI
|
|
16
|
+
# @param errors [Array<String>] List of errors that occurred
|
|
17
|
+
# @param duration [Float] Duration in seconds
|
|
18
|
+
def initialize(experiment_id:, experiment_name:, project_id:,
|
|
19
|
+
permalink:, errors:, duration:)
|
|
20
|
+
@experiment_id = experiment_id
|
|
21
|
+
@experiment_name = experiment_name
|
|
22
|
+
@project_id = project_id
|
|
23
|
+
@permalink = permalink
|
|
24
|
+
@errors = errors
|
|
25
|
+
@duration = duration
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Check if the evaluation was successful (no errors)
|
|
29
|
+
# @return [Boolean]
|
|
30
|
+
def success?
|
|
31
|
+
errors.empty?
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Check if the evaluation failed (has errors)
|
|
35
|
+
# @return [Boolean]
|
|
36
|
+
def failed?
|
|
37
|
+
!success?
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Format the result as a human-readable string (Go SDK format)
|
|
41
|
+
# @return [String]
|
|
42
|
+
def to_s
|
|
43
|
+
[
|
|
44
|
+
"Experiment: #{experiment_name}",
|
|
45
|
+
"ID: #{experiment_id}",
|
|
46
|
+
"Link: #{permalink}",
|
|
47
|
+
"Duration: #{duration.round(2)}s",
|
|
48
|
+
"Errors: #{errors.length}"
|
|
49
|
+
].join("\n")
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Braintrust
|
|
4
|
+
module Eval
|
|
5
|
+
# Scorer wraps a scoring function that evaluates task output against expected values
|
|
6
|
+
# Scorers can accept 3 params (input, expected, output) or 4 params (input, expected, output, metadata)
|
|
7
|
+
# They can return a float, hash, or array of hashes
|
|
8
|
+
class Scorer
|
|
9
|
+
attr_reader :name
|
|
10
|
+
|
|
11
|
+
# Create a new scorer
|
|
12
|
+
# @param name_or_callable [String, Symbol, #call] Name or callable (if callable, name is auto-detected)
|
|
13
|
+
# @param callable [#call, nil] Callable if name was provided separately
|
|
14
|
+
# @param block [Proc, nil] Block if no callable provided
|
|
15
|
+
def initialize(name_or_callable = nil, callable = nil, &block)
|
|
16
|
+
# Determine name and callable from arguments
|
|
17
|
+
if name_or_callable.nil? && callable.nil? && block.nil?
|
|
18
|
+
raise ArgumentError, "Must provide callable or block"
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# If first arg is a string/symbol, it's the name
|
|
22
|
+
if name_or_callable.is_a?(String) || name_or_callable.is_a?(Symbol)
|
|
23
|
+
@name = name_or_callable.to_s
|
|
24
|
+
@callable = callable || block
|
|
25
|
+
raise ArgumentError, "Must provide callable or block" unless @callable
|
|
26
|
+
else
|
|
27
|
+
# First arg is the callable, try to auto-detect name
|
|
28
|
+
@callable = name_or_callable || callable || block
|
|
29
|
+
@name = detect_name(@callable)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Validate callable
|
|
33
|
+
unless @callable.respond_to?(:call)
|
|
34
|
+
raise ArgumentError, "Scorer must be callable (respond to :call)"
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Detect arity and wrap callable if needed
|
|
38
|
+
@wrapped_callable = wrap_callable(@callable)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Call the scorer
|
|
42
|
+
# @param input [Object] The input to the task
|
|
43
|
+
# @param expected [Object] The expected output
|
|
44
|
+
# @param output [Object] The actual output from the task
|
|
45
|
+
# @param metadata [Hash] Optional metadata
|
|
46
|
+
# @return [Float, Hash, Array] Score value(s)
|
|
47
|
+
def call(input, expected, output, metadata = {})
|
|
48
|
+
@wrapped_callable.call(input, expected, output, metadata)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
private
|
|
52
|
+
|
|
53
|
+
# Detect the name from a callable object
|
|
54
|
+
# @param callable [#call] The callable
|
|
55
|
+
# @return [String] The detected name
|
|
56
|
+
def detect_name(callable)
|
|
57
|
+
# Method objects have .name
|
|
58
|
+
if callable.is_a?(Method)
|
|
59
|
+
return callable.name.to_s
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Objects with .name method
|
|
63
|
+
if callable.respond_to?(:name)
|
|
64
|
+
return callable.name.to_s
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# Fallback
|
|
68
|
+
"scorer"
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Wrap the callable to always accept 4 parameters
|
|
72
|
+
# @param callable [#call] The callable to wrap
|
|
73
|
+
# @return [Proc] Wrapped callable that accepts 4 params
|
|
74
|
+
def wrap_callable(callable)
|
|
75
|
+
arity = callable_arity(callable)
|
|
76
|
+
|
|
77
|
+
case arity
|
|
78
|
+
when 3
|
|
79
|
+
# Callable takes 3 params - wrap to ignore metadata
|
|
80
|
+
->(input, expected, output, metadata) {
|
|
81
|
+
callable.call(input, expected, output)
|
|
82
|
+
}
|
|
83
|
+
when 4, -4, -1
|
|
84
|
+
# Callable takes 4 params (or variadic with 4+)
|
|
85
|
+
# -4 means optional 4th param
|
|
86
|
+
# -1 means variadic (*args)
|
|
87
|
+
callable
|
|
88
|
+
else
|
|
89
|
+
raise ArgumentError, "Scorer must accept 3 or 4 parameters (got arity #{arity})"
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Get the arity of a callable
|
|
94
|
+
# @param callable [#call] The callable
|
|
95
|
+
# @return [Integer] The arity
|
|
96
|
+
def callable_arity(callable)
|
|
97
|
+
if callable.respond_to?(:arity)
|
|
98
|
+
callable.arity
|
|
99
|
+
elsif callable.respond_to?(:method)
|
|
100
|
+
callable.method(:call).arity
|
|
101
|
+
else
|
|
102
|
+
# Assume 3 params if we can't detect
|
|
103
|
+
3
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
end
|