braintrust 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +148 -24
- data/lib/braintrust/api/internal/btql.rb +124 -0
- data/lib/braintrust/api/internal/experiments.rb +19 -0
- data/lib/braintrust/api/internal/projects.rb +19 -0
- data/lib/braintrust/contrib/rails/server/application_controller.rb +34 -0
- data/lib/braintrust/contrib/rails/server/engine.rb +72 -0
- data/lib/braintrust/contrib/rails/server/eval_controller.rb +36 -0
- data/lib/braintrust/contrib/rails/server/generator.rb +43 -0
- data/lib/braintrust/contrib/rails/server/health_controller.rb +15 -0
- data/lib/braintrust/contrib/rails/server/list_controller.rb +16 -0
- data/lib/braintrust/contrib/rails/server/routes.rb +8 -0
- data/lib/braintrust/contrib/rails/server.rb +20 -0
- data/lib/braintrust/dataset.rb +6 -3
- data/lib/braintrust/eval/context.rb +131 -0
- data/lib/braintrust/eval/evaluator.rb +11 -5
- data/lib/braintrust/eval/functions.rb +10 -166
- data/lib/braintrust/eval/runner.rb +165 -145
- data/lib/braintrust/eval/scorer.rb +24 -96
- data/lib/braintrust/eval/trace.rb +129 -0
- data/lib/braintrust/eval.rb +60 -132
- data/lib/braintrust/functions.rb +168 -0
- data/lib/braintrust/internal/callable.rb +83 -0
- data/lib/braintrust/logger.rb +9 -0
- data/lib/braintrust/scorer.rb +173 -0
- data/lib/braintrust/server/handlers/eval.rb +8 -168
- data/lib/braintrust/server/handlers/list.rb +3 -41
- data/lib/braintrust/server/rack.rb +2 -0
- data/lib/braintrust/server/services/eval_service.rb +214 -0
- data/lib/braintrust/server/services/list_service.rb +64 -0
- data/lib/braintrust/task.rb +108 -0
- data/lib/braintrust/trace/span_processor.rb +0 -5
- data/lib/braintrust/version.rb +1 -1
- metadata +18 -1
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
begin
|
|
4
|
+
require "action_controller"
|
|
5
|
+
require "rails/engine"
|
|
6
|
+
rescue LoadError
|
|
7
|
+
raise LoadError,
|
|
8
|
+
"Rails (actionpack + railties) is required for the Braintrust Rails server engine. " \
|
|
9
|
+
"Add `gem 'rails'` or `gem 'actionpack'` and `gem 'railties'` to your Gemfile."
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
require "json"
|
|
13
|
+
require_relative "../../eval"
|
|
14
|
+
require_relative "../../server/sse"
|
|
15
|
+
require_relative "../../server/auth/no_auth"
|
|
16
|
+
require_relative "../../server/auth/clerk_token"
|
|
17
|
+
require_relative "../../server/middleware/cors"
|
|
18
|
+
require_relative "../../server/services/list_service"
|
|
19
|
+
require_relative "../../server/services/eval_service"
|
|
20
|
+
require_relative "server/engine"
|
data/lib/braintrust/dataset.rb
CHANGED
|
@@ -181,9 +181,12 @@ module Braintrust
|
|
|
181
181
|
created: raw["created"]
|
|
182
182
|
)
|
|
183
183
|
end
|
|
184
|
+
|
|
185
|
+
# Value object wrapping a dataset UUID for resolution by ID.
|
|
186
|
+
# Used by Eval.run to distinguish dataset-by-ID from dataset-by-name.
|
|
187
|
+
ID = Struct.new(:id, keyword_init: true)
|
|
184
188
|
end
|
|
185
189
|
|
|
186
|
-
#
|
|
187
|
-
|
|
188
|
-
DatasetId = Struct.new(:id, keyword_init: true)
|
|
190
|
+
# @deprecated Use {Braintrust::Dataset::ID} instead.
|
|
191
|
+
DatasetId = Dataset::ID
|
|
189
192
|
end
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "cases"
|
|
4
|
+
|
|
5
|
+
module Braintrust
|
|
6
|
+
module Eval
|
|
7
|
+
# Holds all normalized, ready-to-execute eval components.
|
|
8
|
+
# Use Context.build to construct from raw user inputs.
|
|
9
|
+
class Context
|
|
10
|
+
attr_reader :task, :scorers, :cases, :experiment_id, :experiment_name,
|
|
11
|
+
:project_id, :project_name, :state, :tracer_provider,
|
|
12
|
+
:on_progress, :parent_span_attr, :generation
|
|
13
|
+
|
|
14
|
+
def initialize(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil,
|
|
15
|
+
project_id: nil, project_name: nil, state: nil, tracer_provider: nil,
|
|
16
|
+
on_progress: nil, parent_span_attr: nil, generation: nil)
|
|
17
|
+
@task = task
|
|
18
|
+
@scorers = scorers
|
|
19
|
+
@cases = cases
|
|
20
|
+
@experiment_id = experiment_id
|
|
21
|
+
@experiment_name = experiment_name
|
|
22
|
+
@project_id = project_id
|
|
23
|
+
@project_name = project_name
|
|
24
|
+
@state = state
|
|
25
|
+
@tracer_provider = tracer_provider
|
|
26
|
+
@on_progress = on_progress
|
|
27
|
+
@parent_span_attr = parent_span_attr
|
|
28
|
+
@generation = generation
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Build a Context from raw user inputs.
|
|
32
|
+
# Factory normalizes task, scorers, and cases into typed wrappers.
|
|
33
|
+
# Parent is resolved into parent_span_attr and generation.
|
|
34
|
+
def self.build(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil,
|
|
35
|
+
project_id: nil, project_name: nil, state: nil, tracer_provider: nil,
|
|
36
|
+
on_progress: nil, parent: nil)
|
|
37
|
+
factory = Factory.new(state: state, tracer_provider: tracer_provider, project_name: project_name)
|
|
38
|
+
|
|
39
|
+
Context.new(
|
|
40
|
+
task: factory.normalize_task(task),
|
|
41
|
+
scorers: factory.normalize_scorers(scorers),
|
|
42
|
+
cases: factory.normalize_cases(cases),
|
|
43
|
+
experiment_id: experiment_id,
|
|
44
|
+
experiment_name: experiment_name,
|
|
45
|
+
project_id: project_id,
|
|
46
|
+
project_name: project_name,
|
|
47
|
+
state: state,
|
|
48
|
+
tracer_provider: tracer_provider,
|
|
49
|
+
on_progress: on_progress,
|
|
50
|
+
parent_span_attr: factory.resolve_parent_span_attr(parent),
|
|
51
|
+
generation: parent&.dig(:generation)
|
|
52
|
+
)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Encapsulates normalization of raw user inputs into typed wrappers.
|
|
56
|
+
class Factory
|
|
57
|
+
def initialize(state: nil, tracer_provider: nil, project_name: nil)
|
|
58
|
+
@state = state
|
|
59
|
+
@tracer_provider = tracer_provider
|
|
60
|
+
@project_name = project_name
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def normalize_cases(raw)
|
|
64
|
+
case raw
|
|
65
|
+
when Cases
|
|
66
|
+
raw
|
|
67
|
+
when Array, Enumerable
|
|
68
|
+
Cases.new(raw)
|
|
69
|
+
else
|
|
70
|
+
if raw.respond_to?(:each)
|
|
71
|
+
Cases.new(raw)
|
|
72
|
+
else
|
|
73
|
+
raise ArgumentError, "cases must be Array or Enumerable"
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def resolve_parent_span_attr(parent)
|
|
79
|
+
return nil unless parent
|
|
80
|
+
"#{parent[:object_type]}:#{parent[:object_id]}"
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def normalize_task(raw)
|
|
84
|
+
case raw
|
|
85
|
+
when Task
|
|
86
|
+
raw
|
|
87
|
+
when Proc
|
|
88
|
+
# Pass Proc/Lambda directly to preserve keyword arg info.
|
|
89
|
+
# Legacy positional lambdas (arity 1) are auto-wrapped by Task#wrap_block.
|
|
90
|
+
Task.new(&raw)
|
|
91
|
+
else
|
|
92
|
+
# Callable class: wrap via method(:call) to preserve keyword arg info
|
|
93
|
+
name = raw.respond_to?(:name) ? raw.name : nil
|
|
94
|
+
Task.new(name, &raw.method(:call))
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def normalize_scorers(raw)
|
|
99
|
+
raw.map do |scorer|
|
|
100
|
+
case scorer
|
|
101
|
+
when String
|
|
102
|
+
raise ArgumentError, "project is required to resolve scorer slug '#{scorer}'" unless @project_name
|
|
103
|
+
Braintrust::Functions.scorer(
|
|
104
|
+
project: @project_name,
|
|
105
|
+
slug: scorer,
|
|
106
|
+
state: @state,
|
|
107
|
+
tracer_provider: @tracer_provider
|
|
108
|
+
)
|
|
109
|
+
when Braintrust::Scorer::ID
|
|
110
|
+
Braintrust::Functions.scorer(
|
|
111
|
+
id: scorer.function_id,
|
|
112
|
+
version: scorer.version,
|
|
113
|
+
state: @state,
|
|
114
|
+
tracer_provider: @tracer_provider
|
|
115
|
+
)
|
|
116
|
+
when Braintrust::Scorer
|
|
117
|
+
scorer
|
|
118
|
+
when Proc
|
|
119
|
+
# Pass Proc/Lambda directly to preserve keyword arg info
|
|
120
|
+
# (method(:call) loses parameter metadata)
|
|
121
|
+
Braintrust::Scorer.new(&scorer)
|
|
122
|
+
else
|
|
123
|
+
name = scorer.respond_to?(:name) ? scorer.name : nil
|
|
124
|
+
Braintrust::Scorer.new(name, &scorer.method(:call))
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
end
|
|
@@ -5,21 +5,27 @@ module Braintrust
|
|
|
5
5
|
# Base class for evaluators. Subclass and override #task and #scorers,
|
|
6
6
|
# or instantiate directly with keyword arguments.
|
|
7
7
|
#
|
|
8
|
+
# Evaluators are used with the dev server, which reports scorer names
|
|
9
|
+
# to the Braintrust UI. Always use named scorers (via Scorer.new or
|
|
10
|
+
# subclass) so they display meaningfully.
|
|
11
|
+
#
|
|
8
12
|
# @example Subclass pattern
|
|
9
13
|
# class FoodClassifier < Braintrust::Eval::Evaluator
|
|
10
14
|
# def task
|
|
11
|
-
# ->(input) { classify(input) }
|
|
15
|
+
# ->(input:) { classify(input) }
|
|
12
16
|
# end
|
|
13
17
|
#
|
|
14
18
|
# def scorers
|
|
15
|
-
# [Braintrust::
|
|
19
|
+
# [Braintrust::Scorer.new("exact_match") { |expected:, output:| output == expected ? 1.0 : 0.0 }]
|
|
16
20
|
# end
|
|
17
21
|
# end
|
|
18
22
|
#
|
|
19
23
|
# @example Inline pattern
|
|
20
24
|
# Braintrust::Eval::Evaluator.new(
|
|
21
|
-
# task: ->(input) { input.upcase },
|
|
22
|
-
# scorers: [
|
|
25
|
+
# task: ->(input:) { input.upcase },
|
|
26
|
+
# scorers: [
|
|
27
|
+
# Braintrust::Scorer.new("exact_match") { |expected:, output:| output == expected ? 1.0 : 0.0 }
|
|
28
|
+
# ]
|
|
23
29
|
# )
|
|
24
30
|
class Evaluator
|
|
25
31
|
attr_accessor :task, :scorers, :parameters
|
|
@@ -48,7 +54,7 @@ module Braintrust
|
|
|
48
54
|
# @param project [String, nil] Project name
|
|
49
55
|
# @param experiment [String, nil] Experiment name
|
|
50
56
|
# @param project_id [String, nil] Project UUID (skips project creation)
|
|
51
|
-
# @param dataset [String, Hash, Dataset,
|
|
57
|
+
# @param dataset [String, Hash, Dataset, Dataset::ID, nil] Dataset to fetch
|
|
52
58
|
# @param scorers [Array, nil] Additional scorers (merged with evaluator's own)
|
|
53
59
|
# @param parent [Hash, nil] Parent span context
|
|
54
60
|
# @param state [State, nil] Braintrust state
|
|
@@ -1,178 +1,22 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require_relative "../
|
|
4
|
-
require_relative "scorer"
|
|
5
|
-
require "opentelemetry/sdk"
|
|
6
|
-
require "json"
|
|
3
|
+
require_relative "../functions"
|
|
7
4
|
|
|
8
5
|
module Braintrust
|
|
9
6
|
module Eval
|
|
10
|
-
#
|
|
11
|
-
# Allows calling prompts hosted on Braintrust servers as tasks or scorers
|
|
7
|
+
# @deprecated Use {Braintrust::Functions} instead.
|
|
12
8
|
module Functions
|
|
13
9
|
class << self
|
|
14
|
-
#
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
# @param tracer_provider [TracerProvider, nil] OpenTelemetry tracer provider
|
|
19
|
-
# @return [Proc] Callable that accepts input and returns output
|
|
20
|
-
def task(project:, slug:, state: nil, tracer_provider: nil)
|
|
21
|
-
state ||= Braintrust.current_state
|
|
22
|
-
raise Error, "No state available" unless state
|
|
23
|
-
|
|
24
|
-
# Resolve function ID from project + slug
|
|
25
|
-
api = API.new(state: state)
|
|
26
|
-
function_metadata = resolve_function(api, project, slug)
|
|
27
|
-
function_id = function_metadata["id"]
|
|
28
|
-
function_name = function_metadata["name"] || slug
|
|
29
|
-
|
|
30
|
-
# Get tracer for creating spans
|
|
31
|
-
tracer_provider ||= OpenTelemetry.tracer_provider
|
|
32
|
-
tracer = tracer_provider.tracer("braintrust.functions")
|
|
33
|
-
|
|
34
|
-
# Return a lambda that invokes the remote function with tracing
|
|
35
|
-
lambda do |input|
|
|
36
|
-
# Create a span for the function invocation
|
|
37
|
-
tracer.in_span("function: #{slug}") do |span|
|
|
38
|
-
span.set_attribute("braintrust.span_attributes", JSON.dump({type: "function"}))
|
|
39
|
-
span.set_attribute("braintrust.input_json", JSON.dump(input))
|
|
40
|
-
span.set_attribute("braintrust.function.name", function_name)
|
|
41
|
-
span.set_attribute("braintrust.function.id", function_id)
|
|
42
|
-
span.set_attribute("braintrust.function.slug", slug)
|
|
43
|
-
|
|
44
|
-
begin
|
|
45
|
-
# Invoke the function via API
|
|
46
|
-
output = api.functions.invoke(id: function_id, input: input)
|
|
47
|
-
span.set_attribute("braintrust.output_json", JSON.dump(output))
|
|
48
|
-
output
|
|
49
|
-
rescue => e
|
|
50
|
-
# Record exception and set error status
|
|
51
|
-
span.record_exception(e)
|
|
52
|
-
span.status = OpenTelemetry::Trace::Status.error(e.message)
|
|
53
|
-
raise
|
|
54
|
-
end
|
|
55
|
-
end
|
|
56
|
-
end
|
|
57
|
-
end
|
|
58
|
-
|
|
59
|
-
# Create a scorer that invokes a remote function by ID
|
|
60
|
-
# @param id [String] Function UUID
|
|
61
|
-
# @param version [String, nil] Optional version to pin to
|
|
62
|
-
# @param state [State, nil] Braintrust state (defaults to global)
|
|
63
|
-
# @param tracer_provider [TracerProvider, nil] OpenTelemetry tracer provider
|
|
64
|
-
# @return [Scorer] Scorer object that invokes remote function
|
|
65
|
-
def scorer_by_id(id:, state: nil, version: nil, tracer_provider: nil)
|
|
66
|
-
state ||= Braintrust.current_state
|
|
67
|
-
api = API.new(state: state)
|
|
68
|
-
api.login
|
|
69
|
-
|
|
70
|
-
function_metadata = api.functions.get(id: id, version: version)
|
|
71
|
-
function_id = function_metadata["id"]
|
|
72
|
-
function_name = function_metadata["name"] || id
|
|
73
|
-
|
|
74
|
-
tracer_provider ||= OpenTelemetry.tracer_provider
|
|
75
|
-
tracer = tracer_provider.tracer("braintrust.functions")
|
|
76
|
-
|
|
77
|
-
build_scorer(function_id: function_id, function_name: function_name, api: api, tracer: tracer)
|
|
10
|
+
# @deprecated Use {Braintrust::Functions.task} instead.
|
|
11
|
+
def task(**kwargs)
|
|
12
|
+
Log.warn_once(:eval_functions_task, "Braintrust::Eval::Functions.task is deprecated: use Braintrust::Functions.task instead.")
|
|
13
|
+
Braintrust::Functions.task(**kwargs)
|
|
78
14
|
end
|
|
79
15
|
|
|
80
|
-
#
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
# @param tracer_provider [TracerProvider, nil] OpenTelemetry tracer provider
|
|
85
|
-
# @return [Scorer] Scorer object that invokes remote function
|
|
86
|
-
def scorer(project:, slug:, state: nil, tracer_provider: nil)
|
|
87
|
-
state ||= Braintrust.current_state
|
|
88
|
-
raise Error, "No state available" unless state
|
|
89
|
-
|
|
90
|
-
# Resolve function ID from project + slug
|
|
91
|
-
api = API.new(state: state)
|
|
92
|
-
function_metadata = resolve_function(api, project, slug)
|
|
93
|
-
function_id = function_metadata["id"]
|
|
94
|
-
function_name = function_metadata["name"] || slug
|
|
95
|
-
|
|
96
|
-
# Get tracer for creating spans
|
|
97
|
-
tracer_provider ||= OpenTelemetry.tracer_provider
|
|
98
|
-
tracer = tracer_provider.tracer("braintrust.functions")
|
|
99
|
-
|
|
100
|
-
build_scorer(function_id: function_id, function_name: function_name, api: api, tracer: tracer)
|
|
101
|
-
end
|
|
102
|
-
|
|
103
|
-
private
|
|
104
|
-
|
|
105
|
-
# Build a Scorer that invokes a remote function
|
|
106
|
-
# Shared implementation used by both scorer and scorer_by_id
|
|
107
|
-
# @param function_id [String] Function UUID
|
|
108
|
-
# @param function_name [String] Function display name
|
|
109
|
-
# @param api [API] Braintrust API client
|
|
110
|
-
# @param tracer [OpenTelemetry::Trace::Tracer] Tracer instance
|
|
111
|
-
# @return [Scorer]
|
|
112
|
-
def build_scorer(function_id:, function_name:, api:, tracer:)
|
|
113
|
-
Scorer.new(function_name) do |input, expected, output, metadata|
|
|
114
|
-
tracer.in_span("function: #{function_name}") do |span|
|
|
115
|
-
scorer_input = {
|
|
116
|
-
input: input,
|
|
117
|
-
expected: expected,
|
|
118
|
-
output: output,
|
|
119
|
-
metadata: metadata
|
|
120
|
-
}
|
|
121
|
-
|
|
122
|
-
span.set_attribute("braintrust.span_attributes", JSON.dump({type: "function"}))
|
|
123
|
-
span.set_attribute("braintrust.input_json", JSON.dump(scorer_input))
|
|
124
|
-
span.set_attribute("braintrust.function.name", function_name)
|
|
125
|
-
span.set_attribute("braintrust.function.id", function_id)
|
|
126
|
-
|
|
127
|
-
begin
|
|
128
|
-
result = api.functions.invoke(id: function_id, input: scorer_input)
|
|
129
|
-
|
|
130
|
-
score = case result
|
|
131
|
-
when Numeric
|
|
132
|
-
result.to_f
|
|
133
|
-
when true
|
|
134
|
-
1.0
|
|
135
|
-
when false
|
|
136
|
-
0.0
|
|
137
|
-
when Hash
|
|
138
|
-
if result.key?("score")
|
|
139
|
-
result["score"].to_f
|
|
140
|
-
else
|
|
141
|
-
raise Error, "Hash result must contain 'score' key"
|
|
142
|
-
end
|
|
143
|
-
when String
|
|
144
|
-
result.to_f
|
|
145
|
-
when nil
|
|
146
|
-
nil
|
|
147
|
-
else
|
|
148
|
-
raise Error, "Unsupported result type: #{result.class}"
|
|
149
|
-
end
|
|
150
|
-
|
|
151
|
-
span.set_attribute("braintrust.output_json", JSON.dump(score))
|
|
152
|
-
score
|
|
153
|
-
rescue => e
|
|
154
|
-
span.record_exception(e)
|
|
155
|
-
span.status = OpenTelemetry::Trace::Status.error(e.message)
|
|
156
|
-
raise
|
|
157
|
-
end
|
|
158
|
-
end
|
|
159
|
-
end
|
|
160
|
-
end
|
|
161
|
-
|
|
162
|
-
# Resolve function ID from project name and slug
|
|
163
|
-
# @param api [API] API client
|
|
164
|
-
# @param project [String] Project name
|
|
165
|
-
# @param slug [String] Function slug
|
|
166
|
-
# @return [Hash] Function metadata
|
|
167
|
-
def resolve_function(api, project, slug)
|
|
168
|
-
result = api.functions.list(project_name: project, slug: slug)
|
|
169
|
-
functions = result["objects"]
|
|
170
|
-
|
|
171
|
-
if functions.nil? || functions.empty?
|
|
172
|
-
raise Error, "Function '#{slug}' not found in project '#{project}'"
|
|
173
|
-
end
|
|
174
|
-
|
|
175
|
-
functions.first
|
|
16
|
+
# @deprecated Use {Braintrust::Functions.scorer} instead.
|
|
17
|
+
def scorer(**kwargs)
|
|
18
|
+
Log.warn_once(:eval_functions_scorer, "Braintrust::Eval::Functions.scorer is deprecated: use Braintrust::Functions.scorer instead.")
|
|
19
|
+
Braintrust::Functions.scorer(**kwargs)
|
|
176
20
|
end
|
|
177
21
|
end
|
|
178
22
|
end
|