braintrust 0.2.1 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +163 -10
- data/lib/braintrust/api/functions.rb +3 -1
- data/lib/braintrust/api/internal/btql.rb +3 -33
- data/lib/braintrust/contrib/rails/server/application_controller.rb +34 -0
- data/lib/braintrust/contrib/rails/server/engine.rb +72 -0
- data/lib/braintrust/contrib/rails/server/eval_controller.rb +36 -0
- data/lib/braintrust/contrib/rails/server/generator.rb +43 -0
- data/lib/braintrust/contrib/rails/server/health_controller.rb +15 -0
- data/lib/braintrust/contrib/rails/server/list_controller.rb +16 -0
- data/lib/braintrust/contrib/rails/server/routes.rb +8 -0
- data/lib/braintrust/contrib/rails/server.rb +20 -0
- data/lib/braintrust/eval/context.rb +84 -21
- data/lib/braintrust/eval/evaluator.rb +16 -2
- data/lib/braintrust/eval/runner.rb +120 -75
- data/lib/braintrust/eval.rb +22 -2
- data/lib/braintrust/internal/retry.rb +41 -0
- data/lib/braintrust/prompt.rb +11 -5
- data/lib/braintrust/scorer.rb +55 -4
- data/lib/braintrust/server/handlers/eval.rb +8 -168
- data/lib/braintrust/server/handlers/list.rb +3 -41
- data/lib/braintrust/server/rack.rb +2 -0
- data/lib/braintrust/server/services/eval_service.rb +226 -0
- data/lib/braintrust/server/services/list_service.rb +64 -0
- data/lib/braintrust/trace/span_processor.rb +0 -5
- data/lib/braintrust/version.rb +1 -1
- metadata +26 -127
data/lib/braintrust/scorer.rb
CHANGED
|
@@ -40,12 +40,52 @@ module Braintrust
|
|
|
40
40
|
Block.new(name: name || DEFAULT_NAME, &block)
|
|
41
41
|
end
|
|
42
42
|
|
|
43
|
-
# Included into classes that +include Scorer+. Prepends KeywordFilter
|
|
44
|
-
# so #call receives only
|
|
43
|
+
# Included into classes that +include Scorer+. Prepends KeywordFilter and
|
|
44
|
+
# ResultNormalizer so #call receives only declared kwargs and always returns
|
|
45
|
+
# Array<Hash>. Also provides a default #name and #call_parameters.
|
|
45
46
|
module Callable
|
|
47
|
+
# Normalizes the raw return value of #call into Array<Hash>.
|
|
48
|
+
# Nested inside Callable because it depends on #name which Callable provides.
|
|
49
|
+
module ResultNormalizer
|
|
50
|
+
# @return [Array<Hash>] normalized score hashes with :score, :metadata, :name keys
|
|
51
|
+
def call(**kwargs)
|
|
52
|
+
normalize_score_result(super)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
private
|
|
56
|
+
|
|
57
|
+
# @param result [Numeric, Hash, Array<Hash>] raw return value from #call
|
|
58
|
+
# @return [Array<Hash>] one or more score hashes with :score, :metadata, :name keys
|
|
59
|
+
# @raise [ArgumentError] if any score value is not Numeric
|
|
60
|
+
def normalize_score_result(result)
|
|
61
|
+
case result
|
|
62
|
+
when Array then result.map { |item| normalize_score_item(item) }
|
|
63
|
+
when Hash then [normalize_score_item(result)]
|
|
64
|
+
else
|
|
65
|
+
raise ArgumentError, "#{name}: score must be Numeric, got #{result.inspect}" unless result.is_a?(Numeric)
|
|
66
|
+
[{score: result, metadata: nil, name: name}]
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Fills in missing :name from the scorer and validates :score.
|
|
71
|
+
# @param item [Hash] a score hash with at least a :score key
|
|
72
|
+
# @return [Hash] the same hash with :name set
|
|
73
|
+
# @raise [ArgumentError] if :score is not Numeric
|
|
74
|
+
def normalize_score_item(item)
|
|
75
|
+
item[:name] ||= name
|
|
76
|
+
raise ArgumentError, "#{item[:name]}: score must be Numeric, got #{item[:score].inspect}" unless item[:score].is_a?(Numeric)
|
|
77
|
+
item
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Infrastructure modules prepended onto every scorer class.
|
|
82
|
+
# Used both to set up the ancestor chain and to skip past them in
|
|
83
|
+
# #call_parameters so KeywordFilter sees the real call signature.
|
|
84
|
+
PREPENDED = [Internal::Callable::KeywordFilter, ResultNormalizer].freeze
|
|
85
|
+
|
|
46
86
|
# @param base [Class] the class including Callable
|
|
47
87
|
def self.included(base)
|
|
48
|
-
base.prepend(
|
|
88
|
+
PREPENDED.each { |mod| base.prepend(mod) }
|
|
49
89
|
end
|
|
50
90
|
|
|
51
91
|
# Default name derived from the class name (e.g. FuzzyMatch -> "fuzzy_match").
|
|
@@ -55,6 +95,17 @@ module Braintrust
|
|
|
55
95
|
return Scorer::DEFAULT_NAME unless klass
|
|
56
96
|
klass.gsub(/([a-z])([A-Z])/, '\1_\2').downcase
|
|
57
97
|
end
|
|
98
|
+
|
|
99
|
+
# Provides KeywordFilter with the actual call signature of the subclass.
|
|
100
|
+
# Walks past PREPENDED modules in the ancestor chain so that user-defined
|
|
101
|
+
# #call keyword params are correctly introspected.
|
|
102
|
+
# Block overrides this to point directly at @block.parameters.
|
|
103
|
+
# @return [Array<Array>] parameter list
|
|
104
|
+
def call_parameters
|
|
105
|
+
meth = method(:call)
|
|
106
|
+
meth = meth.super_method while meth.super_method && PREPENDED.include?(meth.owner)
|
|
107
|
+
meth.parameters
|
|
108
|
+
end
|
|
58
109
|
end
|
|
59
110
|
|
|
60
111
|
# Block-based scorer. Stores a Proc and delegates #call to it.
|
|
@@ -75,7 +126,7 @@ module Braintrust
|
|
|
75
126
|
end
|
|
76
127
|
|
|
77
128
|
# @param kwargs [Hash] keyword arguments (filtered by KeywordFilter)
|
|
78
|
-
# @return [
|
|
129
|
+
# @return [Array<Hash>] normalized score results
|
|
79
130
|
def call(**kwargs)
|
|
80
131
|
@block.call(**kwargs)
|
|
81
132
|
end
|
|
@@ -10,38 +10,15 @@ module Braintrust
|
|
|
10
10
|
class Eval
|
|
11
11
|
def initialize(evaluators)
|
|
12
12
|
@evaluators = evaluators
|
|
13
|
+
@service = Services::Eval.new(evaluators)
|
|
13
14
|
end
|
|
14
15
|
|
|
15
16
|
def call(env)
|
|
16
17
|
body = parse_body(env)
|
|
17
18
|
return error_response(400, "Invalid JSON body") unless body
|
|
18
19
|
|
|
19
|
-
|
|
20
|
-
return error_response(
|
|
21
|
-
|
|
22
|
-
evaluator = @evaluators[name]
|
|
23
|
-
return error_response(404, "Evaluator '#{name}' not found") unless evaluator
|
|
24
|
-
|
|
25
|
-
data = body["data"]
|
|
26
|
-
return error_response(400, "Missing required field: data") unless data
|
|
27
|
-
|
|
28
|
-
# Validate exactly one data source
|
|
29
|
-
data_sources = ["data", "dataset_name", "dataset_id"].count { |k| data.key?(k) }
|
|
30
|
-
return error_response(400, "Exactly one data source required") if data_sources != 1
|
|
31
|
-
|
|
32
|
-
experiment_name = body["experiment_name"]
|
|
33
|
-
|
|
34
|
-
# Resolve data source
|
|
35
|
-
cases, dataset = resolve_data_source(data)
|
|
36
|
-
|
|
37
|
-
# Resolve remote scorers from request
|
|
38
|
-
remote_scorer_ids = resolve_remote_scorers(body["scores"])
|
|
39
|
-
|
|
40
|
-
# Resolve parent span context
|
|
41
|
-
parent = resolve_parent(body["parent"])
|
|
42
|
-
|
|
43
|
-
# Build state from auth context (if present)
|
|
44
|
-
state = build_state(env)
|
|
20
|
+
result = @service.validate(body)
|
|
21
|
+
return error_response(result[:status], result[:error]) if result[:error]
|
|
45
22
|
|
|
46
23
|
# The protocol-rack adapter (used by Falcon and any server built on
|
|
47
24
|
# protocol-http) buffers `each`-based bodies through an Enumerable path.
|
|
@@ -50,64 +27,7 @@ module Braintrust
|
|
|
50
27
|
body_class = env.key?("protocol.http.request") ? SSEStreamBody : SSEBody
|
|
51
28
|
|
|
52
29
|
sse_body = body_class.new do |sse|
|
|
53
|
-
|
|
54
|
-
run_opts = {
|
|
55
|
-
on_progress: ->(progress_data) {
|
|
56
|
-
# Build remote eval protocol events from generic progress data.
|
|
57
|
-
# Runner provides: id, data/error, scores (optional), origin (optional).
|
|
58
|
-
# Protocol requires: id, object_type, origin, name, format, output_type, event, data.
|
|
59
|
-
base = {
|
|
60
|
-
"object_type" => "task",
|
|
61
|
-
"name" => name,
|
|
62
|
-
"format" => "code",
|
|
63
|
-
"output_type" => "completion"
|
|
64
|
-
}
|
|
65
|
-
base["id"] = progress_data["id"] if progress_data["id"]
|
|
66
|
-
base["origin"] = progress_data["origin"] if progress_data["origin"]
|
|
67
|
-
|
|
68
|
-
if progress_data.key?("error")
|
|
69
|
-
sse.event("progress", JSON.dump(base.merge("event" => "error", "data" => progress_data["error"])))
|
|
70
|
-
else
|
|
71
|
-
sse.event("progress", JSON.dump(base.merge("event" => "json_delta", "data" => JSON.dump(progress_data["data"]))))
|
|
72
|
-
end
|
|
73
|
-
|
|
74
|
-
# Signal per-cell completion so the UI exits "Streaming..." state
|
|
75
|
-
# and updates the progress bar immediately.
|
|
76
|
-
sse.event("progress", JSON.dump(base.merge("event" => "done", "data" => "")))
|
|
77
|
-
},
|
|
78
|
-
quiet: true
|
|
79
|
-
}
|
|
80
|
-
run_opts[:parent] = parent if parent
|
|
81
|
-
run_opts[:scorers] = remote_scorer_ids if remote_scorer_ids
|
|
82
|
-
run_opts[:dataset] = dataset if dataset
|
|
83
|
-
|
|
84
|
-
if state
|
|
85
|
-
run_opts[:state] = state
|
|
86
|
-
run_opts[:experiment] = experiment_name if experiment_name
|
|
87
|
-
run_opts[:project_id] = body["project_id"] if body["project_id"]
|
|
88
|
-
end
|
|
89
|
-
|
|
90
|
-
result = evaluator.run(cases, **run_opts)
|
|
91
|
-
|
|
92
|
-
# Flush buffered OTLP spans before sending completion events.
|
|
93
|
-
# The BatchSpanProcessor exports every ~5s; fast evals can finish
|
|
94
|
-
# before a single export fires, causing the UI to see no results.
|
|
95
|
-
Braintrust::Trace.flush_spans
|
|
96
|
-
|
|
97
|
-
# Build summary from result scores
|
|
98
|
-
averaged_scores = {}
|
|
99
|
-
result.scorer_stats.each do |scorer_name, stats|
|
|
100
|
-
averaged_scores[scorer_name] = stats.score_mean
|
|
101
|
-
end
|
|
102
|
-
|
|
103
|
-
sse.event("summary", JSON.dump({
|
|
104
|
-
"scores" => averaged_scores,
|
|
105
|
-
"experiment_name" => experiment_name,
|
|
106
|
-
"experiment_id" => result.experiment_id,
|
|
107
|
-
"project_id" => result.project_id
|
|
108
|
-
}))
|
|
109
|
-
|
|
110
|
-
sse.event("done", "")
|
|
30
|
+
@service.stream(result, auth: env["braintrust.auth"], sse: sse)
|
|
111
31
|
end
|
|
112
32
|
|
|
113
33
|
[200, {"content-type" => "text/event-stream", "cache-control" => "no-cache", "connection" => "keep-alive"}, sse_body]
|
|
@@ -115,90 +35,6 @@ module Braintrust
|
|
|
115
35
|
|
|
116
36
|
private
|
|
117
37
|
|
|
118
|
-
# Resolve data source from the data field.
|
|
119
|
-
# Returns [cases, dataset] where exactly one is non-nil.
|
|
120
|
-
def resolve_data_source(data)
|
|
121
|
-
if data.key?("data")
|
|
122
|
-
cases = data["data"].map do |d|
|
|
123
|
-
{input: d["input"], expected: d["expected"]}
|
|
124
|
-
end
|
|
125
|
-
[cases, nil]
|
|
126
|
-
elsif data.key?("dataset_id")
|
|
127
|
-
[nil, Braintrust::Dataset::ID.new(id: data["dataset_id"])]
|
|
128
|
-
elsif data.key?("dataset_name")
|
|
129
|
-
dataset_opts = {name: data["dataset_name"]}
|
|
130
|
-
dataset_opts[:project] = data["project_name"] if data["project_name"]
|
|
131
|
-
[nil, dataset_opts]
|
|
132
|
-
else
|
|
133
|
-
[nil, nil]
|
|
134
|
-
end
|
|
135
|
-
end
|
|
136
|
-
|
|
137
|
-
# Map request scores array to Scorer::ID structs.
|
|
138
|
-
# The UI sends function_id as a nested object: {"function_id": "uuid"}.
|
|
139
|
-
def resolve_remote_scorers(scores)
|
|
140
|
-
return nil if scores.nil? || scores.empty?
|
|
141
|
-
scores.map do |s|
|
|
142
|
-
func_id = s["function_id"]
|
|
143
|
-
func_id = func_id["function_id"] if func_id.is_a?(Hash)
|
|
144
|
-
Braintrust::Scorer::ID.new(
|
|
145
|
-
function_id: func_id,
|
|
146
|
-
version: s["version"]
|
|
147
|
-
)
|
|
148
|
-
end
|
|
149
|
-
end
|
|
150
|
-
|
|
151
|
-
# Map request parent to symbol-keyed Hash.
|
|
152
|
-
# Hardcode playground_id to match Java SDK behavior.
|
|
153
|
-
# Also extracts generation from propagated_event for span_attributes.
|
|
154
|
-
def resolve_parent(parent)
|
|
155
|
-
return nil unless parent.is_a?(Hash)
|
|
156
|
-
object_id = parent["object_id"]
|
|
157
|
-
return nil unless object_id
|
|
158
|
-
|
|
159
|
-
generation = parent.dig("propagated_event", "span_attributes", "generation")
|
|
160
|
-
|
|
161
|
-
result = {object_type: "playground_id", object_id: object_id}
|
|
162
|
-
result[:generation] = generation if generation
|
|
163
|
-
result
|
|
164
|
-
end
|
|
165
|
-
|
|
166
|
-
# Build State from auth context set by Auth middleware.
|
|
167
|
-
# Returns nil when no auth context is present (e.g. NoAuth strategy).
|
|
168
|
-
# Uses an LRU-style cache (max 64 entries) keyed by [api_key, app_url, org_name].
|
|
169
|
-
def build_state(env)
|
|
170
|
-
auth = env["braintrust.auth"]
|
|
171
|
-
return nil unless auth.is_a?(Hash)
|
|
172
|
-
|
|
173
|
-
cache_key = [auth["api_key"], auth["app_url"], auth["org_name"]]
|
|
174
|
-
|
|
175
|
-
@state_mutex ||= Mutex.new
|
|
176
|
-
@state_cache ||= {}
|
|
177
|
-
|
|
178
|
-
@state_mutex.synchronize do
|
|
179
|
-
cached = @state_cache[cache_key]
|
|
180
|
-
return cached if cached
|
|
181
|
-
|
|
182
|
-
state = Braintrust::State.new(
|
|
183
|
-
api_key: auth["api_key"],
|
|
184
|
-
org_id: auth["org_id"],
|
|
185
|
-
org_name: auth["org_name"],
|
|
186
|
-
app_url: auth["app_url"],
|
|
187
|
-
api_url: auth["api_url"],
|
|
188
|
-
enable_tracing: false
|
|
189
|
-
)
|
|
190
|
-
|
|
191
|
-
# Evict oldest entry if cache is full
|
|
192
|
-
if @state_cache.size >= 64
|
|
193
|
-
oldest_key = @state_cache.keys.first
|
|
194
|
-
@state_cache.delete(oldest_key)
|
|
195
|
-
end
|
|
196
|
-
|
|
197
|
-
@state_cache[cache_key] = state
|
|
198
|
-
state
|
|
199
|
-
end
|
|
200
|
-
end
|
|
201
|
-
|
|
202
38
|
def parse_body(env)
|
|
203
39
|
body = env["rack.input"]&.read
|
|
204
40
|
return nil if body.nil? || body.empty?
|
|
@@ -211,6 +47,10 @@ module Braintrust
|
|
|
211
47
|
[status, {"content-type" => "application/json"},
|
|
212
48
|
[JSON.dump({"error" => message})]]
|
|
213
49
|
end
|
|
50
|
+
|
|
51
|
+
def build_state(env)
|
|
52
|
+
@service.build_state(env["braintrust.auth"])
|
|
53
|
+
end
|
|
214
54
|
end
|
|
215
55
|
end
|
|
216
56
|
end
|
|
@@ -23,50 +23,12 @@ module Braintrust
|
|
|
23
23
|
class List
|
|
24
24
|
def initialize(evaluators)
|
|
25
25
|
@evaluators = evaluators
|
|
26
|
+
@service = Services::List.new(evaluators)
|
|
26
27
|
end
|
|
27
28
|
|
|
28
29
|
def call(_env)
|
|
29
|
-
result =
|
|
30
|
-
|
|
31
|
-
scores = (evaluator.scorers || []).each_with_index.map do |scorer, i|
|
|
32
|
-
scorer_name = scorer.respond_to?(:name) ? scorer.name : "score_#{i}"
|
|
33
|
-
{"name" => scorer_name}
|
|
34
|
-
end
|
|
35
|
-
entry = {"scores" => scores}
|
|
36
|
-
params = serialize_parameters(evaluator.parameters)
|
|
37
|
-
entry["parameters"] = params if params
|
|
38
|
-
result[name] = entry
|
|
39
|
-
end
|
|
40
|
-
|
|
41
|
-
[200, {"content-type" => "application/json"},
|
|
42
|
-
[JSON.dump(result)]]
|
|
43
|
-
end
|
|
44
|
-
|
|
45
|
-
private
|
|
46
|
-
|
|
47
|
-
# Convert user-defined parameters to the dev server protocol format.
|
|
48
|
-
# Wraps in a staticParameters container with "data" typed entries.
|
|
49
|
-
def serialize_parameters(parameters)
|
|
50
|
-
return nil unless parameters && !parameters.empty?
|
|
51
|
-
|
|
52
|
-
schema = {}
|
|
53
|
-
parameters.each do |name, spec|
|
|
54
|
-
spec = spec.transform_keys(&:to_s) if spec.is_a?(Hash)
|
|
55
|
-
if spec.is_a?(Hash)
|
|
56
|
-
schema[name.to_s] = {
|
|
57
|
-
"type" => "data",
|
|
58
|
-
"schema" => {"type" => spec["type"] || "string"},
|
|
59
|
-
"default" => spec["default"],
|
|
60
|
-
"description" => spec["description"]
|
|
61
|
-
}
|
|
62
|
-
end
|
|
63
|
-
end
|
|
64
|
-
|
|
65
|
-
{
|
|
66
|
-
"type" => "braintrust.staticParameters",
|
|
67
|
-
"schema" => schema,
|
|
68
|
-
"source" => nil
|
|
69
|
-
}
|
|
30
|
+
result = @service.call
|
|
31
|
+
[200, {"content-type" => "application/json"}, [JSON.dump(result)]]
|
|
70
32
|
end
|
|
71
33
|
end
|
|
72
34
|
end
|
|
@@ -15,6 +15,8 @@ require_relative "auth/no_auth"
|
|
|
15
15
|
require_relative "auth/clerk_token"
|
|
16
16
|
require_relative "middleware/cors"
|
|
17
17
|
require_relative "middleware/auth"
|
|
18
|
+
require_relative "services/list_service"
|
|
19
|
+
require_relative "services/eval_service"
|
|
18
20
|
require_relative "handlers/health"
|
|
19
21
|
require_relative "handlers/list"
|
|
20
22
|
require_relative "handlers/eval"
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
|
|
5
|
+
module Braintrust
|
|
6
|
+
module Server
|
|
7
|
+
module Services
|
|
8
|
+
# Framework-agnostic service for running evaluations and streaming SSE results.
|
|
9
|
+
# Must be long-lived (not per-request) to preserve the @state_cache across requests.
|
|
10
|
+
class Eval
|
|
11
|
+
def initialize(evaluators)
|
|
12
|
+
@evaluators = evaluators
|
|
13
|
+
@state_mutex = Mutex.new
|
|
14
|
+
@state_cache = {}
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# Validates request body. Returns:
|
|
18
|
+
# {error: String, status: Integer} on failure
|
|
19
|
+
# {evaluator:, name:, cases:, dataset:, ...} on success
|
|
20
|
+
def validate(body)
|
|
21
|
+
name = body["name"]
|
|
22
|
+
return {error: "Missing required field: name", status: 400} unless name
|
|
23
|
+
|
|
24
|
+
evaluator = current_evaluators[name]
|
|
25
|
+
return {error: "Evaluator '#{name}' not found", status: 404} unless evaluator
|
|
26
|
+
|
|
27
|
+
data = body["data"]
|
|
28
|
+
return {error: "Missing required field: data", status: 400} unless data
|
|
29
|
+
|
|
30
|
+
data_sources = ["data", "dataset_name", "dataset_id"].count { |k| data.key?(k) }
|
|
31
|
+
return {error: "Exactly one data source required", status: 400} if data_sources != 1
|
|
32
|
+
|
|
33
|
+
cases, dataset = resolve_data_source(data)
|
|
34
|
+
|
|
35
|
+
{
|
|
36
|
+
evaluator: evaluator,
|
|
37
|
+
name: name,
|
|
38
|
+
cases: cases,
|
|
39
|
+
dataset: dataset,
|
|
40
|
+
experiment_name: body["experiment_name"],
|
|
41
|
+
remote_scorer_ids: resolve_remote_scorers(body["scores"]),
|
|
42
|
+
parent: resolve_parent(body["parent"]),
|
|
43
|
+
project_id: body["project_id"],
|
|
44
|
+
parameters: resolve_parameters(body["parameters"], evaluator)
|
|
45
|
+
}
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Runs the validated eval and streams SSE events via the sse writer.
|
|
49
|
+
# +validated+ is the hash returned by #validate.
|
|
50
|
+
# +auth+ is the auth context hash (or nil/true for no-auth).
|
|
51
|
+
# +sse+ is an SSEWriter instance.
|
|
52
|
+
def stream(validated, auth:, sse:)
|
|
53
|
+
name = validated[:name]
|
|
54
|
+
evaluator = validated[:evaluator]
|
|
55
|
+
cases = validated[:cases]
|
|
56
|
+
dataset = validated[:dataset]
|
|
57
|
+
experiment_name = validated[:experiment_name]
|
|
58
|
+
remote_scorer_ids = validated[:remote_scorer_ids]
|
|
59
|
+
parent = validated[:parent]
|
|
60
|
+
project_id = validated[:project_id]
|
|
61
|
+
parameters = validated[:parameters]
|
|
62
|
+
|
|
63
|
+
state = build_state(auth)
|
|
64
|
+
|
|
65
|
+
# Only pass project/experiment params when state is available
|
|
66
|
+
run_opts = {
|
|
67
|
+
on_progress: ->(progress_data) {
|
|
68
|
+
# Build remote eval protocol events from generic progress data.
|
|
69
|
+
# Runner provides: id, data/error, scores (optional), origin (optional).
|
|
70
|
+
# Protocol requires: id, object_type, origin, name, format, output_type, event, data.
|
|
71
|
+
base = {
|
|
72
|
+
"object_type" => "task",
|
|
73
|
+
"name" => name,
|
|
74
|
+
"format" => "code",
|
|
75
|
+
"output_type" => "completion"
|
|
76
|
+
}
|
|
77
|
+
base["id"] = progress_data["id"] if progress_data["id"]
|
|
78
|
+
base["origin"] = progress_data["origin"] if progress_data["origin"]
|
|
79
|
+
|
|
80
|
+
if progress_data.key?("error")
|
|
81
|
+
sse.event("progress", JSON.dump(base.merge("event" => "error", "data" => progress_data["error"])))
|
|
82
|
+
else
|
|
83
|
+
sse.event("progress", JSON.dump(base.merge("event" => "json_delta", "data" => JSON.dump(progress_data["data"]))))
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Signal per-cell completion so the UI exits "Streaming..." state
|
|
87
|
+
# and updates the progress bar immediately.
|
|
88
|
+
sse.event("progress", JSON.dump(base.merge("event" => "done", "data" => "")))
|
|
89
|
+
},
|
|
90
|
+
quiet: true
|
|
91
|
+
}
|
|
92
|
+
run_opts[:parent] = parent if parent
|
|
93
|
+
run_opts[:scorers] = remote_scorer_ids if remote_scorer_ids
|
|
94
|
+
run_opts[:parameters] = parameters if parameters && !parameters.empty?
|
|
95
|
+
run_opts[:dataset] = dataset if dataset
|
|
96
|
+
|
|
97
|
+
if state
|
|
98
|
+
run_opts[:state] = state
|
|
99
|
+
run_opts[:experiment] = experiment_name if experiment_name
|
|
100
|
+
run_opts[:project_id] = project_id if project_id
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
result = evaluator.run(cases, **run_opts)
|
|
104
|
+
|
|
105
|
+
# Flush buffered OTLP spans before sending completion events.
|
|
106
|
+
# The BatchSpanProcessor exports every ~5s; fast evals can finish
|
|
107
|
+
# before a single export fires, causing the UI to see no results.
|
|
108
|
+
Braintrust::Trace.flush_spans
|
|
109
|
+
|
|
110
|
+
# Build summary from result scores
|
|
111
|
+
averaged_scores = {}
|
|
112
|
+
result.scorer_stats.each do |scorer_name, stats|
|
|
113
|
+
averaged_scores[scorer_name] = stats.score_mean
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
sse.event("summary", JSON.dump({
|
|
117
|
+
"scores" => averaged_scores,
|
|
118
|
+
"experiment_name" => experiment_name,
|
|
119
|
+
"experiment_id" => result.experiment_id,
|
|
120
|
+
"project_id" => result.project_id
|
|
121
|
+
}))
|
|
122
|
+
|
|
123
|
+
sse.event("done", "")
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# Build State from auth context hash.
|
|
127
|
+
# Returns nil when auth is not a Hash (e.g. NoAuth returns true).
|
|
128
|
+
# Uses an LRU-style cache (max 64 entries) keyed by [api_key, app_url, org_name].
|
|
129
|
+
def build_state(auth)
|
|
130
|
+
return nil unless auth.is_a?(Hash)
|
|
131
|
+
|
|
132
|
+
cache_key = [auth["api_key"], auth["app_url"], auth["org_name"]]
|
|
133
|
+
|
|
134
|
+
@state_mutex ||= Mutex.new
|
|
135
|
+
@state_cache ||= {}
|
|
136
|
+
|
|
137
|
+
@state_mutex.synchronize do
|
|
138
|
+
cached = @state_cache[cache_key]
|
|
139
|
+
return cached if cached
|
|
140
|
+
|
|
141
|
+
state = Braintrust::State.new(
|
|
142
|
+
api_key: auth["api_key"],
|
|
143
|
+
org_id: auth["org_id"],
|
|
144
|
+
org_name: auth["org_name"],
|
|
145
|
+
app_url: auth["app_url"],
|
|
146
|
+
api_url: auth["api_url"],
|
|
147
|
+
enable_tracing: false
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
if @state_cache.size >= 64
|
|
151
|
+
oldest_key = @state_cache.keys.first
|
|
152
|
+
@state_cache.delete(oldest_key)
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
@state_cache[cache_key] = state
|
|
156
|
+
state
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
private
|
|
161
|
+
|
|
162
|
+
def current_evaluators
|
|
163
|
+
return @evaluators.call if @evaluators.respond_to?(:call)
|
|
164
|
+
@evaluators
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
# Merge request parameters with evaluator's parameter defaults.
|
|
168
|
+
# Request values override defaults. Returns a string-keyed Hash.
|
|
169
|
+
def resolve_parameters(raw_params, evaluator)
|
|
170
|
+
defaults = (evaluator.parameters || {}).to_h { |name, spec|
|
|
171
|
+
[name.to_s, spec.is_a?(Hash) ? (spec[:default] || spec["default"]) : nil]
|
|
172
|
+
}.compact
|
|
173
|
+
defaults.merge(raw_params || {})
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
# Resolve data source from the data field.
|
|
177
|
+
# Returns [cases, dataset] where exactly one is non-nil.
|
|
178
|
+
def resolve_data_source(data)
|
|
179
|
+
if data.key?("data")
|
|
180
|
+
cases = data["data"].map do |d|
|
|
181
|
+
{input: d["input"], expected: d["expected"]}
|
|
182
|
+
end
|
|
183
|
+
[cases, nil]
|
|
184
|
+
elsif data.key?("dataset_id")
|
|
185
|
+
[nil, Braintrust::Dataset::ID.new(id: data["dataset_id"])]
|
|
186
|
+
elsif data.key?("dataset_name")
|
|
187
|
+
dataset_opts = {name: data["dataset_name"]}
|
|
188
|
+
dataset_opts[:project] = data["project_name"] if data["project_name"]
|
|
189
|
+
[nil, dataset_opts]
|
|
190
|
+
else
|
|
191
|
+
[nil, nil]
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
# Map request scores array to Scorer::ID structs.
|
|
196
|
+
# The UI sends function_id as a nested object: {"function_id": "uuid"}.
|
|
197
|
+
def resolve_remote_scorers(scores)
|
|
198
|
+
return nil if scores.nil? || scores.empty?
|
|
199
|
+
scores.map do |s|
|
|
200
|
+
func_id = s["function_id"]
|
|
201
|
+
func_id = func_id["function_id"] if func_id.is_a?(Hash)
|
|
202
|
+
Braintrust::Scorer::ID.new(
|
|
203
|
+
function_id: func_id,
|
|
204
|
+
version: s["version"]
|
|
205
|
+
)
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
# Map request parent to symbol-keyed Hash.
|
|
210
|
+
# Hardcode playground_id to match Java SDK behavior.
|
|
211
|
+
# Also extracts generation from propagated_event for span_attributes.
|
|
212
|
+
def resolve_parent(parent)
|
|
213
|
+
return nil unless parent.is_a?(Hash)
|
|
214
|
+
object_id = parent["object_id"]
|
|
215
|
+
return nil unless object_id
|
|
216
|
+
|
|
217
|
+
generation = parent.dig("propagated_event", "span_attributes", "generation")
|
|
218
|
+
|
|
219
|
+
result = {object_type: "playground_id", object_id: object_id}
|
|
220
|
+
result[:generation] = generation if generation
|
|
221
|
+
result
|
|
222
|
+
end
|
|
223
|
+
end
|
|
224
|
+
end
|
|
225
|
+
end
|
|
226
|
+
end
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
|
|
5
|
+
module Braintrust
|
|
6
|
+
module Server
|
|
7
|
+
module Services
|
|
8
|
+
# Framework-agnostic service for listing evaluators.
|
|
9
|
+
# Returns a plain Hash (not a Rack triplet) suitable for JSON.dump.
|
|
10
|
+
class List
|
|
11
|
+
def initialize(evaluators)
|
|
12
|
+
@evaluators = evaluators
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def call
|
|
16
|
+
result = {}
|
|
17
|
+
current_evaluators.each do |name, evaluator|
|
|
18
|
+
scores = (evaluator.scorers || []).each_with_index.map do |scorer, i|
|
|
19
|
+
scorer_name = scorer.respond_to?(:name) ? scorer.name : "score_#{i}"
|
|
20
|
+
{"name" => scorer_name}
|
|
21
|
+
end
|
|
22
|
+
entry = {"scores" => scores}
|
|
23
|
+
params = serialize_parameters(evaluator.parameters)
|
|
24
|
+
entry["parameters"] = params if params
|
|
25
|
+
result[name] = entry
|
|
26
|
+
end
|
|
27
|
+
result
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
private
|
|
31
|
+
|
|
32
|
+
def current_evaluators
|
|
33
|
+
return @evaluators.call if @evaluators.respond_to?(:call)
|
|
34
|
+
@evaluators
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Convert user-defined parameters to the dev server protocol format.
|
|
38
|
+
# Wraps in a staticParameters container with "data" typed entries.
|
|
39
|
+
def serialize_parameters(parameters)
|
|
40
|
+
return nil unless parameters && !parameters.empty?
|
|
41
|
+
|
|
42
|
+
schema = {}
|
|
43
|
+
parameters.each do |name, spec|
|
|
44
|
+
spec = spec.transform_keys(&:to_s) if spec.is_a?(Hash)
|
|
45
|
+
if spec.is_a?(Hash)
|
|
46
|
+
schema[name.to_s] = {
|
|
47
|
+
"type" => "data",
|
|
48
|
+
"schema" => {"type" => spec["type"] || "string"},
|
|
49
|
+
"default" => spec["default"],
|
|
50
|
+
"description" => spec["description"]
|
|
51
|
+
}
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
{
|
|
56
|
+
"type" => "braintrust.staticParameters",
|
|
57
|
+
"schema" => schema,
|
|
58
|
+
"source" => nil
|
|
59
|
+
}
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
@@ -80,11 +80,6 @@ module Braintrust
|
|
|
80
80
|
# Determine if a span should be forwarded to the wrapped processor
|
|
81
81
|
# based on configured filters
|
|
82
82
|
def should_forward_span?(span)
|
|
83
|
-
# Always keep root spans (spans with no parent)
|
|
84
|
-
# Check if parent_span_id is the invalid/zero span ID
|
|
85
|
-
is_root = span.parent_span_id == OpenTelemetry::Trace::INVALID_SPAN_ID
|
|
86
|
-
return true if is_root
|
|
87
|
-
|
|
88
83
|
# If no filters, keep everything
|
|
89
84
|
return true if @filters.empty?
|
|
90
85
|
|
data/lib/braintrust/version.rb
CHANGED