braintrust 0.1.4 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +109 -13
- data/lib/braintrust/api/datasets.rb +10 -0
- data/lib/braintrust/api/internal/btql.rb +124 -0
- data/lib/braintrust/api/internal/experiments.rb +20 -1
- data/lib/braintrust/api/internal/projects.rb +19 -0
- data/lib/braintrust/dataset.rb +13 -6
- data/lib/braintrust/eval/context.rb +131 -0
- data/lib/braintrust/eval/evaluator.rb +78 -0
- data/lib/braintrust/eval/functions.rb +10 -132
- data/lib/braintrust/eval/runner.rb +119 -85
- data/lib/braintrust/eval/scorer.rb +24 -92
- data/lib/braintrust/eval/trace.rb +129 -0
- data/lib/braintrust/eval.rb +131 -156
- data/lib/braintrust/functions.rb +168 -0
- data/lib/braintrust/internal/callable.rb +83 -0
- data/lib/braintrust/logger.rb +9 -0
- data/lib/braintrust/scorer.rb +122 -0
- data/lib/braintrust/server/auth/clerk_token.rb +68 -0
- data/lib/braintrust/server/auth/no_auth.rb +14 -0
- data/lib/braintrust/server/handlers/eval.rb +217 -0
- data/lib/braintrust/server/handlers/health.rb +16 -0
- data/lib/braintrust/server/handlers/list.rb +74 -0
- data/lib/braintrust/server/middleware/auth.rb +29 -0
- data/lib/braintrust/server/middleware/cors.rb +87 -0
- data/lib/braintrust/server/rack/app.rb +38 -0
- data/lib/braintrust/server/rack.rb +36 -0
- data/lib/braintrust/server/router.rb +37 -0
- data/lib/braintrust/server/sse.rb +52 -0
- data/lib/braintrust/server.rb +8 -0
- data/lib/braintrust/task.rb +108 -0
- data/lib/braintrust/trace/span_exporter.rb +36 -0
- data/lib/braintrust/trace.rb +3 -4
- data/lib/braintrust/version.rb +1 -1
- metadata +22 -1
|
@@ -1,144 +1,22 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require_relative "../
|
|
4
|
-
require_relative "scorer"
|
|
5
|
-
require "opentelemetry/sdk"
|
|
6
|
-
require "json"
|
|
3
|
+
require_relative "../functions"
|
|
7
4
|
|
|
8
5
|
module Braintrust
|
|
9
6
|
module Eval
|
|
10
|
-
#
|
|
11
|
-
# Allows calling prompts hosted on Braintrust servers as tasks or scorers
|
|
7
|
+
# @deprecated Use {Braintrust::Functions} instead.
|
|
12
8
|
module Functions
|
|
13
9
|
class << self
|
|
14
|
-
#
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
# @param tracer_provider [TracerProvider, nil] OpenTelemetry tracer provider
|
|
19
|
-
# @return [Proc] Callable that accepts input and returns output
|
|
20
|
-
def task(project:, slug:, state: nil, tracer_provider: nil)
|
|
21
|
-
state ||= Braintrust.current_state
|
|
22
|
-
raise Error, "No state available" unless state
|
|
23
|
-
|
|
24
|
-
# Resolve function ID from project + slug
|
|
25
|
-
api = API.new(state: state)
|
|
26
|
-
function_metadata = resolve_function(api, project, slug)
|
|
27
|
-
function_id = function_metadata["id"]
|
|
28
|
-
function_name = function_metadata["name"] || slug
|
|
29
|
-
|
|
30
|
-
# Get tracer for creating spans
|
|
31
|
-
tracer_provider ||= OpenTelemetry.tracer_provider
|
|
32
|
-
tracer = tracer_provider.tracer("braintrust.functions")
|
|
33
|
-
|
|
34
|
-
# Return a lambda that invokes the remote function with tracing
|
|
35
|
-
lambda do |input|
|
|
36
|
-
# Create a span for the function invocation
|
|
37
|
-
tracer.in_span("function: #{slug}") do |span|
|
|
38
|
-
span.set_attribute("braintrust.span_attributes", JSON.dump({type: "function"}))
|
|
39
|
-
span.set_attribute("braintrust.input_json", JSON.dump(input))
|
|
40
|
-
span.set_attribute("braintrust.function.name", function_name)
|
|
41
|
-
span.set_attribute("braintrust.function.id", function_id)
|
|
42
|
-
span.set_attribute("braintrust.function.slug", slug)
|
|
43
|
-
|
|
44
|
-
begin
|
|
45
|
-
# Invoke the function via API
|
|
46
|
-
output = api.functions.invoke(id: function_id, input: input)
|
|
47
|
-
span.set_attribute("braintrust.output_json", JSON.dump(output))
|
|
48
|
-
output
|
|
49
|
-
rescue => e
|
|
50
|
-
# Record exception and set error status
|
|
51
|
-
span.record_exception(e)
|
|
52
|
-
span.status = OpenTelemetry::Trace::Status.error(e.message)
|
|
53
|
-
raise
|
|
54
|
-
end
|
|
55
|
-
end
|
|
56
|
-
end
|
|
57
|
-
end
|
|
58
|
-
|
|
59
|
-
# Create a scorer that invokes a remote function
|
|
60
|
-
# @param project [String] Project name
|
|
61
|
-
# @param slug [String] Function slug
|
|
62
|
-
# @param state [State, nil] Braintrust state (defaults to global)
|
|
63
|
-
# @param tracer_provider [TracerProvider, nil] OpenTelemetry tracer provider
|
|
64
|
-
# @return [Scorer] Scorer object that invokes remote function
|
|
65
|
-
def scorer(project:, slug:, state: nil, tracer_provider: nil)
|
|
66
|
-
state ||= Braintrust.current_state
|
|
67
|
-
raise Error, "No state available" unless state
|
|
68
|
-
|
|
69
|
-
# Resolve function ID from project + slug
|
|
70
|
-
api = API.new(state: state)
|
|
71
|
-
function_metadata = resolve_function(api, project, slug)
|
|
72
|
-
function_id = function_metadata["id"]
|
|
73
|
-
function_name = function_metadata["name"] || slug
|
|
74
|
-
|
|
75
|
-
# Get tracer for creating spans
|
|
76
|
-
tracer_provider ||= OpenTelemetry.tracer_provider
|
|
77
|
-
tracer = tracer_provider.tracer("braintrust.functions")
|
|
78
|
-
|
|
79
|
-
# Create a scorer that invokes the remote function
|
|
80
|
-
Scorer.new(slug) do |input, expected, output, metadata|
|
|
81
|
-
# Create a span for the function invocation
|
|
82
|
-
tracer.in_span("function: #{slug}") do |span|
|
|
83
|
-
scorer_input = {
|
|
84
|
-
input: input,
|
|
85
|
-
expected: expected,
|
|
86
|
-
output: output,
|
|
87
|
-
metadata: metadata
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
span.set_attribute("braintrust.span_attributes", JSON.dump({type: "function"}))
|
|
91
|
-
span.set_attribute("braintrust.input_json", JSON.dump(scorer_input))
|
|
92
|
-
span.set_attribute("braintrust.function.name", function_name)
|
|
93
|
-
span.set_attribute("braintrust.function.id", function_id)
|
|
94
|
-
span.set_attribute("braintrust.function.slug", slug)
|
|
95
|
-
|
|
96
|
-
begin
|
|
97
|
-
# Invoke the function via API
|
|
98
|
-
# The remote scorer receives all scorer arguments
|
|
99
|
-
result = api.functions.invoke(id: function_id, input: scorer_input)
|
|
100
|
-
|
|
101
|
-
score = case result
|
|
102
|
-
when Hash
|
|
103
|
-
if result.key?("score")
|
|
104
|
-
result["score"].to_f
|
|
105
|
-
else
|
|
106
|
-
raise Error, "Hash result must contain 'score' key"
|
|
107
|
-
end
|
|
108
|
-
when String
|
|
109
|
-
result.to_f
|
|
110
|
-
else
|
|
111
|
-
raise Error, "Unsupported result type: #{result.class}"
|
|
112
|
-
end
|
|
113
|
-
|
|
114
|
-
span.set_attribute("braintrust.output_json", JSON.dump(score))
|
|
115
|
-
score
|
|
116
|
-
rescue => e
|
|
117
|
-
# Record exception and set error status
|
|
118
|
-
span.record_exception(e)
|
|
119
|
-
span.status = OpenTelemetry::Trace::Status.error(e.message)
|
|
120
|
-
raise
|
|
121
|
-
end
|
|
122
|
-
end
|
|
123
|
-
end
|
|
10
|
+
# @deprecated Use {Braintrust::Functions.task} instead.
|
|
11
|
+
def task(**kwargs)
|
|
12
|
+
Log.warn_once(:eval_functions_task, "Braintrust::Eval::Functions.task is deprecated: use Braintrust::Functions.task instead.")
|
|
13
|
+
Braintrust::Functions.task(**kwargs)
|
|
124
14
|
end
|
|
125
15
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
# @param project [String] Project name
|
|
131
|
-
# @param slug [String] Function slug
|
|
132
|
-
# @return [Hash] Function metadata
|
|
133
|
-
def resolve_function(api, project, slug)
|
|
134
|
-
result = api.functions.list(project_name: project, slug: slug)
|
|
135
|
-
functions = result["objects"]
|
|
136
|
-
|
|
137
|
-
if functions.nil? || functions.empty?
|
|
138
|
-
raise Error, "Function '#{slug}' not found in project '#{project}'"
|
|
139
|
-
end
|
|
140
|
-
|
|
141
|
-
functions.first
|
|
16
|
+
# @deprecated Use {Braintrust::Functions.scorer} instead.
|
|
17
|
+
def scorer(**kwargs)
|
|
18
|
+
Log.warn_once(:eval_functions_scorer, "Braintrust::Eval::Functions.scorer is deprecated: use Braintrust::Functions.scorer instead.")
|
|
19
|
+
Braintrust::Functions.scorer(**kwargs)
|
|
142
20
|
end
|
|
143
21
|
end
|
|
144
22
|
end
|
|
@@ -1,56 +1,52 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require_relative "case"
|
|
4
|
-
require_relative "cases"
|
|
5
|
-
require_relative "scorer"
|
|
6
4
|
require_relative "result"
|
|
7
5
|
require_relative "summary"
|
|
6
|
+
require_relative "trace"
|
|
8
7
|
require_relative "../internal/thread_pool"
|
|
8
|
+
require_relative "../api/internal/btql"
|
|
9
9
|
|
|
10
10
|
require "opentelemetry/sdk"
|
|
11
11
|
require "json"
|
|
12
12
|
|
|
13
13
|
module Braintrust
|
|
14
14
|
module Eval
|
|
15
|
-
# Internal runner class that performs the execution of the Eval and returns the result
|
|
15
|
+
# Internal runner class that performs the execution of the Eval and returns the result.
|
|
16
|
+
# Receives a fully-normalized Context — all callables are already typed wrappers.
|
|
16
17
|
class Runner
|
|
17
18
|
# Maximum parallelism allowed (mirrors Internal::ThreadPool::MAX_PARALLELISM)
|
|
18
19
|
MAX_PARALLELISM = Internal::ThreadPool::MAX_PARALLELISM
|
|
19
20
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
@
|
|
26
|
-
|
|
27
|
-
@
|
|
28
|
-
@api = api
|
|
29
|
-
@tracer_provider = tracer_provider || OpenTelemetry.tracer_provider
|
|
30
|
-
@tracer = @tracer_provider.tracer("braintrust-eval")
|
|
31
|
-
@parent_attr = "experiment_id:#{experiment_id}"
|
|
21
|
+
# Per-case mutable accumulator. Built from Case, populated by task and scoring stages.
|
|
22
|
+
CaseContext = Struct.new(:input, :expected, :output, :metadata, :tags, :trace, :origin, keyword_init: true)
|
|
23
|
+
|
|
24
|
+
# @param eval_context [Context] Normalized eval context
|
|
25
|
+
def initialize(eval_context)
|
|
26
|
+
@eval_context = eval_context
|
|
27
|
+
tracer_provider = eval_context.tracer_provider || OpenTelemetry.tracer_provider
|
|
28
|
+
@tracer = tracer_provider.tracer("braintrust-eval")
|
|
32
29
|
|
|
33
30
|
# Mutex for thread-safe score collection
|
|
34
31
|
@score_mutex = Mutex.new
|
|
35
32
|
end
|
|
36
33
|
|
|
37
34
|
# Run evaluation and return Result
|
|
38
|
-
# @param cases [Array, Enumerable] Test cases
|
|
39
35
|
# @param parallelism [Integer] Number of parallel workers (default: 1)
|
|
40
36
|
# @return [Result]
|
|
41
|
-
def run(
|
|
37
|
+
def run(parallelism: 1)
|
|
42
38
|
start_time = Time.now
|
|
43
|
-
|
|
39
|
+
eval_cases = eval_context.cases
|
|
44
40
|
errors = Queue.new
|
|
45
41
|
@scores = {} # Reset for each run: { scorer_name => Array<Numeric> }
|
|
46
42
|
|
|
47
43
|
if parallelism && parallelism > 1
|
|
48
|
-
Internal::ThreadPool.each(
|
|
49
|
-
|
|
44
|
+
Internal::ThreadPool.each(eval_cases, parallelism: parallelism) do |eval_case|
|
|
45
|
+
run_eval_case(build_case_context(eval_case), errors)
|
|
50
46
|
end
|
|
51
47
|
else
|
|
52
|
-
|
|
53
|
-
|
|
48
|
+
eval_cases.each do |eval_case|
|
|
49
|
+
run_eval_case(build_case_context(eval_case), errors)
|
|
54
50
|
end
|
|
55
51
|
end
|
|
56
52
|
|
|
@@ -60,14 +56,16 @@ module Braintrust
|
|
|
60
56
|
# Calculate duration
|
|
61
57
|
duration = Time.now - start_time
|
|
62
58
|
|
|
63
|
-
# Generate permalink
|
|
64
|
-
permalink =
|
|
59
|
+
# Generate permalink (only when state and experiment are available)
|
|
60
|
+
permalink = if eval_context.state && eval_context.experiment_id
|
|
61
|
+
eval_context.state.object_permalink(object_type: "experiment", object_id: eval_context.experiment_id)
|
|
62
|
+
end
|
|
65
63
|
|
|
66
64
|
Result.new(
|
|
67
|
-
experiment_id: experiment_id,
|
|
68
|
-
experiment_name: experiment_name,
|
|
69
|
-
project_id: project_id,
|
|
70
|
-
project_name: project_name,
|
|
65
|
+
experiment_id: eval_context.experiment_id,
|
|
66
|
+
experiment_name: eval_context.experiment_name,
|
|
67
|
+
project_id: eval_context.project_id,
|
|
68
|
+
project_name: eval_context.project_name,
|
|
71
69
|
permalink: permalink,
|
|
72
70
|
errors: error_array,
|
|
73
71
|
duration: duration,
|
|
@@ -77,63 +75,71 @@ module Braintrust
|
|
|
77
75
|
|
|
78
76
|
private
|
|
79
77
|
|
|
80
|
-
attr_reader :
|
|
81
|
-
:task, :scorers, :tracer, :parent_attr
|
|
78
|
+
attr_reader :eval_context, :tracer
|
|
82
79
|
|
|
83
80
|
# Run a single test case with OpenTelemetry tracing
|
|
84
81
|
# Creates eval span (parent) with task and score as children
|
|
85
|
-
# @param
|
|
82
|
+
# @param case_context [CaseContext] The per-case accumulator
|
|
86
83
|
# @param errors [Queue] Thread-safe error collection queue
|
|
87
|
-
def
|
|
84
|
+
def run_eval_case(case_context, errors)
|
|
88
85
|
tracer.in_span("eval") do |eval_span|
|
|
89
|
-
eval_span.set_attribute("braintrust.parent",
|
|
86
|
+
eval_span.set_attribute("braintrust.parent", eval_context.parent_span_attr) if eval_context.parent_span_attr
|
|
90
87
|
|
|
91
88
|
# Set tags early so they're present even if task fails
|
|
92
|
-
eval_span.set_attribute("braintrust.tags",
|
|
89
|
+
eval_span.set_attribute("braintrust.tags", case_context.tags) if case_context.tags
|
|
93
90
|
|
|
94
91
|
# Run task
|
|
95
|
-
output = nil
|
|
96
92
|
begin
|
|
97
|
-
output = run_task(
|
|
93
|
+
case_context.output = run_task(case_context)
|
|
98
94
|
rescue => e
|
|
99
95
|
# Error already recorded on task span, set eval span status
|
|
100
96
|
eval_span.status = OpenTelemetry::Trace::Status.error(e.message)
|
|
101
|
-
errors << "Task failed for input '#{
|
|
97
|
+
errors << "Task failed for input '#{case_context.input}': #{e.message}"
|
|
98
|
+
report_progress(eval_span, case_context, error: e.message)
|
|
102
99
|
next
|
|
103
100
|
end
|
|
104
101
|
|
|
102
|
+
# Flush spans so they're queryable via BTQL, then build trace
|
|
103
|
+
eval_context.tracer_provider&.force_flush
|
|
104
|
+
case_context.trace = build_trace(eval_span)
|
|
105
|
+
|
|
105
106
|
# Run scorers
|
|
107
|
+
case_scores = nil
|
|
106
108
|
begin
|
|
107
|
-
run_scorers(
|
|
109
|
+
case_scores = run_scorers(case_context)
|
|
108
110
|
rescue => e
|
|
109
111
|
# Error already recorded on score span, set eval span status
|
|
110
112
|
eval_span.status = OpenTelemetry::Trace::Status.error(e.message)
|
|
111
|
-
errors << "Scorers failed for input '#{
|
|
113
|
+
errors << "Scorers failed for input '#{case_context.input}': #{e.message}"
|
|
112
114
|
end
|
|
113
115
|
|
|
114
116
|
# Set eval span attributes (after task and scorers complete)
|
|
115
|
-
set_json_attr(eval_span, "braintrust.span_attributes",
|
|
116
|
-
set_json_attr(eval_span, "braintrust.input_json",
|
|
117
|
-
set_json_attr(eval_span, "braintrust.output_json", output)
|
|
118
|
-
set_json_attr(eval_span, "braintrust.expected",
|
|
117
|
+
set_json_attr(eval_span, "braintrust.span_attributes", build_span_attributes("eval"))
|
|
118
|
+
set_json_attr(eval_span, "braintrust.input_json", case_context.input)
|
|
119
|
+
set_json_attr(eval_span, "braintrust.output_json", case_context.output)
|
|
120
|
+
set_json_attr(eval_span, "braintrust.expected", case_context.expected) if case_context.expected
|
|
119
121
|
|
|
120
122
|
# Set origin for cases from remote sources (already JSON-serialized)
|
|
121
|
-
eval_span.set_attribute("braintrust.origin",
|
|
123
|
+
eval_span.set_attribute("braintrust.origin", case_context.origin) if case_context.origin
|
|
124
|
+
|
|
125
|
+
report_progress(eval_span, case_context, data: case_context.output, scores: case_scores || {})
|
|
122
126
|
end
|
|
123
127
|
end
|
|
124
128
|
|
|
125
129
|
# Run task with OpenTelemetry tracing
|
|
126
130
|
# Creates task span with input and output
|
|
127
|
-
# @param
|
|
131
|
+
# @param case_context [CaseContext] The per-case context
|
|
128
132
|
# @return [Object] Task output
|
|
129
|
-
def run_task(
|
|
133
|
+
def run_task(case_context)
|
|
130
134
|
tracer.in_span("task") do |task_span|
|
|
131
|
-
task_span.set_attribute("braintrust.parent",
|
|
132
|
-
set_json_attr(task_span, "braintrust.span_attributes",
|
|
133
|
-
set_json_attr(task_span, "braintrust.input_json",
|
|
135
|
+
task_span.set_attribute("braintrust.parent", eval_context.parent_span_attr) if eval_context.parent_span_attr
|
|
136
|
+
set_json_attr(task_span, "braintrust.span_attributes", build_span_attributes("task"))
|
|
137
|
+
set_json_attr(task_span, "braintrust.input_json", case_context.input)
|
|
134
138
|
|
|
135
139
|
begin
|
|
136
|
-
output = task.call(
|
|
140
|
+
output = eval_context.task.call(
|
|
141
|
+
input: case_context.input
|
|
142
|
+
)
|
|
137
143
|
set_json_attr(task_span, "braintrust.output_json", output)
|
|
138
144
|
output
|
|
139
145
|
rescue => e
|
|
@@ -147,17 +153,24 @@ module Braintrust
|
|
|
147
153
|
|
|
148
154
|
# Run scorers with OpenTelemetry tracing
|
|
149
155
|
# Creates single score span for all scorers
|
|
150
|
-
# @param
|
|
151
|
-
# @
|
|
152
|
-
def run_scorers(
|
|
156
|
+
# @param case_context [CaseContext] The per-case context (output must be populated)
|
|
157
|
+
# @return [Hash] Scores hash { scorer_name => score_value }
|
|
158
|
+
def run_scorers(case_context)
|
|
153
159
|
tracer.in_span("score") do |score_span|
|
|
154
|
-
score_span.set_attribute("braintrust.parent",
|
|
155
|
-
set_json_attr(score_span, "braintrust.span_attributes",
|
|
156
|
-
|
|
160
|
+
score_span.set_attribute("braintrust.parent", eval_context.parent_span_attr) if eval_context.parent_span_attr
|
|
161
|
+
set_json_attr(score_span, "braintrust.span_attributes", build_span_attributes("score"))
|
|
162
|
+
|
|
163
|
+
scorer_kwargs = {
|
|
164
|
+
input: case_context.input,
|
|
165
|
+
expected: case_context.expected,
|
|
166
|
+
output: case_context.output,
|
|
167
|
+
metadata: case_context.metadata || {},
|
|
168
|
+
trace: case_context.trace
|
|
169
|
+
}
|
|
157
170
|
scores = {}
|
|
158
171
|
scorer_error = nil
|
|
159
|
-
scorers.each do |scorer|
|
|
160
|
-
score_value = scorer.call(
|
|
172
|
+
eval_context.scorers.each do |scorer|
|
|
173
|
+
score_value = scorer.call(**scorer_kwargs)
|
|
161
174
|
scores[scorer.name] = score_value
|
|
162
175
|
|
|
163
176
|
# Collect raw score for summary (thread-safe)
|
|
@@ -173,39 +186,49 @@ module Braintrust
|
|
|
173
186
|
|
|
174
187
|
# Raise after setting scores so we can see which scorers succeeded
|
|
175
188
|
raise scorer_error if scorer_error
|
|
189
|
+
|
|
190
|
+
scores
|
|
176
191
|
end
|
|
177
192
|
end
|
|
178
193
|
|
|
179
|
-
#
|
|
180
|
-
#
|
|
181
|
-
# @
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
end
|
|
194
|
+
# Build a lazy Trace for a case, backed by BTQL.
|
|
195
|
+
# Returns nil when state or experiment_id are unavailable (local-only mode).
|
|
196
|
+
# @param eval_span [OpenTelemetry::Trace::Span] The eval span for this case
|
|
197
|
+
# @return [Eval::Trace, nil]
|
|
198
|
+
def build_trace(eval_span)
|
|
199
|
+
return nil unless eval_context.state && eval_context.experiment_id
|
|
200
|
+
|
|
201
|
+
root_span_id = eval_span.context.hex_trace_id
|
|
202
|
+
object_type = "experiment"
|
|
203
|
+
object_id = eval_context.experiment_id
|
|
204
|
+
btql = API::Internal::BTQL.new(eval_context.state)
|
|
205
|
+
|
|
206
|
+
Eval::Trace.new(
|
|
207
|
+
spans: -> { btql.trace_spans(object_type: object_type, object_id: object_id, root_span_id: root_span_id) }
|
|
208
|
+
)
|
|
195
209
|
end
|
|
196
210
|
|
|
197
|
-
#
|
|
198
|
-
# @param
|
|
199
|
-
# @return [
|
|
200
|
-
def
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
211
|
+
# Build a CaseContext from a Case struct
|
|
212
|
+
# @param eval_case [Case] The eval case
|
|
213
|
+
# @return [CaseContext]
|
|
214
|
+
def build_case_context(eval_case)
|
|
215
|
+
CaseContext.new(
|
|
216
|
+
input: eval_case.input, expected: eval_case.expected,
|
|
217
|
+
metadata: eval_case.metadata, tags: eval_case.tags, origin: eval_case.origin
|
|
218
|
+
)
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
# Report progress for a case via on_progress callback.
|
|
222
|
+
# Rescues errors in the callback so a broken handler never crashes the eval.
|
|
223
|
+
def report_progress(eval_span, case_context, **fields)
|
|
224
|
+
return unless eval_context.on_progress
|
|
225
|
+
progress = {"id" => eval_span.context.hex_span_id}.merge(fields.transform_keys(&:to_s))
|
|
226
|
+
if case_context.origin
|
|
227
|
+
progress["origin"] = case_context.origin.is_a?(String) ? JSON.parse(case_context.origin) : case_context.origin
|
|
208
228
|
end
|
|
229
|
+
eval_context.on_progress.call(progress)
|
|
230
|
+
rescue => e
|
|
231
|
+
Braintrust.logger.warn("on_progress callback error: #{e.message}")
|
|
209
232
|
end
|
|
210
233
|
|
|
211
234
|
# Record error on span with exception event and error status
|
|
@@ -221,6 +244,17 @@ module Braintrust
|
|
|
221
244
|
span.status = OpenTelemetry::Trace::Status.error(error.message)
|
|
222
245
|
end
|
|
223
246
|
|
|
247
|
+
# Build span_attributes hash with type, and optionally name and generation.
|
|
248
|
+
# Matches Java SDK behavior of including these on every span.
|
|
249
|
+
# @param type [String] Span type ("eval", "task", or "score")
|
|
250
|
+
# @return [Hash]
|
|
251
|
+
def build_span_attributes(type)
|
|
252
|
+
attrs = {type: type}
|
|
253
|
+
attrs[:name] = eval_context.experiment_name if eval_context.experiment_name
|
|
254
|
+
attrs[:generation] = eval_context.generation if eval_context.generation
|
|
255
|
+
attrs
|
|
256
|
+
end
|
|
257
|
+
|
|
224
258
|
# Set a span attribute by JSON encoding the value
|
|
225
259
|
# @param span [OpenTelemetry::Trace::Span] The span
|
|
226
260
|
# @param key [String] The attribute key
|
|
@@ -1,106 +1,38 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require_relative "../scorer"
|
|
4
|
+
|
|
3
5
|
module Braintrust
|
|
4
6
|
module Eval
|
|
5
|
-
#
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
# Create a new scorer
|
|
12
|
-
# @param name_or_callable [String, Symbol, #call] Name or callable (if callable, name is auto-detected)
|
|
13
|
-
# @param callable [#call, nil] Callable if name was provided separately
|
|
14
|
-
# @param block [Proc, nil] Block if no callable provided
|
|
15
|
-
def initialize(name_or_callable = nil, callable = nil, &block)
|
|
16
|
-
# Determine name and callable from arguments
|
|
17
|
-
if name_or_callable.nil? && callable.nil? && block.nil?
|
|
18
|
-
raise ArgumentError, "Must provide callable or block"
|
|
19
|
-
end
|
|
7
|
+
# @deprecated Use {Braintrust::Scorer} instead.
|
|
8
|
+
module Scorer
|
|
9
|
+
# @deprecated Use {Braintrust::Scorer.new} instead.
|
|
10
|
+
def self.new(name_or_callable = nil, callable = nil, &block)
|
|
11
|
+
Log.warn_once(:eval_scorer_class, "Braintrust::Eval::Scorer is deprecated: use Braintrust::Scorer.new instead.")
|
|
20
12
|
|
|
21
|
-
# If first arg is a string/symbol, it's the name
|
|
22
13
|
if name_or_callable.is_a?(String) || name_or_callable.is_a?(Symbol)
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
raise ArgumentError, "Must provide callable or block" unless @callable
|
|
14
|
+
name = name_or_callable.to_s
|
|
15
|
+
block = callable.method(:call) if callable && !block
|
|
26
16
|
else
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
end
|
|
31
|
-
|
|
32
|
-
# Validate callable
|
|
33
|
-
unless @callable.respond_to?(:call)
|
|
34
|
-
raise ArgumentError, "Scorer must be callable (respond to :call)"
|
|
35
|
-
end
|
|
36
|
-
|
|
37
|
-
# Detect arity and wrap callable if needed
|
|
38
|
-
@wrapped_callable = wrap_callable(@callable)
|
|
39
|
-
end
|
|
40
|
-
|
|
41
|
-
# Call the scorer
|
|
42
|
-
# @param input [Object] The input to the task
|
|
43
|
-
# @param expected [Object] The expected output
|
|
44
|
-
# @param output [Object] The actual output from the task
|
|
45
|
-
# @param metadata [Hash] Optional metadata
|
|
46
|
-
# @return [Float, Hash, Array] Score value(s)
|
|
47
|
-
def call(input, expected, output, metadata = {})
|
|
48
|
-
@wrapped_callable.call(input, expected, output, metadata)
|
|
49
|
-
end
|
|
50
|
-
|
|
51
|
-
private
|
|
52
|
-
|
|
53
|
-
# Detect the name from a callable object
|
|
54
|
-
# @param callable [#call] The callable
|
|
55
|
-
# @return [String] The detected name
|
|
56
|
-
def detect_name(callable)
|
|
57
|
-
# Method objects have .name
|
|
58
|
-
if callable.is_a?(Method)
|
|
59
|
-
return callable.name.to_s
|
|
17
|
+
resolved = name_or_callable || callable
|
|
18
|
+
block = resolved.method(:call) if resolved && !block
|
|
19
|
+
name = nil
|
|
60
20
|
end
|
|
61
21
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
end
|
|
66
|
-
|
|
67
|
-
# Fallback
|
|
68
|
-
"scorer"
|
|
69
|
-
end
|
|
70
|
-
|
|
71
|
-
# Wrap the callable to always accept 4 parameters
|
|
72
|
-
# @param callable [#call] The callable to wrap
|
|
73
|
-
# @return [Proc] Wrapped callable that accepts 4 params
|
|
74
|
-
def wrap_callable(callable)
|
|
75
|
-
arity = callable_arity(callable)
|
|
76
|
-
|
|
77
|
-
case arity
|
|
78
|
-
when 3
|
|
79
|
-
# Callable takes 3 params - wrap to ignore metadata
|
|
80
|
-
->(input, expected, output, metadata) {
|
|
81
|
-
callable.call(input, expected, output)
|
|
82
|
-
}
|
|
83
|
-
when 4, -4, -1
|
|
84
|
-
# Callable takes 4 params (or variadic with 4+)
|
|
85
|
-
# -4 means optional 4th param
|
|
86
|
-
# -1 means variadic (*args)
|
|
87
|
-
callable
|
|
88
|
-
else
|
|
89
|
-
raise ArgumentError, "Scorer must accept 3 or 4 parameters (got arity #{arity})"
|
|
90
|
-
end
|
|
22
|
+
scorer = Braintrust::Scorer.new(name, &block)
|
|
23
|
+
scorer.singleton_class.prepend(PositionalArgsRemapping)
|
|
24
|
+
scorer
|
|
91
25
|
end
|
|
92
26
|
|
|
93
|
-
#
|
|
94
|
-
#
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
# Assume 3 params if we can't detect
|
|
103
|
-
3
|
|
27
|
+
# @deprecated Maps positional #call(input, expected, output, metadata) to keyword args.
|
|
28
|
+
# Will be removed when the legacy Eval::Scorer API is removed.
|
|
29
|
+
module PositionalArgsRemapping
|
|
30
|
+
def call(*args, **kwargs)
|
|
31
|
+
if args.any?
|
|
32
|
+
Log.warn_once(:scorer_positional_call, "Calling a Scorer with positional args is deprecated: use keyword args (input:, expected:, output:, metadata:) instead.")
|
|
33
|
+
kwargs = {input: args[0], expected: args[1], output: args[2], metadata: args[3]}
|
|
34
|
+
end
|
|
35
|
+
super(**kwargs)
|
|
104
36
|
end
|
|
105
37
|
end
|
|
106
38
|
end
|