braintrust 0.1.4 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +109 -13
  3. data/lib/braintrust/api/datasets.rb +10 -0
  4. data/lib/braintrust/api/internal/btql.rb +124 -0
  5. data/lib/braintrust/api/internal/experiments.rb +20 -1
  6. data/lib/braintrust/api/internal/projects.rb +19 -0
  7. data/lib/braintrust/dataset.rb +13 -6
  8. data/lib/braintrust/eval/context.rb +131 -0
  9. data/lib/braintrust/eval/evaluator.rb +78 -0
  10. data/lib/braintrust/eval/functions.rb +10 -132
  11. data/lib/braintrust/eval/runner.rb +119 -85
  12. data/lib/braintrust/eval/scorer.rb +24 -92
  13. data/lib/braintrust/eval/trace.rb +129 -0
  14. data/lib/braintrust/eval.rb +131 -156
  15. data/lib/braintrust/functions.rb +168 -0
  16. data/lib/braintrust/internal/callable.rb +83 -0
  17. data/lib/braintrust/logger.rb +9 -0
  18. data/lib/braintrust/scorer.rb +122 -0
  19. data/lib/braintrust/server/auth/clerk_token.rb +68 -0
  20. data/lib/braintrust/server/auth/no_auth.rb +14 -0
  21. data/lib/braintrust/server/handlers/eval.rb +217 -0
  22. data/lib/braintrust/server/handlers/health.rb +16 -0
  23. data/lib/braintrust/server/handlers/list.rb +74 -0
  24. data/lib/braintrust/server/middleware/auth.rb +29 -0
  25. data/lib/braintrust/server/middleware/cors.rb +87 -0
  26. data/lib/braintrust/server/rack/app.rb +38 -0
  27. data/lib/braintrust/server/rack.rb +36 -0
  28. data/lib/braintrust/server/router.rb +37 -0
  29. data/lib/braintrust/server/sse.rb +52 -0
  30. data/lib/braintrust/server.rb +8 -0
  31. data/lib/braintrust/task.rb +108 -0
  32. data/lib/braintrust/trace/span_exporter.rb +36 -0
  33. data/lib/braintrust/trace.rb +3 -4
  34. data/lib/braintrust/version.rb +1 -1
  35. metadata +22 -1
@@ -1,144 +1,22 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative "../api"
4
- require_relative "scorer"
5
- require "opentelemetry/sdk"
6
- require "json"
3
+ require_relative "../functions"
7
4
 
8
5
  module Braintrust
9
6
  module Eval
10
- # Functions provides remote function execution capabilities
11
- # Allows calling prompts hosted on Braintrust servers as tasks or scorers
7
+ # @deprecated Use {Braintrust::Functions} instead.
12
8
  module Functions
13
9
  class << self
14
- # Create a task callable that invokes a remote function
15
- # @param project [String] Project name
16
- # @param slug [String] Function slug
17
- # @param state [State, nil] Braintrust state (defaults to global)
18
- # @param tracer_provider [TracerProvider, nil] OpenTelemetry tracer provider
19
- # @return [Proc] Callable that accepts input and returns output
20
- def task(project:, slug:, state: nil, tracer_provider: nil)
21
- state ||= Braintrust.current_state
22
- raise Error, "No state available" unless state
23
-
24
- # Resolve function ID from project + slug
25
- api = API.new(state: state)
26
- function_metadata = resolve_function(api, project, slug)
27
- function_id = function_metadata["id"]
28
- function_name = function_metadata["name"] || slug
29
-
30
- # Get tracer for creating spans
31
- tracer_provider ||= OpenTelemetry.tracer_provider
32
- tracer = tracer_provider.tracer("braintrust.functions")
33
-
34
- # Return a lambda that invokes the remote function with tracing
35
- lambda do |input|
36
- # Create a span for the function invocation
37
- tracer.in_span("function: #{slug}") do |span|
38
- span.set_attribute("braintrust.span_attributes", JSON.dump({type: "function"}))
39
- span.set_attribute("braintrust.input_json", JSON.dump(input))
40
- span.set_attribute("braintrust.function.name", function_name)
41
- span.set_attribute("braintrust.function.id", function_id)
42
- span.set_attribute("braintrust.function.slug", slug)
43
-
44
- begin
45
- # Invoke the function via API
46
- output = api.functions.invoke(id: function_id, input: input)
47
- span.set_attribute("braintrust.output_json", JSON.dump(output))
48
- output
49
- rescue => e
50
- # Record exception and set error status
51
- span.record_exception(e)
52
- span.status = OpenTelemetry::Trace::Status.error(e.message)
53
- raise
54
- end
55
- end
56
- end
57
- end
58
-
59
- # Create a scorer that invokes a remote function
60
- # @param project [String] Project name
61
- # @param slug [String] Function slug
62
- # @param state [State, nil] Braintrust state (defaults to global)
63
- # @param tracer_provider [TracerProvider, nil] OpenTelemetry tracer provider
64
- # @return [Scorer] Scorer object that invokes remote function
65
- def scorer(project:, slug:, state: nil, tracer_provider: nil)
66
- state ||= Braintrust.current_state
67
- raise Error, "No state available" unless state
68
-
69
- # Resolve function ID from project + slug
70
- api = API.new(state: state)
71
- function_metadata = resolve_function(api, project, slug)
72
- function_id = function_metadata["id"]
73
- function_name = function_metadata["name"] || slug
74
-
75
- # Get tracer for creating spans
76
- tracer_provider ||= OpenTelemetry.tracer_provider
77
- tracer = tracer_provider.tracer("braintrust.functions")
78
-
79
- # Create a scorer that invokes the remote function
80
- Scorer.new(slug) do |input, expected, output, metadata|
81
- # Create a span for the function invocation
82
- tracer.in_span("function: #{slug}") do |span|
83
- scorer_input = {
84
- input: input,
85
- expected: expected,
86
- output: output,
87
- metadata: metadata
88
- }
89
-
90
- span.set_attribute("braintrust.span_attributes", JSON.dump({type: "function"}))
91
- span.set_attribute("braintrust.input_json", JSON.dump(scorer_input))
92
- span.set_attribute("braintrust.function.name", function_name)
93
- span.set_attribute("braintrust.function.id", function_id)
94
- span.set_attribute("braintrust.function.slug", slug)
95
-
96
- begin
97
- # Invoke the function via API
98
- # The remote scorer receives all scorer arguments
99
- result = api.functions.invoke(id: function_id, input: scorer_input)
100
-
101
- score = case result
102
- when Hash
103
- if result.key?("score")
104
- result["score"].to_f
105
- else
106
- raise Error, "Hash result must contain 'score' key"
107
- end
108
- when String
109
- result.to_f
110
- else
111
- raise Error, "Unsupported result type: #{result.class}"
112
- end
113
-
114
- span.set_attribute("braintrust.output_json", JSON.dump(score))
115
- score
116
- rescue => e
117
- # Record exception and set error status
118
- span.record_exception(e)
119
- span.status = OpenTelemetry::Trace::Status.error(e.message)
120
- raise
121
- end
122
- end
123
- end
10
+ # @deprecated Use {Braintrust::Functions.task} instead.
11
+ def task(**kwargs)
12
+ Log.warn_once(:eval_functions_task, "Braintrust::Eval::Functions.task is deprecated: use Braintrust::Functions.task instead.")
13
+ Braintrust::Functions.task(**kwargs)
124
14
  end
125
15
 
126
- private
127
-
128
- # Resolve function ID from project name and slug
129
- # @param api [API] API client
130
- # @param project [String] Project name
131
- # @param slug [String] Function slug
132
- # @return [Hash] Function metadata
133
- def resolve_function(api, project, slug)
134
- result = api.functions.list(project_name: project, slug: slug)
135
- functions = result["objects"]
136
-
137
- if functions.nil? || functions.empty?
138
- raise Error, "Function '#{slug}' not found in project '#{project}'"
139
- end
140
-
141
- functions.first
16
+ # @deprecated Use {Braintrust::Functions.scorer} instead.
17
+ def scorer(**kwargs)
18
+ Log.warn_once(:eval_functions_scorer, "Braintrust::Eval::Functions.scorer is deprecated: use Braintrust::Functions.scorer instead.")
19
+ Braintrust::Functions.scorer(**kwargs)
142
20
  end
143
21
  end
144
22
  end
@@ -1,56 +1,52 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative "case"
4
- require_relative "cases"
5
- require_relative "scorer"
6
4
  require_relative "result"
7
5
  require_relative "summary"
6
+ require_relative "trace"
8
7
  require_relative "../internal/thread_pool"
8
+ require_relative "../api/internal/btql"
9
9
 
10
10
  require "opentelemetry/sdk"
11
11
  require "json"
12
12
 
13
13
  module Braintrust
14
14
  module Eval
15
- # Internal runner class that performs the execution of the Eval and returns the result
15
+ # Internal runner class that performs the execution of the Eval and returns the result.
16
+ # Receives a fully-normalized Context — all callables are already typed wrappers.
16
17
  class Runner
17
18
  # Maximum parallelism allowed (mirrors Internal::ThreadPool::MAX_PARALLELISM)
18
19
  MAX_PARALLELISM = Internal::ThreadPool::MAX_PARALLELISM
19
20
 
20
- def initialize(experiment_id:, experiment_name:, project_id:, project_name:,
21
- task:, scorers:, api:, tracer_provider: nil)
22
- @experiment_id = experiment_id
23
- @experiment_name = experiment_name
24
- @project_id = project_id
25
- @project_name = project_name
26
- @task = task
27
- @scorers = normalize_scorers(scorers)
28
- @api = api
29
- @tracer_provider = tracer_provider || OpenTelemetry.tracer_provider
30
- @tracer = @tracer_provider.tracer("braintrust-eval")
31
- @parent_attr = "experiment_id:#{experiment_id}"
21
+ # Per-case mutable accumulator. Built from Case, populated by task and scoring stages.
22
+ CaseContext = Struct.new(:input, :expected, :output, :metadata, :tags, :trace, :origin, keyword_init: true)
23
+
24
+ # @param eval_context [Context] Normalized eval context
25
+ def initialize(eval_context)
26
+ @eval_context = eval_context
27
+ tracer_provider = eval_context.tracer_provider || OpenTelemetry.tracer_provider
28
+ @tracer = tracer_provider.tracer("braintrust-eval")
32
29
 
33
30
  # Mutex for thread-safe score collection
34
31
  @score_mutex = Mutex.new
35
32
  end
36
33
 
37
34
  # Run evaluation and return Result
38
- # @param cases [Array, Enumerable] Test cases
39
35
  # @param parallelism [Integer] Number of parallel workers (default: 1)
40
36
  # @return [Result]
41
- def run(cases, parallelism: 1)
37
+ def run(parallelism: 1)
42
38
  start_time = Time.now
43
- normalized_cases = normalize_cases(cases)
39
+ eval_cases = eval_context.cases
44
40
  errors = Queue.new
45
41
  @scores = {} # Reset for each run: { scorer_name => Array<Numeric> }
46
42
 
47
43
  if parallelism && parallelism > 1
48
- Internal::ThreadPool.each(normalized_cases, parallelism: parallelism) do |test_case|
49
- run_case(test_case, errors)
44
+ Internal::ThreadPool.each(eval_cases, parallelism: parallelism) do |eval_case|
45
+ run_eval_case(build_case_context(eval_case), errors)
50
46
  end
51
47
  else
52
- normalized_cases.each do |test_case|
53
- run_case(test_case, errors)
48
+ eval_cases.each do |eval_case|
49
+ run_eval_case(build_case_context(eval_case), errors)
54
50
  end
55
51
  end
56
52
 
@@ -60,14 +56,16 @@ module Braintrust
60
56
  # Calculate duration
61
57
  duration = Time.now - start_time
62
58
 
63
- # Generate permalink
64
- permalink = @api.object_permalink(object_type: "experiment", object_id: experiment_id)
59
+ # Generate permalink (only when state and experiment are available)
60
+ permalink = if eval_context.state && eval_context.experiment_id
61
+ eval_context.state.object_permalink(object_type: "experiment", object_id: eval_context.experiment_id)
62
+ end
65
63
 
66
64
  Result.new(
67
- experiment_id: experiment_id,
68
- experiment_name: experiment_name,
69
- project_id: project_id,
70
- project_name: project_name,
65
+ experiment_id: eval_context.experiment_id,
66
+ experiment_name: eval_context.experiment_name,
67
+ project_id: eval_context.project_id,
68
+ project_name: eval_context.project_name,
71
69
  permalink: permalink,
72
70
  errors: error_array,
73
71
  duration: duration,
@@ -77,63 +75,71 @@ module Braintrust
77
75
 
78
76
  private
79
77
 
80
- attr_reader :experiment_id, :experiment_name, :project_id, :project_name,
81
- :task, :scorers, :tracer, :parent_attr
78
+ attr_reader :eval_context, :tracer
82
79
 
83
80
  # Run a single test case with OpenTelemetry tracing
84
81
  # Creates eval span (parent) with task and score as children
85
- # @param test_case [Case] The test case
82
+ # @param case_context [CaseContext] The per-case accumulator
86
83
  # @param errors [Queue] Thread-safe error collection queue
87
- def run_case(test_case, errors)
84
+ def run_eval_case(case_context, errors)
88
85
  tracer.in_span("eval") do |eval_span|
89
- eval_span.set_attribute("braintrust.parent", parent_attr)
86
+ eval_span.set_attribute("braintrust.parent", eval_context.parent_span_attr) if eval_context.parent_span_attr
90
87
 
91
88
  # Set tags early so they're present even if task fails
92
- eval_span.set_attribute("braintrust.tags", test_case.tags) if test_case.tags
89
+ eval_span.set_attribute("braintrust.tags", case_context.tags) if case_context.tags
93
90
 
94
91
  # Run task
95
- output = nil
96
92
  begin
97
- output = run_task(test_case)
93
+ case_context.output = run_task(case_context)
98
94
  rescue => e
99
95
  # Error already recorded on task span, set eval span status
100
96
  eval_span.status = OpenTelemetry::Trace::Status.error(e.message)
101
- errors << "Task failed for input '#{test_case.input}': #{e.message}"
97
+ errors << "Task failed for input '#{case_context.input}': #{e.message}"
98
+ report_progress(eval_span, case_context, error: e.message)
102
99
  next
103
100
  end
104
101
 
102
+ # Flush spans so they're queryable via BTQL, then build trace
103
+ eval_context.tracer_provider&.force_flush
104
+ case_context.trace = build_trace(eval_span)
105
+
105
106
  # Run scorers
107
+ case_scores = nil
106
108
  begin
107
- run_scorers(test_case, output)
109
+ case_scores = run_scorers(case_context)
108
110
  rescue => e
109
111
  # Error already recorded on score span, set eval span status
110
112
  eval_span.status = OpenTelemetry::Trace::Status.error(e.message)
111
- errors << "Scorers failed for input '#{test_case.input}': #{e.message}"
113
+ errors << "Scorers failed for input '#{case_context.input}': #{e.message}"
112
114
  end
113
115
 
114
116
  # Set eval span attributes (after task and scorers complete)
115
- set_json_attr(eval_span, "braintrust.span_attributes", {type: "eval"})
116
- set_json_attr(eval_span, "braintrust.input_json", test_case.input)
117
- set_json_attr(eval_span, "braintrust.output_json", output)
118
- set_json_attr(eval_span, "braintrust.expected", test_case.expected) if test_case.expected
117
+ set_json_attr(eval_span, "braintrust.span_attributes", build_span_attributes("eval"))
118
+ set_json_attr(eval_span, "braintrust.input_json", case_context.input)
119
+ set_json_attr(eval_span, "braintrust.output_json", case_context.output)
120
+ set_json_attr(eval_span, "braintrust.expected", case_context.expected) if case_context.expected
119
121
 
120
122
  # Set origin for cases from remote sources (already JSON-serialized)
121
- eval_span.set_attribute("braintrust.origin", test_case.origin) if test_case.origin
123
+ eval_span.set_attribute("braintrust.origin", case_context.origin) if case_context.origin
124
+
125
+ report_progress(eval_span, case_context, data: case_context.output, scores: case_scores || {})
122
126
  end
123
127
  end
124
128
 
125
129
  # Run task with OpenTelemetry tracing
126
130
  # Creates task span with input and output
127
- # @param test_case [Case] The test case
131
+ # @param case_context [CaseContext] The per-case context
128
132
  # @return [Object] Task output
129
- def run_task(test_case)
133
+ def run_task(case_context)
130
134
  tracer.in_span("task") do |task_span|
131
- task_span.set_attribute("braintrust.parent", parent_attr)
132
- set_json_attr(task_span, "braintrust.span_attributes", {type: "task"})
133
- set_json_attr(task_span, "braintrust.input_json", test_case.input)
135
+ task_span.set_attribute("braintrust.parent", eval_context.parent_span_attr) if eval_context.parent_span_attr
136
+ set_json_attr(task_span, "braintrust.span_attributes", build_span_attributes("task"))
137
+ set_json_attr(task_span, "braintrust.input_json", case_context.input)
134
138
 
135
139
  begin
136
- output = task.call(test_case.input)
140
+ output = eval_context.task.call(
141
+ input: case_context.input
142
+ )
137
143
  set_json_attr(task_span, "braintrust.output_json", output)
138
144
  output
139
145
  rescue => e
@@ -147,17 +153,24 @@ module Braintrust
147
153
 
148
154
  # Run scorers with OpenTelemetry tracing
149
155
  # Creates single score span for all scorers
150
- # @param test_case [Case] The test case
151
- # @param output [Object] Task output
152
- def run_scorers(test_case, output)
156
+ # @param case_context [CaseContext] The per-case context (output must be populated)
157
+ # @return [Hash] Scores hash { scorer_name => score_value }
158
+ def run_scorers(case_context)
153
159
  tracer.in_span("score") do |score_span|
154
- score_span.set_attribute("braintrust.parent", parent_attr)
155
- set_json_attr(score_span, "braintrust.span_attributes", {type: "score"})
156
-
160
+ score_span.set_attribute("braintrust.parent", eval_context.parent_span_attr) if eval_context.parent_span_attr
161
+ set_json_attr(score_span, "braintrust.span_attributes", build_span_attributes("score"))
162
+
163
+ scorer_kwargs = {
164
+ input: case_context.input,
165
+ expected: case_context.expected,
166
+ output: case_context.output,
167
+ metadata: case_context.metadata || {},
168
+ trace: case_context.trace
169
+ }
157
170
  scores = {}
158
171
  scorer_error = nil
159
- scorers.each do |scorer|
160
- score_value = scorer.call(test_case.input, test_case.expected, output, test_case.metadata || {})
172
+ eval_context.scorers.each do |scorer|
173
+ score_value = scorer.call(**scorer_kwargs)
161
174
  scores[scorer.name] = score_value
162
175
 
163
176
  # Collect raw score for summary (thread-safe)
@@ -173,39 +186,49 @@ module Braintrust
173
186
 
174
187
  # Raise after setting scores so we can see which scorers succeeded
175
188
  raise scorer_error if scorer_error
189
+
190
+ scores
176
191
  end
177
192
  end
178
193
 
179
- # Normalize cases input to Cases wrapper
180
- # @param cases_input [Array, Enumerable, Cases] The cases input
181
- # @return [Cases]
182
- def normalize_cases(cases_input)
183
- case cases_input
184
- when Cases
185
- cases_input
186
- when Array, Enumerable
187
- Cases.new(cases_input)
188
- else
189
- if cases_input.respond_to?(:each)
190
- Cases.new(cases_input)
191
- else
192
- raise ArgumentError, "cases must be Array or Enumerable"
193
- end
194
- end
194
+ # Build a lazy Trace for a case, backed by BTQL.
195
+ # Returns nil when state or experiment_id are unavailable (local-only mode).
196
+ # @param eval_span [OpenTelemetry::Trace::Span] The eval span for this case
197
+ # @return [Eval::Trace, nil]
198
+ def build_trace(eval_span)
199
+ return nil unless eval_context.state && eval_context.experiment_id
200
+
201
+ root_span_id = eval_span.context.hex_trace_id
202
+ object_type = "experiment"
203
+ object_id = eval_context.experiment_id
204
+ btql = API::Internal::BTQL.new(eval_context.state)
205
+
206
+ Eval::Trace.new(
207
+ spans: -> { btql.trace_spans(object_type: object_type, object_id: object_id, root_span_id: root_span_id) }
208
+ )
195
209
  end
196
210
 
197
- # Normalize scorers to Scorer objects
198
- # @param scorers_input [Array] The scorers input (Scorer objects or callables)
199
- # @return [Array<Scorer>]
200
- def normalize_scorers(scorers_input)
201
- scorers_input.map do |scorer|
202
- case scorer
203
- when Scorer
204
- scorer
205
- else
206
- Scorer.new(scorer)
207
- end
211
+ # Build a CaseContext from a Case struct
212
+ # @param eval_case [Case] The eval case
213
+ # @return [CaseContext]
214
+ def build_case_context(eval_case)
215
+ CaseContext.new(
216
+ input: eval_case.input, expected: eval_case.expected,
217
+ metadata: eval_case.metadata, tags: eval_case.tags, origin: eval_case.origin
218
+ )
219
+ end
220
+
221
+ # Report progress for a case via on_progress callback.
222
+ # Rescues errors in the callback so a broken handler never crashes the eval.
223
+ def report_progress(eval_span, case_context, **fields)
224
+ return unless eval_context.on_progress
225
+ progress = {"id" => eval_span.context.hex_span_id}.merge(fields.transform_keys(&:to_s))
226
+ if case_context.origin
227
+ progress["origin"] = case_context.origin.is_a?(String) ? JSON.parse(case_context.origin) : case_context.origin
208
228
  end
229
+ eval_context.on_progress.call(progress)
230
+ rescue => e
231
+ Braintrust.logger.warn("on_progress callback error: #{e.message}")
209
232
  end
210
233
 
211
234
  # Record error on span with exception event and error status
@@ -221,6 +244,17 @@ module Braintrust
221
244
  span.status = OpenTelemetry::Trace::Status.error(error.message)
222
245
  end
223
246
 
247
+ # Build span_attributes hash with type, and optionally name and generation.
248
+ # Matches Java SDK behavior of including these on every span.
249
+ # @param type [String] Span type ("eval", "task", or "score")
250
+ # @return [Hash]
251
+ def build_span_attributes(type)
252
+ attrs = {type: type}
253
+ attrs[:name] = eval_context.experiment_name if eval_context.experiment_name
254
+ attrs[:generation] = eval_context.generation if eval_context.generation
255
+ attrs
256
+ end
257
+
224
258
  # Set a span attribute by JSON encoding the value
225
259
  # @param span [OpenTelemetry::Trace::Span] The span
226
260
  # @param key [String] The attribute key
@@ -1,106 +1,38 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require_relative "../scorer"
4
+
3
5
  module Braintrust
4
6
  module Eval
5
- # Scorer wraps a scoring function that evaluates task output against expected values
6
- # Scorers can accept 3 params (input, expected, output) or 4 params (input, expected, output, metadata)
7
- # They can return a float, hash, or array of hashes
8
- class Scorer
9
- attr_reader :name
10
-
11
- # Create a new scorer
12
- # @param name_or_callable [String, Symbol, #call] Name or callable (if callable, name is auto-detected)
13
- # @param callable [#call, nil] Callable if name was provided separately
14
- # @param block [Proc, nil] Block if no callable provided
15
- def initialize(name_or_callable = nil, callable = nil, &block)
16
- # Determine name and callable from arguments
17
- if name_or_callable.nil? && callable.nil? && block.nil?
18
- raise ArgumentError, "Must provide callable or block"
19
- end
7
+ # @deprecated Use {Braintrust::Scorer} instead.
8
+ module Scorer
9
+ # @deprecated Use {Braintrust::Scorer.new} instead.
10
+ def self.new(name_or_callable = nil, callable = nil, &block)
11
+ Log.warn_once(:eval_scorer_class, "Braintrust::Eval::Scorer is deprecated: use Braintrust::Scorer.new instead.")
20
12
 
21
- # If first arg is a string/symbol, it's the name
22
13
  if name_or_callable.is_a?(String) || name_or_callable.is_a?(Symbol)
23
- @name = name_or_callable.to_s
24
- @callable = callable || block
25
- raise ArgumentError, "Must provide callable or block" unless @callable
14
+ name = name_or_callable.to_s
15
+ block = callable.method(:call) if callable && !block
26
16
  else
27
- # First arg is the callable, try to auto-detect name
28
- @callable = name_or_callable || callable || block
29
- @name = detect_name(@callable)
30
- end
31
-
32
- # Validate callable
33
- unless @callable.respond_to?(:call)
34
- raise ArgumentError, "Scorer must be callable (respond to :call)"
35
- end
36
-
37
- # Detect arity and wrap callable if needed
38
- @wrapped_callable = wrap_callable(@callable)
39
- end
40
-
41
- # Call the scorer
42
- # @param input [Object] The input to the task
43
- # @param expected [Object] The expected output
44
- # @param output [Object] The actual output from the task
45
- # @param metadata [Hash] Optional metadata
46
- # @return [Float, Hash, Array] Score value(s)
47
- def call(input, expected, output, metadata = {})
48
- @wrapped_callable.call(input, expected, output, metadata)
49
- end
50
-
51
- private
52
-
53
- # Detect the name from a callable object
54
- # @param callable [#call] The callable
55
- # @return [String] The detected name
56
- def detect_name(callable)
57
- # Method objects have .name
58
- if callable.is_a?(Method)
59
- return callable.name.to_s
17
+ resolved = name_or_callable || callable
18
+ block = resolved.method(:call) if resolved && !block
19
+ name = nil
60
20
  end
61
21
 
62
- # Objects with .name method
63
- if callable.respond_to?(:name)
64
- return callable.name.to_s
65
- end
66
-
67
- # Fallback
68
- "scorer"
69
- end
70
-
71
- # Wrap the callable to always accept 4 parameters
72
- # @param callable [#call] The callable to wrap
73
- # @return [Proc] Wrapped callable that accepts 4 params
74
- def wrap_callable(callable)
75
- arity = callable_arity(callable)
76
-
77
- case arity
78
- when 3
79
- # Callable takes 3 params - wrap to ignore metadata
80
- ->(input, expected, output, metadata) {
81
- callable.call(input, expected, output)
82
- }
83
- when 4, -4, -1
84
- # Callable takes 4 params (or variadic with 4+)
85
- # -4 means optional 4th param
86
- # -1 means variadic (*args)
87
- callable
88
- else
89
- raise ArgumentError, "Scorer must accept 3 or 4 parameters (got arity #{arity})"
90
- end
22
+ scorer = Braintrust::Scorer.new(name, &block)
23
+ scorer.singleton_class.prepend(PositionalArgsRemapping)
24
+ scorer
91
25
  end
92
26
 
93
- # Get the arity of a callable
94
- # @param callable [#call] The callable
95
- # @return [Integer] The arity
96
- def callable_arity(callable)
97
- if callable.respond_to?(:arity)
98
- callable.arity
99
- elsif callable.respond_to?(:method)
100
- callable.method(:call).arity
101
- else
102
- # Assume 3 params if we can't detect
103
- 3
27
+ # @deprecated Maps positional #call(input, expected, output, metadata) to keyword args.
28
+ # Will be removed when the legacy Eval::Scorer API is removed.
29
+ module PositionalArgsRemapping
30
+ def call(*args, **kwargs)
31
+ if args.any?
32
+ Log.warn_once(:scorer_positional_call, "Calling a Scorer with positional args is deprecated: use keyword args (input:, expected:, output:, metadata:) instead.")
33
+ kwargs = {input: args[0], expected: args[1], output: args[2], metadata: args[3]}
34
+ end
35
+ super(**kwargs)
104
36
  end
105
37
  end
106
38
  end