braintrust 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +148 -24
  3. data/lib/braintrust/api/internal/btql.rb +124 -0
  4. data/lib/braintrust/api/internal/experiments.rb +19 -0
  5. data/lib/braintrust/api/internal/projects.rb +19 -0
  6. data/lib/braintrust/contrib/rails/server/application_controller.rb +34 -0
  7. data/lib/braintrust/contrib/rails/server/engine.rb +72 -0
  8. data/lib/braintrust/contrib/rails/server/eval_controller.rb +36 -0
  9. data/lib/braintrust/contrib/rails/server/generator.rb +43 -0
  10. data/lib/braintrust/contrib/rails/server/health_controller.rb +15 -0
  11. data/lib/braintrust/contrib/rails/server/list_controller.rb +16 -0
  12. data/lib/braintrust/contrib/rails/server/routes.rb +8 -0
  13. data/lib/braintrust/contrib/rails/server.rb +20 -0
  14. data/lib/braintrust/dataset.rb +6 -3
  15. data/lib/braintrust/eval/context.rb +131 -0
  16. data/lib/braintrust/eval/evaluator.rb +11 -5
  17. data/lib/braintrust/eval/functions.rb +10 -166
  18. data/lib/braintrust/eval/runner.rb +165 -145
  19. data/lib/braintrust/eval/scorer.rb +24 -96
  20. data/lib/braintrust/eval/trace.rb +129 -0
  21. data/lib/braintrust/eval.rb +60 -132
  22. data/lib/braintrust/functions.rb +168 -0
  23. data/lib/braintrust/internal/callable.rb +83 -0
  24. data/lib/braintrust/logger.rb +9 -0
  25. data/lib/braintrust/scorer.rb +173 -0
  26. data/lib/braintrust/server/handlers/eval.rb +8 -168
  27. data/lib/braintrust/server/handlers/list.rb +3 -41
  28. data/lib/braintrust/server/rack.rb +2 -0
  29. data/lib/braintrust/server/services/eval_service.rb +214 -0
  30. data/lib/braintrust/server/services/list_service.rb +64 -0
  31. data/lib/braintrust/task.rb +108 -0
  32. data/lib/braintrust/trace/span_processor.rb +0 -5
  33. data/lib/braintrust/version.rb +1 -1
  34. metadata +18 -1
@@ -1,59 +1,52 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative "case"
4
- require_relative "cases"
5
- require_relative "scorer"
6
4
  require_relative "result"
7
5
  require_relative "summary"
6
+ require_relative "trace"
8
7
  require_relative "../internal/thread_pool"
8
+ require_relative "../api/internal/btql"
9
9
 
10
10
  require "opentelemetry/sdk"
11
11
  require "json"
12
12
 
13
13
  module Braintrust
14
14
  module Eval
15
- # Internal runner class that performs the execution of the Eval and returns the result
15
+ # Internal runner class that performs the execution of the Eval and returns the result.
16
+ # Receives a fully-normalized Context — all callables are already typed wrappers.
16
17
  class Runner
17
18
  # Maximum parallelism allowed (mirrors Internal::ThreadPool::MAX_PARALLELISM)
18
19
  MAX_PARALLELISM = Internal::ThreadPool::MAX_PARALLELISM
19
20
 
20
- def initialize(task:, scorers:, experiment_id: nil, experiment_name: nil,
21
- project_id: nil, project_name: nil, state: nil, tracer_provider: nil,
22
- on_progress: nil, parent: nil)
23
- @experiment_id = experiment_id
24
- @experiment_name = experiment_name
25
- @project_id = project_id
26
- @project_name = project_name
27
- @task = task
28
- @scorers = normalize_scorers(scorers)
29
- @state = state
30
- @tracer_provider = tracer_provider || OpenTelemetry.tracer_provider
31
- @tracer = @tracer_provider.tracer("braintrust-eval")
32
- @parent_attr = parent ? "#{parent[:object_type]}:#{parent[:object_id]}" : nil
33
- @generation = parent&.dig(:generation)
34
- @on_progress = on_progress
21
+ # Per-case mutable accumulator. Built from Case, populated by task and scoring stages.
22
+ CaseContext = Struct.new(:input, :expected, :output, :metadata, :tags, :trace, :origin, keyword_init: true)
23
+
24
+ # @param eval_context [Context] Normalized eval context
25
+ def initialize(eval_context)
26
+ @eval_context = eval_context
27
+ tracer_provider = eval_context.tracer_provider || OpenTelemetry.tracer_provider
28
+ @tracer = tracer_provider.tracer("braintrust-eval")
35
29
 
36
30
  # Mutex for thread-safe score collection
37
31
  @score_mutex = Mutex.new
38
32
  end
39
33
 
40
34
  # Run evaluation and return Result
41
- # @param cases [Array, Enumerable] Test cases
42
35
  # @param parallelism [Integer] Number of parallel workers (default: 1)
43
36
  # @return [Result]
44
- def run(cases, parallelism: 1)
37
+ def run(parallelism: 1)
45
38
  start_time = Time.now
46
- normalized_cases = normalize_cases(cases)
39
+ eval_cases = eval_context.cases
47
40
  errors = Queue.new
48
41
  @scores = {} # Reset for each run: { scorer_name => Array<Numeric> }
49
42
 
50
43
  if parallelism && parallelism > 1
51
- Internal::ThreadPool.each(normalized_cases, parallelism: parallelism) do |test_case|
52
- run_case(test_case, errors)
44
+ Internal::ThreadPool.each(eval_cases, parallelism: parallelism) do |eval_case|
45
+ run_eval_case(build_case_context(eval_case), errors)
53
46
  end
54
47
  else
55
- normalized_cases.each do |test_case|
56
- run_case(test_case, errors)
48
+ eval_cases.each do |eval_case|
49
+ run_eval_case(build_case_context(eval_case), errors)
57
50
  end
58
51
  end
59
52
 
@@ -64,15 +57,15 @@ module Braintrust
64
57
  duration = Time.now - start_time
65
58
 
66
59
  # Generate permalink (only when state and experiment are available)
67
- permalink = if @state && experiment_id
68
- @state.object_permalink(object_type: "experiment", object_id: experiment_id)
60
+ permalink = if eval_context.state && eval_context.experiment_id
61
+ eval_context.state.object_permalink(object_type: "experiment", object_id: eval_context.experiment_id)
69
62
  end
70
63
 
71
64
  Result.new(
72
- experiment_id: experiment_id,
73
- experiment_name: experiment_name,
74
- project_id: project_id,
75
- project_name: project_name,
65
+ experiment_id: eval_context.experiment_id,
66
+ experiment_name: eval_context.experiment_name,
67
+ project_id: eval_context.project_id,
68
+ project_name: eval_context.project_name,
76
69
  permalink: permalink,
77
70
  errors: error_array,
78
71
  duration: duration,
@@ -82,86 +75,73 @@ module Braintrust
82
75
 
83
76
  private
84
77
 
85
- attr_reader :experiment_id, :experiment_name, :project_id, :project_name,
86
- :task, :scorers, :tracer, :parent_attr
78
+ attr_reader :eval_context, :tracer
87
79
 
88
80
  # Run a single test case with OpenTelemetry tracing
89
81
  # Creates eval span (parent) with task and score as children
90
- # @param test_case [Case] The test case
82
+ # @param case_context [CaseContext] The per-case accumulator
91
83
  # @param errors [Queue] Thread-safe error collection queue
92
- def run_case(test_case, errors)
93
- tracer.in_span("eval") do |eval_span|
94
- eval_span.set_attribute("braintrust.parent", parent_attr) if parent_attr
95
-
96
- # Set tags early so they're present even if task fails
97
- eval_span.set_attribute("braintrust.tags", test_case.tags) if test_case.tags
84
+ def run_eval_case(case_context, errors)
85
+ # Each eval case starts its own trace — detach from any ambient span context
86
+ eval_span = tracer.start_root_span("eval")
87
+ OpenTelemetry::Trace.with_span(eval_span) do
88
+ # Set attributes known before task execution
89
+ eval_span.set_attribute("braintrust.parent", eval_context.parent_span_attr) if eval_context.parent_span_attr
90
+ set_json_attr(eval_span, "braintrust.span_attributes", build_span_attributes("eval"))
91
+ set_json_attr(eval_span, "braintrust.input_json", {input: case_context.input})
92
+ set_json_attr(eval_span, "braintrust.expected", case_context.expected) if case_context.expected
93
+ set_json_attr(eval_span, "braintrust.metadata", case_context.metadata) if case_context.metadata
94
+ eval_span.set_attribute("braintrust.tags", case_context.tags) if case_context.tags
95
+ eval_span.set_attribute("braintrust.origin", case_context.origin) if case_context.origin
98
96
 
99
97
  # Run task
100
- output = nil
101
98
  begin
102
- output = run_task(test_case)
99
+ case_context.output = run_task(case_context)
103
100
  rescue => e
104
101
  # Error already recorded on task span, set eval span status
105
102
  eval_span.status = OpenTelemetry::Trace::Status.error(e.message)
106
- errors << "Task failed for input '#{test_case.input}': #{e.message}"
107
- if @on_progress
108
- error_progress = {
109
- "id" => eval_span.context.hex_span_id,
110
- "error" => e.message
111
- }
112
- if test_case.origin
113
- error_progress["origin"] = test_case.origin.is_a?(String) ? JSON.parse(test_case.origin) : test_case.origin
114
- end
115
- @on_progress.call(error_progress)
116
- end
103
+ set_json_attr(eval_span, "braintrust.output_json", {output: nil})
104
+ errors << "Task failed for input '#{case_context.input}': #{e.message}"
105
+ report_progress(eval_span, case_context, error: e.message)
117
106
  next
118
107
  end
119
108
 
109
+ # Flush spans so they're queryable via BTQL, then build trace
110
+ eval_context.tracer_provider&.force_flush
111
+ case_context.trace = build_trace(eval_span)
112
+
120
113
  # Run scorers
121
- case_scores = nil
122
114
  begin
123
- case_scores = run_scorers(test_case, output)
115
+ run_scorers(case_context)
124
116
  rescue => e
125
117
  # Error already recorded on score span, set eval span status
126
118
  eval_span.status = OpenTelemetry::Trace::Status.error(e.message)
127
- errors << "Scorers failed for input '#{test_case.input}': #{e.message}"
119
+ errors << "Scorers failed for input '#{case_context.input}': #{e.message}"
128
120
  end
129
121
 
130
- # Set eval span attributes (after task and scorers complete)
131
- set_json_attr(eval_span, "braintrust.span_attributes", build_span_attributes("eval"))
132
- set_json_attr(eval_span, "braintrust.input_json", test_case.input)
133
- set_json_attr(eval_span, "braintrust.output_json", output)
134
- set_json_attr(eval_span, "braintrust.expected", test_case.expected) if test_case.expected
135
-
136
- # Set origin for cases from remote sources (already JSON-serialized)
137
- eval_span.set_attribute("braintrust.origin", test_case.origin) if test_case.origin
138
-
139
- if @on_progress
140
- progress = {
141
- "id" => eval_span.context.hex_span_id,
142
- "data" => output,
143
- "scores" => case_scores || {}
144
- }
145
- if test_case.origin
146
- progress["origin"] = test_case.origin.is_a?(String) ? JSON.parse(test_case.origin) : test_case.origin
147
- end
148
- @on_progress.call(progress)
149
- end
122
+ # Set output after task completes
123
+ set_json_attr(eval_span, "braintrust.output_json", {output: case_context.output})
124
+
125
+ report_progress(eval_span, case_context, data: case_context.output)
150
126
  end
127
+ ensure
128
+ eval_span&.finish
151
129
  end
152
130
 
153
131
  # Run task with OpenTelemetry tracing
154
132
  # Creates task span with input and output
155
- # @param test_case [Case] The test case
133
+ # @param case_context [CaseContext] The per-case context
156
134
  # @return [Object] Task output
157
- def run_task(test_case)
135
+ def run_task(case_context)
158
136
  tracer.in_span("task") do |task_span|
159
- task_span.set_attribute("braintrust.parent", parent_attr) if parent_attr
137
+ task_span.set_attribute("braintrust.parent", eval_context.parent_span_attr) if eval_context.parent_span_attr
160
138
  set_json_attr(task_span, "braintrust.span_attributes", build_span_attributes("task"))
161
- set_json_attr(task_span, "braintrust.input_json", test_case.input)
139
+ set_json_attr(task_span, "braintrust.input_json", case_context.input)
162
140
 
163
141
  begin
164
- output = task.call(test_case.input)
142
+ output = eval_context.task.call(
143
+ input: case_context.input
144
+ )
165
145
  set_json_attr(task_span, "braintrust.output_json", output)
166
146
  output
167
147
  rescue => e
@@ -173,70 +153,103 @@ module Braintrust
173
153
  end
174
154
  end
175
155
 
176
- # Run scorers with OpenTelemetry tracing
177
- # Creates single score span for all scorers
178
- # @param test_case [Case] The test case
179
- # @param output [Object] Task output
180
- # @return [Hash] Scores hash { scorer_name => score_value }
181
- def run_scorers(test_case, output)
182
- tracer.in_span("score") do |score_span|
183
- score_span.set_attribute("braintrust.parent", parent_attr) if parent_attr
184
- set_json_attr(score_span, "braintrust.span_attributes", build_span_attributes("score"))
185
-
186
- scores = {}
187
- scorer_error = nil
188
- scorers.each do |scorer|
189
- score_value = scorer.call(test_case.input, test_case.expected, output, test_case.metadata || {})
190
- scores[scorer.name] = score_value
191
-
192
- # Collect raw score for summary (thread-safe)
193
- collect_score(scorer.name, score_value)
194
- rescue => e
195
- # Record first error but continue processing other scorers
196
- scorer_error ||= e
197
- record_span_error(score_span, e, "ScorerError")
198
- end
156
+ # Run scorers with OpenTelemetry tracing.
157
+ # Creates one span per scorer, each a direct child of the current (eval) span.
158
+ # @param case_context [CaseContext] The per-case context (output must be populated)
159
+ def run_scorers(case_context)
160
+ scorer_kwargs = {
161
+ input: case_context.input,
162
+ expected: case_context.expected,
163
+ output: case_context.output,
164
+ metadata: case_context.metadata || {},
165
+ trace: case_context.trace
166
+ }
167
+ scorer_input = {
168
+ input: case_context.input,
169
+ expected: case_context.expected,
170
+ output: case_context.output,
171
+ metadata: case_context.metadata || {}
172
+ }
173
+
174
+ scorer_error = nil
175
+ eval_context.scorers.each do |scorer|
176
+ collect_scores(run_scorer(scorer, scorer_kwargs, scorer_input))
177
+ rescue => e
178
+ scorer_error ||= e
179
+ end
199
180
 
200
- # Always set scores attribute, even if some scorers failed
201
- set_json_attr(score_span, "braintrust.scores", scores)
181
+ raise scorer_error if scorer_error
182
+ end
202
183
 
203
- # Raise after setting scores so we can see which scorers succeeded
204
- raise scorer_error if scorer_error
184
+ # Run a single scorer inside its own span.
185
+ # @param scorer [Scorer] The scorer to run
186
+ # @param scorer_kwargs [Hash] Keyword arguments for the scorer
187
+ # @param scorer_input [Hash] Input to log on the span
188
+ # @return [Array<Hash>] Raw score results from the scorer
189
+ def run_scorer(scorer, scorer_kwargs, scorer_input)
190
+ tracer.in_span(scorer.name) do |score_span|
191
+ score_span.set_attribute("braintrust.parent", eval_context.parent_span_attr) if eval_context.parent_span_attr
192
+ set_json_attr(score_span, "braintrust.span_attributes", build_scorer_span_attributes(scorer.name))
193
+ set_json_attr(score_span, "braintrust.input_json", scorer_input)
194
+
195
+ score_results = scorer.call(**scorer_kwargs)
196
+
197
+ scorer_scores = {}
198
+ scorer_metadata = {}
199
+ score_results.each do |s|
200
+ scorer_scores[s[:name]] = s[:score]
201
+ scorer_metadata[s[:name]] = s[:metadata] if s[:metadata].is_a?(Hash)
202
+ end
203
+
204
+ set_json_attr(score_span, "braintrust.output_json", scorer_scores)
205
+ set_json_attr(score_span, "braintrust.scores", scorer_scores)
206
+ set_json_attr(score_span, "braintrust.metadata", scorer_metadata) unless scorer_metadata.empty?
205
207
 
206
- scores
208
+ score_results
209
+ rescue => e
210
+ record_span_error(score_span, e, "ScorerError")
211
+ raise
207
212
  end
208
213
  end
209
214
 
210
- # Normalize cases input to Cases wrapper
211
- # @param cases_input [Array, Enumerable, Cases] The cases input
212
- # @return [Cases]
213
- def normalize_cases(cases_input)
214
- case cases_input
215
- when Cases
216
- cases_input
217
- when Array, Enumerable
218
- Cases.new(cases_input)
219
- else
220
- if cases_input.respond_to?(:each)
221
- Cases.new(cases_input)
222
- else
223
- raise ArgumentError, "cases must be Array or Enumerable"
224
- end
225
- end
215
+ # Build a lazy Trace for a case, backed by BTQL.
216
+ # Returns nil when state or experiment_id are unavailable (local-only mode).
217
+ # @param eval_span [OpenTelemetry::Trace::Span] The eval span for this case
218
+ # @return [Eval::Trace, nil]
219
+ def build_trace(eval_span)
220
+ return nil unless eval_context.state && eval_context.experiment_id
221
+
222
+ root_span_id = eval_span.context.hex_trace_id
223
+ object_type = "experiment"
224
+ object_id = eval_context.experiment_id
225
+ btql = API::Internal::BTQL.new(eval_context.state)
226
+
227
+ Eval::Trace.new(
228
+ spans: -> { btql.trace_spans(object_type: object_type, object_id: object_id, root_span_id: root_span_id) }
229
+ )
226
230
  end
227
231
 
228
- # Normalize scorers to Scorer objects
229
- # @param scorers_input [Array] The scorers input (Scorer objects or callables)
230
- # @return [Array<Scorer>]
231
- def normalize_scorers(scorers_input)
232
- scorers_input.map do |scorer|
233
- case scorer
234
- when Scorer
235
- scorer
236
- else
237
- Scorer.new(scorer)
238
- end
232
+ # Build a CaseContext from a Case struct
233
+ # @param eval_case [Case] The eval case
234
+ # @return [CaseContext]
235
+ def build_case_context(eval_case)
236
+ CaseContext.new(
237
+ input: eval_case.input, expected: eval_case.expected,
238
+ metadata: eval_case.metadata, tags: eval_case.tags, origin: eval_case.origin
239
+ )
240
+ end
241
+
242
+ # Report progress for a case via on_progress callback.
243
+ # Rescues errors in the callback so a broken handler never crashes the eval.
244
+ def report_progress(eval_span, case_context, **fields)
245
+ return unless eval_context.on_progress
246
+ progress = {"id" => eval_span.context.hex_span_id}.merge(fields.transform_keys(&:to_s))
247
+ if case_context.origin
248
+ progress["origin"] = case_context.origin.is_a?(String) ? JSON.parse(case_context.origin) : case_context.origin
239
249
  end
250
+ eval_context.on_progress.call(progress)
251
+ rescue => e
252
+ Braintrust.logger.warn("on_progress callback error: #{e.message}")
240
253
  end
241
254
 
242
255
  # Record error on span with exception event and error status
@@ -258,8 +271,18 @@ module Braintrust
258
271
  # @return [Hash]
259
272
  def build_span_attributes(type)
260
273
  attrs = {type: type}
261
- attrs[:name] = experiment_name if experiment_name
262
- attrs[:generation] = @generation if @generation
274
+ attrs[:name] = eval_context.experiment_name if eval_context.experiment_name
275
+ attrs[:generation] = eval_context.generation if eval_context.generation
276
+ attrs
277
+ end
278
+
279
+ # Build span_attributes for a scorer span.
280
+ # Each scorer gets its own span with type "score", purpose "scorer", and the scorer's name.
281
+ # @param scorer_name [String] The scorer name
282
+ # @return [Hash]
283
+ def build_scorer_span_attributes(scorer_name)
284
+ attrs = {type: "score", name: scorer_name, purpose: "scorer"}
285
+ attrs[:generation] = eval_context.generation if eval_context.generation
263
286
  attrs
264
287
  end
265
288
 
@@ -271,14 +294,11 @@ module Braintrust
271
294
  span.set_attribute(key, JSON.dump(value))
272
295
  end
273
296
 
274
- # Collect a single score value for summary calculation
275
- # @param name [String] Scorer name
276
- # @param value [Object] Score value (only Numeric values are collected)
277
- def collect_score(name, value)
278
- return unless value.is_a?(Numeric)
279
-
297
+ # Collect score results into the summary accumulator (thread-safe).
298
+ # @param score_results [Array<Hash>] Score results from a scorer
299
+ def collect_scores(score_results)
280
300
  @score_mutex.synchronize do
281
- (@scores[name] ||= []) << value
301
+ score_results.each { |s| (@scores[s[:name]] ||= []) << s[:score] }
282
302
  end
283
303
  end
284
304
  end
@@ -1,112 +1,40 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require_relative "../scorer"
4
+
3
5
  module Braintrust
4
6
  module Eval
5
- # Scorer wraps a scoring function that evaluates task output against expected values
6
- # Scorers can accept 3 params (input, expected, output) or 4 params (input, expected, output, metadata)
7
- # They can return a float, hash, or array of hashes
8
- class Scorer
9
- attr_reader :name
10
-
11
- # Create a new scorer
12
- # @param name_or_callable [String, Symbol, #call] Name or callable (if callable, name is auto-detected)
13
- # @param callable [#call, nil] Callable if name was provided separately
14
- # @param block [Proc, nil] Block if no callable provided
15
- def initialize(name_or_callable = nil, callable = nil, &block)
16
- # Determine name and callable from arguments
17
- if name_or_callable.nil? && callable.nil? && block.nil?
18
- raise ArgumentError, "Must provide callable or block"
19
- end
7
+ # @deprecated Use {Braintrust::Scorer} instead.
8
+ module Scorer
9
+ # @deprecated Use {Braintrust::Scorer.new} instead.
10
+ def self.new(name_or_callable = nil, callable = nil, &block)
11
+ Log.warn_once(:eval_scorer_class, "Braintrust::Eval::Scorer is deprecated: use Braintrust::Scorer.new instead.")
20
12
 
21
- # If first arg is a string/symbol, it's the name
22
13
  if name_or_callable.is_a?(String) || name_or_callable.is_a?(Symbol)
23
- @name = name_or_callable.to_s
24
- @callable = callable || block
25
- raise ArgumentError, "Must provide callable or block" unless @callable
14
+ name = name_or_callable.to_s
15
+ block = callable.method(:call) if callable && !block
26
16
  else
27
- # First arg is the callable, try to auto-detect name
28
- @callable = name_or_callable || callable || block
29
- @name = detect_name(@callable)
17
+ resolved = name_or_callable || callable
18
+ block = resolved.method(:call) if resolved && !block
19
+ name = nil
30
20
  end
31
21
 
32
- # Validate callable
33
- unless @callable.respond_to?(:call)
34
- raise ArgumentError, "Scorer must be callable (respond to :call)"
35
- end
36
-
37
- # Detect arity and wrap callable if needed
38
- @wrapped_callable = wrap_callable(@callable)
22
+ scorer = Braintrust::Scorer.new(name, &block)
23
+ scorer.singleton_class.prepend(PositionalArgsRemapping)
24
+ scorer
39
25
  end
40
26
 
41
- # Call the scorer
42
- # @param input [Object] The input to the task
43
- # @param expected [Object] The expected output
44
- # @param output [Object] The actual output from the task
45
- # @param metadata [Hash] Optional metadata
46
- # @return [Float, Hash, Array] Score value(s)
47
- def call(input, expected, output, metadata = {})
48
- @wrapped_callable.call(input, expected, output, metadata)
49
- end
50
-
51
- private
52
-
53
- # Detect the name from a callable object
54
- # @param callable [#call] The callable
55
- # @return [String] The detected name
56
- def detect_name(callable)
57
- # Method objects have .name
58
- if callable.is_a?(Method)
59
- return callable.name.to_s
60
- end
61
-
62
- # Objects with .name method
63
- if callable.respond_to?(:name)
64
- return callable.name.to_s
65
- end
66
-
67
- # Fallback
68
- "scorer"
69
- end
70
-
71
- # Wrap the callable to always accept 4 parameters
72
- # @param callable [#call] The callable to wrap
73
- # @return [Proc] Wrapped callable that accepts 4 params
74
- def wrap_callable(callable)
75
- arity = callable_arity(callable)
76
-
77
- case arity
78
- when 3
79
- # Callable takes 3 params - wrap to ignore metadata
80
- ->(input, expected, output, metadata) {
81
- callable.call(input, expected, output)
82
- }
83
- when 4, -4, -1
84
- # Callable takes 4 params (or variadic with 4+)
85
- # -4 means optional 4th param
86
- # -1 means variadic (*args)
87
- callable
88
- else
89
- raise ArgumentError, "Scorer must accept 3 or 4 parameters (got arity #{arity})"
90
- end
91
- end
92
-
93
- # Get the arity of a callable
94
- # @param callable [#call] The callable
95
- # @return [Integer] The arity
96
- def callable_arity(callable)
97
- if callable.respond_to?(:arity)
98
- callable.arity
99
- elsif callable.respond_to?(:method)
100
- callable.method(:call).arity
101
- else
102
- # Assume 3 params if we can't detect
103
- 3
27
+ # @deprecated Maps positional #call(input, expected, output, metadata) to keyword args.
28
+ # Will be removed when the legacy Eval::Scorer API is removed.
29
+ module PositionalArgsRemapping
30
+ def call(*args, **kwargs)
31
+ if args.any?
32
+ Log.warn_once(:scorer_positional_call, "Calling a Scorer with positional args is deprecated: use keyword args (input:, expected:, output:, metadata:) instead.")
33
+ kwargs = {input: args[0], expected: args[1], output: args[2], metadata: args[3]}
34
+ end
35
+ super(**kwargs)
104
36
  end
105
37
  end
106
38
  end
107
39
  end
108
-
109
- # Value object wrapping a remote scorer function UUID.
110
- # Used by Eval.run to distinguish remote scorers from local callables.
111
- ScorerId = Struct.new(:function_id, :version, keyword_init: true)
112
40
  end