braintrust 0.0.7 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,9 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative "eval/case"
4
- require_relative "eval/cases"
5
3
  require_relative "eval/scorer"
6
- require_relative "eval/result"
4
+ require_relative "eval/runner"
7
5
  require_relative "internal/experiments"
6
+
8
7
  require "opentelemetry/sdk"
9
8
  require "json"
10
9
 
@@ -193,7 +192,9 @@ module Braintrust
193
192
  # - Hash: {name:, id:, project:, version:, limit:}
194
193
  # @param task [#call] The task to evaluate (must be callable)
195
194
  # @param scorers [Array<Scorer, #call>] The scorers to use (Scorer objects or callables)
196
- # @param parallelism [Integer] Number of parallel workers (default: 1)
195
+ # @param parallelism [Integer] Number of parallel workers (default: 1).
196
+ # When parallelism > 1, test cases are executed concurrently using a thread pool.
197
+ # The task and scorers MUST be thread-safe when using parallelism > 1.
197
198
  # @param tags [Array<String>] Optional experiment tags
198
199
  # @param metadata [Hash] Optional experiment metadata
199
200
  # @param update [Boolean] If true, allow reusing existing experiment (default: false)
@@ -232,18 +233,18 @@ module Braintrust
232
233
  project_id = result[:project_id]
233
234
  project_name = result[:project_name]
234
235
 
235
- # Run the eval with resolved experiment info
236
- result = run_internal(
236
+ # Instantiate Runner and run evaluation
237
+ runner = Runner.new(
237
238
  experiment_id: experiment_id,
238
239
  experiment_name: experiment,
239
240
  project_id: project_id,
240
241
  project_name: project_name,
241
- cases: cases,
242
242
  task: task,
243
243
  scorers: scorers,
244
244
  state: state,
245
245
  tracer_provider: tracer_provider
246
246
  )
247
+ result = runner.run(cases, parallelism: parallelism)
247
248
 
248
249
  # Print result summary unless quiet
249
250
  print_result(result) unless quiet
@@ -253,66 +254,10 @@ module Braintrust
253
254
 
254
255
  private
255
256
 
256
- # Internal eval runner that doesn't touch the API
257
- # @param experiment_id [String] Resolved experiment ID
258
- # @param experiment_name [String] Experiment name
259
- # @param project_id [String] Resolved project ID
260
- # @param project_name [String] Project name
261
- # @param cases [Array, Enumerable, Cases] Test cases
262
- # @param task [#call] Task callable
263
- # @param scorers [Array] Scorers
264
- # @param state [State] Braintrust state
265
- # @param tracer_provider [TracerProvider, nil] OpenTelemetry tracer provider
266
- # @return [Result]
267
- def run_internal(experiment_id:, experiment_name:, project_id:, project_name:,
268
- cases:, task:, scorers:, state:, tracer_provider: nil)
269
- start_time = Time.now
270
-
271
- # Get tracer for creating spans
272
- tracer_provider ||= OpenTelemetry.tracer_provider
273
- tracer = tracer_provider.tracer("braintrust-eval")
274
-
275
- # Parent attribute for all eval spans
276
- parent_attr = "experiment_id:#{experiment_id}"
277
-
278
- # Normalize cases to Cases wrapper
279
- normalized_cases = normalize_cases(cases)
280
-
281
- # Normalize scorers to Scorer objects
282
- normalized_scorers = normalize_scorers(scorers)
283
-
284
- # Collect errors
285
- errors = []
286
-
287
- # Run each case with tracing
288
- normalized_cases.each do |test_case|
289
- run_case(test_case, task, normalized_scorers, errors,
290
- tracer, parent_attr)
291
- end
292
-
293
- # Calculate duration
294
- duration = Time.now - start_time
295
-
296
- # Generate permalink: {app_url}/app/{org}/object?object_type=experiment&object_id={experiment_id}
297
- permalink = "#{state.app_url}/app/#{state.org_name}/object?object_type=experiment&object_id=#{experiment_id}"
298
-
299
- # Return result
300
- Result.new(
301
- experiment_id: experiment_id,
302
- experiment_name: experiment_name,
303
- project_id: project_id,
304
- project_name: project_name,
305
- permalink: permalink,
306
- errors: errors,
307
- duration: duration
308
- )
309
- end
310
-
311
257
  # Print result summary to stdout
312
258
  # @param result [Result] The evaluation result
313
259
  def print_result(result)
314
- puts "=" * 60
315
- puts result
260
+ puts result.to_pretty
316
261
  end
317
262
 
318
263
  # Validate required parameters
@@ -419,166 +364,6 @@ module Braintrust
419
364
  filtered
420
365
  end
421
366
  end
422
-
423
- # Normalize cases input to Cases wrapper
424
- # @param cases_input [Array, Enumerable, Cases] The cases input
425
- # @return [Cases]
426
- def normalize_cases(cases_input)
427
- case cases_input
428
- when Cases
429
- cases_input
430
- when Array, Enumerable
431
- Cases.new(cases_input)
432
- else
433
- if cases_input.respond_to?(:each)
434
- Cases.new(cases_input)
435
- else
436
- raise ArgumentError, "cases must be Array or Enumerable"
437
- end
438
- end
439
- end
440
-
441
- # Normalize scorers to Scorer objects
442
- # @param scorers_input [Array] The scorers input (Scorer objects or callables)
443
- # @return [Array<Scorer>]
444
- def normalize_scorers(scorers_input)
445
- scorers_input.map do |scorer|
446
- case scorer
447
- when Scorer
448
- # Already a Scorer
449
- scorer
450
- else
451
- # Wrap callable in Scorer (auto-detects name)
452
- Scorer.new(scorer)
453
- end
454
- end
455
- end
456
-
457
- # Run a single test case with OpenTelemetry tracing
458
- # Creates eval span (parent) with task and score as children
459
- # @param test_case [Case] The test case
460
- # @param task [#call] The task
461
- # @param scorers [Array<Scorer>] The scorers
462
- # @param errors [Array<String>] Error collection array
463
- # @param tracer [Tracer] OpenTelemetry tracer
464
- # @param parent_attr [String] Parent attribute (experiment_id:exp_id)
465
- def run_case(test_case, task, scorers, errors, tracer, parent_attr)
466
- # Create eval span (parent)
467
- tracer.in_span("eval") do |eval_span|
468
- eval_span.set_attribute("braintrust.parent", parent_attr)
469
-
470
- # Set tags early so they're present even if task fails
471
- eval_span.set_attribute("braintrust.tags", test_case.tags) if test_case.tags
472
-
473
- # Run task
474
- output = nil
475
- begin
476
- output = run_task(test_case, task, tracer, parent_attr)
477
- rescue => e
478
- # Error already recorded on task span, set eval span status
479
- eval_span.status = OpenTelemetry::Trace::Status.error(e.message)
480
- errors << "Task failed for input '#{test_case.input}': #{e.message}"
481
- next
482
- end
483
-
484
- # Run scorers
485
- begin
486
- run_scorers(test_case, output, scorers, tracer, parent_attr)
487
- rescue => e
488
- # Error already recorded on score span, set eval span status
489
- eval_span.status = OpenTelemetry::Trace::Status.error(e.message)
490
- errors << "Scorers failed for input '#{test_case.input}': #{e.message}"
491
- end
492
-
493
- # Set eval span attributes (after task and scorers complete)
494
- set_json_attr(eval_span, "braintrust.span_attributes", {type: "eval"})
495
- set_json_attr(eval_span, "braintrust.input_json", test_case.input)
496
- set_json_attr(eval_span, "braintrust.output_json", output)
497
- set_json_attr(eval_span, "braintrust.expected", test_case.expected) if test_case.expected
498
- end
499
- end
500
-
501
- # Run task with OpenTelemetry tracing
502
- # Creates task span with input and output
503
- # @param test_case [Case] The test case
504
- # @param task [#call] The task
505
- # @param tracer [Tracer] OpenTelemetry tracer
506
- # @param parent_attr [String] Parent attribute
507
- # @return [Object] Task output
508
- def run_task(test_case, task, tracer, parent_attr)
509
- tracer.in_span("task") do |task_span|
510
- task_span.set_attribute("braintrust.parent", parent_attr)
511
- set_json_attr(task_span, "braintrust.span_attributes", {type: "task"})
512
- set_json_attr(task_span, "braintrust.input_json", test_case.input)
513
-
514
- begin
515
- output = task.call(test_case.input)
516
- set_json_attr(task_span, "braintrust.output_json", output)
517
- output
518
- rescue => e
519
- # Record exception event with stacktrace, then set error status
520
- task_span.record_exception(e)
521
- task_span.status = OpenTelemetry::Trace::Status.error(e.message)
522
- raise
523
- end
524
- end
525
- end
526
-
527
- # Run scorers with OpenTelemetry tracing
528
- # Creates single score span for all scorers
529
- # @param test_case [Case] The test case
530
- # @param output [Object] Task output
531
- # @param scorers [Array<Scorer>] The scorers
532
- # @param tracer [Tracer] OpenTelemetry tracer
533
- # @param parent_attr [String] Parent attribute
534
- def run_scorers(test_case, output, scorers, tracer, parent_attr)
535
- tracer.in_span("score") do |score_span|
536
- score_span.set_attribute("braintrust.parent", parent_attr)
537
- set_json_attr(score_span, "braintrust.span_attributes", {type: "score"})
538
-
539
- scores = {}
540
- scorer_error = nil
541
- scorers.each do |scorer|
542
- score_value = scorer.call(test_case.input, test_case.expected, output, test_case.metadata || {})
543
- scores[scorer.name] = score_value
544
- rescue => e
545
- # Record first error but continue processing other scorers
546
- scorer_error ||= "Scorer '#{scorer.name}' failed: #{e.message}"
547
- record_span_error(score_span, e, "ScorerError")
548
- end
549
-
550
- # Always set scores attribute, even if some scorers failed
551
- set_json_attr(score_span, "braintrust.scores", scores)
552
-
553
- # Raise after setting scores so we can see which scorers succeeded
554
- raise scorer_error if scorer_error
555
- end
556
- end
557
-
558
- # Record error on span with exception event and error status
559
- # @param span [OpenTelemetry::Trace::Span] The span to record error on
560
- # @param error [Exception] The error that occurred
561
- # @param error_type [String] The error type name (optional, used for custom error classification)
562
- def record_span_error(span, error, error_type = nil)
563
- # Record exception with stacktrace (OpenTelemetry standard)
564
- if error_type
565
- # For custom error types, add type override
566
- span.record_exception(error, attributes: {"exception.type" => error_type})
567
- else
568
- span.record_exception(error)
569
- end
570
-
571
- # Set span status to error
572
- span.status = OpenTelemetry::Trace::Status.error(error.message)
573
- end
574
-
575
- # Set a span attribute by JSON encoding the value
576
- # @param span [OpenTelemetry::Trace::Span] The span
577
- # @param key [String] The attribute key
578
- # @param value [Object] The value to JSON encode
579
- def set_json_attr(span, key, value)
580
- span.set_attribute(key, JSON.dump(value))
581
- end
582
367
  end
583
368
  end
584
369
  end
@@ -0,0 +1,167 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Braintrust
4
+ module Internal
5
+ # Reusable thread pool for concurrent task execution.
6
+ # Uses the strategy pattern to define result handling behavior.
7
+ #
8
+ # @example Iterate without collecting results (Eval use case)
9
+ # ThreadPool.each(items, parallelism: 4) do |item|
10
+ # process(item)
11
+ # end
12
+ #
13
+ # @example Collect results in order
14
+ # results = ThreadPool.collect(items, parallelism: 4) do |item|
15
+ # transform(item)
16
+ # end
17
+ #
18
+ # @note Thread limits are per-call, not global. If your application calls
19
+ # ThreadPool methods from multiple threads concurrently (e.g., web workers,
20
+ # background jobs), each call spawns its own worker threads. Plan your
21
+ # parallelism settings accordingly to avoid excessive thread creation.
22
+ #
23
+ class ThreadPool
24
+ DEFAULT_PARALLELISM = 3
25
+ MAX_PARALLELISM = 50
26
+
27
+ # Strategy for iteration without collecting results
28
+ class Each
29
+ def prepare(items)
30
+ @queue = Queue.new
31
+ items.each { |item| @queue << item }
32
+ end
33
+
34
+ def enqueue_sentinel(count)
35
+ count.times { @queue << :done }
36
+ end
37
+
38
+ def work_loop(&block)
39
+ loop do
40
+ item = @queue.pop
41
+ break if item == :done
42
+ block.call(item)
43
+ end
44
+ end
45
+
46
+ def result
47
+ nil
48
+ end
49
+
50
+ def empty_result
51
+ nil
52
+ end
53
+
54
+ def sequential_run(items, &block)
55
+ items.each(&block)
56
+ nil
57
+ end
58
+ end
59
+
60
+ # Strategy for collecting results in input order
61
+ class Collect
62
+ def prepare(items)
63
+ @results = Array.new(items.size)
64
+ @queue = Queue.new
65
+ items.each_with_index { |item, idx| @queue << [item, idx] }
66
+ end
67
+
68
+ def enqueue_sentinel(count)
69
+ count.times { @queue << :done }
70
+ end
71
+
72
+ def work_loop(&block)
73
+ loop do
74
+ work = @queue.pop
75
+ break if work == :done
76
+ item, idx = work
77
+ @results[idx] = block.call(item)
78
+ end
79
+ end
80
+
81
+ def result
82
+ @results
83
+ end
84
+
85
+ def empty_result
86
+ []
87
+ end
88
+
89
+ def sequential_run(items, &block)
90
+ items.map(&block)
91
+ end
92
+ end
93
+
94
+ STRATEGIES = {
95
+ each: Each,
96
+ collect: Collect
97
+ }.freeze
98
+
99
+ # Execute block for each item concurrently, discarding results.
100
+ # @param items [Array, Enumerable] Items to process
101
+ # @param parallelism [Integer] Number of worker threads (default: 3)
102
+ # @yield [item] Block to execute for each item
103
+ # @return [nil]
104
+ def self.each(items, parallelism: DEFAULT_PARALLELISM, &block)
105
+ run(items, parallelism: parallelism, strategy: :each, &block)
106
+ end
107
+
108
+ # Execute block for each item concurrently, collecting results in order.
109
+ # @param items [Array, Enumerable] Items to process
110
+ # @param parallelism [Integer] Number of worker threads (default: 3)
111
+ # @yield [item] Block to execute for each item
112
+ # @return [Array] Results in same order as input items
113
+ def self.collect(items, parallelism: DEFAULT_PARALLELISM, &block)
114
+ run(items, parallelism: parallelism, strategy: :collect, &block)
115
+ end
116
+
117
+ # Execute block for each item concurrently using the specified strategy.
118
+ # Prefer using .each or .collect convenience methods instead.
119
+ # @param items [Array, Enumerable] Items to process
120
+ # @param strategy [Symbol, #prepare] Strategy for result handling (required)
121
+ # @param parallelism [Integer] Number of worker threads (default: 3)
122
+ # @yield [item] Block to execute for each item
123
+ # @return [Object, nil] Strategy-dependent result
124
+ def self.run(items, strategy:, parallelism: DEFAULT_PARALLELISM, &block)
125
+ validate_parallelism!(parallelism)
126
+
127
+ executor = strategy_instance(strategy)
128
+ all_items = items.to_a
129
+
130
+ return executor.sequential_run(all_items, &block) if parallelism == 1
131
+ return executor.empty_result if all_items.empty?
132
+
133
+ executor.prepare(all_items)
134
+ executor.enqueue_sentinel(parallelism)
135
+
136
+ threads = parallelism.times.map do
137
+ Thread.new { executor.work_loop(&block) }
138
+ end
139
+
140
+ threads.each(&:join)
141
+ executor.result
142
+ end
143
+
144
+ def self.strategy_instance(strategy)
145
+ case strategy
146
+ when Symbol
147
+ STRATEGIES.fetch(strategy) {
148
+ raise ArgumentError, "Unknown strategy: #{strategy}. Valid: #{STRATEGIES.keys.join(", ")}"
149
+ }.new
150
+ else
151
+ strategy
152
+ end
153
+ end
154
+
155
+ def self.validate_parallelism!(parallelism)
156
+ unless parallelism.is_a?(Integer) && parallelism > 0
157
+ raise ArgumentError, "parallelism must be a positive integer"
158
+ end
159
+ if parallelism > MAX_PARALLELISM
160
+ raise ArgumentError, "parallelism cannot exceed #{MAX_PARALLELISM}"
161
+ end
162
+ end
163
+
164
+ private_class_method :strategy_instance, :validate_parallelism!
165
+ end
166
+ end
167
+ end