braintrust 0.0.8 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c6b2bcda06084f2e90d2602659ca71cf0ab574ac8c74c367890cbb2b04740529
4
- data.tar.gz: 306b5a46660eae3d3e3811d021627883419a4dc4c114e51e40be64c590868c95
3
+ metadata.gz: be2efc651c8e685179541cf2ade46f86e3ca66c408ed00707d2b9890a4d5fa72
4
+ data.tar.gz: ba9c9f993abf5ea64290c5510361297a6b7dca21ad06583634d7e4080d4f7531
5
5
  SHA512:
6
- metadata.gz: 1db7bf706b260762aa114eb5e8f844cb0567efd5a6f9d8cca03667111c0e89ff68f4e53b3a3adc6ad2192947602fc4b88a8e0057169ad7eff12ccb1c2ecb4951
7
- data.tar.gz: bbc71c33bb28da124bd1cc61c8bf4f765ec2899a57bf604da624a04c42a7bfce508ed1eb78c4ec421da5038fbaf89086cee8d0b04998d223568df05e8640679f
6
+ metadata.gz: a40a56eae61148496ee7c96775b9c6fef63106a5c943dbe1a43f66572cb245e5ec3fd5a2e98452926c8a326a0f06f9dd68bfb56a4b7f04081e65ec03ef3d7939
7
+ data.tar.gz: c1bed8505efca929e688538d293e4f329c8dd8c2d1152858bb6967fa0d9f54b88f4120fb3d1f7d591fc51cc0e05ec1577b8345524ddc45863c1eb2ff6537a334
@@ -0,0 +1,220 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Braintrust
4
+ module Eval
5
+ # Formatter for pretty CLI output of experiment results
6
+ # Uses ANSI colors and Unicode box drawing for terminal display
7
+ module Formatter
8
+ # ANSI color codes
9
+ COLORS = {
10
+ gray: "\e[90m",
11
+ red: "\e[31m",
12
+ green: "\e[32m",
13
+ blue: "\e[34m",
14
+ magenta: "\e[35m",
15
+ white: "\e[97m",
16
+ dim: "\e[2m",
17
+ reset: "\e[0m"
18
+ }.freeze
19
+
20
+ # Box drawing characters (Unicode)
21
+ BOX = {
22
+ top_left: "╭",
23
+ top_right: "╮",
24
+ bottom_left: "╰",
25
+ bottom_right: "╯",
26
+ horizontal: "─",
27
+ vertical: "│"
28
+ }.freeze
29
+
30
+ # Maximum length for error messages before truncation
31
+ MAX_ERROR_LENGTH = 150
32
+
33
+ class << self
34
+ # Format an experiment summary for CLI output
35
+ # @param summary [ExperimentSummary] The experiment summary
36
+ # @return [String] Formatted output with box drawing and colors
37
+ def format_experiment_summary(summary)
38
+ return "" unless summary
39
+
40
+ lines = []
41
+
42
+ # Metadata section
43
+ lines << format_metadata_row("Project", summary.project_name)
44
+ lines << format_metadata_row("Experiment", summary.experiment_name)
45
+ lines << format_metadata_row("ID", summary.experiment_id)
46
+ lines << format_metadata_row("Duration", format_duration(summary.duration))
47
+ lines << format_metadata_row("Errors", summary.error_count.to_s)
48
+
49
+ # Scores section (if any)
50
+ if summary.scores&.any?
51
+ lines << ""
52
+ lines << colorize("Scores", :white)
53
+
54
+ # Calculate max scorer name length for alignment
55
+ max_name_len = summary.scores.values.map { |s| s.name.length }.max || 0
56
+ name_width = [max_name_len + 2, 20].max # +2 for "◯ " prefix
57
+
58
+ summary.scores.each_value do |score|
59
+ lines << format_score_row(score, name_width)
60
+ end
61
+ end
62
+
63
+ # Errors section (if any)
64
+ if summary.errors&.any?
65
+ lines << ""
66
+ lines << colorize("Errors", :white)
67
+
68
+ summary.errors.each do |error|
69
+ lines << format_error_row(error)
70
+ end
71
+ end
72
+
73
+ # Footer link
74
+ if summary.experiment_url
75
+ lines << ""
76
+ lines << terminal_link("View results for #{summary.experiment_name}", summary.experiment_url)
77
+ end
78
+
79
+ wrap_in_box(lines, "Experiment summary")
80
+ end
81
+
82
+ # Format a metadata row (label: value)
83
+ # @param label [String] Row label
84
+ # @param value [String] Row value
85
+ # @return [String] Formatted row
86
+ def format_metadata_row(label, value)
87
+ "#{colorize(label + ":", :dim)} #{value}"
88
+ end
89
+
90
+ # Format duration for display
91
+ # @param duration [Float] Duration in seconds
92
+ # @return [String] Formatted duration (e.g., "1.2345s" or "123ms")
93
+ def format_duration(duration)
94
+ if duration < 1
95
+ "#{(duration * 1000).round(0)}ms"
96
+ else
97
+ "#{duration.round(4)}s"
98
+ end
99
+ end
100
+
101
+ # Format an error row for display
102
+ # @param error_message [String] The error message
103
+ # @return [String] Formatted row with red ✗
104
+ def format_error_row(error_message)
105
+ truncated = truncate_error(error_message, MAX_ERROR_LENGTH)
106
+ "#{colorize("✗", :red)} #{truncated}"
107
+ end
108
+
109
+ # Truncate error message to max length with ellipsis
110
+ # @param message [String] The error message
111
+ # @param max_length [Integer] Maximum length before truncation
112
+ # @return [String] Truncated message
113
+ def truncate_error(message, max_length)
114
+ return message if message.length <= max_length
115
+ "#{message[0, max_length - 3]}..."
116
+ end
117
+
118
+ # Apply ANSI color codes to text
119
+ # @param text [String] Text to colorize
120
+ # @param styles [Array<Symbol>] Color/style names (:gray, :red, :green, etc.)
121
+ # @return [String] Colorized text (or plain text if not a TTY)
122
+ def colorize(text, *styles)
123
+ return text unless $stdout.tty?
124
+ codes = styles.map { |s| COLORS[s] }.compact.join
125
+ "#{codes}#{text}#{COLORS[:reset]}"
126
+ end
127
+
128
+ # Format a score row for display
129
+ # @param score [ScorerStats] The scorer statistics
130
+ # @param name_width [Integer] Width for the name column
131
+ # @return [String] Formatted row
132
+ def format_score_row(score, name_width = 20)
133
+ name = "#{colorize("◯", :blue)} #{score.name}"
134
+ value = colorize("#{(score.score_mean * 100).round(2)}%", :white)
135
+ pad_cell(name, name_width, :left) + " " + pad_cell(value, 10, :right)
136
+ end
137
+
138
+ # Pad a cell to a given width, accounting for ANSI codes
139
+ # @param text [String] Cell text (may contain ANSI codes)
140
+ # @param width [Integer] Target width
141
+ # @param align [Symbol] :left or :right alignment
142
+ # @return [String] Padded cell
143
+ def pad_cell(text, width, align)
144
+ visible_length = visible_text_length(text)
145
+ padding = [width - visible_length, 0].max
146
+
147
+ case align
148
+ when :right
149
+ " " * padding + text
150
+ else
151
+ text + " " * padding
152
+ end
153
+ end
154
+
155
+ # Calculate visible text length, stripping ANSI codes and OSC 8 hyperlinks
156
+ # @param text [String] Text that may contain escape sequences
157
+ # @return [Integer] Visible character count
158
+ def visible_text_length(text)
159
+ # Strip ANSI color codes: \e[...m
160
+ # Strip OSC 8 hyperlinks: \e]8;;...\e\\ (the URL part is invisible)
161
+ text
162
+ .gsub(/\e\[[0-9;]*m/, "") # ANSI color codes
163
+ .gsub(/\e\]8;;[^\e]*\e\\/, "") # OSC 8 hyperlink sequences
164
+ .length
165
+ end
166
+
167
+ # Create a clickable terminal hyperlink (OSC 8)
168
+ # @param text [String] Display text
169
+ # @param url [String] Target URL
170
+ # @return [String] Hyperlinked text (or plain text with URL if not a TTY)
171
+ def terminal_link(text, url)
172
+ if $stdout.tty?
173
+ "\e]8;;#{url}\e\\#{text}\e]8;;\e\\"
174
+ else
175
+ "#{text}: #{url}"
176
+ end
177
+ end
178
+
179
+ # Wrap content lines in a Unicode box with title
180
+ # @param lines [Array<String>] Content lines
181
+ # @param title [String] Box title
182
+ # @return [String] Boxed content
183
+ def wrap_in_box(lines, title)
184
+ # Calculate width from content (strip escape sequences for measurement)
185
+ content_width = lines.map { |l| visible_text_length(l) }.max || 0
186
+ box_width = [content_width + 4, title.length + 6].max
187
+ inner_width = box_width - 2
188
+
189
+ result = []
190
+
191
+ # Top border with title
192
+ title_str = " #{title} "
193
+ remaining = inner_width - title_str.length - 1
194
+ top = colorize("#{BOX[:top_left]}#{BOX[:horizontal]}", :gray) +
195
+ colorize(title_str, :gray) +
196
+ colorize(BOX[:horizontal] * remaining + BOX[:top_right], :gray)
197
+ result << top
198
+
199
+ # Empty line for padding
200
+ result << colorize(BOX[:vertical], :gray) + " " * inner_width + colorize(BOX[:vertical], :gray)
201
+
202
+ # Content lines
203
+ lines.each do |line|
204
+ visible_len = visible_text_length(line)
205
+ padding = inner_width - visible_len - 2 # 1 space on each side
206
+ result << colorize(BOX[:vertical], :gray) + " " + line + " " * [padding, 0].max + " " + colorize(BOX[:vertical], :gray)
207
+ end
208
+
209
+ # Empty line for padding
210
+ result << colorize(BOX[:vertical], :gray) + " " * inner_width + colorize(BOX[:vertical], :gray)
211
+
212
+ # Bottom border
213
+ result << colorize("#{BOX[:bottom_left]}#{BOX[:horizontal] * inner_width}#{BOX[:bottom_right]}", :gray)
214
+
215
+ "\n" + result.join("\n")
216
+ end
217
+ end
218
+ end
219
+ end
220
+ end
@@ -1,12 +1,15 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require_relative "formatter"
4
+ require_relative "summary"
5
+
3
6
  module Braintrust
4
7
  module Eval
5
8
  # Result represents the outcome of an evaluation run
6
- # Contains experiment metadata, errors, and timing information
9
+ # Contains experiment metadata, errors, timing information, and raw score data
7
10
  class Result
8
11
  attr_reader :experiment_id, :experiment_name, :project_id, :project_name,
9
- :permalink, :errors, :duration
12
+ :permalink, :errors, :duration, :scores
10
13
 
11
14
  # Create a new result
12
15
  # @param experiment_id [String] The experiment ID
@@ -16,8 +19,9 @@ module Braintrust
16
19
  # @param permalink [String] Link to view the experiment in Braintrust UI
17
20
  # @param errors [Array<String>] List of errors that occurred
18
21
  # @param duration [Float] Duration in seconds
22
+ # @param scores [Hash, nil] Raw score data { scorer_name => Array<Numeric> }
19
23
  def initialize(experiment_id:, experiment_name:, project_id:, project_name:,
20
- permalink:, errors:, duration:)
24
+ permalink:, errors:, duration:, scores: nil)
21
25
  @experiment_id = experiment_id
22
26
  @experiment_name = experiment_name
23
27
  @project_id = project_id
@@ -25,6 +29,7 @@ module Braintrust
25
29
  @permalink = permalink
26
30
  @errors = errors
27
31
  @duration = duration
32
+ @scores = scores
28
33
  end
29
34
 
30
35
  # Check if the evaluation was successful (no errors)
@@ -39,6 +44,12 @@ module Braintrust
39
44
  !success?
40
45
  end
41
46
 
47
+ # Get the experiment summary (lazily computed)
48
+ # @return [ExperimentSummary] Summary view model for Formatter
49
+ def summary
50
+ @summary ||= build_summary
51
+ end
52
+
42
53
  # Format the result as a human-readable string (Go SDK format)
43
54
  # @return [String]
44
55
  def to_s
@@ -51,6 +62,49 @@ module Braintrust
51
62
  "Errors: #{errors.length}"
52
63
  ].join("\n")
53
64
  end
65
+
66
+ # Format the result as a pretty CLI output with box drawing and colors
67
+ # @return [String]
68
+ def to_pretty
69
+ Formatter.format_experiment_summary(summary)
70
+ end
71
+
72
+ # Get statistics for all scorers (lazily computed from scores)
73
+ # @return [Hash<String, ScorerStats>] Scorer stats keyed by scorer name
74
+ def scorer_stats
75
+ @scorer_stats ||= build_scorer_stats
76
+ end
77
+
78
+ private
79
+
80
+ # Build scorer statistics from raw score data
81
+ # @return [Hash<String, ScorerStats>] Scorer stats keyed by scorer name
82
+ def build_scorer_stats
83
+ return {} if scores.nil? || scores.empty?
84
+
85
+ stats = {}
86
+ scores.each do |name, score_values|
87
+ next if score_values.empty?
88
+ mean = score_values.sum.to_f / score_values.size
89
+ stats[name] = ScorerStats.new(name: name, score_mean: mean)
90
+ end
91
+ stats
92
+ end
93
+
94
+ # Build experiment summary view model
95
+ # @return [ExperimentSummary] Summary with all data for Formatter
96
+ def build_summary
97
+ ExperimentSummary.new(
98
+ project_name: project_name,
99
+ experiment_name: experiment_name,
100
+ experiment_id: experiment_id,
101
+ experiment_url: permalink,
102
+ scores: scorer_stats,
103
+ duration: duration,
104
+ error_count: errors.length,
105
+ errors: errors
106
+ )
107
+ end
54
108
  end
55
109
  end
56
110
  end
@@ -0,0 +1,241 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "case"
4
+ require_relative "cases"
5
+ require_relative "scorer"
6
+ require_relative "result"
7
+ require_relative "summary"
8
+ require_relative "../internal/thread_pool"
9
+
10
+ require "opentelemetry/sdk"
11
+ require "json"
12
+
13
+ module Braintrust
14
+ module Eval
15
+ # Internal runner class that performs the execution of the Eval and returns the result
16
+ class Runner
17
+ # Maximum parallelism allowed (mirrors Internal::ThreadPool::MAX_PARALLELISM)
18
+ MAX_PARALLELISM = Internal::ThreadPool::MAX_PARALLELISM
19
+
20
+ def initialize(experiment_id:, experiment_name:, project_id:, project_name:,
21
+ task:, scorers:, state:, tracer_provider: nil)
22
+ @experiment_id = experiment_id
23
+ @experiment_name = experiment_name
24
+ @project_id = project_id
25
+ @project_name = project_name
26
+ @task = task
27
+ @scorers = normalize_scorers(scorers)
28
+ @state = state
29
+ @tracer_provider = tracer_provider || OpenTelemetry.tracer_provider
30
+ @tracer = @tracer_provider.tracer("braintrust-eval")
31
+ @parent_attr = "experiment_id:#{experiment_id}"
32
+
33
+ # Mutex for thread-safe score collection
34
+ @score_mutex = Mutex.new
35
+ end
36
+
37
+ # Run evaluation and return Result
38
+ # @param cases [Array, Enumerable] Test cases
39
+ # @param parallelism [Integer] Number of parallel workers (default: 1)
40
+ # @return [Result]
41
+ def run(cases, parallelism: 1)
42
+ start_time = Time.now
43
+ normalized_cases = normalize_cases(cases)
44
+ errors = Queue.new
45
+ @scores = {} # Reset for each run: { scorer_name => Array<Numeric> }
46
+
47
+ if parallelism && parallelism > 1
48
+ Internal::ThreadPool.each(normalized_cases, parallelism: parallelism) do |test_case|
49
+ run_case(test_case, errors)
50
+ end
51
+ else
52
+ normalized_cases.each do |test_case|
53
+ run_case(test_case, errors)
54
+ end
55
+ end
56
+
57
+ # Convert Queue to Array after all threads complete
58
+ error_array = [].tap { |a| a << errors.pop until errors.empty? }
59
+
60
+ # Calculate duration
61
+ duration = Time.now - start_time
62
+
63
+ # Generate permalink
64
+ permalink = "#{state.app_url}/app/#{state.org_name}/object?object_type=experiment&object_id=#{experiment_id}"
65
+
66
+ Result.new(
67
+ experiment_id: experiment_id,
68
+ experiment_name: experiment_name,
69
+ project_id: project_id,
70
+ project_name: project_name,
71
+ permalink: permalink,
72
+ errors: error_array,
73
+ duration: duration,
74
+ scores: @scores
75
+ )
76
+ end
77
+
78
+ private
79
+
80
+ attr_reader :experiment_id, :experiment_name, :project_id, :project_name,
81
+ :task, :scorers, :state, :tracer, :parent_attr
82
+
83
+ # Run a single test case with OpenTelemetry tracing
84
+ # Creates eval span (parent) with task and score as children
85
+ # @param test_case [Case] The test case
86
+ # @param errors [Queue] Thread-safe error collection queue
87
+ def run_case(test_case, errors)
88
+ tracer.in_span("eval") do |eval_span|
89
+ eval_span.set_attribute("braintrust.parent", parent_attr)
90
+
91
+ # Set tags early so they're present even if task fails
92
+ eval_span.set_attribute("braintrust.tags", test_case.tags) if test_case.tags
93
+
94
+ # Run task
95
+ output = nil
96
+ begin
97
+ output = run_task(test_case)
98
+ rescue => e
99
+ # Error already recorded on task span, set eval span status
100
+ eval_span.status = OpenTelemetry::Trace::Status.error(e.message)
101
+ errors << "Task failed for input '#{test_case.input}': #{e.message}"
102
+ next
103
+ end
104
+
105
+ # Run scorers
106
+ begin
107
+ run_scorers(test_case, output)
108
+ rescue => e
109
+ # Error already recorded on score span, set eval span status
110
+ eval_span.status = OpenTelemetry::Trace::Status.error(e.message)
111
+ errors << "Scorers failed for input '#{test_case.input}': #{e.message}"
112
+ end
113
+
114
+ # Set eval span attributes (after task and scorers complete)
115
+ set_json_attr(eval_span, "braintrust.span_attributes", {type: "eval"})
116
+ set_json_attr(eval_span, "braintrust.input_json", test_case.input)
117
+ set_json_attr(eval_span, "braintrust.output_json", output)
118
+ set_json_attr(eval_span, "braintrust.expected", test_case.expected) if test_case.expected
119
+ end
120
+ end
121
+
122
+ # Run task with OpenTelemetry tracing
123
+ # Creates task span with input and output
124
+ # @param test_case [Case] The test case
125
+ # @return [Object] Task output
126
+ def run_task(test_case)
127
+ tracer.in_span("task") do |task_span|
128
+ task_span.set_attribute("braintrust.parent", parent_attr)
129
+ set_json_attr(task_span, "braintrust.span_attributes", {type: "task"})
130
+ set_json_attr(task_span, "braintrust.input_json", test_case.input)
131
+
132
+ begin
133
+ output = task.call(test_case.input)
134
+ set_json_attr(task_span, "braintrust.output_json", output)
135
+ output
136
+ rescue => e
137
+ # Record exception event with stacktrace, then set error status
138
+ task_span.record_exception(e)
139
+ task_span.status = OpenTelemetry::Trace::Status.error(e.message)
140
+ raise
141
+ end
142
+ end
143
+ end
144
+
145
+ # Run scorers with OpenTelemetry tracing
146
+ # Creates single score span for all scorers
147
+ # @param test_case [Case] The test case
148
+ # @param output [Object] Task output
149
+ def run_scorers(test_case, output)
150
+ tracer.in_span("score") do |score_span|
151
+ score_span.set_attribute("braintrust.parent", parent_attr)
152
+ set_json_attr(score_span, "braintrust.span_attributes", {type: "score"})
153
+
154
+ scores = {}
155
+ scorer_error = nil
156
+ scorers.each do |scorer|
157
+ score_value = scorer.call(test_case.input, test_case.expected, output, test_case.metadata || {})
158
+ scores[scorer.name] = score_value
159
+
160
+ # Collect raw score for summary (thread-safe)
161
+ collect_score(scorer.name, score_value)
162
+ rescue => e
163
+ # Record first error but continue processing other scorers
164
+ scorer_error ||= e
165
+ record_span_error(score_span, e, "ScorerError")
166
+ end
167
+
168
+ # Always set scores attribute, even if some scorers failed
169
+ set_json_attr(score_span, "braintrust.scores", scores)
170
+
171
+ # Raise after setting scores so we can see which scorers succeeded
172
+ raise scorer_error if scorer_error
173
+ end
174
+ end
175
+
176
+ # Normalize cases input to Cases wrapper
177
+ # @param cases_input [Array, Enumerable, Cases] The cases input
178
+ # @return [Cases]
179
+ def normalize_cases(cases_input)
180
+ case cases_input
181
+ when Cases
182
+ cases_input
183
+ when Array, Enumerable
184
+ Cases.new(cases_input)
185
+ else
186
+ if cases_input.respond_to?(:each)
187
+ Cases.new(cases_input)
188
+ else
189
+ raise ArgumentError, "cases must be Array or Enumerable"
190
+ end
191
+ end
192
+ end
193
+
194
+ # Normalize scorers to Scorer objects
195
+ # @param scorers_input [Array] The scorers input (Scorer objects or callables)
196
+ # @return [Array<Scorer>]
197
+ def normalize_scorers(scorers_input)
198
+ scorers_input.map do |scorer|
199
+ case scorer
200
+ when Scorer
201
+ scorer
202
+ else
203
+ Scorer.new(scorer)
204
+ end
205
+ end
206
+ end
207
+
208
+ # Record error on span with exception event and error status
209
+ # @param span [OpenTelemetry::Trace::Span] The span to record error on
210
+ # @param error [Exception] The error that occurred
211
+ # @param error_type [String] The error type name (optional)
212
+ def record_span_error(span, error, error_type = nil)
213
+ if error_type
214
+ span.record_exception(error, attributes: {"exception.type" => error_type})
215
+ else
216
+ span.record_exception(error)
217
+ end
218
+ span.status = OpenTelemetry::Trace::Status.error(error.message)
219
+ end
220
+
221
+ # Set a span attribute by JSON encoding the value
222
+ # @param span [OpenTelemetry::Trace::Span] The span
223
+ # @param key [String] The attribute key
224
+ # @param value [Object] The value to JSON encode
225
+ def set_json_attr(span, key, value)
226
+ span.set_attribute(key, JSON.dump(value))
227
+ end
228
+
229
+ # Collect a single score value for summary calculation
230
+ # @param name [String] Scorer name
231
+ # @param value [Object] Score value (only Numeric values are collected)
232
+ def collect_score(name, value)
233
+ return unless value.is_a?(Numeric)
234
+
235
+ @score_mutex.synchronize do
236
+ (@scores[name] ||= []) << value
237
+ end
238
+ end
239
+ end
240
+ end
241
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Braintrust
4
+ module Eval
5
+ # Aggregated statistics for a single scorer across test cases
6
+ # @attr name [String] Scorer name
7
+ # @attr score_mean [Float] Mean score (0.0 to 1.0)
8
+ ScorerStats = Struct.new(:name, :score_mean, keyword_init: true)
9
+
10
+ # Summary of results from an Experiment
11
+ # Typically used to generate experiment output
12
+ # @attr project_name [String] Project name
13
+ # @attr experiment_name [String] Experiment name
14
+ # @attr experiment_id [String] Experiment ID
15
+ # @attr experiment_url [String] URL to view experiment in Braintrust UI
16
+ # @attr scores [Hash<String, ScorerStats>] Scorer stats keyed by scorer name
17
+ # @attr duration [Float] Duration in seconds
18
+ # @attr error_count [Integer] Number of errors
19
+ # @attr errors [Array<String>] Error messages with locations
20
+ ExperimentSummary = Struct.new(
21
+ :project_name,
22
+ :experiment_name,
23
+ :experiment_id,
24
+ :experiment_url,
25
+ :scores,
26
+ :duration,
27
+ :error_count,
28
+ :errors,
29
+ keyword_init: true
30
+ )
31
+ end
32
+ end
@@ -1,10 +1,9 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative "eval/case"
4
- require_relative "eval/cases"
5
3
  require_relative "eval/scorer"
6
- require_relative "eval/result"
4
+ require_relative "eval/runner"
7
5
  require_relative "internal/experiments"
6
+
8
7
  require "opentelemetry/sdk"
9
8
  require "json"
10
9
 
@@ -193,7 +192,9 @@ module Braintrust
193
192
  # - Hash: {name:, id:, project:, version:, limit:}
194
193
  # @param task [#call] The task to evaluate (must be callable)
195
194
  # @param scorers [Array<Scorer, #call>] The scorers to use (Scorer objects or callables)
196
- # @param parallelism [Integer] Number of parallel workers (default: 1)
195
+ # @param parallelism [Integer] Number of parallel workers (default: 1).
196
+ # When parallelism > 1, test cases are executed concurrently using a thread pool.
197
+ # The task and scorers MUST be thread-safe when using parallelism > 1.
197
198
  # @param tags [Array<String>] Optional experiment tags
198
199
  # @param metadata [Hash] Optional experiment metadata
199
200
  # @param update [Boolean] If true, allow reusing existing experiment (default: false)
@@ -232,18 +233,18 @@ module Braintrust
232
233
  project_id = result[:project_id]
233
234
  project_name = result[:project_name]
234
235
 
235
- # Run the eval with resolved experiment info
236
- result = run_internal(
236
+ # Instantiate Runner and run evaluation
237
+ runner = Runner.new(
237
238
  experiment_id: experiment_id,
238
239
  experiment_name: experiment,
239
240
  project_id: project_id,
240
241
  project_name: project_name,
241
- cases: cases,
242
242
  task: task,
243
243
  scorers: scorers,
244
244
  state: state,
245
245
  tracer_provider: tracer_provider
246
246
  )
247
+ result = runner.run(cases, parallelism: parallelism)
247
248
 
248
249
  # Print result summary unless quiet
249
250
  print_result(result) unless quiet
@@ -253,66 +254,10 @@ module Braintrust
253
254
 
254
255
  private
255
256
 
256
- # Internal eval runner that doesn't touch the API
257
- # @param experiment_id [String] Resolved experiment ID
258
- # @param experiment_name [String] Experiment name
259
- # @param project_id [String] Resolved project ID
260
- # @param project_name [String] Project name
261
- # @param cases [Array, Enumerable, Cases] Test cases
262
- # @param task [#call] Task callable
263
- # @param scorers [Array] Scorers
264
- # @param state [State] Braintrust state
265
- # @param tracer_provider [TracerProvider, nil] OpenTelemetry tracer provider
266
- # @return [Result]
267
- def run_internal(experiment_id:, experiment_name:, project_id:, project_name:,
268
- cases:, task:, scorers:, state:, tracer_provider: nil)
269
- start_time = Time.now
270
-
271
- # Get tracer for creating spans
272
- tracer_provider ||= OpenTelemetry.tracer_provider
273
- tracer = tracer_provider.tracer("braintrust-eval")
274
-
275
- # Parent attribute for all eval spans
276
- parent_attr = "experiment_id:#{experiment_id}"
277
-
278
- # Normalize cases to Cases wrapper
279
- normalized_cases = normalize_cases(cases)
280
-
281
- # Normalize scorers to Scorer objects
282
- normalized_scorers = normalize_scorers(scorers)
283
-
284
- # Collect errors
285
- errors = []
286
-
287
- # Run each case with tracing
288
- normalized_cases.each do |test_case|
289
- run_case(test_case, task, normalized_scorers, errors,
290
- tracer, parent_attr)
291
- end
292
-
293
- # Calculate duration
294
- duration = Time.now - start_time
295
-
296
- # Generate permalink: {app_url}/app/{org}/object?object_type=experiment&object_id={experiment_id}
297
- permalink = "#{state.app_url}/app/#{state.org_name}/object?object_type=experiment&object_id=#{experiment_id}"
298
-
299
- # Return result
300
- Result.new(
301
- experiment_id: experiment_id,
302
- experiment_name: experiment_name,
303
- project_id: project_id,
304
- project_name: project_name,
305
- permalink: permalink,
306
- errors: errors,
307
- duration: duration
308
- )
309
- end
310
-
311
257
  # Print result summary to stdout
312
258
  # @param result [Result] The evaluation result
313
259
  def print_result(result)
314
- puts "=" * 60
315
- puts result
260
+ puts result.to_pretty
316
261
  end
317
262
 
318
263
  # Validate required parameters
@@ -419,166 +364,6 @@ module Braintrust
419
364
  filtered
420
365
  end
421
366
  end
422
-
423
- # Normalize cases input to Cases wrapper
424
- # @param cases_input [Array, Enumerable, Cases] The cases input
425
- # @return [Cases]
426
- def normalize_cases(cases_input)
427
- case cases_input
428
- when Cases
429
- cases_input
430
- when Array, Enumerable
431
- Cases.new(cases_input)
432
- else
433
- if cases_input.respond_to?(:each)
434
- Cases.new(cases_input)
435
- else
436
- raise ArgumentError, "cases must be Array or Enumerable"
437
- end
438
- end
439
- end
440
-
441
- # Normalize scorers to Scorer objects
442
- # @param scorers_input [Array] The scorers input (Scorer objects or callables)
443
- # @return [Array<Scorer>]
444
- def normalize_scorers(scorers_input)
445
- scorers_input.map do |scorer|
446
- case scorer
447
- when Scorer
448
- # Already a Scorer
449
- scorer
450
- else
451
- # Wrap callable in Scorer (auto-detects name)
452
- Scorer.new(scorer)
453
- end
454
- end
455
- end
456
-
457
- # Run a single test case with OpenTelemetry tracing
458
- # Creates eval span (parent) with task and score as children
459
- # @param test_case [Case] The test case
460
- # @param task [#call] The task
461
- # @param scorers [Array<Scorer>] The scorers
462
- # @param errors [Array<String>] Error collection array
463
- # @param tracer [Tracer] OpenTelemetry tracer
464
- # @param parent_attr [String] Parent attribute (experiment_id:exp_id)
465
- def run_case(test_case, task, scorers, errors, tracer, parent_attr)
466
- # Create eval span (parent)
467
- tracer.in_span("eval") do |eval_span|
468
- eval_span.set_attribute("braintrust.parent", parent_attr)
469
-
470
- # Set tags early so they're present even if task fails
471
- eval_span.set_attribute("braintrust.tags", test_case.tags) if test_case.tags
472
-
473
- # Run task
474
- output = nil
475
- begin
476
- output = run_task(test_case, task, tracer, parent_attr)
477
- rescue => e
478
- # Error already recorded on task span, set eval span status
479
- eval_span.status = OpenTelemetry::Trace::Status.error(e.message)
480
- errors << "Task failed for input '#{test_case.input}': #{e.message}"
481
- next
482
- end
483
-
484
- # Run scorers
485
- begin
486
- run_scorers(test_case, output, scorers, tracer, parent_attr)
487
- rescue => e
488
- # Error already recorded on score span, set eval span status
489
- eval_span.status = OpenTelemetry::Trace::Status.error(e.message)
490
- errors << "Scorers failed for input '#{test_case.input}': #{e.message}"
491
- end
492
-
493
- # Set eval span attributes (after task and scorers complete)
494
- set_json_attr(eval_span, "braintrust.span_attributes", {type: "eval"})
495
- set_json_attr(eval_span, "braintrust.input_json", test_case.input)
496
- set_json_attr(eval_span, "braintrust.output_json", output)
497
- set_json_attr(eval_span, "braintrust.expected", test_case.expected) if test_case.expected
498
- end
499
- end
500
-
501
- # Run task with OpenTelemetry tracing
502
- # Creates task span with input and output
503
- # @param test_case [Case] The test case
504
- # @param task [#call] The task
505
- # @param tracer [Tracer] OpenTelemetry tracer
506
- # @param parent_attr [String] Parent attribute
507
- # @return [Object] Task output
508
- def run_task(test_case, task, tracer, parent_attr)
509
- tracer.in_span("task") do |task_span|
510
- task_span.set_attribute("braintrust.parent", parent_attr)
511
- set_json_attr(task_span, "braintrust.span_attributes", {type: "task"})
512
- set_json_attr(task_span, "braintrust.input_json", test_case.input)
513
-
514
- begin
515
- output = task.call(test_case.input)
516
- set_json_attr(task_span, "braintrust.output_json", output)
517
- output
518
- rescue => e
519
- # Record exception event with stacktrace, then set error status
520
- task_span.record_exception(e)
521
- task_span.status = OpenTelemetry::Trace::Status.error(e.message)
522
- raise
523
- end
524
- end
525
- end
526
-
527
- # Run scorers with OpenTelemetry tracing
528
- # Creates single score span for all scorers
529
- # @param test_case [Case] The test case
530
- # @param output [Object] Task output
531
- # @param scorers [Array<Scorer>] The scorers
532
- # @param tracer [Tracer] OpenTelemetry tracer
533
- # @param parent_attr [String] Parent attribute
534
- def run_scorers(test_case, output, scorers, tracer, parent_attr)
535
- tracer.in_span("score") do |score_span|
536
- score_span.set_attribute("braintrust.parent", parent_attr)
537
- set_json_attr(score_span, "braintrust.span_attributes", {type: "score"})
538
-
539
- scores = {}
540
- scorer_error = nil
541
- scorers.each do |scorer|
542
- score_value = scorer.call(test_case.input, test_case.expected, output, test_case.metadata || {})
543
- scores[scorer.name] = score_value
544
- rescue => e
545
- # Record first error but continue processing other scorers
546
- scorer_error ||= "Scorer '#{scorer.name}' failed: #{e.message}"
547
- record_span_error(score_span, e, "ScorerError")
548
- end
549
-
550
- # Always set scores attribute, even if some scorers failed
551
- set_json_attr(score_span, "braintrust.scores", scores)
552
-
553
- # Raise after setting scores so we can see which scorers succeeded
554
- raise scorer_error if scorer_error
555
- end
556
- end
557
-
558
- # Record error on span with exception event and error status
559
- # @param span [OpenTelemetry::Trace::Span] The span to record error on
560
- # @param error [Exception] The error that occurred
561
- # @param error_type [String] The error type name (optional, used for custom error classification)
562
- def record_span_error(span, error, error_type = nil)
563
- # Record exception with stacktrace (OpenTelemetry standard)
564
- if error_type
565
- # For custom error types, add type override
566
- span.record_exception(error, attributes: {"exception.type" => error_type})
567
- else
568
- span.record_exception(error)
569
- end
570
-
571
- # Set span status to error
572
- span.status = OpenTelemetry::Trace::Status.error(error.message)
573
- end
574
-
575
- # Set a span attribute by JSON encoding the value
576
- # @param span [OpenTelemetry::Trace::Span] The span
577
- # @param key [String] The attribute key
578
- # @param value [Object] The value to JSON encode
579
- def set_json_attr(span, key, value)
580
- span.set_attribute(key, JSON.dump(value))
581
- end
582
367
  end
583
368
  end
584
369
  end
@@ -0,0 +1,167 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Braintrust
4
+ module Internal
5
+ # Reusable thread pool for concurrent task execution.
6
+ # Uses the strategy pattern to define result handling behavior.
7
+ #
8
+ # @example Iterate without collecting results (Eval use case)
9
+ # ThreadPool.each(items, parallelism: 4) do |item|
10
+ # process(item)
11
+ # end
12
+ #
13
+ # @example Collect results in order
14
+ # results = ThreadPool.collect(items, parallelism: 4) do |item|
15
+ # transform(item)
16
+ # end
17
+ #
18
+ # @note Thread limits are per-call, not global. If your application calls
19
+ # ThreadPool methods from multiple threads concurrently (e.g., web workers,
20
+ # background jobs), each call spawns its own worker threads. Plan your
21
+ # parallelism settings accordingly to avoid excessive thread creation.
22
+ #
23
+ class ThreadPool
24
+ DEFAULT_PARALLELISM = 3
25
+ MAX_PARALLELISM = 50
26
+
27
+ # Strategy for iteration without collecting results
28
+ class Each
29
+ def prepare(items)
30
+ @queue = Queue.new
31
+ items.each { |item| @queue << item }
32
+ end
33
+
34
+ def enqueue_sentinel(count)
35
+ count.times { @queue << :done }
36
+ end
37
+
38
+ def work_loop(&block)
39
+ loop do
40
+ item = @queue.pop
41
+ break if item == :done
42
+ block.call(item)
43
+ end
44
+ end
45
+
46
+ def result
47
+ nil
48
+ end
49
+
50
+ def empty_result
51
+ nil
52
+ end
53
+
54
+ def sequential_run(items, &block)
55
+ items.each(&block)
56
+ nil
57
+ end
58
+ end
59
+
60
+ # Strategy for collecting results in input order
61
+ class Collect
62
+ def prepare(items)
63
+ @results = Array.new(items.size)
64
+ @queue = Queue.new
65
+ items.each_with_index { |item, idx| @queue << [item, idx] }
66
+ end
67
+
68
+ def enqueue_sentinel(count)
69
+ count.times { @queue << :done }
70
+ end
71
+
72
+ def work_loop(&block)
73
+ loop do
74
+ work = @queue.pop
75
+ break if work == :done
76
+ item, idx = work
77
+ @results[idx] = block.call(item)
78
+ end
79
+ end
80
+
81
+ def result
82
+ @results
83
+ end
84
+
85
+ def empty_result
86
+ []
87
+ end
88
+
89
+ def sequential_run(items, &block)
90
+ items.map(&block)
91
+ end
92
+ end
93
+
94
+ STRATEGIES = {
95
+ each: Each,
96
+ collect: Collect
97
+ }.freeze
98
+
99
+ # Execute block for each item concurrently, discarding results.
100
+ # @param items [Array, Enumerable] Items to process
101
+ # @param parallelism [Integer] Number of worker threads (default: 3)
102
+ # @yield [item] Block to execute for each item
103
+ # @return [nil]
104
+ def self.each(items, parallelism: DEFAULT_PARALLELISM, &block)
105
+ run(items, parallelism: parallelism, strategy: :each, &block)
106
+ end
107
+
108
+ # Execute block for each item concurrently, collecting results in order.
109
+ # @param items [Array, Enumerable] Items to process
110
+ # @param parallelism [Integer] Number of worker threads (default: 3)
111
+ # @yield [item] Block to execute for each item
112
+ # @return [Array] Results in same order as input items
113
+ def self.collect(items, parallelism: DEFAULT_PARALLELISM, &block)
114
+ run(items, parallelism: parallelism, strategy: :collect, &block)
115
+ end
116
+
117
+ # Execute block for each item concurrently using the specified strategy.
118
+ # Prefer using .each or .collect convenience methods instead.
119
+ # @param items [Array, Enumerable] Items to process
120
+ # @param strategy [Symbol, #prepare] Strategy for result handling (required)
121
+ # @param parallelism [Integer] Number of worker threads (default: 3)
122
+ # @yield [item] Block to execute for each item
123
+ # @return [Object, nil] Strategy-dependent result
124
+ def self.run(items, strategy:, parallelism: DEFAULT_PARALLELISM, &block)
125
+ validate_parallelism!(parallelism)
126
+
127
+ executor = strategy_instance(strategy)
128
+ all_items = items.to_a
129
+
130
+ return executor.sequential_run(all_items, &block) if parallelism == 1
131
+ return executor.empty_result if all_items.empty?
132
+
133
+ executor.prepare(all_items)
134
+ executor.enqueue_sentinel(parallelism)
135
+
136
+ threads = parallelism.times.map do
137
+ Thread.new { executor.work_loop(&block) }
138
+ end
139
+
140
+ threads.each(&:join)
141
+ executor.result
142
+ end
143
+
144
+ def self.strategy_instance(strategy)
145
+ case strategy
146
+ when Symbol
147
+ STRATEGIES.fetch(strategy) {
148
+ raise ArgumentError, "Unknown strategy: #{strategy}. Valid: #{STRATEGIES.keys.join(", ")}"
149
+ }.new
150
+ else
151
+ strategy
152
+ end
153
+ end
154
+
155
+ def self.validate_parallelism!(parallelism)
156
+ unless parallelism.is_a?(Integer) && parallelism > 0
157
+ raise ArgumentError, "parallelism must be a positive integer"
158
+ end
159
+ if parallelism > MAX_PARALLELISM
160
+ raise ArgumentError, "parallelism cannot exceed #{MAX_PARALLELISM}"
161
+ end
162
+ end
163
+
164
+ private_class_method :strategy_instance, :validate_parallelism!
165
+ end
166
+ end
167
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Braintrust
4
- VERSION = "0.0.8"
4
+ VERSION = "0.0.9"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: braintrust
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.8
4
+ version: 0.0.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Braintrust
@@ -195,10 +195,14 @@ files:
195
195
  - lib/braintrust/eval.rb
196
196
  - lib/braintrust/eval/case.rb
197
197
  - lib/braintrust/eval/cases.rb
198
+ - lib/braintrust/eval/formatter.rb
198
199
  - lib/braintrust/eval/functions.rb
199
200
  - lib/braintrust/eval/result.rb
201
+ - lib/braintrust/eval/runner.rb
200
202
  - lib/braintrust/eval/scorer.rb
203
+ - lib/braintrust/eval/summary.rb
201
204
  - lib/braintrust/internal/experiments.rb
205
+ - lib/braintrust/internal/thread_pool.rb
202
206
  - lib/braintrust/logger.rb
203
207
  - lib/braintrust/state.rb
204
208
  - lib/braintrust/trace.rb