RubyGems - braintrust - Versions diffs - 0.0.8 → 0.0.9 - Mend

braintrust 0.0.8 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/lib/braintrust/eval/formatter.rb +220 -0
data/lib/braintrust/eval/result.rb +57 -3
data/lib/braintrust/eval/runner.rb +241 -0
data/lib/braintrust/eval/summary.rb +32 -0
data/lib/braintrust/eval.rb +9 -224
data/lib/braintrust/internal/thread_pool.rb +167 -0
data/lib/braintrust/version.rb +1 -1
metadata +5 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: c6b2bcda06084f2e90d2602659ca71cf0ab574ac8c74c367890cbb2b04740529
-  data.tar.gz: 306b5a46660eae3d3e3811d021627883419a4dc4c114e51e40be64c590868c95
+  metadata.gz: be2efc651c8e685179541cf2ade46f86e3ca66c408ed00707d2b9890a4d5fa72
+  data.tar.gz: ba9c9f993abf5ea64290c5510361297a6b7dca21ad06583634d7e4080d4f7531
 SHA512:
-  metadata.gz: 1db7bf706b260762aa114eb5e8f844cb0567efd5a6f9d8cca03667111c0e89ff68f4e53b3a3adc6ad2192947602fc4b88a8e0057169ad7eff12ccb1c2ecb4951
-  data.tar.gz: bbc71c33bb28da124bd1cc61c8bf4f765ec2899a57bf604da624a04c42a7bfce508ed1eb78c4ec421da5038fbaf89086cee8d0b04998d223568df05e8640679f
+  metadata.gz: a40a56eae61148496ee7c96775b9c6fef63106a5c943dbe1a43f66572cb245e5ec3fd5a2e98452926c8a326a0f06f9dd68bfb56a4b7f04081e65ec03ef3d7939
+  data.tar.gz: c1bed8505efca929e688538d293e4f329c8dd8c2d1152858bb6967fa0d9f54b88f4120fb3d1f7d591fc51cc0e05ec1577b8345524ddc45863c1eb2ff6537a334

data/lib/braintrust/eval/formatter.rb ADDED Viewed

@@ -0,0 +1,220 @@
+# frozen_string_literal: true
+module Braintrust
+  module Eval
+    # Formatter for pretty CLI output of experiment results
+    # Uses ANSI colors and Unicode box drawing for terminal display
+    module Formatter
+      # ANSI color codes
+      COLORS = {
+        gray: "\e[90m",
+        red: "\e[31m",
+        green: "\e[32m",
+        blue: "\e[34m",
+        magenta: "\e[35m",
+        white: "\e[97m",
+        dim: "\e[2m",
+        reset: "\e[0m"
+      }.freeze
+      # Box drawing characters (Unicode)
+      BOX = {
+        top_left: "╭",
+        top_right: "╮",
+        bottom_left: "╰",
+        bottom_right: "╯",
+        horizontal: "─",
+        vertical: "│"
+      }.freeze
+      # Maximum length for error messages before truncation
+      MAX_ERROR_LENGTH = 150
+      class << self
+        # Format an experiment summary for CLI output
+        # @param summary [ExperimentSummary] The experiment summary
+        # @return [String] Formatted output with box drawing and colors
+        def format_experiment_summary(summary)
+          return "" unless summary
+          lines = []
+          # Metadata section
+          lines << format_metadata_row("Project", summary.project_name)
+          lines << format_metadata_row("Experiment", summary.experiment_name)
+          lines << format_metadata_row("ID", summary.experiment_id)
+          lines << format_metadata_row("Duration", format_duration(summary.duration))
+          lines << format_metadata_row("Errors", summary.error_count.to_s)
+          # Scores section (if any)
+          if summary.scores&.any?
+            lines << ""
+            lines << colorize("Scores", :white)
+            # Calculate max scorer name length for alignment
+            max_name_len = summary.scores.values.map { |s| s.name.length }.max || 0
+            name_width = [max_name_len + 2, 20].max # +2 for "◯ " prefix
+            summary.scores.each_value do |score|
+              lines << format_score_row(score, name_width)
+            end
+          end
+          # Errors section (if any)
+          if summary.errors&.any?
+            lines << ""
+            lines << colorize("Errors", :white)
+            summary.errors.each do |error|
+              lines << format_error_row(error)
+            end
+          end
+          # Footer link
+          if summary.experiment_url
+            lines << ""
+            lines << terminal_link("View results for #{summary.experiment_name}", summary.experiment_url)
+          end
+          wrap_in_box(lines, "Experiment summary")
+        end
+        # Format a metadata row (label: value)
+        # @param label [String] Row label
+        # @param value [String] Row value
+        # @return [String] Formatted row
+        def format_metadata_row(label, value)
+          "#{colorize(label + ":", :dim)} #{value}"
+        end
+        # Format duration for display
+        # @param duration [Float] Duration in seconds
+        # @return [String] Formatted duration (e.g., "1.2345s" or "123ms")
+        def format_duration(duration)
+          if duration < 1
+            "#{(duration * 1000).round(0)}ms"
+          else
+            "#{duration.round(4)}s"
+          end
+        end
+        # Format an error row for display
+        # @param error_message [String] The error message
+        # @return [String] Formatted row with red ✗
+        def format_error_row(error_message)
+          truncated = truncate_error(error_message, MAX_ERROR_LENGTH)
+          "#{colorize("✗", :red)} #{truncated}"
+        end
+        # Truncate error message to max length with ellipsis
+        # @param message [String] The error message
+        # @param max_length [Integer] Maximum length before truncation
+        # @return [String] Truncated message
+        def truncate_error(message, max_length)
+          return message if message.length <= max_length
+          "#{message[0, max_length - 3]}..."
+        end
+        # Apply ANSI color codes to text
+        # @param text [String] Text to colorize
+        # @param styles [Array<Symbol>] Color/style names (:gray, :red, :green, etc.)
+        # @return [String] Colorized text (or plain text if not a TTY)
+        def colorize(text, *styles)
+          return text unless $stdout.tty?
+          codes = styles.map { |s| COLORS[s] }.compact.join
+          "#{codes}#{text}#{COLORS[:reset]}"
+        end
+        # Format a score row for display
+        # @param score [ScorerStats] The scorer statistics
+        # @param name_width [Integer] Width for the name column
+        # @return [String] Formatted row
+        def format_score_row(score, name_width = 20)
+          name = "#{colorize("◯", :blue)} #{score.name}"
+          value = colorize("#{(score.score_mean * 100).round(2)}%", :white)
+          pad_cell(name, name_width, :left) + " " + pad_cell(value, 10, :right)
+        end
+        # Pad a cell to a given width, accounting for ANSI codes
+        # @param text [String] Cell text (may contain ANSI codes)
+        # @param width [Integer] Target width
+        # @param align [Symbol] :left or :right alignment
+        # @return [String] Padded cell
+        def pad_cell(text, width, align)
+          visible_length = visible_text_length(text)
+          padding = [width - visible_length, 0].max
+          case align
+          when :right
+            " " * padding + text
+          else
+            text + " " * padding
+          end
+        end
+        # Calculate visible text length, stripping ANSI codes and OSC 8 hyperlinks
+        # @param text [String] Text that may contain escape sequences
+        # @return [Integer] Visible character count
+        def visible_text_length(text)
+          # Strip ANSI color codes: \e[...m
+          # Strip OSC 8 hyperlinks: \e]8;;...\e\\ (the URL part is invisible)
+          text
+            .gsub(/\e\[[0-9;]*m/, "")           # ANSI color codes
+            .gsub(/\e\]8;;[^\e]*\e\\/, "")      # OSC 8 hyperlink sequences
+            .length
+        end
+        # Create a clickable terminal hyperlink (OSC 8)
+        # @param text [String] Display text
+        # @param url [String] Target URL
+        # @return [String] Hyperlinked text (or plain text with URL if not a TTY)
+        def terminal_link(text, url)
+          if $stdout.tty?
+            "\e]8;;#{url}\e\\#{text}\e]8;;\e\\"
+          else
+            "#{text}: #{url}"
+          end
+        end
+        # Wrap content lines in a Unicode box with title
+        # @param lines [Array<String>] Content lines
+        # @param title [String] Box title
+        # @return [String] Boxed content
+        def wrap_in_box(lines, title)
+          # Calculate width from content (strip escape sequences for measurement)
+          content_width = lines.map { |l| visible_text_length(l) }.max || 0
+          box_width = [content_width + 4, title.length + 6].max
+          inner_width = box_width - 2
+          result = []
+          # Top border with title
+          title_str = " #{title} "
+          remaining = inner_width - title_str.length - 1
+          top = colorize("#{BOX[:top_left]}#{BOX[:horizontal]}", :gray) +
+            colorize(title_str, :gray) +
+            colorize(BOX[:horizontal] * remaining + BOX[:top_right], :gray)
+          result << top
+          # Empty line for padding
+          result << colorize(BOX[:vertical], :gray) + " " * inner_width + colorize(BOX[:vertical], :gray)
+          # Content lines
+          lines.each do |line|
+            visible_len = visible_text_length(line)
+            padding = inner_width - visible_len - 2 # 1 space on each side
+            result << colorize(BOX[:vertical], :gray) + " " + line + " " * [padding, 0].max + " " + colorize(BOX[:vertical], :gray)
+          end
+          # Empty line for padding
+          result << colorize(BOX[:vertical], :gray) + " " * inner_width + colorize(BOX[:vertical], :gray)
+          # Bottom border
+          result << colorize("#{BOX[:bottom_left]}#{BOX[:horizontal] * inner_width}#{BOX[:bottom_right]}", :gray)
+          "\n" + result.join("\n")
+        end
+      end
+    end
+  end
+end

data/lib/braintrust/eval/result.rb CHANGED Viewed

@@ -1,12 +1,15 @@
 # frozen_string_literal: true
+require_relative "formatter"
+require_relative "summary"
 module Braintrust
   module Eval
     # Result represents the outcome of an evaluation run
-    # Contains experiment metadata, errors, and timing information
+    # Contains experiment metadata, errors, timing information, and raw score data
     class Result
       attr_reader :experiment_id, :experiment_name, :project_id, :project_name,
-        :permalink, :errors, :duration
+        :permalink, :errors, :duration, :scores
       # Create a new result
       # @param experiment_id [String] The experiment ID
@@ -16,8 +19,9 @@ module Braintrust
       # @param permalink [String] Link to view the experiment in Braintrust UI
       # @param errors [Array<String>] List of errors that occurred
       # @param duration [Float] Duration in seconds
+      # @param scores [Hash, nil] Raw score data { scorer_name => Array<Numeric> }
       def initialize(experiment_id:, experiment_name:, project_id:, project_name:,
-        permalink:, errors:, duration:)
+        permalink:, errors:, duration:, scores: nil)
         @experiment_id = experiment_id
         @experiment_name = experiment_name
         @project_id = project_id
@@ -25,6 +29,7 @@ module Braintrust
         @permalink = permalink
         @errors = errors
         @duration = duration
+        @scores = scores
       end
       # Check if the evaluation was successful (no errors)
@@ -39,6 +44,12 @@ module Braintrust
         !success?
       end
+      # Get the experiment summary (lazily computed)
+      # @return [ExperimentSummary] Summary view model for Formatter
+      def summary
+        @summary ||= build_summary
+      end
       # Format the result as a human-readable string (Go SDK format)
       # @return [String]
       def to_s
@@ -51,6 +62,49 @@ module Braintrust
           "Errors: #{errors.length}"
         ].join("\n")
       end
+      # Format the result as a pretty CLI output with box drawing and colors
+      # @return [String]
+      def to_pretty
+        Formatter.format_experiment_summary(summary)
+      end
+      # Get statistics for all scorers (lazily computed from scores)
+      # @return [Hash<String, ScorerStats>] Scorer stats keyed by scorer name
+      def scorer_stats
+        @scorer_stats ||= build_scorer_stats
+      end
+      private
+      # Build scorer statistics from raw score data
+      # @return [Hash<String, ScorerStats>] Scorer stats keyed by scorer name
+      def build_scorer_stats
+        return {} if scores.nil? || scores.empty?
+        stats = {}
+        scores.each do |name, score_values|
+          next if score_values.empty?
+          mean = score_values.sum.to_f / score_values.size
+          stats[name] = ScorerStats.new(name: name, score_mean: mean)
+        end
+        stats
+      end
+      # Build experiment summary view model
+      # @return [ExperimentSummary] Summary with all data for Formatter
+      def build_summary
+        ExperimentSummary.new(
+          project_name: project_name,
+          experiment_name: experiment_name,
+          experiment_id: experiment_id,
+          experiment_url: permalink,
+          scores: scorer_stats,
+          duration: duration,
+          error_count: errors.length,
+          errors: errors
+        )
+      end
     end
   end
 end

data/lib/braintrust/eval/runner.rb ADDED Viewed

@@ -0,0 +1,241 @@
+# frozen_string_literal: true
+require_relative "case"
+require_relative "cases"
+require_relative "scorer"
+require_relative "result"
+require_relative "summary"
+require_relative "../internal/thread_pool"
+require "opentelemetry/sdk"
+require "json"
+module Braintrust
+  module Eval
+    # Internal runner class that performs the execution of the Eval and returns the result
+    class Runner
+      # Maximum parallelism allowed (mirrors Internal::ThreadPool::MAX_PARALLELISM)
+      MAX_PARALLELISM = Internal::ThreadPool::MAX_PARALLELISM
+      def initialize(experiment_id:, experiment_name:, project_id:, project_name:,
+        task:, scorers:, state:, tracer_provider: nil)
+        @experiment_id = experiment_id
+        @experiment_name = experiment_name
+        @project_id = project_id
+        @project_name = project_name
+        @task = task
+        @scorers = normalize_scorers(scorers)
+        @state = state
+        @tracer_provider = tracer_provider || OpenTelemetry.tracer_provider
+        @tracer = @tracer_provider.tracer("braintrust-eval")
+        @parent_attr = "experiment_id:#{experiment_id}"
+        # Mutex for thread-safe score collection
+        @score_mutex = Mutex.new
+      end
+      # Run evaluation and return Result
+      # @param cases [Array, Enumerable] Test cases
+      # @param parallelism [Integer] Number of parallel workers (default: 1)
+      # @return [Result]
+      def run(cases, parallelism: 1)
+        start_time = Time.now
+        normalized_cases = normalize_cases(cases)
+        errors = Queue.new
+        @scores = {} # Reset for each run: { scorer_name => Array<Numeric> }
+        if parallelism && parallelism > 1
+          Internal::ThreadPool.each(normalized_cases, parallelism: parallelism) do |test_case|
+            run_case(test_case, errors)
+          end
+        else
+          normalized_cases.each do |test_case|
+            run_case(test_case, errors)
+          end
+        end
+        # Convert Queue to Array after all threads complete
+        error_array = [].tap { |a| a << errors.pop until errors.empty? }
+        # Calculate duration
+        duration = Time.now - start_time
+        # Generate permalink
+        permalink = "#{state.app_url}/app/#{state.org_name}/object?object_type=experiment&object_id=#{experiment_id}"
+        Result.new(
+          experiment_id: experiment_id,
+          experiment_name: experiment_name,
+          project_id: project_id,
+          project_name: project_name,
+          permalink: permalink,
+          errors: error_array,
+          duration: duration,
+          scores: @scores
+        )
+      end
+      private
+      attr_reader :experiment_id, :experiment_name, :project_id, :project_name,
+        :task, :scorers, :state, :tracer, :parent_attr
+      # Run a single test case with OpenTelemetry tracing
+      # Creates eval span (parent) with task and score as children
+      # @param test_case [Case] The test case
+      # @param errors [Queue] Thread-safe error collection queue
+      def run_case(test_case, errors)
+        tracer.in_span("eval") do |eval_span|
+          eval_span.set_attribute("braintrust.parent", parent_attr)
+          # Set tags early so they're present even if task fails
+          eval_span.set_attribute("braintrust.tags", test_case.tags) if test_case.tags
+          # Run task
+          output = nil
+          begin
+            output = run_task(test_case)
+          rescue => e
+            # Error already recorded on task span, set eval span status
+            eval_span.status = OpenTelemetry::Trace::Status.error(e.message)
+            errors << "Task failed for input '#{test_case.input}': #{e.message}"
+            next
+          end
+          # Run scorers
+          begin
+            run_scorers(test_case, output)
+          rescue => e
+            # Error already recorded on score span, set eval span status
+            eval_span.status = OpenTelemetry::Trace::Status.error(e.message)
+            errors << "Scorers failed for input '#{test_case.input}': #{e.message}"
+          end
+          # Set eval span attributes (after task and scorers complete)
+          set_json_attr(eval_span, "braintrust.span_attributes", {type: "eval"})
+          set_json_attr(eval_span, "braintrust.input_json", test_case.input)
+          set_json_attr(eval_span, "braintrust.output_json", output)
+          set_json_attr(eval_span, "braintrust.expected", test_case.expected) if test_case.expected
+        end
+      end
+      # Run task with OpenTelemetry tracing
+      # Creates task span with input and output
+      # @param test_case [Case] The test case
+      # @return [Object] Task output
+      def run_task(test_case)
+        tracer.in_span("task") do |task_span|
+          task_span.set_attribute("braintrust.parent", parent_attr)
+          set_json_attr(task_span, "braintrust.span_attributes", {type: "task"})
+          set_json_attr(task_span, "braintrust.input_json", test_case.input)
+          begin
+            output = task.call(test_case.input)
+            set_json_attr(task_span, "braintrust.output_json", output)
+            output
+          rescue => e
+            # Record exception event with stacktrace, then set error status
+            task_span.record_exception(e)
+            task_span.status = OpenTelemetry::Trace::Status.error(e.message)
+            raise
+          end
+        end
+      end
+      # Run scorers with OpenTelemetry tracing
+      # Creates single score span for all scorers
+      # @param test_case [Case] The test case
+      # @param output [Object] Task output
+      def run_scorers(test_case, output)
+        tracer.in_span("score") do |score_span|
+          score_span.set_attribute("braintrust.parent", parent_attr)
+          set_json_attr(score_span, "braintrust.span_attributes", {type: "score"})
+          scores = {}
+          scorer_error = nil
+          scorers.each do |scorer|
+            score_value = scorer.call(test_case.input, test_case.expected, output, test_case.metadata || {})
+            scores[scorer.name] = score_value
+            # Collect raw score for summary (thread-safe)
+            collect_score(scorer.name, score_value)
+          rescue => e
+            # Record first error but continue processing other scorers
+            scorer_error ||= e
+            record_span_error(score_span, e, "ScorerError")
+          end
+          # Always set scores attribute, even if some scorers failed
+          set_json_attr(score_span, "braintrust.scores", scores)
+          # Raise after setting scores so we can see which scorers succeeded
+          raise scorer_error if scorer_error
+        end
+      end
+      # Normalize cases input to Cases wrapper
+      # @param cases_input [Array, Enumerable, Cases] The cases input
+      # @return [Cases]
+      def normalize_cases(cases_input)
+        case cases_input
+        when Cases
+          cases_input
+        when Array, Enumerable
+          Cases.new(cases_input)
+        else
+          if cases_input.respond_to?(:each)
+            Cases.new(cases_input)
+          else
+            raise ArgumentError, "cases must be Array or Enumerable"
+          end
+        end
+      end
+      # Normalize scorers to Scorer objects
+      # @param scorers_input [Array] The scorers input (Scorer objects or callables)
+      # @return [Array<Scorer>]
+      def normalize_scorers(scorers_input)
+        scorers_input.map do |scorer|
+          case scorer
+          when Scorer
+            scorer
+          else
+            Scorer.new(scorer)
+          end
+        end
+      end
+      # Record error on span with exception event and error status
+      # @param span [OpenTelemetry::Trace::Span] The span to record error on
+      # @param error [Exception] The error that occurred
+      # @param error_type [String] The error type name (optional)
+      def record_span_error(span, error, error_type = nil)
+        if error_type
+          span.record_exception(error, attributes: {"exception.type" => error_type})
+        else
+          span.record_exception(error)
+        end
+        span.status = OpenTelemetry::Trace::Status.error(error.message)
+      end
+      # Set a span attribute by JSON encoding the value
+      # @param span [OpenTelemetry::Trace::Span] The span
+      # @param key [String] The attribute key
+      # @param value [Object] The value to JSON encode
+      def set_json_attr(span, key, value)
+        span.set_attribute(key, JSON.dump(value))
+      end
+      # Collect a single score value for summary calculation
+      # @param name [String] Scorer name
+      # @param value [Object] Score value (only Numeric values are collected)
+      def collect_score(name, value)
+        return unless value.is_a?(Numeric)
+        @score_mutex.synchronize do
+          (@scores[name] ||= []) << value
+        end
+      end
+    end
+  end
+end

data/lib/braintrust/eval/summary.rb ADDED Viewed

@@ -0,0 +1,32 @@
+# frozen_string_literal: true
+module Braintrust
+  module Eval
+    # Aggregated statistics for a single scorer across test cases
+    # @attr name [String] Scorer name
+    # @attr score_mean [Float] Mean score (0.0 to 1.0)
+    ScorerStats = Struct.new(:name, :score_mean, keyword_init: true)
+    # Summary of results from an Experiment
+    # Typically used to generate experiment output
+    # @attr project_name [String] Project name
+    # @attr experiment_name [String] Experiment name
+    # @attr experiment_id [String] Experiment ID
+    # @attr experiment_url [String] URL to view experiment in Braintrust UI
+    # @attr scores [Hash<String, ScorerStats>] Scorer stats keyed by scorer name
+    # @attr duration [Float] Duration in seconds
+    # @attr error_count [Integer] Number of errors
+    # @attr errors [Array<String>] Error messages with locations
+    ExperimentSummary = Struct.new(
+      :project_name,
+      :experiment_name,
+      :experiment_id,
+      :experiment_url,
+      :scores,
+      :duration,
+      :error_count,
+      :errors,
+      keyword_init: true
+    )
+  end
+end

data/lib/braintrust/eval.rb CHANGED Viewed

@@ -1,10 +1,9 @@
 # frozen_string_literal: true
-require_relative "eval/case"
-require_relative "eval/cases"
 require_relative "eval/scorer"
-require_relative "eval/result"
+require_relative "eval/runner"
 require_relative "internal/experiments"
 require "opentelemetry/sdk"
 require "json"
@@ -193,7 +192,9 @@ module Braintrust
       #   - Hash: {name:, id:, project:, version:, limit:}
       # @param task [#call] The task to evaluate (must be callable)
       # @param scorers [Array<Scorer, #call>] The scorers to use (Scorer objects or callables)
-      # @param parallelism [Integer] Number of parallel workers (default: 1)
+      # @param parallelism [Integer] Number of parallel workers (default: 1).
+      #   When parallelism > 1, test cases are executed concurrently using a thread pool.
+      #   The task and scorers MUST be thread-safe when using parallelism > 1.
       # @param tags [Array<String>] Optional experiment tags
       # @param metadata [Hash] Optional experiment metadata
       # @param update [Boolean] If true, allow reusing existing experiment (default: false)
@@ -232,18 +233,18 @@ module Braintrust
         project_id = result[:project_id]
         project_name = result[:project_name]
-        # Run the eval with resolved experiment info
-        result = run_internal(
+        # Instantiate Runner and run evaluation
+        runner = Runner.new(
           experiment_id: experiment_id,
           experiment_name: experiment,
           project_id: project_id,
           project_name: project_name,
-          cases: cases,
           task: task,
           scorers: scorers,
           state: state,
           tracer_provider: tracer_provider
         )
+        result = runner.run(cases, parallelism: parallelism)
         # Print result summary unless quiet
         print_result(result) unless quiet
@@ -253,66 +254,10 @@ module Braintrust
       private
-      # Internal eval runner that doesn't touch the API
-      # @param experiment_id [String] Resolved experiment ID
-      # @param experiment_name [String] Experiment name
-      # @param project_id [String] Resolved project ID
-      # @param project_name [String] Project name
-      # @param cases [Array, Enumerable, Cases] Test cases
-      # @param task [#call] Task callable
-      # @param scorers [Array] Scorers
-      # @param state [State] Braintrust state
-      # @param tracer_provider [TracerProvider, nil] OpenTelemetry tracer provider
-      # @return [Result]
-      def run_internal(experiment_id:, experiment_name:, project_id:, project_name:,
-        cases:, task:, scorers:, state:, tracer_provider: nil)
-        start_time = Time.now
-        # Get tracer for creating spans
-        tracer_provider ||= OpenTelemetry.tracer_provider
-        tracer = tracer_provider.tracer("braintrust-eval")
-        # Parent attribute for all eval spans
-        parent_attr = "experiment_id:#{experiment_id}"
-        # Normalize cases to Cases wrapper
-        normalized_cases = normalize_cases(cases)
-        # Normalize scorers to Scorer objects
-        normalized_scorers = normalize_scorers(scorers)
-        # Collect errors
-        errors = []
-        # Run each case with tracing
-        normalized_cases.each do |test_case|
-          run_case(test_case, task, normalized_scorers, errors,
-            tracer, parent_attr)
-        end
-        # Calculate duration
-        duration = Time.now - start_time
-        # Generate permalink: {app_url}/app/{org}/object?object_type=experiment&object_id={experiment_id}
-        permalink = "#{state.app_url}/app/#{state.org_name}/object?object_type=experiment&object_id=#{experiment_id}"
-        # Return result
-        Result.new(
-          experiment_id: experiment_id,
-          experiment_name: experiment_name,
-          project_id: project_id,
-          project_name: project_name,
-          permalink: permalink,
-          errors: errors,
-          duration: duration
-        )
-      end
       # Print result summary to stdout
       # @param result [Result] The evaluation result
       def print_result(result)
-        puts "=" * 60
-        puts result
+        puts result.to_pretty
       end
       # Validate required parameters
@@ -419,166 +364,6 @@ module Braintrust
           filtered
         end
       end
-      # Normalize cases input to Cases wrapper
-      # @param cases_input [Array, Enumerable, Cases] The cases input
-      # @return [Cases]
-      def normalize_cases(cases_input)
-        case cases_input
-        when Cases
-          cases_input
-        when Array, Enumerable
-          Cases.new(cases_input)
-        else
-          if cases_input.respond_to?(:each)
-            Cases.new(cases_input)
-          else
-            raise ArgumentError, "cases must be Array or Enumerable"
-          end
-        end
-      end
-      # Normalize scorers to Scorer objects
-      # @param scorers_input [Array] The scorers input (Scorer objects or callables)
-      # @return [Array<Scorer>]
-      def normalize_scorers(scorers_input)
-        scorers_input.map do |scorer|
-          case scorer
-          when Scorer
-            # Already a Scorer
-            scorer
-          else
-            # Wrap callable in Scorer (auto-detects name)
-            Scorer.new(scorer)
-          end
-        end
-      end
-      # Run a single test case with OpenTelemetry tracing
-      # Creates eval span (parent) with task and score as children
-      # @param test_case [Case] The test case
-      # @param task [#call] The task
-      # @param scorers [Array<Scorer>] The scorers
-      # @param errors [Array<String>] Error collection array
-      # @param tracer [Tracer] OpenTelemetry tracer
-      # @param parent_attr [String] Parent attribute (experiment_id:exp_id)
-      def run_case(test_case, task, scorers, errors, tracer, parent_attr)
-        # Create eval span (parent)
-        tracer.in_span("eval") do |eval_span|
-          eval_span.set_attribute("braintrust.parent", parent_attr)
-          # Set tags early so they're present even if task fails
-          eval_span.set_attribute("braintrust.tags", test_case.tags) if test_case.tags
-          # Run task
-          output = nil
-          begin
-            output = run_task(test_case, task, tracer, parent_attr)
-          rescue => e
-            # Error already recorded on task span, set eval span status
-            eval_span.status = OpenTelemetry::Trace::Status.error(e.message)
-            errors << "Task failed for input '#{test_case.input}': #{e.message}"
-            next
-          end
-          # Run scorers
-          begin
-            run_scorers(test_case, output, scorers, tracer, parent_attr)
-          rescue => e
-            # Error already recorded on score span, set eval span status
-            eval_span.status = OpenTelemetry::Trace::Status.error(e.message)
-            errors << "Scorers failed for input '#{test_case.input}': #{e.message}"
-          end
-          # Set eval span attributes (after task and scorers complete)
-          set_json_attr(eval_span, "braintrust.span_attributes", {type: "eval"})
-          set_json_attr(eval_span, "braintrust.input_json", test_case.input)
-          set_json_attr(eval_span, "braintrust.output_json", output)
-          set_json_attr(eval_span, "braintrust.expected", test_case.expected) if test_case.expected
-        end
-      end
-      # Run task with OpenTelemetry tracing
-      # Creates task span with input and output
-      # @param test_case [Case] The test case
-      # @param task [#call] The task
-      # @param tracer [Tracer] OpenTelemetry tracer
-      # @param parent_attr [String] Parent attribute
-      # @return [Object] Task output
-      def run_task(test_case, task, tracer, parent_attr)
-        tracer.in_span("task") do |task_span|
-          task_span.set_attribute("braintrust.parent", parent_attr)
-          set_json_attr(task_span, "braintrust.span_attributes", {type: "task"})
-          set_json_attr(task_span, "braintrust.input_json", test_case.input)
-          begin
-            output = task.call(test_case.input)
-            set_json_attr(task_span, "braintrust.output_json", output)
-            output
-          rescue => e
-            # Record exception event with stacktrace, then set error status
-            task_span.record_exception(e)
-            task_span.status = OpenTelemetry::Trace::Status.error(e.message)
-            raise
-          end
-        end
-      end
-      # Run scorers with OpenTelemetry tracing
-      # Creates single score span for all scorers
-      # @param test_case [Case] The test case
-      # @param output [Object] Task output
-      # @param scorers [Array<Scorer>] The scorers
-      # @param tracer [Tracer] OpenTelemetry tracer
-      # @param parent_attr [String] Parent attribute
-      def run_scorers(test_case, output, scorers, tracer, parent_attr)
-        tracer.in_span("score") do |score_span|
-          score_span.set_attribute("braintrust.parent", parent_attr)
-          set_json_attr(score_span, "braintrust.span_attributes", {type: "score"})
-          scores = {}
-          scorer_error = nil
-          scorers.each do |scorer|
-            score_value = scorer.call(test_case.input, test_case.expected, output, test_case.metadata || {})
-            scores[scorer.name] = score_value
-          rescue => e
-            # Record first error but continue processing other scorers
-            scorer_error ||= "Scorer '#{scorer.name}' failed: #{e.message}"
-            record_span_error(score_span, e, "ScorerError")
-          end
-          # Always set scores attribute, even if some scorers failed
-          set_json_attr(score_span, "braintrust.scores", scores)
-          # Raise after setting scores so we can see which scorers succeeded
-          raise scorer_error if scorer_error
-        end
-      end
-      # Record error on span with exception event and error status
-      # @param span [OpenTelemetry::Trace::Span] The span to record error on
-      # @param error [Exception] The error that occurred
-      # @param error_type [String] The error type name (optional, used for custom error classification)
-      def record_span_error(span, error, error_type = nil)
-        # Record exception with stacktrace (OpenTelemetry standard)
-        if error_type
-          # For custom error types, add type override
-          span.record_exception(error, attributes: {"exception.type" => error_type})
-        else
-          span.record_exception(error)
-        end
-        # Set span status to error
-        span.status = OpenTelemetry::Trace::Status.error(error.message)
-      end
-      # Set a span attribute by JSON encoding the value
-      # @param span [OpenTelemetry::Trace::Span] The span
-      # @param key [String] The attribute key
-      # @param value [Object] The value to JSON encode
-      def set_json_attr(span, key, value)
-        span.set_attribute(key, JSON.dump(value))
-      end
     end
   end
 end

data/lib/braintrust/internal/thread_pool.rb ADDED Viewed

@@ -0,0 +1,167 @@
+# frozen_string_literal: true
+module Braintrust
+  module Internal
+    # Reusable thread pool for concurrent task execution.
+    # Uses the strategy pattern to define result handling behavior.
+    #
+    # @example Iterate without collecting results (Eval use case)
+    #   ThreadPool.each(items, parallelism: 4) do |item|
+    #     process(item)
+    #   end
+    #
+    # @example Collect results in order
+    #   results = ThreadPool.collect(items, parallelism: 4) do |item|
+    #     transform(item)
+    #   end
+    #
+    # @note Thread limits are per-call, not global. If your application calls
+    #   ThreadPool methods from multiple threads concurrently (e.g., web workers,
+    #   background jobs), each call spawns its own worker threads. Plan your
+    #   parallelism settings accordingly to avoid excessive thread creation.
+    #
+    class ThreadPool
+      DEFAULT_PARALLELISM = 3
+      MAX_PARALLELISM = 50
+      # Strategy for iteration without collecting results
+      class Each
+        def prepare(items)
+          @queue = Queue.new
+          items.each { |item| @queue << item }
+        end
+        def enqueue_sentinel(count)
+          count.times { @queue << :done }
+        end
+        def work_loop(&block)
+          loop do
+            item = @queue.pop
+            break if item == :done
+            block.call(item)
+          end
+        end
+        def result
+          nil
+        end
+        def empty_result
+          nil
+        end
+        def sequential_run(items, &block)
+          items.each(&block)
+          nil
+        end
+      end
+      # Strategy for collecting results in input order
+      class Collect
+        def prepare(items)
+          @results = Array.new(items.size)
+          @queue = Queue.new
+          items.each_with_index { |item, idx| @queue << [item, idx] }
+        end
+        def enqueue_sentinel(count)
+          count.times { @queue << :done }
+        end
+        def work_loop(&block)
+          loop do
+            work = @queue.pop
+            break if work == :done
+            item, idx = work
+            @results[idx] = block.call(item)
+          end
+        end
+        def result
+          @results
+        end
+        def empty_result
+          []
+        end
+        def sequential_run(items, &block)
+          items.map(&block)
+        end
+      end
+      STRATEGIES = {
+        each: Each,
+        collect: Collect
+      }.freeze
+      # Execute block for each item concurrently, discarding results.
+      # @param items [Array, Enumerable] Items to process
+      # @param parallelism [Integer] Number of worker threads (default: 3)
+      # @yield [item] Block to execute for each item
+      # @return [nil]
+      def self.each(items, parallelism: DEFAULT_PARALLELISM, &block)
+        run(items, parallelism: parallelism, strategy: :each, &block)
+      end
+      # Execute block for each item concurrently, collecting results in order.
+      # @param items [Array, Enumerable] Items to process
+      # @param parallelism [Integer] Number of worker threads (default: 3)
+      # @yield [item] Block to execute for each item
+      # @return [Array] Results in same order as input items
+      def self.collect(items, parallelism: DEFAULT_PARALLELISM, &block)
+        run(items, parallelism: parallelism, strategy: :collect, &block)
+      end
+      # Execute block for each item concurrently using the specified strategy.
+      # Prefer using .each or .collect convenience methods instead.
+      # @param items [Array, Enumerable] Items to process
+      # @param strategy [Symbol, #prepare] Strategy for result handling (required)
+      # @param parallelism [Integer] Number of worker threads (default: 3)
+      # @yield [item] Block to execute for each item
+      # @return [Object, nil] Strategy-dependent result
+      def self.run(items, strategy:, parallelism: DEFAULT_PARALLELISM, &block)
+        validate_parallelism!(parallelism)
+        executor = strategy_instance(strategy)
+        all_items = items.to_a
+        return executor.sequential_run(all_items, &block) if parallelism == 1
+        return executor.empty_result if all_items.empty?
+        executor.prepare(all_items)
+        executor.enqueue_sentinel(parallelism)
+        threads = parallelism.times.map do
+          Thread.new { executor.work_loop(&block) }
+        end
+        threads.each(&:join)
+        executor.result
+      end
+      def self.strategy_instance(strategy)
+        case strategy
+        when Symbol
+          STRATEGIES.fetch(strategy) {
+            raise ArgumentError, "Unknown strategy: #{strategy}. Valid: #{STRATEGIES.keys.join(", ")}"
+          }.new
+        else
+          strategy
+        end
+      end
+      def self.validate_parallelism!(parallelism)
+        unless parallelism.is_a?(Integer) && parallelism > 0
+          raise ArgumentError, "parallelism must be a positive integer"
+        end
+        if parallelism > MAX_PARALLELISM
+          raise ArgumentError, "parallelism cannot exceed #{MAX_PARALLELISM}"
+        end
+      end
+      private_class_method :strategy_instance, :validate_parallelism!
+    end
+  end
+end

data/lib/braintrust/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Braintrust
-  VERSION = "0.0.8"
+  VERSION = "0.0.9"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: braintrust
 version: !ruby/object:Gem::Version
-  version: 0.0.8
+  version: 0.0.9
 platform: ruby
 authors:
 - Braintrust
@@ -195,10 +195,14 @@ files:
 - lib/braintrust/eval.rb
 - lib/braintrust/eval/case.rb
 - lib/braintrust/eval/cases.rb
+- lib/braintrust/eval/formatter.rb
 - lib/braintrust/eval/functions.rb
 - lib/braintrust/eval/result.rb
+- lib/braintrust/eval/runner.rb
 - lib/braintrust/eval/scorer.rb
+- lib/braintrust/eval/summary.rb
 - lib/braintrust/internal/experiments.rb
+- lib/braintrust/internal/thread_pool.rb
 - lib/braintrust/logger.rb
 - lib/braintrust/state.rb
 - lib/braintrust/trace.rb