RubyGems - braintrust - Versions diffs - 0.0.7 → 0.0.9 - Mend

braintrust 0.0.7 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/lib/braintrust/eval/formatter.rb +220 -0
data/lib/braintrust/eval/result.rb +57 -3
data/lib/braintrust/eval/runner.rb +241 -0
data/lib/braintrust/eval/summary.rb +32 -0
data/lib/braintrust/eval.rb +9 -224
data/lib/braintrust/internal/thread_pool.rb +167 -0
data/lib/braintrust/trace/contrib/github.com/alexrudall/ruby-openai/ruby-openai.rb +304 -62
data/lib/braintrust/trace/contrib/openai.rb +38 -8
data/lib/braintrust/trace/tokens.rb +10 -2
data/lib/braintrust/version.rb +1 -1
metadata +5 -1

data/lib/braintrust/eval.rb CHANGED Viewed

@@ -1,10 +1,9 @@
 # frozen_string_literal: true
-require_relative "eval/case"
-require_relative "eval/cases"
 require_relative "eval/scorer"
-require_relative "eval/result"
+require_relative "eval/runner"
 require_relative "internal/experiments"
 require "opentelemetry/sdk"
 require "json"
@@ -193,7 +192,9 @@ module Braintrust
       #   - Hash: {name:, id:, project:, version:, limit:}
       # @param task [#call] The task to evaluate (must be callable)
       # @param scorers [Array<Scorer, #call>] The scorers to use (Scorer objects or callables)
-      # @param parallelism [Integer] Number of parallel workers (default: 1)
+      # @param parallelism [Integer] Number of parallel workers (default: 1).
+      #   When parallelism > 1, test cases are executed concurrently using a thread pool.
+      #   The task and scorers MUST be thread-safe when using parallelism > 1.
       # @param tags [Array<String>] Optional experiment tags
       # @param metadata [Hash] Optional experiment metadata
       # @param update [Boolean] If true, allow reusing existing experiment (default: false)
@@ -232,18 +233,18 @@ module Braintrust
         project_id = result[:project_id]
         project_name = result[:project_name]
-        # Run the eval with resolved experiment info
-        result = run_internal(
+        # Instantiate Runner and run evaluation
+        runner = Runner.new(
           experiment_id: experiment_id,
           experiment_name: experiment,
           project_id: project_id,
           project_name: project_name,
-          cases: cases,
           task: task,
           scorers: scorers,
           state: state,
           tracer_provider: tracer_provider
         )
+        result = runner.run(cases, parallelism: parallelism)
         # Print result summary unless quiet
         print_result(result) unless quiet
@@ -253,66 +254,10 @@ module Braintrust
       private
-      # Internal eval runner that doesn't touch the API
-      # @param experiment_id [String] Resolved experiment ID
-      # @param experiment_name [String] Experiment name
-      # @param project_id [String] Resolved project ID
-      # @param project_name [String] Project name
-      # @param cases [Array, Enumerable, Cases] Test cases
-      # @param task [#call] Task callable
-      # @param scorers [Array] Scorers
-      # @param state [State] Braintrust state
-      # @param tracer_provider [TracerProvider, nil] OpenTelemetry tracer provider
-      # @return [Result]
-      def run_internal(experiment_id:, experiment_name:, project_id:, project_name:,
-        cases:, task:, scorers:, state:, tracer_provider: nil)
-        start_time = Time.now
-        # Get tracer for creating spans
-        tracer_provider ||= OpenTelemetry.tracer_provider
-        tracer = tracer_provider.tracer("braintrust-eval")
-        # Parent attribute for all eval spans
-        parent_attr = "experiment_id:#{experiment_id}"
-        # Normalize cases to Cases wrapper
-        normalized_cases = normalize_cases(cases)
-        # Normalize scorers to Scorer objects
-        normalized_scorers = normalize_scorers(scorers)
-        # Collect errors
-        errors = []
-        # Run each case with tracing
-        normalized_cases.each do |test_case|
-          run_case(test_case, task, normalized_scorers, errors,
-            tracer, parent_attr)
-        end
-        # Calculate duration
-        duration = Time.now - start_time
-        # Generate permalink: {app_url}/app/{org}/object?object_type=experiment&object_id={experiment_id}
-        permalink = "#{state.app_url}/app/#{state.org_name}/object?object_type=experiment&object_id=#{experiment_id}"
-        # Return result
-        Result.new(
-          experiment_id: experiment_id,
-          experiment_name: experiment_name,
-          project_id: project_id,
-          project_name: project_name,
-          permalink: permalink,
-          errors: errors,
-          duration: duration
-        )
-      end
       # Print result summary to stdout
       # @param result [Result] The evaluation result
       def print_result(result)
-        puts "=" * 60
-        puts result
+        puts result.to_pretty
       end
       # Validate required parameters
@@ -419,166 +364,6 @@ module Braintrust
           filtered
         end
       end
-      # Normalize cases input to Cases wrapper
-      # @param cases_input [Array, Enumerable, Cases] The cases input
-      # @return [Cases]
-      def normalize_cases(cases_input)
-        case cases_input
-        when Cases
-          cases_input
-        when Array, Enumerable
-          Cases.new(cases_input)
-        else
-          if cases_input.respond_to?(:each)
-            Cases.new(cases_input)
-          else
-            raise ArgumentError, "cases must be Array or Enumerable"
-          end
-        end
-      end
-      # Normalize scorers to Scorer objects
-      # @param scorers_input [Array] The scorers input (Scorer objects or callables)
-      # @return [Array<Scorer>]
-      def normalize_scorers(scorers_input)
-        scorers_input.map do |scorer|
-          case scorer
-          when Scorer
-            # Already a Scorer
-            scorer
-          else
-            # Wrap callable in Scorer (auto-detects name)
-            Scorer.new(scorer)
-          end
-        end
-      end
-      # Run a single test case with OpenTelemetry tracing
-      # Creates eval span (parent) with task and score as children
-      # @param test_case [Case] The test case
-      # @param task [#call] The task
-      # @param scorers [Array<Scorer>] The scorers
-      # @param errors [Array<String>] Error collection array
-      # @param tracer [Tracer] OpenTelemetry tracer
-      # @param parent_attr [String] Parent attribute (experiment_id:exp_id)
-      def run_case(test_case, task, scorers, errors, tracer, parent_attr)
-        # Create eval span (parent)
-        tracer.in_span("eval") do |eval_span|
-          eval_span.set_attribute("braintrust.parent", parent_attr)
-          # Set tags early so they're present even if task fails
-          eval_span.set_attribute("braintrust.tags", test_case.tags) if test_case.tags
-          # Run task
-          output = nil
-          begin
-            output = run_task(test_case, task, tracer, parent_attr)
-          rescue => e
-            # Error already recorded on task span, set eval span status
-            eval_span.status = OpenTelemetry::Trace::Status.error(e.message)
-            errors << "Task failed for input '#{test_case.input}': #{e.message}"
-            next
-          end
-          # Run scorers
-          begin
-            run_scorers(test_case, output, scorers, tracer, parent_attr)
-          rescue => e
-            # Error already recorded on score span, set eval span status
-            eval_span.status = OpenTelemetry::Trace::Status.error(e.message)
-            errors << "Scorers failed for input '#{test_case.input}': #{e.message}"
-          end
-          # Set eval span attributes (after task and scorers complete)
-          set_json_attr(eval_span, "braintrust.span_attributes", {type: "eval"})
-          set_json_attr(eval_span, "braintrust.input_json", test_case.input)
-          set_json_attr(eval_span, "braintrust.output_json", output)
-          set_json_attr(eval_span, "braintrust.expected", test_case.expected) if test_case.expected
-        end
-      end
-      # Run task with OpenTelemetry tracing
-      # Creates task span with input and output
-      # @param test_case [Case] The test case
-      # @param task [#call] The task
-      # @param tracer [Tracer] OpenTelemetry tracer
-      # @param parent_attr [String] Parent attribute
-      # @return [Object] Task output
-      def run_task(test_case, task, tracer, parent_attr)
-        tracer.in_span("task") do |task_span|
-          task_span.set_attribute("braintrust.parent", parent_attr)
-          set_json_attr(task_span, "braintrust.span_attributes", {type: "task"})
-          set_json_attr(task_span, "braintrust.input_json", test_case.input)
-          begin
-            output = task.call(test_case.input)
-            set_json_attr(task_span, "braintrust.output_json", output)
-            output
-          rescue => e
-            # Record exception event with stacktrace, then set error status
-            task_span.record_exception(e)
-            task_span.status = OpenTelemetry::Trace::Status.error(e.message)
-            raise
-          end
-        end
-      end
-      # Run scorers with OpenTelemetry tracing
-      # Creates single score span for all scorers
-      # @param test_case [Case] The test case
-      # @param output [Object] Task output
-      # @param scorers [Array<Scorer>] The scorers
-      # @param tracer [Tracer] OpenTelemetry tracer
-      # @param parent_attr [String] Parent attribute
-      def run_scorers(test_case, output, scorers, tracer, parent_attr)
-        tracer.in_span("score") do |score_span|
-          score_span.set_attribute("braintrust.parent", parent_attr)
-          set_json_attr(score_span, "braintrust.span_attributes", {type: "score"})
-          scores = {}
-          scorer_error = nil
-          scorers.each do |scorer|
-            score_value = scorer.call(test_case.input, test_case.expected, output, test_case.metadata || {})
-            scores[scorer.name] = score_value
-          rescue => e
-            # Record first error but continue processing other scorers
-            scorer_error ||= "Scorer '#{scorer.name}' failed: #{e.message}"
-            record_span_error(score_span, e, "ScorerError")
-          end
-          # Always set scores attribute, even if some scorers failed
-          set_json_attr(score_span, "braintrust.scores", scores)
-          # Raise after setting scores so we can see which scorers succeeded
-          raise scorer_error if scorer_error
-        end
-      end
-      # Record error on span with exception event and error status
-      # @param span [OpenTelemetry::Trace::Span] The span to record error on
-      # @param error [Exception] The error that occurred
-      # @param error_type [String] The error type name (optional, used for custom error classification)
-      def record_span_error(span, error, error_type = nil)
-        # Record exception with stacktrace (OpenTelemetry standard)
-        if error_type
-          # For custom error types, add type override
-          span.record_exception(error, attributes: {"exception.type" => error_type})
-        else
-          span.record_exception(error)
-        end
-        # Set span status to error
-        span.status = OpenTelemetry::Trace::Status.error(error.message)
-      end
-      # Set a span attribute by JSON encoding the value
-      # @param span [OpenTelemetry::Trace::Span] The span
-      # @param key [String] The attribute key
-      # @param value [Object] The value to JSON encode
-      def set_json_attr(span, key, value)
-        span.set_attribute(key, JSON.dump(value))
-      end
     end
   end
 end

data/lib/braintrust/internal/thread_pool.rb ADDED Viewed

@@ -0,0 +1,167 @@
+# frozen_string_literal: true
+module Braintrust
+  module Internal
+    # Reusable thread pool for concurrent task execution.
+    # Uses the strategy pattern to define result handling behavior.
+    #
+    # @example Iterate without collecting results (Eval use case)
+    #   ThreadPool.each(items, parallelism: 4) do |item|
+    #     process(item)
+    #   end
+    #
+    # @example Collect results in order
+    #   results = ThreadPool.collect(items, parallelism: 4) do |item|
+    #     transform(item)
+    #   end
+    #
+    # @note Thread limits are per-call, not global. If your application calls
+    #   ThreadPool methods from multiple threads concurrently (e.g., web workers,
+    #   background jobs), each call spawns its own worker threads. Plan your
+    #   parallelism settings accordingly to avoid excessive thread creation.
+    #
+    class ThreadPool
+      DEFAULT_PARALLELISM = 3
+      MAX_PARALLELISM = 50
+      # Strategy for iteration without collecting results
+      class Each
+        def prepare(items)
+          @queue = Queue.new
+          items.each { |item| @queue << item }
+        end
+        def enqueue_sentinel(count)
+          count.times { @queue << :done }
+        end
+        def work_loop(&block)
+          loop do
+            item = @queue.pop
+            break if item == :done
+            block.call(item)
+          end
+        end
+        def result
+          nil
+        end
+        def empty_result
+          nil
+        end
+        def sequential_run(items, &block)
+          items.each(&block)
+          nil
+        end
+      end
+      # Strategy for collecting results in input order
+      class Collect
+        def prepare(items)
+          @results = Array.new(items.size)
+          @queue = Queue.new
+          items.each_with_index { |item, idx| @queue << [item, idx] }
+        end
+        def enqueue_sentinel(count)
+          count.times { @queue << :done }
+        end
+        def work_loop(&block)
+          loop do
+            work = @queue.pop
+            break if work == :done
+            item, idx = work
+            @results[idx] = block.call(item)
+          end
+        end
+        def result
+          @results
+        end
+        def empty_result
+          []
+        end
+        def sequential_run(items, &block)
+          items.map(&block)
+        end
+      end
+      STRATEGIES = {
+        each: Each,
+        collect: Collect
+      }.freeze
+      # Execute block for each item concurrently, discarding results.
+      # @param items [Array, Enumerable] Items to process
+      # @param parallelism [Integer] Number of worker threads (default: 3)
+      # @yield [item] Block to execute for each item
+      # @return [nil]
+      def self.each(items, parallelism: DEFAULT_PARALLELISM, &block)
+        run(items, parallelism: parallelism, strategy: :each, &block)
+      end
+      # Execute block for each item concurrently, collecting results in order.
+      # @param items [Array, Enumerable] Items to process
+      # @param parallelism [Integer] Number of worker threads (default: 3)
+      # @yield [item] Block to execute for each item
+      # @return [Array] Results in same order as input items
+      def self.collect(items, parallelism: DEFAULT_PARALLELISM, &block)
+        run(items, parallelism: parallelism, strategy: :collect, &block)
+      end
+      # Execute block for each item concurrently using the specified strategy.
+      # Prefer using .each or .collect convenience methods instead.
+      # @param items [Array, Enumerable] Items to process
+      # @param strategy [Symbol, #prepare] Strategy for result handling (required)
+      # @param parallelism [Integer] Number of worker threads (default: 3)
+      # @yield [item] Block to execute for each item
+      # @return [Object, nil] Strategy-dependent result
+      def self.run(items, strategy:, parallelism: DEFAULT_PARALLELISM, &block)
+        validate_parallelism!(parallelism)
+        executor = strategy_instance(strategy)
+        all_items = items.to_a
+        return executor.sequential_run(all_items, &block) if parallelism == 1
+        return executor.empty_result if all_items.empty?
+        executor.prepare(all_items)
+        executor.enqueue_sentinel(parallelism)
+        threads = parallelism.times.map do
+          Thread.new { executor.work_loop(&block) }
+        end
+        threads.each(&:join)
+        executor.result
+      end
+      def self.strategy_instance(strategy)
+        case strategy
+        when Symbol
+          STRATEGIES.fetch(strategy) {
+            raise ArgumentError, "Unknown strategy: #{strategy}. Valid: #{STRATEGIES.keys.join(", ")}"
+          }.new
+        else
+          strategy
+        end
+      end
+      def self.validate_parallelism!(parallelism)
+        unless parallelism.is_a?(Integer) && parallelism > 0
+          raise ArgumentError, "parallelism must be a positive integer"
+        end
+        if parallelism > MAX_PARALLELISM
+          raise ArgumentError, "parallelism cannot exceed #{MAX_PARALLELISM}"
+        end
+      end
+      private_class_method :strategy_instance, :validate_parallelism!
+    end
+  end
+end