RubyGems - braintrust - Versions diffs - 0.2.0 → 0.3.0 - Mend

braintrust 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

checksums.yaml +4 -4
data/README.md +148 -24
data/lib/braintrust/api/internal/btql.rb +124 -0
data/lib/braintrust/api/internal/experiments.rb +19 -0
data/lib/braintrust/api/internal/projects.rb +19 -0
data/lib/braintrust/contrib/rails/server/application_controller.rb +34 -0
data/lib/braintrust/contrib/rails/server/engine.rb +72 -0
data/lib/braintrust/contrib/rails/server/eval_controller.rb +36 -0
data/lib/braintrust/contrib/rails/server/generator.rb +43 -0
data/lib/braintrust/contrib/rails/server/health_controller.rb +15 -0
data/lib/braintrust/contrib/rails/server/list_controller.rb +16 -0
data/lib/braintrust/contrib/rails/server/routes.rb +8 -0
data/lib/braintrust/contrib/rails/server.rb +20 -0
data/lib/braintrust/dataset.rb +6 -3
data/lib/braintrust/eval/context.rb +131 -0
data/lib/braintrust/eval/evaluator.rb +11 -5
data/lib/braintrust/eval/functions.rb +10 -166
data/lib/braintrust/eval/runner.rb +165 -145
data/lib/braintrust/eval/scorer.rb +24 -96
data/lib/braintrust/eval/trace.rb +129 -0
data/lib/braintrust/eval.rb +60 -132
data/lib/braintrust/functions.rb +168 -0
data/lib/braintrust/internal/callable.rb +83 -0
data/lib/braintrust/logger.rb +9 -0
data/lib/braintrust/scorer.rb +173 -0
data/lib/braintrust/server/handlers/eval.rb +8 -168
data/lib/braintrust/server/handlers/list.rb +3 -41
data/lib/braintrust/server/rack.rb +2 -0
data/lib/braintrust/server/services/eval_service.rb +214 -0
data/lib/braintrust/server/services/list_service.rb +64 -0
data/lib/braintrust/task.rb +108 -0
data/lib/braintrust/trace/span_processor.rb +0 -5
data/lib/braintrust/version.rb +1 -1
metadata +18 -1

data/lib/braintrust/server/services/eval_service.rb ADDED Viewed

@@ -0,0 +1,214 @@
+# frozen_string_literal: true
+require "json"
+module Braintrust
+  module Server
+    module Services
+      # Framework-agnostic service for running evaluations and streaming SSE results.
+      # Must be long-lived (not per-request) to preserve the @state_cache across requests.
+      class Eval
+        def initialize(evaluators)
+          @evaluators = evaluators
+          @state_mutex = Mutex.new
+          @state_cache = {}
+        end
+        # Validates request body. Returns:
+        #   {error: String, status: Integer} on failure
+        #   {evaluator:, name:, cases:, dataset:, ...} on success
+        def validate(body)
+          name = body["name"]
+          return {error: "Missing required field: name", status: 400} unless name
+          evaluator = current_evaluators[name]
+          return {error: "Evaluator '#{name}' not found", status: 404} unless evaluator
+          data = body["data"]
+          return {error: "Missing required field: data", status: 400} unless data
+          data_sources = ["data", "dataset_name", "dataset_id"].count { |k| data.key?(k) }
+          return {error: "Exactly one data source required", status: 400} if data_sources != 1
+          cases, dataset = resolve_data_source(data)
+          {
+            evaluator: evaluator,
+            name: name,
+            cases: cases,
+            dataset: dataset,
+            experiment_name: body["experiment_name"],
+            remote_scorer_ids: resolve_remote_scorers(body["scores"]),
+            parent: resolve_parent(body["parent"]),
+            project_id: body["project_id"]
+          }
+        end
+        # Runs the validated eval and streams SSE events via the sse writer.
+        # +validated+ is the hash returned by #validate.
+        # +auth+ is the auth context hash (or nil/true for no-auth).
+        # +sse+ is an SSEWriter instance.
+        def stream(validated, auth:, sse:)
+          name = validated[:name]
+          evaluator = validated[:evaluator]
+          cases = validated[:cases]
+          dataset = validated[:dataset]
+          experiment_name = validated[:experiment_name]
+          remote_scorer_ids = validated[:remote_scorer_ids]
+          parent = validated[:parent]
+          project_id = validated[:project_id]
+          state = build_state(auth)
+          # Only pass project/experiment params when state is available
+          run_opts = {
+            on_progress: ->(progress_data) {
+              # Build remote eval protocol events from generic progress data.
+              # Runner provides: id, data/error, scores (optional), origin (optional).
+              # Protocol requires: id, object_type, origin, name, format, output_type, event, data.
+              base = {
+                "object_type" => "task",
+                "name" => name,
+                "format" => "code",
+                "output_type" => "completion"
+              }
+              base["id"] = progress_data["id"] if progress_data["id"]
+              base["origin"] = progress_data["origin"] if progress_data["origin"]
+              if progress_data.key?("error")
+                sse.event("progress", JSON.dump(base.merge("event" => "error", "data" => progress_data["error"])))
+              else
+                sse.event("progress", JSON.dump(base.merge("event" => "json_delta", "data" => JSON.dump(progress_data["data"]))))
+              end
+              # Signal per-cell completion so the UI exits "Streaming..." state
+              # and updates the progress bar immediately.
+              sse.event("progress", JSON.dump(base.merge("event" => "done", "data" => "")))
+            },
+            quiet: true
+          }
+          run_opts[:parent] = parent if parent
+          run_opts[:scorers] = remote_scorer_ids if remote_scorer_ids
+          run_opts[:dataset] = dataset if dataset
+          if state
+            run_opts[:state] = state
+            run_opts[:experiment] = experiment_name if experiment_name
+            run_opts[:project_id] = project_id if project_id
+          end
+          result = evaluator.run(cases, **run_opts)
+          # Flush buffered OTLP spans before sending completion events.
+          # The BatchSpanProcessor exports every ~5s; fast evals can finish
+          # before a single export fires, causing the UI to see no results.
+          Braintrust::Trace.flush_spans
+          # Build summary from result scores
+          averaged_scores = {}
+          result.scorer_stats.each do |scorer_name, stats|
+            averaged_scores[scorer_name] = stats.score_mean
+          end
+          sse.event("summary", JSON.dump({
+            "scores" => averaged_scores,
+            "experiment_name" => experiment_name,
+            "experiment_id" => result.experiment_id,
+            "project_id" => result.project_id
+          }))
+          sse.event("done", "")
+        end
+        # Build State from auth context hash.
+        # Returns nil when auth is not a Hash (e.g. NoAuth returns true).
+        # Uses an LRU-style cache (max 64 entries) keyed by [api_key, app_url, org_name].
+        def build_state(auth)
+          return nil unless auth.is_a?(Hash)
+          cache_key = [auth["api_key"], auth["app_url"], auth["org_name"]]
+          @state_mutex ||= Mutex.new
+          @state_cache ||= {}
+          @state_mutex.synchronize do
+            cached = @state_cache[cache_key]
+            return cached if cached
+            state = Braintrust::State.new(
+              api_key: auth["api_key"],
+              org_id: auth["org_id"],
+              org_name: auth["org_name"],
+              app_url: auth["app_url"],
+              api_url: auth["api_url"],
+              enable_tracing: false
+            )
+            if @state_cache.size >= 64
+              oldest_key = @state_cache.keys.first
+              @state_cache.delete(oldest_key)
+            end
+            @state_cache[cache_key] = state
+            state
+          end
+        end
+        private
+        def current_evaluators
+          return @evaluators.call if @evaluators.respond_to?(:call)
+          @evaluators
+        end
+        # Resolve data source from the data field.
+        # Returns [cases, dataset] where exactly one is non-nil.
+        def resolve_data_source(data)
+          if data.key?("data")
+            cases = data["data"].map do |d|
+              {input: d["input"], expected: d["expected"]}
+            end
+            [cases, nil]
+          elsif data.key?("dataset_id")
+            [nil, Braintrust::Dataset::ID.new(id: data["dataset_id"])]
+          elsif data.key?("dataset_name")
+            dataset_opts = {name: data["dataset_name"]}
+            dataset_opts[:project] = data["project_name"] if data["project_name"]
+            [nil, dataset_opts]
+          else
+            [nil, nil]
+          end
+        end
+        # Map request scores array to Scorer::ID structs.
+        # The UI sends function_id as a nested object: {"function_id": "uuid"}.
+        def resolve_remote_scorers(scores)
+          return nil if scores.nil? || scores.empty?
+          scores.map do |s|
+            func_id = s["function_id"]
+            func_id = func_id["function_id"] if func_id.is_a?(Hash)
+            Braintrust::Scorer::ID.new(
+              function_id: func_id,
+              version: s["version"]
+            )
+          end
+        end
+        # Map request parent to symbol-keyed Hash.
+        # Hardcode playground_id to match Java SDK behavior.
+        # Also extracts generation from propagated_event for span_attributes.
+        def resolve_parent(parent)
+          return nil unless parent.is_a?(Hash)
+          object_id = parent["object_id"]
+          return nil unless object_id
+          generation = parent.dig("propagated_event", "span_attributes", "generation")
+          result = {object_type: "playground_id", object_id: object_id}
+          result[:generation] = generation if generation
+          result
+        end
+      end
+    end
+  end
+end

data/lib/braintrust/server/services/list_service.rb ADDED Viewed

@@ -0,0 +1,64 @@
+# frozen_string_literal: true
+require "json"
+module Braintrust
+  module Server
+    module Services
+      # Framework-agnostic service for listing evaluators.
+      # Returns a plain Hash (not a Rack triplet) suitable for JSON.dump.
+      class List
+        def initialize(evaluators)
+          @evaluators = evaluators
+        end
+        def call
+          result = {}
+          current_evaluators.each do |name, evaluator|
+            scores = (evaluator.scorers || []).each_with_index.map do |scorer, i|
+              scorer_name = scorer.respond_to?(:name) ? scorer.name : "score_#{i}"
+              {"name" => scorer_name}
+            end
+            entry = {"scores" => scores}
+            params = serialize_parameters(evaluator.parameters)
+            entry["parameters"] = params if params
+            result[name] = entry
+          end
+          result
+        end
+        private
+        def current_evaluators
+          return @evaluators.call if @evaluators.respond_to?(:call)
+          @evaluators
+        end
+        # Convert user-defined parameters to the dev server protocol format.
+        # Wraps in a staticParameters container with "data" typed entries.
+        def serialize_parameters(parameters)
+          return nil unless parameters && !parameters.empty?
+          schema = {}
+          parameters.each do |name, spec|
+            spec = spec.transform_keys(&:to_s) if spec.is_a?(Hash)
+            if spec.is_a?(Hash)
+              schema[name.to_s] = {
+                "type" => "data",
+                "schema" => {"type" => spec["type"] || "string"},
+                "default" => spec["default"],
+                "description" => spec["description"]
+              }
+            end
+          end
+          {
+            "type" => "braintrust.staticParameters",
+            "schema" => schema,
+            "source" => nil
+          }
+        end
+      end
+    end
+  end
+end

data/lib/braintrust/task.rb ADDED Viewed

@@ -0,0 +1,108 @@
+# frozen_string_literal: true
+require_relative "internal/callable"
+module Braintrust
+  # Task wraps a callable that processes inputs.
+  #
+  # Use inline with a block (keyword args):
+  #   task = Task.new("my_task") { |input:| process(input) }
+  #
+  # Or include in a class and define #call with keyword args:
+  #   class MyTask
+  #     include Braintrust::Task
+  #
+  #     def call(input:)
+  #       process(input)
+  #     end
+  #   end
+  #
+  # Legacy callables with 1 positional param are auto-wrapped for
+  # backwards compatibility but emit a deprecation warning.
+  module Task
+    DEFAULT_NAME = "task"
+    # @param base [Class] the class including Task
+    def self.included(base)
+      base.include(Callable)
+    end
+    # Create a block-based task.
+    #
+    # @param name [String, nil] optional name (defaults to "task")
+    # @param block [Proc] the task implementation; declare only the keyword
+    #   args you need (e.g. +|input:|+). Extra kwargs passed by the caller
+    #   are filtered out automatically.
+    # @return [Task::Block]
+    # @raise [ArgumentError] if the block has unsupported arity
+    def self.new(name = nil, &block)
+      Block.new(name: name || DEFAULT_NAME, &block)
+    end
+    # Included into classes that +include Task+. Prepends KeywordFilter
+    # so #call receives only its declared kwargs, and provides a default #name.
+    module Callable
+      # @param base [Class] the class including Callable
+      def self.included(base)
+        base.prepend(Internal::Callable::KeywordFilter)
+      end
+      # Default name derived from the class name (e.g. MyTask -> "my_task").
+      # @return [String]
+      def name
+        klass = self.class.name&.split("::")&.last
+        return Task::DEFAULT_NAME unless klass
+        klass.gsub(/([a-z])([A-Z])/, '\1_\2').downcase
+      end
+    end
+    # Block-based task. Stores a Proc and delegates #call to it.
+    # Includes Task so it satisfies +Task ===+ checks (e.g. in Context::Factory).
+    # Exposes #call_parameters so KeywordFilter can introspect the block's
+    # declared kwargs rather than Block#call's **kwargs signature.
+    class Block
+      include Task
+      # @return [String]
+      attr_reader :name
+      # @param name [String] task name
+      # @param block [Proc] task implementation
+      def initialize(name: DEFAULT_NAME, &block)
+        @name = name
+        @block = wrap_block(block)
+      end
+      # @param kwargs [Hash] keyword arguments (filtered by KeywordFilter)
+      # @return [Object] result of the block
+      def call(**kwargs)
+        @block.call(**kwargs)
+      end
+      # Exposes the block's parameter list so KeywordFilter can filter
+      # kwargs to match the block's declared keywords.
+      # @return [Array<Array>] parameter list from Proc#parameters
+      def call_parameters
+        @block.parameters
+      end
+      private
+      # Legacy positional wrapping: arity 1/-1 gets :input extracted.
+      # Keyword and zero-arity blocks are stored raw; KeywordFilter handles filtering at call time.
+      # @param block [Proc]
+      # @return [Proc]
+      def wrap_block(block)
+        params = block.parameters
+        if Internal::Callable::KeywordFilter.has_any_keywords?(params) || block.arity == 0
+          block
+        elsif block.arity == 1 || block.arity == -1
+          Log.warn_once(:task_positional, "Task with positional param (input) is deprecated. Use keyword args: ->(input:) { ... } instead.")
+          ->(**kw) { block.call(kw[:input]) }
+        else
+          raise ArgumentError, "Task must accept keyword args or 1 positional param (got arity #{block.arity})"
+        end
+      end
+    end
+  end
+end

data/lib/braintrust/trace/span_processor.rb CHANGED Viewed

@@ -80,11 +80,6 @@ module Braintrust
       # Determine if a span should be forwarded to the wrapped processor
       # based on configured filters
       def should_forward_span?(span)
-        # Always keep root spans (spans with no parent)
-        # Check if parent_span_id is the invalid/zero span ID
-        is_root = span.parent_span_id == OpenTelemetry::Trace::INVALID_SPAN_ID
-        return true if is_root
         # If no filters, keep everything
         return true if @filters.empty?

data/lib/braintrust/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Braintrust
-  VERSION = "0.2.0"
+  VERSION = "0.3.0"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: braintrust
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  version: 0.3.0
 platform: ruby
 authors:
 - Braintrust
@@ -193,6 +193,7 @@ files:
 - lib/braintrust/api/datasets.rb
 - lib/braintrust/api/functions.rb
 - lib/braintrust/api/internal/auth.rb
+- lib/braintrust/api/internal/btql.rb
 - lib/braintrust/api/internal/experiments.rb
 - lib/braintrust/api/internal/projects.rb
 - lib/braintrust/config.rb
@@ -214,6 +215,14 @@ files:
 - lib/braintrust/contrib/openai/patcher.rb
 - lib/braintrust/contrib/patcher.rb
 - lib/braintrust/contrib/rails/railtie.rb
+- lib/braintrust/contrib/rails/server.rb
+- lib/braintrust/contrib/rails/server/application_controller.rb
+- lib/braintrust/contrib/rails/server/engine.rb
+- lib/braintrust/contrib/rails/server/eval_controller.rb
+- lib/braintrust/contrib/rails/server/generator.rb
+- lib/braintrust/contrib/rails/server/health_controller.rb
+- lib/braintrust/contrib/rails/server/list_controller.rb
+- lib/braintrust/contrib/rails/server/routes.rb
 - lib/braintrust/contrib/registry.rb
 - lib/braintrust/contrib/ruby_llm/deprecated.rb
 - lib/braintrust/contrib/ruby_llm/instrumentation/chat.rb
@@ -234,6 +243,7 @@ files:
 - lib/braintrust/eval.rb
 - lib/braintrust/eval/case.rb
 - lib/braintrust/eval/cases.rb
+- lib/braintrust/eval/context.rb
 - lib/braintrust/eval/evaluator.rb
 - lib/braintrust/eval/formatter.rb
 - lib/braintrust/eval/functions.rb
@@ -241,6 +251,9 @@ files:
 - lib/braintrust/eval/runner.rb
 - lib/braintrust/eval/scorer.rb
 - lib/braintrust/eval/summary.rb
+- lib/braintrust/eval/trace.rb
+- lib/braintrust/functions.rb
+- lib/braintrust/internal/callable.rb
 - lib/braintrust/internal/encoding.rb
 - lib/braintrust/internal/env.rb
 - lib/braintrust/internal/http.rb
@@ -250,6 +263,7 @@ files:
 - lib/braintrust/internal/time.rb
 - lib/braintrust/logger.rb
 - lib/braintrust/prompt.rb
+- lib/braintrust/scorer.rb
 - lib/braintrust/server.rb
 - lib/braintrust/server/auth/clerk_token.rb
 - lib/braintrust/server/auth/no_auth.rb
@@ -261,9 +275,12 @@ files:
 - lib/braintrust/server/rack.rb
 - lib/braintrust/server/rack/app.rb
 - lib/braintrust/server/router.rb
+- lib/braintrust/server/services/eval_service.rb
+- lib/braintrust/server/services/list_service.rb
 - lib/braintrust/server/sse.rb
 - lib/braintrust/setup.rb
 - lib/braintrust/state.rb
+- lib/braintrust/task.rb
 - lib/braintrust/trace.rb
 - lib/braintrust/trace/attachment.rb
 - lib/braintrust/trace/span_exporter.rb