RubyGems - braintrust - Versions diffs - 0.2.0 → 0.3.0 - Mend

braintrust 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

checksums.yaml +4 -4
data/README.md +148 -24
data/lib/braintrust/api/internal/btql.rb +124 -0
data/lib/braintrust/api/internal/experiments.rb +19 -0
data/lib/braintrust/api/internal/projects.rb +19 -0
data/lib/braintrust/contrib/rails/server/application_controller.rb +34 -0
data/lib/braintrust/contrib/rails/server/engine.rb +72 -0
data/lib/braintrust/contrib/rails/server/eval_controller.rb +36 -0
data/lib/braintrust/contrib/rails/server/generator.rb +43 -0
data/lib/braintrust/contrib/rails/server/health_controller.rb +15 -0
data/lib/braintrust/contrib/rails/server/list_controller.rb +16 -0
data/lib/braintrust/contrib/rails/server/routes.rb +8 -0
data/lib/braintrust/contrib/rails/server.rb +20 -0
data/lib/braintrust/dataset.rb +6 -3
data/lib/braintrust/eval/context.rb +131 -0
data/lib/braintrust/eval/evaluator.rb +11 -5
data/lib/braintrust/eval/functions.rb +10 -166
data/lib/braintrust/eval/runner.rb +165 -145
data/lib/braintrust/eval/scorer.rb +24 -96
data/lib/braintrust/eval/trace.rb +129 -0
data/lib/braintrust/eval.rb +60 -132
data/lib/braintrust/functions.rb +168 -0
data/lib/braintrust/internal/callable.rb +83 -0
data/lib/braintrust/logger.rb +9 -0
data/lib/braintrust/scorer.rb +173 -0
data/lib/braintrust/server/handlers/eval.rb +8 -168
data/lib/braintrust/server/handlers/list.rb +3 -41
data/lib/braintrust/server/rack.rb +2 -0
data/lib/braintrust/server/services/eval_service.rb +214 -0
data/lib/braintrust/server/services/list_service.rb +64 -0
data/lib/braintrust/task.rb +108 -0
data/lib/braintrust/trace/span_processor.rb +0 -5
data/lib/braintrust/version.rb +1 -1
metadata +18 -1

data/lib/braintrust/internal/callable.rb ADDED Viewed

@@ -0,0 +1,83 @@
+# frozen_string_literal: true
+module Braintrust
+  module Internal
+    module Callable
+      # Filters keyword arguments so callers can pass a superset of kwargs
+      # and the receiver only gets the ones it declared. This avoids Ruby 3.2+
+      # ArgumentError for unknown keywords without requiring ** on every definition.
+      #
+      # When prepended on a class, intercepts #call and slices kwargs to match
+      # the declared parameters before forwarding. Methods with **keyrest
+      # receive all kwargs unfiltered.
+      #
+      # @example
+      #   class Greeter
+      #     prepend Internal::Callable::KeywordFilter
+      #     def call(name:)
+      #       "hello #{name}"
+      #     end
+      #   end
+      #   Greeter.new.call(name: "world", extra: "ignored")  # => "hello world"
+      module KeywordFilter
+        # Filter kwargs to only the keyword params declared by the given parameters list.
+        # Returns kwargs unchanged if parameters include **keyrest.
+        #
+        # @param params [Array<Array>] parameter list from Proc#parameters or Method#parameters
+        # @param kwargs [Hash] keyword arguments to filter
+        # @return [Hash] filtered keyword arguments
+        def self.filter(params, kwargs)
+          return kwargs if has_keyword_splat?(params)
+          declared_keys = params
+            .select { |type, _| type == :keyreq || type == :key }
+            .map(&:last)
+          kwargs.slice(*declared_keys)
+        end
+        # Wrap a Proc to filter kwargs to only its declared keyword params.
+        # Returns the block unchanged if it accepts **keyrest.
+        #
+        # @param block [Proc] the block to wrap
+        # @return [Proc] a wrapper that filters kwargs, or the original block
+        def self.wrap_block(block)
+          return block if has_keyword_splat?(block.parameters)
+          ->(**kw) { block.call(**filter(block.parameters, kw)) }
+        end
+        # Whether params include ** (keyword splat / keyrest).
+        #
+        # @param params [Array<Array>] parameter list
+        # @return [Boolean]
+        def self.has_keyword_splat?(params)
+          params.any? { |type, _| type == :keyrest }
+        end
+        # Whether params include any keyword parameters (key, keyreq, or keyrest).
+        #
+        # @param params [Array<Array>] parameter list
+        # @return [Boolean]
+        def self.has_any_keywords?(params)
+          params.any? { |type, _| type == :keyreq || type == :key || type == :keyrest }
+        end
+        # When prepended, filters kwargs before the next #call in the ancestor chain.
+        # If the instance defines #call_parameters, uses those.
+        # Otherwise introspects super_method.
+        #
+        # @param kwargs [Hash] keyword arguments
+        # @return [Object] result of the filtered #call
+        def call(**kwargs)
+          params = if respond_to?(:call_parameters)
+            call_parameters
+          else
+            impl = method(:call).super_method
+            return super unless impl
+            impl.parameters
+          end
+          super(**KeywordFilter.filter(params, kwargs))
+        end
+      end
+    end
+  end
+end

data/lib/braintrust/logger.rb CHANGED Viewed

@@ -8,6 +8,7 @@ module Braintrust
     # Default to WARN unless BRAINTRUST_DEBUG is set
     level = ENV["BRAINTRUST_DEBUG"] ? Logger::DEBUG : Logger::WARN
     @logger = Logger.new($stderr, level: level)
+    @warned = Set.new
     class << self
       attr_accessor :logger
@@ -24,6 +25,14 @@ module Braintrust
         @logger.warn(message)
       end
+      # Emit a warning only once per unique key.
+      # Subsequent calls with the same key are silently ignored.
+      def warn_once(key, message)
+        return if @warned.include?(key)
+        @warned.add(key)
+        @logger.warn(message)
+      end
       def error(message)
         @logger.error(message)
       end

data/lib/braintrust/scorer.rb ADDED Viewed

@@ -0,0 +1,173 @@
+# frozen_string_literal: true
+require_relative "internal/callable"
+module Braintrust
+  # Scorer wraps a scoring function that evaluates task output against expected values.
+  #
+  # Use inline with a block (keyword args):
+  #   scorer = Scorer.new("my_scorer") { |expected:, output:| output == expected ? 1.0 : 0.0 }
+  #
+  # Or include in a class and define #call with keyword args:
+  #   class FuzzyMatch
+  #     include Braintrust::Scorer
+  #
+  #     def call(expected:, output:)
+  #       output == expected ? 1.0 : 0.0
+  #     end
+  #   end
+  #
+  # Legacy callables with 3 or 4 positional params are auto-wrapped for
+  # backwards compatibility but emit a deprecation warning.
+  module Scorer
+    DEFAULT_NAME = "scorer"
+    # @param base [Class] the class including Scorer
+    def self.included(base)
+      base.include(Callable)
+    end
+    # Create a block-based scorer.
+    #
+    # @param name [String, nil] optional name (defaults to "scorer")
+    # @param block [Proc] the scoring implementation; declare only the keyword
+    #   args you need. Extra kwargs are filtered out automatically.
+    #
+    #   Supported kwargs: +input:+, +expected:+, +output:+, +metadata:+, +trace:+
+    # @return [Scorer::Block]
+    # @raise [ArgumentError] if the block has unsupported arity
+    def self.new(name = nil, &block)
+      Block.new(name: name || DEFAULT_NAME, &block)
+    end
+    # Included into classes that +include Scorer+. Prepends KeywordFilter and
+    # ResultNormalizer so #call receives only declared kwargs and always returns
+    # Array<Hash>. Also provides a default #name and #call_parameters.
+    module Callable
+      # Normalizes the raw return value of #call into Array<Hash>.
+      # Nested inside Callable because it depends on #name which Callable provides.
+      module ResultNormalizer
+        # @return [Array<Hash>] normalized score hashes with :score, :metadata, :name keys
+        def call(**kwargs)
+          normalize_score_result(super)
+        end
+        private
+        # @param result [Numeric, Hash, Array<Hash>] raw return value from #call
+        # @return [Array<Hash>] one or more score hashes with :score, :metadata, :name keys
+        # @raise [ArgumentError] if any score value is not Numeric
+        def normalize_score_result(result)
+          case result
+          when Array then result.map { |item| normalize_score_item(item) }
+          when Hash then [normalize_score_item(result)]
+          else
+            raise ArgumentError, "#{name}: score must be Numeric, got #{result.inspect}" unless result.is_a?(Numeric)
+            [{score: result, metadata: nil, name: name}]
+          end
+        end
+        # Fills in missing :name from the scorer and validates :score.
+        # @param item [Hash] a score hash with at least a :score key
+        # @return [Hash] the same hash with :name set
+        # @raise [ArgumentError] if :score is not Numeric
+        def normalize_score_item(item)
+          item[:name] ||= name
+          raise ArgumentError, "#{item[:name]}: score must be Numeric, got #{item[:score].inspect}" unless item[:score].is_a?(Numeric)
+          item
+        end
+      end
+      # Infrastructure modules prepended onto every scorer class.
+      # Used both to set up the ancestor chain and to skip past them in
+      # #call_parameters so KeywordFilter sees the real call signature.
+      PREPENDED = [Internal::Callable::KeywordFilter, ResultNormalizer].freeze
+      # @param base [Class] the class including Callable
+      def self.included(base)
+        PREPENDED.each { |mod| base.prepend(mod) }
+      end
+      # Default name derived from the class name (e.g. FuzzyMatch -> "fuzzy_match").
+      # @return [String]
+      def name
+        klass = self.class.name&.split("::")&.last
+        return Scorer::DEFAULT_NAME unless klass
+        klass.gsub(/([a-z])([A-Z])/, '\1_\2').downcase
+      end
+      # Provides KeywordFilter with the actual call signature of the subclass.
+      # Walks past PREPENDED modules in the ancestor chain so that user-defined
+      # #call keyword params are correctly introspected.
+      # Block overrides this to point directly at @block.parameters.
+      # @return [Array<Array>] parameter list
+      def call_parameters
+        meth = method(:call)
+        meth = meth.super_method while meth.super_method && PREPENDED.include?(meth.owner)
+        meth.parameters
+      end
+    end
+    # Block-based scorer. Stores a Proc and delegates #call to it.
+    # Includes Scorer so it satisfies +Scorer ===+ checks (e.g. in Context::Factory).
+    # Exposes #call_parameters so KeywordFilter can introspect the block's
+    # declared kwargs rather than Block#call's **kwargs signature.
+    class Block
+      include Scorer
+      # @return [String]
+      attr_reader :name
+      # @param name [String] scorer name
+      # @param block [Proc] scoring implementation
+      def initialize(name: DEFAULT_NAME, &block)
+        @name = name
+        @block = wrap_block(block)
+      end
+      # @param kwargs [Hash] keyword arguments (filtered by KeywordFilter)
+      # @return [Array<Hash>] normalized score results
+      def call(**kwargs)
+        @block.call(**kwargs)
+      end
+      # Exposes the block's parameter list so KeywordFilter can filter
+      # kwargs to match the block's declared keywords.
+      # @return [Array<Array>] parameter list from Proc#parameters
+      def call_parameters
+        @block.parameters
+      end
+      private
+      # Legacy positional wrapping: arity 3/4/-4/-1 maps to (input, expected, output[, metadata]).
+      # Keyword and zero-arity blocks are stored raw; KeywordFilter handles filtering at call time.
+      # @param block [Proc]
+      # @return [Proc]
+      def wrap_block(block)
+        params = block.parameters
+        if Internal::Callable::KeywordFilter.has_any_keywords?(params) || block.arity == 0
+          block
+        else
+          case block.arity
+          when 3
+            Log.warn_once(:scorer_positional_3, "Scorer with positional params (input, expected, output) is deprecated. Use keyword args: |input:, expected:, output:| instead.")
+            ->(**kw) { block.call(kw[:input], kw[:expected], kw[:output]) }
+          when 4, -4, -1
+            Log.warn_once(:scorer_positional_4, "Scorer with positional params (input, expected, output, metadata) is deprecated. Use keyword args: |input:, expected:, output:, metadata:| instead.")
+            ->(**kw) { block.call(kw[:input], kw[:expected], kw[:output], kw[:metadata]) }
+          else
+            raise ArgumentError, "Scorer must accept keyword args or 3-4 positional params (got arity #{block.arity})"
+          end
+        end
+      end
+    end
+    # Value object wrapping a remote scorer function UUID.
+    # Used by Eval.run to distinguish remote scorers from local callables.
+    ID = Struct.new(:function_id, :version, keyword_init: true)
+  end
+  # @deprecated Use {Braintrust::Scorer::ID} instead.
+  ScorerId = Scorer::ID
+end

data/lib/braintrust/server/handlers/eval.rb CHANGED Viewed

@@ -10,38 +10,15 @@ module Braintrust
       class Eval
         def initialize(evaluators)
           @evaluators = evaluators
+          @service = Services::Eval.new(evaluators)
         end
         def call(env)
           body = parse_body(env)
           return error_response(400, "Invalid JSON body") unless body
-          name = body["name"]
-          return error_response(400, "Missing required field: name") unless name
-          evaluator = @evaluators[name]
-          return error_response(404, "Evaluator '#{name}' not found") unless evaluator
-          data = body["data"]
-          return error_response(400, "Missing required field: data") unless data
-          # Validate exactly one data source
-          data_sources = ["data", "dataset_name", "dataset_id"].count { |k| data.key?(k) }
-          return error_response(400, "Exactly one data source required") if data_sources != 1
-          experiment_name = body["experiment_name"]
-          # Resolve data source
-          cases, dataset = resolve_data_source(data)
-          # Resolve remote scorers from request
-          remote_scorer_ids = resolve_remote_scorers(body["scores"])
-          # Resolve parent span context
-          parent = resolve_parent(body["parent"])
-          # Build state from auth context (if present)
-          state = build_state(env)
+          result = @service.validate(body)
+          return error_response(result[:status], result[:error]) if result[:error]
           # The protocol-rack adapter (used by Falcon and any server built on
           # protocol-http) buffers `each`-based bodies through an Enumerable path.
@@ -50,64 +27,7 @@ module Braintrust
           body_class = env.key?("protocol.http.request") ? SSEStreamBody : SSEBody
           sse_body = body_class.new do |sse|
-            # Only pass project/experiment params when state is available
-            run_opts = {
-              on_progress: ->(progress_data) {
-                # Build remote eval protocol events from generic progress data.
-                # Runner provides: id, data/error, scores (optional), origin (optional).
-                # Protocol requires: id, object_type, origin, name, format, output_type, event, data.
-                base = {
-                  "object_type" => "task",
-                  "name" => name,
-                  "format" => "code",
-                  "output_type" => "completion"
-                }
-                base["id"] = progress_data["id"] if progress_data["id"]
-                base["origin"] = progress_data["origin"] if progress_data["origin"]
-                if progress_data.key?("error")
-                  sse.event("progress", JSON.dump(base.merge("event" => "error", "data" => progress_data["error"])))
-                else
-                  sse.event("progress", JSON.dump(base.merge("event" => "json_delta", "data" => JSON.dump(progress_data["data"]))))
-                end
-                # Signal per-cell completion so the UI exits "Streaming..." state
-                # and updates the progress bar immediately.
-                sse.event("progress", JSON.dump(base.merge("event" => "done", "data" => "")))
-              },
-              quiet: true
-            }
-            run_opts[:parent] = parent if parent
-            run_opts[:scorers] = remote_scorer_ids if remote_scorer_ids
-            run_opts[:dataset] = dataset if dataset
-            if state
-              run_opts[:state] = state
-              run_opts[:experiment] = experiment_name if experiment_name
-              run_opts[:project_id] = body["project_id"] if body["project_id"]
-            end
-            result = evaluator.run(cases, **run_opts)
-            # Flush buffered OTLP spans before sending completion events.
-            # The BatchSpanProcessor exports every ~5s; fast evals can finish
-            # before a single export fires, causing the UI to see no results.
-            Braintrust::Trace.flush_spans
-            # Build summary from result scores
-            averaged_scores = {}
-            result.scorer_stats.each do |scorer_name, stats|
-              averaged_scores[scorer_name] = stats.score_mean
-            end
-            sse.event("summary", JSON.dump({
-              "scores" => averaged_scores,
-              "experiment_name" => experiment_name,
-              "experiment_id" => result.experiment_id,
-              "project_id" => result.project_id
-            }))
-            sse.event("done", "")
+            @service.stream(result, auth: env["braintrust.auth"], sse: sse)
           end
           [200, {"content-type" => "text/event-stream", "cache-control" => "no-cache", "connection" => "keep-alive"}, sse_body]
@@ -115,90 +35,6 @@ module Braintrust
         private
-        # Resolve data source from the data field.
-        # Returns [cases, dataset] where exactly one is non-nil.
-        def resolve_data_source(data)
-          if data.key?("data")
-            cases = data["data"].map do |d|
-              {input: d["input"], expected: d["expected"]}
-            end
-            [cases, nil]
-          elsif data.key?("dataset_id")
-            [nil, Braintrust::DatasetId.new(id: data["dataset_id"])]
-          elsif data.key?("dataset_name")
-            dataset_opts = {name: data["dataset_name"]}
-            dataset_opts[:project] = data["project_name"] if data["project_name"]
-            [nil, dataset_opts]
-          else
-            [nil, nil]
-          end
-        end
-        # Map request scores array to ScorerId structs.
-        # The UI sends function_id as a nested object: {"function_id": "uuid"}.
-        def resolve_remote_scorers(scores)
-          return nil if scores.nil? || scores.empty?
-          scores.map do |s|
-            func_id = s["function_id"]
-            func_id = func_id["function_id"] if func_id.is_a?(Hash)
-            Braintrust::ScorerId.new(
-              function_id: func_id,
-              version: s["version"]
-            )
-          end
-        end
-        # Map request parent to symbol-keyed Hash.
-        # Hardcode playground_id to match Java SDK behavior.
-        # Also extracts generation from propagated_event for span_attributes.
-        def resolve_parent(parent)
-          return nil unless parent.is_a?(Hash)
-          object_id = parent["object_id"]
-          return nil unless object_id
-          generation = parent.dig("propagated_event", "span_attributes", "generation")
-          result = {object_type: "playground_id", object_id: object_id}
-          result[:generation] = generation if generation
-          result
-        end
-        # Build State from auth context set by Auth middleware.
-        # Returns nil when no auth context is present (e.g. NoAuth strategy).
-        # Uses an LRU-style cache (max 64 entries) keyed by [api_key, app_url, org_name].
-        def build_state(env)
-          auth = env["braintrust.auth"]
-          return nil unless auth.is_a?(Hash)
-          cache_key = [auth["api_key"], auth["app_url"], auth["org_name"]]
-          @state_mutex ||= Mutex.new
-          @state_cache ||= {}
-          @state_mutex.synchronize do
-            cached = @state_cache[cache_key]
-            return cached if cached
-            state = Braintrust::State.new(
-              api_key: auth["api_key"],
-              org_id: auth["org_id"],
-              org_name: auth["org_name"],
-              app_url: auth["app_url"],
-              api_url: auth["api_url"],
-              enable_tracing: false
-            )
-            # Evict oldest entry if cache is full
-            if @state_cache.size >= 64
-              oldest_key = @state_cache.keys.first
-              @state_cache.delete(oldest_key)
-            end
-            @state_cache[cache_key] = state
-            state
-          end
-        end
         def parse_body(env)
           body = env["rack.input"]&.read
           return nil if body.nil? || body.empty?
@@ -211,6 +47,10 @@ module Braintrust
           [status, {"content-type" => "application/json"},
             [JSON.dump({"error" => message})]]
         end
+        def build_state(env)
+          @service.build_state(env["braintrust.auth"])
+        end
       end
     end
   end

data/lib/braintrust/server/handlers/list.rb CHANGED Viewed

@@ -23,50 +23,12 @@ module Braintrust
       class List
         def initialize(evaluators)
           @evaluators = evaluators
+          @service = Services::List.new(evaluators)
         end
         def call(_env)
-          result = {}
-          @evaluators.each do |name, evaluator|
-            scores = (evaluator.scorers || []).each_with_index.map do |scorer, i|
-              scorer_name = scorer.respond_to?(:name) ? scorer.name : "score_#{i}"
-              {"name" => scorer_name}
-            end
-            entry = {"scores" => scores}
-            params = serialize_parameters(evaluator.parameters)
-            entry["parameters"] = params if params
-            result[name] = entry
-          end
-          [200, {"content-type" => "application/json"},
-            [JSON.dump(result)]]
-        end
-        private
-        # Convert user-defined parameters to the dev server protocol format.
-        # Wraps in a staticParameters container with "data" typed entries.
-        def serialize_parameters(parameters)
-          return nil unless parameters && !parameters.empty?
-          schema = {}
-          parameters.each do |name, spec|
-            spec = spec.transform_keys(&:to_s) if spec.is_a?(Hash)
-            if spec.is_a?(Hash)
-              schema[name.to_s] = {
-                "type" => "data",
-                "schema" => {"type" => spec["type"] || "string"},
-                "default" => spec["default"],
-                "description" => spec["description"]
-              }
-            end
-          end
-          {
-            "type" => "braintrust.staticParameters",
-            "schema" => schema,
-            "source" => nil
-          }
+          result = @service.call
+          [200, {"content-type" => "application/json"}, [JSON.dump(result)]]
         end
       end
     end

data/lib/braintrust/server/rack.rb CHANGED Viewed

@@ -15,6 +15,8 @@ require_relative "auth/no_auth"
 require_relative "auth/clerk_token"
 require_relative "middleware/cors"
 require_relative "middleware/auth"
+require_relative "services/list_service"
+require_relative "services/eval_service"
 require_relative "handlers/health"
 require_relative "handlers/list"
 require_relative "handlers/eval"