RubyGems - braintrust - Versions diffs - 0.2.0 → 0.2.1 - Mend

braintrust 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +4 -4
data/README.md +42 -15
data/lib/braintrust/api/internal/btql.rb +124 -0
data/lib/braintrust/api/internal/experiments.rb +19 -0
data/lib/braintrust/api/internal/projects.rb +19 -0
data/lib/braintrust/dataset.rb +6 -3
data/lib/braintrust/eval/context.rb +131 -0
data/lib/braintrust/eval/evaluator.rb +11 -5
data/lib/braintrust/eval/functions.rb +10 -166
data/lib/braintrust/eval/runner.rb +100 -108
data/lib/braintrust/eval/scorer.rb +24 -96
data/lib/braintrust/eval/trace.rb +129 -0
data/lib/braintrust/eval.rb +60 -132
data/lib/braintrust/functions.rb +168 -0
data/lib/braintrust/internal/callable.rb +83 -0
data/lib/braintrust/logger.rb +9 -0
data/lib/braintrust/scorer.rb +122 -0
data/lib/braintrust/server/handlers/eval.rb +3 -3
data/lib/braintrust/task.rb +108 -0
data/lib/braintrust/version.rb +1 -1
metadata +8 -1

data/lib/braintrust/eval/trace.rb ADDED Viewed

@@ -0,0 +1,129 @@
+# frozen_string_literal: true
+module Braintrust
+  module Eval
+    # Read-only trace data accessor for scorers.
+    #
+    # Per-case throwaway object — no global cache, no shared state.
+    # Accepts lazy (lambda) or eager (Array) span sources.
+    #
+    # BTQL span shape (string keys from JSON):
+    #   "span_attributes" => {"type" => "llm", "name" => "Chat Completion"}
+    #   "input"  => [{"role" => "user", "content" => "..."}]        # flat message array
+    #   "output" => [{"message" => {"role" => "assistant", ...}}]    # flat choices array
+    #
+    # @example Lazy loading from BTQL
+    #   trace = Trace.new(spans: -> { btql.trace_spans(...) })
+    #   trace.spans              # triggers BTQL query on first access
+    #   trace.spans              # returns memoized result
+    #
+    # @example Eager loading
+    #   trace = Trace.new(spans: [span1, span2])
+    #   trace.spans              # returns array directly
+    class Trace
+      # @param spans [Proc, Array] Span source — a lambda (lazy) or Array (eager).
+      def initialize(spans:)
+        @spans_source = spans
+        @spans_resolved = false
+        @spans_memo = nil
+      end
+      # Resolve and return spans, optionally filtered by type.
+      #
+      # The type lives at span_attributes.type in BTQL rows (e.g. "llm", "eval", "task").
+      #
+      # @param span_type [String, Array<String>, nil] Filter to spans matching this type.
+      #   Accepts a single type string or an array of types (returns the union).
+      #   Returns all spans when nil.
+      # @return [Array<Hash>] Matching spans.
+      def spans(span_type: nil)
+        resolved = resolve_spans
+        if span_type
+          types = Array(span_type)
+          resolved.select { |s| types.include?(span_type_for(s)) }
+        else
+          resolved
+        end
+      end
+      # Convenience method: extract a chronological message thread from LLM spans.
+      #
+      # Walks LLM spans, collects input messages (deduplicated) and output messages
+      # (always included). Returns a flat chronological array.
+      #
+      # BTQL LLM span format:
+      #   input:  flat array of messages  [{"role" => "user", "content" => "..."}]
+      #   output: flat array of choices   [{"message" => {"role" => "assistant", ...}}]
+      #
+      # @return [Array<Hash>] Ordered message list.
+      def thread
+        llm_spans = spans(span_type: "llm")
+        return [] if llm_spans.empty?
+        seen = Set.new
+        messages = []
+        llm_spans.each do |span|
+          # Input: flat message array or {messages: [...]} wrapper
+          input = span["input"] || span[:input]
+          input_messages = extract_input_messages(input)
+          input_messages&.each do |msg|
+            key = msg.hash
+            unless seen.include?(key)
+              seen.add(key)
+              messages << msg
+            end
+          end
+          # Output: flat choices array or {choices: [...]} wrapper
+          output = span["output"] || span[:output]
+          extract_output_messages(output)&.each do |msg|
+            messages << msg
+          end
+        end
+        messages
+      end
+      private
+      # Extract the span type from a span hash.
+      # Handles both string and symbol keys for span_attributes.type.
+      def span_type_for(span)
+        attrs = span["span_attributes"] || span[:span_attributes]
+        return nil unless attrs
+        attrs["type"] || attrs[:type]
+      end
+      # Extract input messages from a span's input field.
+      # Handles both flat array format (BTQL) and {messages: [...]} wrapper.
+      def extract_input_messages(input)
+        return nil unless input
+        return input if input.is_a?(Array)
+        input["messages"] || input[:messages]
+      end
+      # Extract output messages from a span's output field.
+      # Handles both flat choices array (BTQL) and {choices: [...]} wrapper.
+      def extract_output_messages(output)
+        return nil unless output
+        choices = output.is_a?(Array) ? output : (output["choices"] || output[:choices])
+        return nil unless choices
+        choices.filter_map { |c| c["message"] || c[:message] }
+      end
+      def resolve_spans
+        unless @spans_resolved
+          @spans_memo = if @spans_source.respond_to?(:call)
+            @spans_source.call
+          else
+            @spans_source
+          end
+          @spans_memo ||= []
+          @spans_resolved = true
+        end
+        @spans_memo
+      end
+    end
+  end
+end

data/lib/braintrust/eval.rb CHANGED Viewed

@@ -1,9 +1,13 @@
 # frozen_string_literal: true
-require_relative "eval/scorer"
+require_relative "scorer"
+require_relative "task"
+require_relative "functions"
+require_relative "eval/context"
 require_relative "eval/evaluator"
-require_relative "eval/runner"
 require_relative "eval/functions"
+require_relative "eval/runner"
+require_relative "eval/scorer"
 require_relative "api/internal/projects"
 require_relative "api/internal/experiments"
 require_relative "dataset"
@@ -17,18 +21,21 @@ module Braintrust
   # The Eval module provides tools for running systematic evaluations of your AI systems. An
   # evaluation consists of:
   # - **Cases**: Test inputs with optional expected outputs
-  # - **Task**: The code/model being evaluated
-  # - **Scorers**: Functions that judge the quality of outputs
+  # - **Task**: The code/model being evaluated (a {Braintrust::Task} or callable)
+  # - **Scorers**: Functions that judge the quality of outputs (String name, {Braintrust::Scorer}, or callable)
+  #
+  # Tasks and scorers use keyword arguments. Only declare the keywords you need —
+  # extra kwargs are automatically filtered out.
+  #
+  # When using multiple scorers, each must have a unique name — scores are keyed
+  # by name, so duplicates overwrite each other. Use +Scorer.new("name")+ or a
+  # Scorer subclass to assign names. Anonymous lambdas default to "scorer".
   #
   # @example Basic evaluation with inline cases
   #   require "braintrust"
   #
   #   Braintrust.init
   #
-  #   # Define a simple task (the code being evaluated)
-  #   task = ->(input) { input.include?("a") ? "fruit" : "vegetable" }
-  #
-  #   # Run evaluation with inline cases
   #   Braintrust::Eval.run(
   #     project: "my-project",
   #     experiment: "food-classifier",
@@ -37,114 +44,65 @@ module Braintrust
   #       {input: "carrot", expected: "vegetable"},
   #       {input: "banana", expected: "fruit"}
   #     ],
-  #     task: task,
+  #     task: ->(input:) { input.include?("a") ? "fruit" : "vegetable" },
   #     scorers: [
-  #       # Named scorer with Eval.scorer
-  #       Braintrust::Eval.scorer("exact_match") do |input, expected, output|
-  #         output == expected ? 1.0 : 0.0
-  #       end
+  #       ->(expected:, output:) { output == expected ? 1.0 : 0.0 }
   #     ]
   #   )
   #
-  # @example Different ways to define scorers (recommended patterns)
-  #   # Method reference (auto-uses method name as scorer name)
-  #   def exact_match(input, expected, output)
-  #     output == expected ? 1.0 : 0.0
-  #   end
+  # @example Different ways to define scorers
+  #   # String — references a scorer defined in your Braintrust project
+  #   scorers: ["accuracy-scorer", "relevance-scorer"]
   #
-  #   # Named scorer with Eval.scorer
-  #   case_insensitive = Braintrust::Eval.scorer("case_insensitive") do |input, expected, output|
-  #     output.downcase == expected.downcase ? 1.0 : 0.0
-  #   end
+  #   # Lambda — declare only the kwargs you need (input:, expected:, output:, metadata:, tags:)
+  #   exact = ->(expected:, output:) { output == expected ? 1.0 : 0.0 }
   #
-  #   # Callable class with name method
-  #   class FuzzyMatch
-  #     def name
-  #       "fuzzy_match"
-  #     end
+  #   # Named scorer with Scorer.new
+  #   named = Braintrust::Scorer.new("case_insensitive") { |expected:, output:| output.downcase == expected.downcase ? 1.0 : 0.0 }
   #
-  #     def call(input, expected, output, metadata = {})
-  #       threshold = metadata[:threshold] || 0.8
+  #   # Class-based pattern (auto-derives name from class: "fuzzy_match")
+  #   class FuzzyMatch
+  #     include Braintrust::Scorer
+  #     def call(expected:, output:)
   #       # scoring logic here
   #       1.0
   #     end
   #   end
   #
-  #   # Anonymous lambda that returns named score object
-  #   multi_score = ->(input, expected, output) {
-  #     [
-  #       {name: "exact_match", score: output == expected ? 1.0 : 0.0},
-  #       {name: "length_match", score: output.length == expected.length ? 1.0 : 0.0}
-  #     ]
-  #   }
-  #
-  #   # All can be used together
-  #   Braintrust::Eval.run(
-  #     project: "my-project",
-  #     experiment: "scorer-examples",
-  #     cases: [{input: "test", expected: "test"}],
-  #     task: ->(input) { input },
-  #     scorers: [method(:exact_match), case_insensitive, FuzzyMatch.new, multi_score]
-  #   )
-  #
   # @example Different ways to define tasks
-  #   # Lambda
-  #   task_lambda = ->(input) { "result" }
+  #   # Lambda with keyword args
+  #   task = ->(input:) { process(input) }
   #
-  #   # Proc
-  #   task_proc = proc { |input| "result" }
-  #
-  #   # Method reference
-  #   def my_task(input)
-  #     "result"
-  #   end
-  #   task_method = method(:my_task)
+  #   # Named task with Task.new
+  #   task = Braintrust::Task.new("my_task") { |input:| process(input) }
   #
-  #   # Callable class
+  #   # Class-based pattern
   #   class MyTask
-  #     def call(input)
-  #       "result"
+  #     include Braintrust::Task
+  #     def call(input:)
+  #       process(input)
   #     end
   #   end
-  #   task_class = MyTask.new
   #
-  #   # All of these can be used as the task parameter
-  #   Braintrust::Eval.run(
-  #     project: "my-project",
-  #     experiment: "task-examples",
-  #     cases: [{input: "test"}],
-  #     task: task_lambda, # or task_proc, task_method, task_class
-  #     scorers: [
-  #       Braintrust::Eval.scorer("my_scorer") { |input, expected, output| 1.0 }
-  #     ]
-  #   )
+  #   # Legacy lambdas (positional args) are also accepted for backwards compatibility
+  #   legacy_task = ->(input) { process(input) }
   #
   # @example Using datasets instead of inline cases
-  #   # Fetch cases from a dataset stored in Braintrust
   #   Braintrust::Eval.run(
   #     project: "my-project",
   #     experiment: "with-dataset",
   #     dataset: "my-dataset-name", # fetches from same project
-  #     task: ->(input) { "result" },
-  #     scorers: [
-  #       Braintrust::Eval.scorer("my_scorer") { |input, expected, output| 1.0 }
-  #     ]
+  #     task: ->(input:) { input.upcase },
+  #     scorers: [->(expected:, output:) { output == expected ? 1.0 : 0.0 }]
   #   )
   #
   #   # Or with more options
   #   Braintrust::Eval.run(
   #     project: "my-project",
   #     experiment: "with-dataset-options",
-  #     dataset: {
-  #       name: "my-dataset",
-  #       project: "other-project",
-  #       version: "1.0",
-  #       limit: 100
-  #     },
-  #     task: ->(input) { "result" },
-  #     scorers: [
-  #       Braintrust::Eval.scorer("my_scorer") { |input, expected, output| 1.0 }
-  #     ]
+  #     dataset: { name: "my-dataset", project: "other-project", version: "1.0", limit: 100 },
+  #     task: ->(input:) { input.upcase },
+  #     scorers: [->(expected:, output:) { output == expected ? 1.0 : 0.0 }]
   #   )
   #
   # @example Using metadata and tags
@@ -159,32 +117,24 @@ module Braintrust
   #         metadata: {threshold: 0.9, category: "produce"}
   #       }
   #     ],
-  #     task: ->(input) { "fruit" },
+  #     task: ->(input:) { "fruit" },
   #     scorers: [
-  #       # Scorer can access case metadata
-  #       Braintrust::Eval.scorer("threshold_match") do |input, expected, output, metadata|
+  #       ->(expected:, output:, metadata:) {
   #         threshold = metadata[:threshold] || 0.5
   #         # scoring logic using threshold
   #         1.0
-  #       end
+  #       }
   #     ],
-  #     # Experiment-level tags and metadata
   #     tags: ["v1", "production"],
-  #     metadata: {
-  #       model: "gpt-4",
-  #       temperature: 0.7,
-  #       version: "1.0.0"
-  #     }
+  #     metadata: { model: "gpt-4", temperature: 0.7, version: "1.0.0" }
   #   )
   module Eval
     class << self
-      # Create a scorer with a name and callable
-      # @param name [String] The scorer name
-      # @param callable [#call, nil] Optional callable (if not using block)
-      # @param block [Proc] The scorer block
-      # @return [Scorer]
+      # @deprecated Use {Braintrust::Scorer.new} instead
       def scorer(name, callable = nil, &block)
-        Scorer.new(name, callable, &block)
+        Log.warn_once(:eval_scorer, "Braintrust::Eval.scorer is deprecated: use Braintrust::Scorer.new instead.")
+        block = callable.method(:call) if callable && !block
+        Scorer.new(name, &block)
       end
       # Run an evaluation
@@ -195,7 +145,7 @@ module Braintrust
       #   - String: dataset name (fetches from same project)
       #   - Hash: {name:, id:, project:, version:, limit:}
       # @param task [#call] The task to evaluate (must be callable)
-      # @param scorers [Array<Scorer, #call>] The scorers to use (Scorer objects or callables)
+      # @param scorers [Array<String, Scorer, #call>] The scorers to use (String names, Scorer objects, or callables)
       # @param on_progress [#call, nil] Optional callback fired after each test case.
       #   Receives a Hash: {"data" => output, "scores" => {name => value}} on success,
       #   or {"error" => message} on failure.
@@ -216,9 +166,6 @@ module Braintrust
         # Validate required parameters
         validate_params!(task: task, scorers: scorers, cases: cases, dataset: dataset)
-        # Resolve any ScorerId entries to real Scorer objects
-        scorers = resolve_scorers(scorers, state: state, tracer_provider: tracer_provider)
         experiment_id = nil
         project_name = project
@@ -246,20 +193,21 @@ module Braintrust
           end
         end
-        # Instantiate Runner and run evaluation
-        runner = Runner.new(
+        # Build normalized context and run
+        context = Context.build(
+          task: task,
+          scorers: scorers,
+          cases: cases,
           experiment_id: experiment_id,
           experiment_name: experiment,
           project_id: project_id,
           project_name: project_name,
-          task: task,
-          scorers: scorers,
           state: state,
           tracer_provider: tracer_provider,
           on_progress: on_progress,
           parent: parent
         )
-        result = runner.run(cases, parallelism: parallelism)
+        result = Runner.new(context).run(parallelism: parallelism)
         # Print result summary unless quiet
         print_result(result) unless quiet
@@ -275,26 +223,6 @@ module Braintrust
         puts result.to_pretty
       end
-      # Resolve scorers array: ScorerId entries become real Scorer objects, others pass through
-      # @param scorers [Array] Scorers (Scorer, callable, or ScorerId)
-      # @param state [State, nil] Braintrust state (required for ScorerId resolution)
-      # @param tracer_provider [TracerProvider, nil] OpenTelemetry tracer provider
-      # @return [Array<Scorer, #call>] Resolved scorers
-      def resolve_scorers(scorers, state: nil, tracer_provider: nil)
-        scorers.map do |scorer|
-          if scorer.is_a?(ScorerId)
-            Functions.scorer_by_id(
-              id: scorer.function_id,
-              version: scorer.version,
-              state: state,
-              tracer_provider: tracer_provider
-            )
-          else
-            scorer
-          end
-        end
-      end
       # Validate required parameters
       # @raise [ArgumentError] if validation fails
       def validate_params!(task:, scorers:, cases:, dataset:)
@@ -356,7 +284,7 @@ module Braintrust
         dataset_obj = case dataset
         when Dataset
           dataset
-        when DatasetId
+        when Dataset::ID
           Dataset.new(id: dataset.id, state: state)
         when String
           Dataset.new(name: dataset, project: project, state: state)
@@ -367,7 +295,7 @@ module Braintrust
           opts[:state] = state
           Dataset.new(**opts)
         else
-          raise ArgumentError, "dataset must be String, Hash, Dataset, or DatasetId, got #{dataset.class}"
+          raise ArgumentError, "dataset must be String, Hash, Dataset, or Dataset::ID, got #{dataset.class}"
         end
         cases = dataset_obj.fetch_all(limit: limit)

data/lib/braintrust/functions.rb ADDED Viewed

@@ -0,0 +1,168 @@
+# frozen_string_literal: true
+require_relative "api"
+require_relative "scorer"
+require_relative "task"
+require "opentelemetry/sdk"
+require "json"
+module Braintrust
+  # Functions provides remote function execution capabilities.
+  # Allows calling prompts hosted on Braintrust servers as tasks or scorers.
+  module Functions
+    class << self
+      # Create a Task that invokes a remote function
+      # @param project [String] Project name
+      # @param slug [String] Function slug
+      # @param state [State, nil] Braintrust state (defaults to global)
+      # @param tracer_provider [TracerProvider, nil] OpenTelemetry tracer provider
+      # @return [Task] Task object that invokes remote function
+      def task(project:, slug:, state: nil, tracer_provider: nil)
+        state ||= Braintrust.current_state
+        raise Error, "No state available" unless state
+        # Resolve function ID from project + slug
+        api = API.new(state: state)
+        function_metadata = resolve_function(api, project, slug)
+        function_id = function_metadata["id"]
+        function_name = function_metadata["name"] || slug
+        # Get tracer for creating spans
+        tracer_provider ||= OpenTelemetry.tracer_provider
+        tracer = tracer_provider.tracer("braintrust.functions")
+        Task.new(function_name) do |input:|
+          tracer.in_span("function: #{slug}") do |span|
+            span.set_attribute("braintrust.span_attributes", JSON.dump({type: "function"}))
+            span.set_attribute("braintrust.input_json", JSON.dump(input))
+            span.set_attribute("braintrust.function.name", function_name)
+            span.set_attribute("braintrust.function.id", function_id)
+            span.set_attribute("braintrust.function.slug", slug)
+            begin
+              output = api.functions.invoke(id: function_id, input: input)
+              span.set_attribute("braintrust.output_json", JSON.dump(output))
+              output
+            rescue => e
+              span.record_exception(e)
+              span.status = OpenTelemetry::Trace::Status.error(e.message)
+              raise
+            end
+          end
+        end
+      end
+      # Create a scorer that invokes a remote function.
+      # Resolve by project + slug, or by function UUID (id).
+      # @param project [String, nil] Project name (used with slug)
+      # @param slug [String, nil] Function slug (used with project)
+      # @param id [String, nil] Function UUID (alternative to project + slug)
+      # @param version [String, nil] Optional version to pin to (used with id)
+      # @param state [State, nil] Braintrust state (defaults to global)
+      # @param tracer_provider [TracerProvider, nil] OpenTelemetry tracer provider
+      # @return [Scorer] Scorer object that invokes remote function
+      def scorer(project: nil, slug: nil, id: nil, version: nil, state: nil, tracer_provider: nil)
+        has_id = !id.nil?
+        has_project_slug = !project.nil? && !slug.nil?
+        unless has_id || has_project_slug
+          raise ArgumentError, "scorer requires either id: or both project: and slug:"
+        end
+        state ||= Braintrust.current_state
+        raise Error, "No state available" unless state
+        api = API.new(state: state)
+        function_metadata = if id
+          api.login
+          api.functions.get(id: id, version: version)
+        else
+          resolve_function(api, project, slug)
+        end
+        function_id = function_metadata["id"]
+        function_name = function_metadata["name"] || id || slug
+        tracer_provider ||= OpenTelemetry.tracer_provider
+        tracer = tracer_provider.tracer("braintrust.functions")
+        build_scorer(function_id: function_id, function_name: function_name, api: api, tracer: tracer)
+      end
+      private
+      # Build a Scorer that invokes a remote function
+      # @param function_id [String] Function UUID
+      # @param function_name [String] Function display name
+      # @param api [API] Braintrust API client
+      # @param tracer [OpenTelemetry::Trace::Tracer] Tracer instance
+      # @return [Scorer]
+      def build_scorer(function_id:, function_name:, api:, tracer:)
+        Scorer.new(function_name) do |input:, expected:, output:, metadata:|
+          tracer.in_span("function: #{function_name}") do |span|
+            scorer_input = {
+              input: input,
+              expected: expected,
+              output: output,
+              metadata: metadata
+            }
+            span.set_attribute("braintrust.span_attributes", JSON.dump({type: "function"}))
+            span.set_attribute("braintrust.input_json", JSON.dump(scorer_input))
+            span.set_attribute("braintrust.function.name", function_name)
+            span.set_attribute("braintrust.function.id", function_id)
+            begin
+              result = api.functions.invoke(id: function_id, input: scorer_input)
+              score = case result
+              when Numeric
+                result.to_f
+              when true
+                1.0
+              when false
+                0.0
+              when Hash
+                if result.key?("score")
+                  result["score"].to_f
+                else
+                  raise Error, "Hash result must contain 'score' key"
+                end
+              when String
+                result.to_f
+              when nil
+                nil
+              else
+                raise Error, "Unsupported result type: #{result.class}"
+              end
+              span.set_attribute("braintrust.output_json", JSON.dump(score))
+              score
+            rescue => e
+              span.record_exception(e)
+              span.status = OpenTelemetry::Trace::Status.error(e.message)
+              raise
+            end
+          end
+        end
+      end
+      # Resolve function ID from project name and slug
+      # @param api [API] API client
+      # @param project [String] Project name
+      # @param slug [String] Function slug
+      # @return [Hash] Function metadata
+      def resolve_function(api, project, slug)
+        result = api.functions.list(project_name: project, slug: slug)
+        functions = result["objects"]
+        if functions.nil? || functions.empty?
+          raise Error, "Function '#{slug}' not found in project '#{project}'"
+        end
+        functions.first
+      end
+    end
+  end
+end