RubyGems - braintrust - Versions diffs - 0.2.0 → 0.2.1 - Mend

braintrust 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +4 -4
data/README.md +42 -15
data/lib/braintrust/api/internal/btql.rb +124 -0
data/lib/braintrust/api/internal/experiments.rb +19 -0
data/lib/braintrust/api/internal/projects.rb +19 -0
data/lib/braintrust/dataset.rb +6 -3
data/lib/braintrust/eval/context.rb +131 -0
data/lib/braintrust/eval/evaluator.rb +11 -5
data/lib/braintrust/eval/functions.rb +10 -166
data/lib/braintrust/eval/runner.rb +100 -108
data/lib/braintrust/eval/scorer.rb +24 -96
data/lib/braintrust/eval/trace.rb +129 -0
data/lib/braintrust/eval.rb +60 -132
data/lib/braintrust/functions.rb +168 -0
data/lib/braintrust/internal/callable.rb +83 -0
data/lib/braintrust/logger.rb +9 -0
data/lib/braintrust/scorer.rb +122 -0
data/lib/braintrust/server/handlers/eval.rb +3 -3
data/lib/braintrust/task.rb +108 -0
data/lib/braintrust/version.rb +1 -1
metadata +8 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: d67e6d0faeb24297af8a5f43ac1bd1ceacff1f37df2610244ae5f81e34c4ae5f
-  data.tar.gz: 489ec68fee424aa8aa1880b73b58f1f26529493d8898cd0ae5876d3b919fcb7c
+  metadata.gz: 747b190f21c7de342f85390f8a51b17628e23fa2436776989a3ebe637bf9d596
+  data.tar.gz: 1e6c0c59c9ce56d499a04d8424506c56e2c2ad359506a6d5175c7173dc4ab238
 SHA512:
-  metadata.gz: cd876122ad92c5439ff45e975fd84418bfcc7d72d6f9398e48b1ac4c60f09fb96c2b85b46ee1c8de6a75291c0b7d2754ee2fa069f77f8a2f8a4c069132c59d94
-  data.tar.gz: 45d3f80f69ac9725d93aa0db24815da093bfd992b5418f8551c8d25e8caef9299f270a92fa922a4bc4bf3190d9f823a35c7203f9a74bd58daee31869b987f103
+  metadata.gz: 3f652583ec04f5b874e3417db4cc0dff7f43341eeffe686466b8caad5614ed336e8580ac7533ef100726f09cdb264900e0f454edd11328e611513ffc8f77d3cb
+  data.tar.gz: 3316d0cb4ccc77e2d0c0ae48c033b6f5c026237d85c85e75e139434023f713820c3790a98090dac636ae6d44127692279404bcd3ab88b0d50a3de3127d38e3a6

data/README.md CHANGED Viewed

@@ -252,9 +252,9 @@ Braintrust::Eval.run(
     {input: "apple", expected: "fruit"},
     {input: "carrot", expected: "vegetable"}
   ],
-  task: ->(input) { classify(input) },
+  task: ->(input:) { classify(input) },
   scorers: [
-    ->(input, expected, output) { output == expected ? 1.0 : 0.0 }
+    ->(expected:, output:) { output == expected ? 1.0 : 0.0 }
   ]
 )
 ```
@@ -267,7 +267,7 @@ Use test cases from a Braintrust dataset:
 Braintrust::Eval.run(
   project: "my-project",
   dataset: "my-dataset",
-  task: ->(input) { classify(input) },
+  task: ->(input:) { classify(input) },
   scorers: [...]
 )
 ```
@@ -282,7 +282,7 @@ Braintrust::Eval.run(
     {input: "apple", expected: "fruit", tags: ["produce"], metadata: {difficulty: "easy"}},
     {input: "salmon", expected: "protein", tags: ["seafood"], metadata: {difficulty: "medium"}}
   ],
-  task: ->(input) { classify(input) },
+  task: ->(input:) { classify(input) },
   scorers: [...]
 )
 ```
@@ -295,29 +295,56 @@ Use scoring functions defined in Braintrust:
 Braintrust::Eval.run(
   project: "my-project",
   cases: [...],
-  task: ->(input) { ... },
+  task: ->(input:) { ... },
+  scorers: ["accuracy-scorer"]
+)
+```
+Or define scorers inline with `Scorer.new`:
+```ruby
+Braintrust::Eval.run(
+  project: "my-project",
+  cases: [...],
+  task: ->(input:) { ... },
   scorers: [
-    Braintrust::Eval::Functions.scorer(project: "my-project", slug: "accuracy-scorer")
+    Braintrust::Scorer.new("exact_match") do |expected:, output:|
+      output == expected ? 1.0 : 0.0
+    end
   ]
 )
 ```
-Or define scorers inline with `Eval.scorer`:
+#### Trace scoring
+Scorers can access the full evaluation trace (all spans generated by the task) by declaring a `trace:` keyword parameter. This is useful for inspecting intermediate LLM calls, validating tool usage, or checking the message thread:
 ```ruby
 Braintrust::Eval.run(
   project: "my-project",
-  cases: [...],
-  task: ->(input) { ... },
+  cases: [{input: "What is 2+2?", expected: "4"}],
+  task: Braintrust::Task.new { |input:| my_llm_pipeline(input) },
   scorers: [
-    Braintrust::Eval.scorer("exact_match") do |input, expected, output|
+    # Access the full trace to inspect LLM spans
+    Braintrust::Scorer.new("uses_system_prompt") do |output:, trace:|
+      messages = trace.thread  # reconstructed message thread from LLM spans
+      messages.any? { |m| m["role"] == "system" } ? 1.0 : 0.0
+    end,
+    # Filter spans by type
+    Braintrust::Scorer.new("single_llm_call") do |output:, trace:|
+      trace.spans(span_type: "llm").length == 1 ? 1.0 : 0.0
+    end,
+    # Scorers without trace: still work — the parameter is filtered out automatically
+    Braintrust::Scorer.new("exact_match") do |output:, expected:|
       output == expected ? 1.0 : 0.0
     end
   ]
 )
 ```
-See examples: [eval.rb](./examples/eval.rb), [dataset.rb](./examples/eval/dataset.rb), [remote_functions.rb](./examples/eval/remote_functions.rb)
+See examples: [eval.rb](./examples/eval.rb), [dataset.rb](./examples/eval/dataset.rb), [remote_functions.rb](./examples/eval/remote_functions.rb), [trace_scoring.rb](./examples/eval/trace_scoring.rb)
 ### Dev Server
@@ -330,9 +357,9 @@ require "braintrust/server"
 # Define evaluators — these can reference your application code (models, services, etc.)
 food_classifier = Braintrust::Eval::Evaluator.new(
-  task: ->(input) { FoodClassifier.classify(input) },
+  task: ->(input:) { FoodClassifier.classify(input) },
   scorers: [
-    Braintrust::Eval.scorer("exact_match") { |input, expected, output| output == expected ? 1.0 : 0.0 }
+    Braintrust::Scorer.new("exact_match") { |expected:, output:| output == expected ? 1.0 : 0.0 }
   ]
 )
@@ -358,11 +385,11 @@ Evaluators can also be defined as subclasses:
 ```ruby
 class FoodClassifier < Braintrust::Eval::Evaluator
   def task
-    ->(input) { classify(input) }
+    ->(input:) { classify(input) }
   end
   def scorers
-    [Braintrust::Eval.scorer("exact_match") { |i, e, o| o == e ? 1.0 : 0.0 }]
+    [Braintrust::Scorer.new("exact_match") { |expected:, output:| output == expected ? 1.0 : 0.0 }]
   end
 end
 ```

data/lib/braintrust/api/internal/btql.rb ADDED Viewed

@@ -0,0 +1,124 @@
+# frozen_string_literal: true
+require "net/http"
+require "json"
+require "uri"
+require_relative "../../internal/http"
+module Braintrust
+  class API
+    module Internal
+      # Internal BTQL client for querying spans.
+      # Not part of the public API — instantiated directly where needed.
+      class BTQL
+        # Maximum number of retries before returning partial results.
+        # Covers both freshness lag (partially indexed) and ingestion lag
+        # (spans not yet visible to BTQL after OTel flush).
+        MAX_FRESHNESS_RETRIES = 7
+        # Base delay (seconds) between retries (doubles each attempt, capped).
+        FRESHNESS_BASE_DELAY = 1.0
+        # Maximum delay (seconds) between retries. Caps exponential growth
+        # so we keep polling at a reasonable rate in the later window.
+        # Schedule: 1, 2, 4, 8, 8, 8, 8 = ~39s total worst-case.
+        MAX_FRESHNESS_DELAY = 8.0
+        def initialize(state)
+          @state = state
+        end
+        # Query spans belonging to a specific trace within an object.
+        #
+        # Builds a BTQL SQL query that matches the root_span_id and excludes scorer spans.
+        # Retries with exponential backoff if the response indicates data is not yet fresh.
+        #
+        # @param object_type [String] e.g. "experiment"
+        # @param object_id [String] Object UUID
+        # @param root_span_id [String] Hex trace ID of the root span
+        # @return [Array<Hash>] Parsed span data
+        def trace_spans(object_type:, object_id:, root_span_id:)
+          query = build_trace_query(
+            object_type: object_type,
+            object_id: object_id,
+            root_span_id: root_span_id
+          )
+          payload = {query: query, fmt: "jsonl"}
+          retries = 0
+          loop do
+            rows, freshness = execute_query(payload)
+            # Return when data is fresh AND non-empty, or we've exhausted retries.
+            # We retry on empty even when "complete" because there is ingestion lag
+            # between OTel flush and BTQL indexing — the server may report "complete"
+            # before it knows about newly-flushed spans.
+            return rows if (freshness == "complete" && !rows.empty?) || retries >= MAX_FRESHNESS_RETRIES
+            retries += 1
+            delay = [FRESHNESS_BASE_DELAY * (2**(retries - 1)), MAX_FRESHNESS_DELAY].min
+            sleep(delay)
+          end
+        rescue => e
+          Braintrust::Log.warn("[BTQL] Query failed: #{e.message}")
+          []
+        end
+        private
+        # Build a BTQL SQL query string for fetching trace spans.
+        #
+        # Selects all spans for a given root_span_id, excluding scorer spans
+        # (span_attributes.type = 'score').
+        #
+        # @param object_type [String] e.g. "experiment"
+        # @param object_id [String] Object UUID
+        # @param root_span_id [String] Hex trace ID
+        # @return [String] BTQL SQL query
+        def build_trace_query(object_type:, object_id:, root_span_id:)
+          escaped_root = root_span_id.gsub("'", "''")
+          escaped_id = object_id.gsub("'", "''")
+          "SELECT * FROM #{object_type}('#{escaped_id}') " \
+            "WHERE root_span_id = '#{escaped_root}' " \
+            "AND span_attributes.type != 'score' " \
+            "LIMIT 1000"
+        end
+        # Execute a BTQL query and parse the JSONL response.
+        #
+        # @param payload [Hash] BTQL request payload
+        # @return [Array(Array<Hash>, String)] [parsed_rows, freshness_state]
+        def execute_query(payload)
+          uri = URI("#{@state.api_url}/btql")
+          request = Net::HTTP::Post.new(uri)
+          request["Content-Type"] = "application/json"
+          request["Authorization"] = "Bearer #{@state.api_key}"
+          request["Accept"] = "application/x-jsonlines"
+          request.body = JSON.dump(payload)
+          response = Braintrust::Internal::Http.with_redirects(uri, request)
+          unless response.is_a?(Net::HTTPSuccess)
+            raise Braintrust::Error, "HTTP #{response.code} for POST #{uri}: #{response.body}"
+          end
+          freshness = response["x-bt-freshness-state"] || "complete"
+          [parse_jsonl(response.body), freshness]
+        end
+        # Parse a JSONL response body into an array of hashes.
+        #
+        # @param body [String] JSONL response body
+        # @return [Array<Hash>]
+        def parse_jsonl(body)
+          body.each_line.filter_map do |line|
+            line = line.strip
+            next if line.empty?
+            JSON.parse(line)
+          end
+        end
+      end
+    end
+  end
+end

data/lib/braintrust/api/internal/experiments.rb CHANGED Viewed

@@ -50,6 +50,25 @@ module Braintrust
           JSON.parse(response.body)
         end
+        # Delete an experiment
+        # DELETE /v1/experiment/:id
+        # @param id [String] Experiment ID
+        # @return [Hash] Deleted experiment data
+        def delete(id:)
+          uri = URI("#{@state.api_url}/v1/experiment/#{id}")
+          request = Net::HTTP::Delete.new(uri)
+          request["Authorization"] = "Bearer #{@state.api_key}"
+          response = Braintrust::Internal::Http.with_redirects(uri, request)
+          unless response.is_a?(Net::HTTPSuccess)
+            raise Error, "HTTP #{response.code} for DELETE #{uri}: #{response.body}"
+          end
+          JSON.parse(response.body)
+        end
       end
     end
   end

data/lib/braintrust/api/internal/projects.rb CHANGED Viewed

@@ -35,6 +35,25 @@ module Braintrust
           JSON.parse(response.body)
         end
+        # Delete a project
+        # DELETE /v1/project/:id
+        # @param id [String] Project UUID
+        # @return [Hash] Deleted project data
+        def delete(id:)
+          uri = URI("#{@state.api_url}/v1/project/#{id}")
+          request = Net::HTTP::Delete.new(uri)
+          request["Authorization"] = "Bearer #{@state.api_key}"
+          response = Braintrust::Internal::Http.with_redirects(uri, request)
+          unless response.is_a?(Net::HTTPSuccess)
+            raise Error, "HTTP #{response.code} for DELETE #{uri}: #{response.body}"
+          end
+          JSON.parse(response.body)
+        end
       end
     end
   end

data/lib/braintrust/dataset.rb CHANGED Viewed

@@ -181,9 +181,12 @@ module Braintrust
         created: raw["created"]
       )
     end
+    # Value object wrapping a dataset UUID for resolution by ID.
+    # Used by Eval.run to distinguish dataset-by-ID from dataset-by-name.
+    ID = Struct.new(:id, keyword_init: true)
   end
-  # Value object wrapping a dataset UUID for resolution by ID.
-  # Used by Eval.run to distinguish dataset-by-ID from dataset-by-name.
-  DatasetId = Struct.new(:id, keyword_init: true)
+  # @deprecated Use {Braintrust::Dataset::ID} instead.
+  DatasetId = Dataset::ID
 end

data/lib/braintrust/eval/context.rb ADDED Viewed

@@ -0,0 +1,131 @@
+# frozen_string_literal: true
+require_relative "cases"
+module Braintrust
+  module Eval
+    # Holds all normalized, ready-to-execute eval components.
+    # Use Context.build to construct from raw user inputs.
+    class Context
+      attr_reader :task, :scorers, :cases, :experiment_id, :experiment_name,
+        :project_id, :project_name, :state, :tracer_provider,
+        :on_progress, :parent_span_attr, :generation
+      def initialize(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil,
+        project_id: nil, project_name: nil, state: nil, tracer_provider: nil,
+        on_progress: nil, parent_span_attr: nil, generation: nil)
+        @task = task
+        @scorers = scorers
+        @cases = cases
+        @experiment_id = experiment_id
+        @experiment_name = experiment_name
+        @project_id = project_id
+        @project_name = project_name
+        @state = state
+        @tracer_provider = tracer_provider
+        @on_progress = on_progress
+        @parent_span_attr = parent_span_attr
+        @generation = generation
+      end
+      # Build a Context from raw user inputs.
+      # Factory normalizes task, scorers, and cases into typed wrappers.
+      # Parent is resolved into parent_span_attr and generation.
+      def self.build(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil,
+        project_id: nil, project_name: nil, state: nil, tracer_provider: nil,
+        on_progress: nil, parent: nil)
+        factory = Factory.new(state: state, tracer_provider: tracer_provider, project_name: project_name)
+        Context.new(
+          task: factory.normalize_task(task),
+          scorers: factory.normalize_scorers(scorers),
+          cases: factory.normalize_cases(cases),
+          experiment_id: experiment_id,
+          experiment_name: experiment_name,
+          project_id: project_id,
+          project_name: project_name,
+          state: state,
+          tracer_provider: tracer_provider,
+          on_progress: on_progress,
+          parent_span_attr: factory.resolve_parent_span_attr(parent),
+          generation: parent&.dig(:generation)
+        )
+      end
+      # Encapsulates normalization of raw user inputs into typed wrappers.
+      class Factory
+        def initialize(state: nil, tracer_provider: nil, project_name: nil)
+          @state = state
+          @tracer_provider = tracer_provider
+          @project_name = project_name
+        end
+        def normalize_cases(raw)
+          case raw
+          when Cases
+            raw
+          when Array, Enumerable
+            Cases.new(raw)
+          else
+            if raw.respond_to?(:each)
+              Cases.new(raw)
+            else
+              raise ArgumentError, "cases must be Array or Enumerable"
+            end
+          end
+        end
+        def resolve_parent_span_attr(parent)
+          return nil unless parent
+          "#{parent[:object_type]}:#{parent[:object_id]}"
+        end
+        def normalize_task(raw)
+          case raw
+          when Task
+            raw
+          when Proc
+            # Pass Proc/Lambda directly to preserve keyword arg info.
+            # Legacy positional lambdas (arity 1) are auto-wrapped by Task#wrap_block.
+            Task.new(&raw)
+          else
+            # Callable class: wrap via method(:call) to preserve keyword arg info
+            name = raw.respond_to?(:name) ? raw.name : nil
+            Task.new(name, &raw.method(:call))
+          end
+        end
+        def normalize_scorers(raw)
+          raw.map do |scorer|
+            case scorer
+            when String
+              raise ArgumentError, "project is required to resolve scorer slug '#{scorer}'" unless @project_name
+              Braintrust::Functions.scorer(
+                project: @project_name,
+                slug: scorer,
+                state: @state,
+                tracer_provider: @tracer_provider
+              )
+            when Braintrust::Scorer::ID
+              Braintrust::Functions.scorer(
+                id: scorer.function_id,
+                version: scorer.version,
+                state: @state,
+                tracer_provider: @tracer_provider
+              )
+            when Braintrust::Scorer
+              scorer
+            when Proc
+              # Pass Proc/Lambda directly to preserve keyword arg info
+              # (method(:call) loses parameter metadata)
+              Braintrust::Scorer.new(&scorer)
+            else
+              name = scorer.respond_to?(:name) ? scorer.name : nil
+              Braintrust::Scorer.new(name, &scorer.method(:call))
+            end
+          end
+        end
+      end
+    end
+  end
+end

data/lib/braintrust/eval/evaluator.rb CHANGED Viewed

@@ -5,21 +5,27 @@ module Braintrust
     # Base class for evaluators. Subclass and override #task and #scorers,
     # or instantiate directly with keyword arguments.
     #
+    # Evaluators are used with the dev server, which reports scorer names
+    # to the Braintrust UI. Always use named scorers (via Scorer.new or
+    # subclass) so they display meaningfully.
+    #
     # @example Subclass pattern
     #   class FoodClassifier < Braintrust::Eval::Evaluator
     #     def task
-    #       ->(input) { classify(input) }
+    #       ->(input:) { classify(input) }
     #     end
     #
     #     def scorers
-    #       [Braintrust::Eval.scorer("exact_match") { |i, e, o| o == e ? 1.0 : 0.0 }]
+    #       [Braintrust::Scorer.new("exact_match") { |expected:, output:| output == expected ? 1.0 : 0.0 }]
     #     end
     #   end
     #
     # @example Inline pattern
     #   Braintrust::Eval::Evaluator.new(
-    #     task: ->(input) { input.upcase },
-    #     scorers: [my_scorer]
+    #     task: ->(input:) { input.upcase },
+    #     scorers: [
+    #       Braintrust::Scorer.new("exact_match") { |expected:, output:| output == expected ? 1.0 : 0.0 }
+    #     ]
     #   )
     class Evaluator
       attr_accessor :task, :scorers, :parameters
@@ -48,7 +54,7 @@ module Braintrust
       # @param project [String, nil] Project name
       # @param experiment [String, nil] Experiment name
       # @param project_id [String, nil] Project UUID (skips project creation)
-      # @param dataset [String, Hash, Dataset, DatasetId, nil] Dataset to fetch
+      # @param dataset [String, Hash, Dataset, Dataset::ID, nil] Dataset to fetch
       # @param scorers [Array, nil] Additional scorers (merged with evaluator's own)
       # @param parent [Hash, nil] Parent span context
       # @param state [State, nil] Braintrust state