braintrust 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +148 -24
  3. data/lib/braintrust/api/internal/btql.rb +124 -0
  4. data/lib/braintrust/api/internal/experiments.rb +19 -0
  5. data/lib/braintrust/api/internal/projects.rb +19 -0
  6. data/lib/braintrust/contrib/rails/server/application_controller.rb +34 -0
  7. data/lib/braintrust/contrib/rails/server/engine.rb +72 -0
  8. data/lib/braintrust/contrib/rails/server/eval_controller.rb +36 -0
  9. data/lib/braintrust/contrib/rails/server/generator.rb +43 -0
  10. data/lib/braintrust/contrib/rails/server/health_controller.rb +15 -0
  11. data/lib/braintrust/contrib/rails/server/list_controller.rb +16 -0
  12. data/lib/braintrust/contrib/rails/server/routes.rb +8 -0
  13. data/lib/braintrust/contrib/rails/server.rb +20 -0
  14. data/lib/braintrust/dataset.rb +6 -3
  15. data/lib/braintrust/eval/context.rb +131 -0
  16. data/lib/braintrust/eval/evaluator.rb +11 -5
  17. data/lib/braintrust/eval/functions.rb +10 -166
  18. data/lib/braintrust/eval/runner.rb +165 -145
  19. data/lib/braintrust/eval/scorer.rb +24 -96
  20. data/lib/braintrust/eval/trace.rb +129 -0
  21. data/lib/braintrust/eval.rb +60 -132
  22. data/lib/braintrust/functions.rb +168 -0
  23. data/lib/braintrust/internal/callable.rb +83 -0
  24. data/lib/braintrust/logger.rb +9 -0
  25. data/lib/braintrust/scorer.rb +173 -0
  26. data/lib/braintrust/server/handlers/eval.rb +8 -168
  27. data/lib/braintrust/server/handlers/list.rb +3 -41
  28. data/lib/braintrust/server/rack.rb +2 -0
  29. data/lib/braintrust/server/services/eval_service.rb +214 -0
  30. data/lib/braintrust/server/services/list_service.rb +64 -0
  31. data/lib/braintrust/task.rb +108 -0
  32. data/lib/braintrust/trace/span_processor.rb +0 -5
  33. data/lib/braintrust/version.rb +1 -1
  34. metadata +18 -1
@@ -0,0 +1,214 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module Braintrust
6
+ module Server
7
+ module Services
8
+ # Framework-agnostic service for running evaluations and streaming SSE results.
9
+ # Must be long-lived (not per-request) to preserve the @state_cache across requests.
10
+ class Eval
11
+ def initialize(evaluators)
12
+ @evaluators = evaluators
13
+ @state_mutex = Mutex.new
14
+ @state_cache = {}
15
+ end
16
+
17
+ # Validates request body. Returns:
18
+ # {error: String, status: Integer} on failure
19
+ # {evaluator:, name:, cases:, dataset:, ...} on success
20
+ def validate(body)
21
+ name = body["name"]
22
+ return {error: "Missing required field: name", status: 400} unless name
23
+
24
+ evaluator = current_evaluators[name]
25
+ return {error: "Evaluator '#{name}' not found", status: 404} unless evaluator
26
+
27
+ data = body["data"]
28
+ return {error: "Missing required field: data", status: 400} unless data
29
+
30
+ data_sources = ["data", "dataset_name", "dataset_id"].count { |k| data.key?(k) }
31
+ return {error: "Exactly one data source required", status: 400} if data_sources != 1
32
+
33
+ cases, dataset = resolve_data_source(data)
34
+
35
+ {
36
+ evaluator: evaluator,
37
+ name: name,
38
+ cases: cases,
39
+ dataset: dataset,
40
+ experiment_name: body["experiment_name"],
41
+ remote_scorer_ids: resolve_remote_scorers(body["scores"]),
42
+ parent: resolve_parent(body["parent"]),
43
+ project_id: body["project_id"]
44
+ }
45
+ end
46
+
47
+ # Runs the validated eval and streams SSE events via the sse writer.
48
+ # +validated+ is the hash returned by #validate.
49
+ # +auth+ is the auth context hash (or nil/true for no-auth).
50
+ # +sse+ is an SSEWriter instance.
51
+ def stream(validated, auth:, sse:)
52
+ name = validated[:name]
53
+ evaluator = validated[:evaluator]
54
+ cases = validated[:cases]
55
+ dataset = validated[:dataset]
56
+ experiment_name = validated[:experiment_name]
57
+ remote_scorer_ids = validated[:remote_scorer_ids]
58
+ parent = validated[:parent]
59
+ project_id = validated[:project_id]
60
+
61
+ state = build_state(auth)
62
+
63
+ # Only pass project/experiment params when state is available
64
+ run_opts = {
65
+ on_progress: ->(progress_data) {
66
+ # Build remote eval protocol events from generic progress data.
67
+ # Runner provides: id, data/error, scores (optional), origin (optional).
68
+ # Protocol requires: id, object_type, origin, name, format, output_type, event, data.
69
+ base = {
70
+ "object_type" => "task",
71
+ "name" => name,
72
+ "format" => "code",
73
+ "output_type" => "completion"
74
+ }
75
+ base["id"] = progress_data["id"] if progress_data["id"]
76
+ base["origin"] = progress_data["origin"] if progress_data["origin"]
77
+
78
+ if progress_data.key?("error")
79
+ sse.event("progress", JSON.dump(base.merge("event" => "error", "data" => progress_data["error"])))
80
+ else
81
+ sse.event("progress", JSON.dump(base.merge("event" => "json_delta", "data" => JSON.dump(progress_data["data"]))))
82
+ end
83
+
84
+ # Signal per-cell completion so the UI exits "Streaming..." state
85
+ # and updates the progress bar immediately.
86
+ sse.event("progress", JSON.dump(base.merge("event" => "done", "data" => "")))
87
+ },
88
+ quiet: true
89
+ }
90
+ run_opts[:parent] = parent if parent
91
+ run_opts[:scorers] = remote_scorer_ids if remote_scorer_ids
92
+ run_opts[:dataset] = dataset if dataset
93
+
94
+ if state
95
+ run_opts[:state] = state
96
+ run_opts[:experiment] = experiment_name if experiment_name
97
+ run_opts[:project_id] = project_id if project_id
98
+ end
99
+
100
+ result = evaluator.run(cases, **run_opts)
101
+
102
+ # Flush buffered OTLP spans before sending completion events.
103
+ # The BatchSpanProcessor exports every ~5s; fast evals can finish
104
+ # before a single export fires, causing the UI to see no results.
105
+ Braintrust::Trace.flush_spans
106
+
107
+ # Build summary from result scores
108
+ averaged_scores = {}
109
+ result.scorer_stats.each do |scorer_name, stats|
110
+ averaged_scores[scorer_name] = stats.score_mean
111
+ end
112
+
113
+ sse.event("summary", JSON.dump({
114
+ "scores" => averaged_scores,
115
+ "experiment_name" => experiment_name,
116
+ "experiment_id" => result.experiment_id,
117
+ "project_id" => result.project_id
118
+ }))
119
+
120
+ sse.event("done", "")
121
+ end
122
+
123
+ # Build State from auth context hash.
124
+ # Returns nil when auth is not a Hash (e.g. NoAuth returns true).
125
+ # Uses an LRU-style cache (max 64 entries) keyed by [api_key, app_url, org_name].
126
+ def build_state(auth)
127
+ return nil unless auth.is_a?(Hash)
128
+
129
+ cache_key = [auth["api_key"], auth["app_url"], auth["org_name"]]
130
+
131
+ @state_mutex ||= Mutex.new
132
+ @state_cache ||= {}
133
+
134
+ @state_mutex.synchronize do
135
+ cached = @state_cache[cache_key]
136
+ return cached if cached
137
+
138
+ state = Braintrust::State.new(
139
+ api_key: auth["api_key"],
140
+ org_id: auth["org_id"],
141
+ org_name: auth["org_name"],
142
+ app_url: auth["app_url"],
143
+ api_url: auth["api_url"],
144
+ enable_tracing: false
145
+ )
146
+
147
+ if @state_cache.size >= 64
148
+ oldest_key = @state_cache.keys.first
149
+ @state_cache.delete(oldest_key)
150
+ end
151
+
152
+ @state_cache[cache_key] = state
153
+ state
154
+ end
155
+ end
156
+
157
+ private
158
+
159
+ def current_evaluators
160
+ return @evaluators.call if @evaluators.respond_to?(:call)
161
+ @evaluators
162
+ end
163
+
164
+ # Resolve data source from the data field.
165
+ # Returns [cases, dataset] where exactly one is non-nil.
166
+ def resolve_data_source(data)
167
+ if data.key?("data")
168
+ cases = data["data"].map do |d|
169
+ {input: d["input"], expected: d["expected"]}
170
+ end
171
+ [cases, nil]
172
+ elsif data.key?("dataset_id")
173
+ [nil, Braintrust::Dataset::ID.new(id: data["dataset_id"])]
174
+ elsif data.key?("dataset_name")
175
+ dataset_opts = {name: data["dataset_name"]}
176
+ dataset_opts[:project] = data["project_name"] if data["project_name"]
177
+ [nil, dataset_opts]
178
+ else
179
+ [nil, nil]
180
+ end
181
+ end
182
+
183
+ # Map request scores array to Scorer::ID structs.
184
+ # The UI sends function_id as a nested object: {"function_id": "uuid"}.
185
+ def resolve_remote_scorers(scores)
186
+ return nil if scores.nil? || scores.empty?
187
+ scores.map do |s|
188
+ func_id = s["function_id"]
189
+ func_id = func_id["function_id"] if func_id.is_a?(Hash)
190
+ Braintrust::Scorer::ID.new(
191
+ function_id: func_id,
192
+ version: s["version"]
193
+ )
194
+ end
195
+ end
196
+
197
+ # Map request parent to symbol-keyed Hash.
198
+ # Hardcode playground_id to match Java SDK behavior.
199
+ # Also extracts generation from propagated_event for span_attributes.
200
+ def resolve_parent(parent)
201
+ return nil unless parent.is_a?(Hash)
202
+ object_id = parent["object_id"]
203
+ return nil unless object_id
204
+
205
+ generation = parent.dig("propagated_event", "span_attributes", "generation")
206
+
207
+ result = {object_type: "playground_id", object_id: object_id}
208
+ result[:generation] = generation if generation
209
+ result
210
+ end
211
+ end
212
+ end
213
+ end
214
+ end
@@ -0,0 +1,64 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module Braintrust
6
+ module Server
7
+ module Services
8
+ # Framework-agnostic service for listing evaluators.
9
+ # Returns a plain Hash (not a Rack triplet) suitable for JSON.dump.
10
+ class List
11
+ def initialize(evaluators)
12
+ @evaluators = evaluators
13
+ end
14
+
15
+ def call
16
+ result = {}
17
+ current_evaluators.each do |name, evaluator|
18
+ scores = (evaluator.scorers || []).each_with_index.map do |scorer, i|
19
+ scorer_name = scorer.respond_to?(:name) ? scorer.name : "score_#{i}"
20
+ {"name" => scorer_name}
21
+ end
22
+ entry = {"scores" => scores}
23
+ params = serialize_parameters(evaluator.parameters)
24
+ entry["parameters"] = params if params
25
+ result[name] = entry
26
+ end
27
+ result
28
+ end
29
+
30
+ private
31
+
32
+ def current_evaluators
33
+ return @evaluators.call if @evaluators.respond_to?(:call)
34
+ @evaluators
35
+ end
36
+
37
+ # Convert user-defined parameters to the dev server protocol format.
38
+ # Wraps in a staticParameters container with "data" typed entries.
39
+ def serialize_parameters(parameters)
40
+ return nil unless parameters && !parameters.empty?
41
+
42
+ schema = {}
43
+ parameters.each do |name, spec|
44
+ spec = spec.transform_keys(&:to_s) if spec.is_a?(Hash)
45
+ if spec.is_a?(Hash)
46
+ schema[name.to_s] = {
47
+ "type" => "data",
48
+ "schema" => {"type" => spec["type"] || "string"},
49
+ "default" => spec["default"],
50
+ "description" => spec["description"]
51
+ }
52
+ end
53
+ end
54
+
55
+ {
56
+ "type" => "braintrust.staticParameters",
57
+ "schema" => schema,
58
+ "source" => nil
59
+ }
60
+ end
61
+ end
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,108 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "internal/callable"
4
+
5
+ module Braintrust
6
+ # Task wraps a callable that processes inputs.
7
+ #
8
+ # Use inline with a block (keyword args):
9
+ # task = Task.new("my_task") { |input:| process(input) }
10
+ #
11
+ # Or include in a class and define #call with keyword args:
12
+ # class MyTask
13
+ # include Braintrust::Task
14
+ #
15
+ # def call(input:)
16
+ # process(input)
17
+ # end
18
+ # end
19
+ #
20
+ # Legacy callables with 1 positional param are auto-wrapped for
21
+ # backwards compatibility but emit a deprecation warning.
22
+ module Task
23
+ DEFAULT_NAME = "task"
24
+
25
+ # @param base [Class] the class including Task
26
+ def self.included(base)
27
+ base.include(Callable)
28
+ end
29
+
30
+ # Create a block-based task.
31
+ #
32
+ # @param name [String, nil] optional name (defaults to "task")
33
+ # @param block [Proc] the task implementation; declare only the keyword
34
+ # args you need (e.g. +|input:|+). Extra kwargs passed by the caller
35
+ # are filtered out automatically.
36
+ # @return [Task::Block]
37
+ # @raise [ArgumentError] if the block has unsupported arity
38
+ def self.new(name = nil, &block)
39
+ Block.new(name: name || DEFAULT_NAME, &block)
40
+ end
41
+
42
+ # Included into classes that +include Task+. Prepends KeywordFilter
43
+ # so #call receives only its declared kwargs, and provides a default #name.
44
+ module Callable
45
+ # @param base [Class] the class including Callable
46
+ def self.included(base)
47
+ base.prepend(Internal::Callable::KeywordFilter)
48
+ end
49
+
50
+ # Default name derived from the class name (e.g. MyTask -> "my_task").
51
+ # @return [String]
52
+ def name
53
+ klass = self.class.name&.split("::")&.last
54
+ return Task::DEFAULT_NAME unless klass
55
+ klass.gsub(/([a-z])([A-Z])/, '\1_\2').downcase
56
+ end
57
+ end
58
+
59
+ # Block-based task. Stores a Proc and delegates #call to it.
60
+ # Includes Task so it satisfies +Task ===+ checks (e.g. in Context::Factory).
61
+ # Exposes #call_parameters so KeywordFilter can introspect the block's
62
+ # declared kwargs rather than Block#call's **kwargs signature.
63
+ class Block
64
+ include Task
65
+
66
+ # @return [String]
67
+ attr_reader :name
68
+
69
+ # @param name [String] task name
70
+ # @param block [Proc] task implementation
71
+ def initialize(name: DEFAULT_NAME, &block)
72
+ @name = name
73
+ @block = wrap_block(block)
74
+ end
75
+
76
+ # @param kwargs [Hash] keyword arguments (filtered by KeywordFilter)
77
+ # @return [Object] result of the block
78
+ def call(**kwargs)
79
+ @block.call(**kwargs)
80
+ end
81
+
82
+ # Exposes the block's parameter list so KeywordFilter can filter
83
+ # kwargs to match the block's declared keywords.
84
+ # @return [Array<Array>] parameter list from Proc#parameters
85
+ def call_parameters
86
+ @block.parameters
87
+ end
88
+
89
+ private
90
+
91
+ # Legacy positional wrapping: arity 1/-1 gets :input extracted.
92
+ # Keyword and zero-arity blocks are stored raw; KeywordFilter handles filtering at call time.
93
+ # @param block [Proc]
94
+ # @return [Proc]
95
+ def wrap_block(block)
96
+ params = block.parameters
97
+ if Internal::Callable::KeywordFilter.has_any_keywords?(params) || block.arity == 0
98
+ block
99
+ elsif block.arity == 1 || block.arity == -1
100
+ Log.warn_once(:task_positional, "Task with positional param (input) is deprecated. Use keyword args: ->(input:) { ... } instead.")
101
+ ->(**kw) { block.call(kw[:input]) }
102
+ else
103
+ raise ArgumentError, "Task must accept keyword args or 1 positional param (got arity #{block.arity})"
104
+ end
105
+ end
106
+ end
107
+ end
108
+ end
@@ -80,11 +80,6 @@ module Braintrust
80
80
  # Determine if a span should be forwarded to the wrapped processor
81
81
  # based on configured filters
82
82
  def should_forward_span?(span)
83
- # Always keep root spans (spans with no parent)
84
- # Check if parent_span_id is the invalid/zero span ID
85
- is_root = span.parent_span_id == OpenTelemetry::Trace::INVALID_SPAN_ID
86
- return true if is_root
87
-
88
83
  # If no filters, keep everything
89
84
  return true if @filters.empty?
90
85
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Braintrust
4
- VERSION = "0.2.0"
4
+ VERSION = "0.3.0"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: braintrust
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Braintrust
@@ -193,6 +193,7 @@ files:
193
193
  - lib/braintrust/api/datasets.rb
194
194
  - lib/braintrust/api/functions.rb
195
195
  - lib/braintrust/api/internal/auth.rb
196
+ - lib/braintrust/api/internal/btql.rb
196
197
  - lib/braintrust/api/internal/experiments.rb
197
198
  - lib/braintrust/api/internal/projects.rb
198
199
  - lib/braintrust/config.rb
@@ -214,6 +215,14 @@ files:
214
215
  - lib/braintrust/contrib/openai/patcher.rb
215
216
  - lib/braintrust/contrib/patcher.rb
216
217
  - lib/braintrust/contrib/rails/railtie.rb
218
+ - lib/braintrust/contrib/rails/server.rb
219
+ - lib/braintrust/contrib/rails/server/application_controller.rb
220
+ - lib/braintrust/contrib/rails/server/engine.rb
221
+ - lib/braintrust/contrib/rails/server/eval_controller.rb
222
+ - lib/braintrust/contrib/rails/server/generator.rb
223
+ - lib/braintrust/contrib/rails/server/health_controller.rb
224
+ - lib/braintrust/contrib/rails/server/list_controller.rb
225
+ - lib/braintrust/contrib/rails/server/routes.rb
217
226
  - lib/braintrust/contrib/registry.rb
218
227
  - lib/braintrust/contrib/ruby_llm/deprecated.rb
219
228
  - lib/braintrust/contrib/ruby_llm/instrumentation/chat.rb
@@ -234,6 +243,7 @@ files:
234
243
  - lib/braintrust/eval.rb
235
244
  - lib/braintrust/eval/case.rb
236
245
  - lib/braintrust/eval/cases.rb
246
+ - lib/braintrust/eval/context.rb
237
247
  - lib/braintrust/eval/evaluator.rb
238
248
  - lib/braintrust/eval/formatter.rb
239
249
  - lib/braintrust/eval/functions.rb
@@ -241,6 +251,9 @@ files:
241
251
  - lib/braintrust/eval/runner.rb
242
252
  - lib/braintrust/eval/scorer.rb
243
253
  - lib/braintrust/eval/summary.rb
254
+ - lib/braintrust/eval/trace.rb
255
+ - lib/braintrust/functions.rb
256
+ - lib/braintrust/internal/callable.rb
244
257
  - lib/braintrust/internal/encoding.rb
245
258
  - lib/braintrust/internal/env.rb
246
259
  - lib/braintrust/internal/http.rb
@@ -250,6 +263,7 @@ files:
250
263
  - lib/braintrust/internal/time.rb
251
264
  - lib/braintrust/logger.rb
252
265
  - lib/braintrust/prompt.rb
266
+ - lib/braintrust/scorer.rb
253
267
  - lib/braintrust/server.rb
254
268
  - lib/braintrust/server/auth/clerk_token.rb
255
269
  - lib/braintrust/server/auth/no_auth.rb
@@ -261,9 +275,12 @@ files:
261
275
  - lib/braintrust/server/rack.rb
262
276
  - lib/braintrust/server/rack/app.rb
263
277
  - lib/braintrust/server/router.rb
278
+ - lib/braintrust/server/services/eval_service.rb
279
+ - lib/braintrust/server/services/list_service.rb
264
280
  - lib/braintrust/server/sse.rb
265
281
  - lib/braintrust/setup.rb
266
282
  - lib/braintrust/state.rb
283
+ - lib/braintrust/task.rb
267
284
  - lib/braintrust/trace.rb
268
285
  - lib/braintrust/trace/attachment.rb
269
286
  - lib/braintrust/trace/span_exporter.rb