braintrust 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +148 -24
  3. data/lib/braintrust/api/internal/btql.rb +124 -0
  4. data/lib/braintrust/api/internal/experiments.rb +19 -0
  5. data/lib/braintrust/api/internal/projects.rb +19 -0
  6. data/lib/braintrust/contrib/rails/server/application_controller.rb +34 -0
  7. data/lib/braintrust/contrib/rails/server/engine.rb +72 -0
  8. data/lib/braintrust/contrib/rails/server/eval_controller.rb +36 -0
  9. data/lib/braintrust/contrib/rails/server/generator.rb +43 -0
  10. data/lib/braintrust/contrib/rails/server/health_controller.rb +15 -0
  11. data/lib/braintrust/contrib/rails/server/list_controller.rb +16 -0
  12. data/lib/braintrust/contrib/rails/server/routes.rb +8 -0
  13. data/lib/braintrust/contrib/rails/server.rb +20 -0
  14. data/lib/braintrust/dataset.rb +6 -3
  15. data/lib/braintrust/eval/context.rb +131 -0
  16. data/lib/braintrust/eval/evaluator.rb +11 -5
  17. data/lib/braintrust/eval/functions.rb +10 -166
  18. data/lib/braintrust/eval/runner.rb +165 -145
  19. data/lib/braintrust/eval/scorer.rb +24 -96
  20. data/lib/braintrust/eval/trace.rb +129 -0
  21. data/lib/braintrust/eval.rb +60 -132
  22. data/lib/braintrust/functions.rb +168 -0
  23. data/lib/braintrust/internal/callable.rb +83 -0
  24. data/lib/braintrust/logger.rb +9 -0
  25. data/lib/braintrust/scorer.rb +173 -0
  26. data/lib/braintrust/server/handlers/eval.rb +8 -168
  27. data/lib/braintrust/server/handlers/list.rb +3 -41
  28. data/lib/braintrust/server/rack.rb +2 -0
  29. data/lib/braintrust/server/services/eval_service.rb +214 -0
  30. data/lib/braintrust/server/services/list_service.rb +64 -0
  31. data/lib/braintrust/task.rb +108 -0
  32. data/lib/braintrust/trace/span_processor.rb +0 -5
  33. data/lib/braintrust/version.rb +1 -1
  34. metadata +18 -1
@@ -0,0 +1,83 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Braintrust
4
+ module Internal
5
+ module Callable
6
+ # Filters keyword arguments so callers can pass a superset of kwargs
7
+ # and the receiver only gets the ones it declared. This avoids Ruby 3.2+
8
+ # ArgumentError for unknown keywords without requiring ** on every definition.
9
+ #
10
+ # When prepended on a class, intercepts #call and slices kwargs to match
11
+ # the declared parameters before forwarding. Methods with **keyrest
12
+ # receive all kwargs unfiltered.
13
+ #
14
+ # @example
15
+ # class Greeter
16
+ # prepend Internal::Callable::KeywordFilter
17
+ # def call(name:)
18
+ # "hello #{name}"
19
+ # end
20
+ # end
21
+ # Greeter.new.call(name: "world", extra: "ignored") # => "hello world"
22
+ module KeywordFilter
23
+ # Filter kwargs to only the keyword params declared by the given parameters list.
24
+ # Returns kwargs unchanged if parameters include **keyrest.
25
+ #
26
+ # @param params [Array<Array>] parameter list from Proc#parameters or Method#parameters
27
+ # @param kwargs [Hash] keyword arguments to filter
28
+ # @return [Hash] filtered keyword arguments
29
+ def self.filter(params, kwargs)
30
+ return kwargs if has_keyword_splat?(params)
31
+
32
+ declared_keys = params
33
+ .select { |type, _| type == :keyreq || type == :key }
34
+ .map(&:last)
35
+ kwargs.slice(*declared_keys)
36
+ end
37
+
38
+ # Wrap a Proc to filter kwargs to only its declared keyword params.
39
+ # Returns the block unchanged if it accepts **keyrest.
40
+ #
41
+ # @param block [Proc] the block to wrap
42
+ # @return [Proc] a wrapper that filters kwargs, or the original block
43
+ def self.wrap_block(block)
44
+ return block if has_keyword_splat?(block.parameters)
45
+ ->(**kw) { block.call(**filter(block.parameters, kw)) }
46
+ end
47
+
48
+ # Whether params include ** (keyword splat / keyrest).
49
+ #
50
+ # @param params [Array<Array>] parameter list
51
+ # @return [Boolean]
52
+ def self.has_keyword_splat?(params)
53
+ params.any? { |type, _| type == :keyrest }
54
+ end
55
+
56
+ # Whether params include any keyword parameters (key, keyreq, or keyrest).
57
+ #
58
+ # @param params [Array<Array>] parameter list
59
+ # @return [Boolean]
60
+ def self.has_any_keywords?(params)
61
+ params.any? { |type, _| type == :keyreq || type == :key || type == :keyrest }
62
+ end
63
+
64
+ # When prepended, filters kwargs before the next #call in the ancestor chain.
65
+ # If the instance defines #call_parameters, uses those.
66
+ # Otherwise introspects super_method.
67
+ #
68
+ # @param kwargs [Hash] keyword arguments
69
+ # @return [Object] result of the filtered #call
70
+ def call(**kwargs)
71
+ params = if respond_to?(:call_parameters)
72
+ call_parameters
73
+ else
74
+ impl = method(:call).super_method
75
+ return super unless impl
76
+ impl.parameters
77
+ end
78
+ super(**KeywordFilter.filter(params, kwargs))
79
+ end
80
+ end
81
+ end
82
+ end
83
+ end
@@ -8,6 +8,7 @@ module Braintrust
8
8
  # Default to WARN unless BRAINTRUST_DEBUG is set
9
9
  level = ENV["BRAINTRUST_DEBUG"] ? Logger::DEBUG : Logger::WARN
10
10
  @logger = Logger.new($stderr, level: level)
11
+ @warned = Set.new
11
12
 
12
13
  class << self
13
14
  attr_accessor :logger
@@ -24,6 +25,14 @@ module Braintrust
24
25
  @logger.warn(message)
25
26
  end
26
27
 
28
+ # Emit a warning only once per unique key.
29
+ # Subsequent calls with the same key are silently ignored.
30
+ def warn_once(key, message)
31
+ return if @warned.include?(key)
32
+ @warned.add(key)
33
+ @logger.warn(message)
34
+ end
35
+
27
36
  def error(message)
28
37
  @logger.error(message)
29
38
  end
@@ -0,0 +1,173 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "internal/callable"
4
+
5
+ module Braintrust
6
+ # Scorer wraps a scoring function that evaluates task output against expected values.
7
+ #
8
+ # Use inline with a block (keyword args):
9
+ # scorer = Scorer.new("my_scorer") { |expected:, output:| output == expected ? 1.0 : 0.0 }
10
+ #
11
+ # Or include in a class and define #call with keyword args:
12
+ # class FuzzyMatch
13
+ # include Braintrust::Scorer
14
+ #
15
+ # def call(expected:, output:)
16
+ # output == expected ? 1.0 : 0.0
17
+ # end
18
+ # end
19
+ #
20
+ # Legacy callables with 3 or 4 positional params are auto-wrapped for
21
+ # backwards compatibility but emit a deprecation warning.
22
+ module Scorer
23
+ DEFAULT_NAME = "scorer"
24
+
25
+ # @param base [Class] the class including Scorer
26
+ def self.included(base)
27
+ base.include(Callable)
28
+ end
29
+
30
+ # Create a block-based scorer.
31
+ #
32
+ # @param name [String, nil] optional name (defaults to "scorer")
33
+ # @param block [Proc] the scoring implementation; declare only the keyword
34
+ # args you need. Extra kwargs are filtered out automatically.
35
+ #
36
+ # Supported kwargs: +input:+, +expected:+, +output:+, +metadata:+, +trace:+
37
+ # @return [Scorer::Block]
38
+ # @raise [ArgumentError] if the block has unsupported arity
39
+ def self.new(name = nil, &block)
40
+ Block.new(name: name || DEFAULT_NAME, &block)
41
+ end
42
+
43
+ # Included into classes that +include Scorer+. Prepends KeywordFilter and
44
+ # ResultNormalizer so #call receives only declared kwargs and always returns
45
+ # Array<Hash>. Also provides a default #name and #call_parameters.
46
+ module Callable
47
+ # Normalizes the raw return value of #call into Array<Hash>.
48
+ # Nested inside Callable because it depends on #name which Callable provides.
49
+ module ResultNormalizer
50
+ # @return [Array<Hash>] normalized score hashes with :score, :metadata, :name keys
51
+ def call(**kwargs)
52
+ normalize_score_result(super)
53
+ end
54
+
55
+ private
56
+
57
+ # @param result [Numeric, Hash, Array<Hash>] raw return value from #call
58
+ # @return [Array<Hash>] one or more score hashes with :score, :metadata, :name keys
59
+ # @raise [ArgumentError] if any score value is not Numeric
60
+ def normalize_score_result(result)
61
+ case result
62
+ when Array then result.map { |item| normalize_score_item(item) }
63
+ when Hash then [normalize_score_item(result)]
64
+ else
65
+ raise ArgumentError, "#{name}: score must be Numeric, got #{result.inspect}" unless result.is_a?(Numeric)
66
+ [{score: result, metadata: nil, name: name}]
67
+ end
68
+ end
69
+
70
+ # Fills in missing :name from the scorer and validates :score.
71
+ # @param item [Hash] a score hash with at least a :score key
72
+ # @return [Hash] the same hash with :name set
73
+ # @raise [ArgumentError] if :score is not Numeric
74
+ def normalize_score_item(item)
75
+ item[:name] ||= name
76
+ raise ArgumentError, "#{item[:name]}: score must be Numeric, got #{item[:score].inspect}" unless item[:score].is_a?(Numeric)
77
+ item
78
+ end
79
+ end
80
+
81
+ # Infrastructure modules prepended onto every scorer class.
82
+ # Used both to set up the ancestor chain and to skip past them in
83
+ # #call_parameters so KeywordFilter sees the real call signature.
84
+ PREPENDED = [Internal::Callable::KeywordFilter, ResultNormalizer].freeze
85
+
86
+ # @param base [Class] the class including Callable
87
+ def self.included(base)
88
+ PREPENDED.each { |mod| base.prepend(mod) }
89
+ end
90
+
91
+ # Default name derived from the class name (e.g. FuzzyMatch -> "fuzzy_match").
92
+ # @return [String]
93
+ def name
94
+ klass = self.class.name&.split("::")&.last
95
+ return Scorer::DEFAULT_NAME unless klass
96
+ klass.gsub(/([a-z])([A-Z])/, '\1_\2').downcase
97
+ end
98
+
99
+ # Provides KeywordFilter with the actual call signature of the subclass.
100
+ # Walks past PREPENDED modules in the ancestor chain so that user-defined
101
+ # #call keyword params are correctly introspected.
102
+ # Block overrides this to point directly at @block.parameters.
103
+ # @return [Array<Array>] parameter list
104
+ def call_parameters
105
+ meth = method(:call)
106
+ meth = meth.super_method while meth.super_method && PREPENDED.include?(meth.owner)
107
+ meth.parameters
108
+ end
109
+ end
110
+
111
+ # Block-based scorer. Stores a Proc and delegates #call to it.
112
+ # Includes Scorer so it satisfies +Scorer ===+ checks (e.g. in Context::Factory).
113
+ # Exposes #call_parameters so KeywordFilter can introspect the block's
114
+ # declared kwargs rather than Block#call's **kwargs signature.
115
+ class Block
116
+ include Scorer
117
+
118
+ # @return [String]
119
+ attr_reader :name
120
+
121
+ # @param name [String] scorer name
122
+ # @param block [Proc] scoring implementation
123
+ def initialize(name: DEFAULT_NAME, &block)
124
+ @name = name
125
+ @block = wrap_block(block)
126
+ end
127
+
128
+ # @param kwargs [Hash] keyword arguments (filtered by KeywordFilter)
129
+ # @return [Array<Hash>] normalized score results
130
+ def call(**kwargs)
131
+ @block.call(**kwargs)
132
+ end
133
+
134
+ # Exposes the block's parameter list so KeywordFilter can filter
135
+ # kwargs to match the block's declared keywords.
136
+ # @return [Array<Array>] parameter list from Proc#parameters
137
+ def call_parameters
138
+ @block.parameters
139
+ end
140
+
141
+ private
142
+
143
+ # Legacy positional wrapping: arity 3/4/-4/-1 maps to (input, expected, output[, metadata]).
144
+ # Keyword and zero-arity blocks are stored raw; KeywordFilter handles filtering at call time.
145
+ # @param block [Proc]
146
+ # @return [Proc]
147
+ def wrap_block(block)
148
+ params = block.parameters
149
+ if Internal::Callable::KeywordFilter.has_any_keywords?(params) || block.arity == 0
150
+ block
151
+ else
152
+ case block.arity
153
+ when 3
154
+ Log.warn_once(:scorer_positional_3, "Scorer with positional params (input, expected, output) is deprecated. Use keyword args: |input:, expected:, output:| instead.")
155
+ ->(**kw) { block.call(kw[:input], kw[:expected], kw[:output]) }
156
+ when 4, -4, -1
157
+ Log.warn_once(:scorer_positional_4, "Scorer with positional params (input, expected, output, metadata) is deprecated. Use keyword args: |input:, expected:, output:, metadata:| instead.")
158
+ ->(**kw) { block.call(kw[:input], kw[:expected], kw[:output], kw[:metadata]) }
159
+ else
160
+ raise ArgumentError, "Scorer must accept keyword args or 3-4 positional params (got arity #{block.arity})"
161
+ end
162
+ end
163
+ end
164
+ end
165
+
166
+ # Value object wrapping a remote scorer function UUID.
167
+ # Used by Eval.run to distinguish remote scorers from local callables.
168
+ ID = Struct.new(:function_id, :version, keyword_init: true)
169
+ end
170
+
171
+ # @deprecated Use {Braintrust::Scorer::ID} instead.
172
+ ScorerId = Scorer::ID
173
+ end
@@ -10,38 +10,15 @@ module Braintrust
10
10
  class Eval
11
11
  def initialize(evaluators)
12
12
  @evaluators = evaluators
13
+ @service = Services::Eval.new(evaluators)
13
14
  end
14
15
 
15
16
  def call(env)
16
17
  body = parse_body(env)
17
18
  return error_response(400, "Invalid JSON body") unless body
18
19
 
19
- name = body["name"]
20
- return error_response(400, "Missing required field: name") unless name
21
-
22
- evaluator = @evaluators[name]
23
- return error_response(404, "Evaluator '#{name}' not found") unless evaluator
24
-
25
- data = body["data"]
26
- return error_response(400, "Missing required field: data") unless data
27
-
28
- # Validate exactly one data source
29
- data_sources = ["data", "dataset_name", "dataset_id"].count { |k| data.key?(k) }
30
- return error_response(400, "Exactly one data source required") if data_sources != 1
31
-
32
- experiment_name = body["experiment_name"]
33
-
34
- # Resolve data source
35
- cases, dataset = resolve_data_source(data)
36
-
37
- # Resolve remote scorers from request
38
- remote_scorer_ids = resolve_remote_scorers(body["scores"])
39
-
40
- # Resolve parent span context
41
- parent = resolve_parent(body["parent"])
42
-
43
- # Build state from auth context (if present)
44
- state = build_state(env)
20
+ result = @service.validate(body)
21
+ return error_response(result[:status], result[:error]) if result[:error]
45
22
 
46
23
  # The protocol-rack adapter (used by Falcon and any server built on
47
24
  # protocol-http) buffers `each`-based bodies through an Enumerable path.
@@ -50,64 +27,7 @@ module Braintrust
50
27
  body_class = env.key?("protocol.http.request") ? SSEStreamBody : SSEBody
51
28
 
52
29
  sse_body = body_class.new do |sse|
53
- # Only pass project/experiment params when state is available
54
- run_opts = {
55
- on_progress: ->(progress_data) {
56
- # Build remote eval protocol events from generic progress data.
57
- # Runner provides: id, data/error, scores (optional), origin (optional).
58
- # Protocol requires: id, object_type, origin, name, format, output_type, event, data.
59
- base = {
60
- "object_type" => "task",
61
- "name" => name,
62
- "format" => "code",
63
- "output_type" => "completion"
64
- }
65
- base["id"] = progress_data["id"] if progress_data["id"]
66
- base["origin"] = progress_data["origin"] if progress_data["origin"]
67
-
68
- if progress_data.key?("error")
69
- sse.event("progress", JSON.dump(base.merge("event" => "error", "data" => progress_data["error"])))
70
- else
71
- sse.event("progress", JSON.dump(base.merge("event" => "json_delta", "data" => JSON.dump(progress_data["data"]))))
72
- end
73
-
74
- # Signal per-cell completion so the UI exits "Streaming..." state
75
- # and updates the progress bar immediately.
76
- sse.event("progress", JSON.dump(base.merge("event" => "done", "data" => "")))
77
- },
78
- quiet: true
79
- }
80
- run_opts[:parent] = parent if parent
81
- run_opts[:scorers] = remote_scorer_ids if remote_scorer_ids
82
- run_opts[:dataset] = dataset if dataset
83
-
84
- if state
85
- run_opts[:state] = state
86
- run_opts[:experiment] = experiment_name if experiment_name
87
- run_opts[:project_id] = body["project_id"] if body["project_id"]
88
- end
89
-
90
- result = evaluator.run(cases, **run_opts)
91
-
92
- # Flush buffered OTLP spans before sending completion events.
93
- # The BatchSpanProcessor exports every ~5s; fast evals can finish
94
- # before a single export fires, causing the UI to see no results.
95
- Braintrust::Trace.flush_spans
96
-
97
- # Build summary from result scores
98
- averaged_scores = {}
99
- result.scorer_stats.each do |scorer_name, stats|
100
- averaged_scores[scorer_name] = stats.score_mean
101
- end
102
-
103
- sse.event("summary", JSON.dump({
104
- "scores" => averaged_scores,
105
- "experiment_name" => experiment_name,
106
- "experiment_id" => result.experiment_id,
107
- "project_id" => result.project_id
108
- }))
109
-
110
- sse.event("done", "")
30
+ @service.stream(result, auth: env["braintrust.auth"], sse: sse)
111
31
  end
112
32
 
113
33
  [200, {"content-type" => "text/event-stream", "cache-control" => "no-cache", "connection" => "keep-alive"}, sse_body]
@@ -115,90 +35,6 @@ module Braintrust
115
35
 
116
36
  private
117
37
 
118
- # Resolve data source from the data field.
119
- # Returns [cases, dataset] where exactly one is non-nil.
120
- def resolve_data_source(data)
121
- if data.key?("data")
122
- cases = data["data"].map do |d|
123
- {input: d["input"], expected: d["expected"]}
124
- end
125
- [cases, nil]
126
- elsif data.key?("dataset_id")
127
- [nil, Braintrust::DatasetId.new(id: data["dataset_id"])]
128
- elsif data.key?("dataset_name")
129
- dataset_opts = {name: data["dataset_name"]}
130
- dataset_opts[:project] = data["project_name"] if data["project_name"]
131
- [nil, dataset_opts]
132
- else
133
- [nil, nil]
134
- end
135
- end
136
-
137
- # Map request scores array to ScorerId structs.
138
- # The UI sends function_id as a nested object: {"function_id": "uuid"}.
139
- def resolve_remote_scorers(scores)
140
- return nil if scores.nil? || scores.empty?
141
- scores.map do |s|
142
- func_id = s["function_id"]
143
- func_id = func_id["function_id"] if func_id.is_a?(Hash)
144
- Braintrust::ScorerId.new(
145
- function_id: func_id,
146
- version: s["version"]
147
- )
148
- end
149
- end
150
-
151
- # Map request parent to symbol-keyed Hash.
152
- # Hardcode playground_id to match Java SDK behavior.
153
- # Also extracts generation from propagated_event for span_attributes.
154
- def resolve_parent(parent)
155
- return nil unless parent.is_a?(Hash)
156
- object_id = parent["object_id"]
157
- return nil unless object_id
158
-
159
- generation = parent.dig("propagated_event", "span_attributes", "generation")
160
-
161
- result = {object_type: "playground_id", object_id: object_id}
162
- result[:generation] = generation if generation
163
- result
164
- end
165
-
166
- # Build State from auth context set by Auth middleware.
167
- # Returns nil when no auth context is present (e.g. NoAuth strategy).
168
- # Uses an LRU-style cache (max 64 entries) keyed by [api_key, app_url, org_name].
169
- def build_state(env)
170
- auth = env["braintrust.auth"]
171
- return nil unless auth.is_a?(Hash)
172
-
173
- cache_key = [auth["api_key"], auth["app_url"], auth["org_name"]]
174
-
175
- @state_mutex ||= Mutex.new
176
- @state_cache ||= {}
177
-
178
- @state_mutex.synchronize do
179
- cached = @state_cache[cache_key]
180
- return cached if cached
181
-
182
- state = Braintrust::State.new(
183
- api_key: auth["api_key"],
184
- org_id: auth["org_id"],
185
- org_name: auth["org_name"],
186
- app_url: auth["app_url"],
187
- api_url: auth["api_url"],
188
- enable_tracing: false
189
- )
190
-
191
- # Evict oldest entry if cache is full
192
- if @state_cache.size >= 64
193
- oldest_key = @state_cache.keys.first
194
- @state_cache.delete(oldest_key)
195
- end
196
-
197
- @state_cache[cache_key] = state
198
- state
199
- end
200
- end
201
-
202
38
  def parse_body(env)
203
39
  body = env["rack.input"]&.read
204
40
  return nil if body.nil? || body.empty?
@@ -211,6 +47,10 @@ module Braintrust
211
47
  [status, {"content-type" => "application/json"},
212
48
  [JSON.dump({"error" => message})]]
213
49
  end
50
+
51
+ def build_state(env)
52
+ @service.build_state(env["braintrust.auth"])
53
+ end
214
54
  end
215
55
  end
216
56
  end
@@ -23,50 +23,12 @@ module Braintrust
23
23
  class List
24
24
  def initialize(evaluators)
25
25
  @evaluators = evaluators
26
+ @service = Services::List.new(evaluators)
26
27
  end
27
28
 
28
29
  def call(_env)
29
- result = {}
30
- @evaluators.each do |name, evaluator|
31
- scores = (evaluator.scorers || []).each_with_index.map do |scorer, i|
32
- scorer_name = scorer.respond_to?(:name) ? scorer.name : "score_#{i}"
33
- {"name" => scorer_name}
34
- end
35
- entry = {"scores" => scores}
36
- params = serialize_parameters(evaluator.parameters)
37
- entry["parameters"] = params if params
38
- result[name] = entry
39
- end
40
-
41
- [200, {"content-type" => "application/json"},
42
- [JSON.dump(result)]]
43
- end
44
-
45
- private
46
-
47
- # Convert user-defined parameters to the dev server protocol format.
48
- # Wraps in a staticParameters container with "data" typed entries.
49
- def serialize_parameters(parameters)
50
- return nil unless parameters && !parameters.empty?
51
-
52
- schema = {}
53
- parameters.each do |name, spec|
54
- spec = spec.transform_keys(&:to_s) if spec.is_a?(Hash)
55
- if spec.is_a?(Hash)
56
- schema[name.to_s] = {
57
- "type" => "data",
58
- "schema" => {"type" => spec["type"] || "string"},
59
- "default" => spec["default"],
60
- "description" => spec["description"]
61
- }
62
- end
63
- end
64
-
65
- {
66
- "type" => "braintrust.staticParameters",
67
- "schema" => schema,
68
- "source" => nil
69
- }
30
+ result = @service.call
31
+ [200, {"content-type" => "application/json"}, [JSON.dump(result)]]
70
32
  end
71
33
  end
72
34
  end
@@ -15,6 +15,8 @@ require_relative "auth/no_auth"
15
15
  require_relative "auth/clerk_token"
16
16
  require_relative "middleware/cors"
17
17
  require_relative "middleware/auth"
18
+ require_relative "services/list_service"
19
+ require_relative "services/eval_service"
18
20
  require_relative "handlers/health"
19
21
  require_relative "handlers/list"
20
22
  require_relative "handlers/eval"