braintrust 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +148 -24
  3. data/lib/braintrust/api/internal/btql.rb +124 -0
  4. data/lib/braintrust/api/internal/experiments.rb +19 -0
  5. data/lib/braintrust/api/internal/projects.rb +19 -0
  6. data/lib/braintrust/contrib/rails/server/application_controller.rb +34 -0
  7. data/lib/braintrust/contrib/rails/server/engine.rb +72 -0
  8. data/lib/braintrust/contrib/rails/server/eval_controller.rb +36 -0
  9. data/lib/braintrust/contrib/rails/server/generator.rb +43 -0
  10. data/lib/braintrust/contrib/rails/server/health_controller.rb +15 -0
  11. data/lib/braintrust/contrib/rails/server/list_controller.rb +16 -0
  12. data/lib/braintrust/contrib/rails/server/routes.rb +8 -0
  13. data/lib/braintrust/contrib/rails/server.rb +20 -0
  14. data/lib/braintrust/dataset.rb +6 -3
  15. data/lib/braintrust/eval/context.rb +131 -0
  16. data/lib/braintrust/eval/evaluator.rb +11 -5
  17. data/lib/braintrust/eval/functions.rb +10 -166
  18. data/lib/braintrust/eval/runner.rb +165 -145
  19. data/lib/braintrust/eval/scorer.rb +24 -96
  20. data/lib/braintrust/eval/trace.rb +129 -0
  21. data/lib/braintrust/eval.rb +60 -132
  22. data/lib/braintrust/functions.rb +168 -0
  23. data/lib/braintrust/internal/callable.rb +83 -0
  24. data/lib/braintrust/logger.rb +9 -0
  25. data/lib/braintrust/scorer.rb +173 -0
  26. data/lib/braintrust/server/handlers/eval.rb +8 -168
  27. data/lib/braintrust/server/handlers/list.rb +3 -41
  28. data/lib/braintrust/server/rack.rb +2 -0
  29. data/lib/braintrust/server/services/eval_service.rb +214 -0
  30. data/lib/braintrust/server/services/list_service.rb +64 -0
  31. data/lib/braintrust/task.rb +108 -0
  32. data/lib/braintrust/trace/span_processor.rb +0 -5
  33. data/lib/braintrust/version.rb +1 -1
  34. metadata +18 -1
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ begin
4
+ require "action_controller"
5
+ require "rails/engine"
6
+ rescue LoadError
7
+ raise LoadError,
8
+ "Rails (actionpack + railties) is required for the Braintrust Rails server engine. " \
9
+ "Add `gem 'rails'` or `gem 'actionpack'` and `gem 'railties'` to your Gemfile."
10
+ end
11
+
12
+ require "json"
13
+ require_relative "../../eval"
14
+ require_relative "../../server/sse"
15
+ require_relative "../../server/auth/no_auth"
16
+ require_relative "../../server/auth/clerk_token"
17
+ require_relative "../../server/middleware/cors"
18
+ require_relative "../../server/services/list_service"
19
+ require_relative "../../server/services/eval_service"
20
+ require_relative "server/engine"
@@ -181,9 +181,12 @@ module Braintrust
181
181
  created: raw["created"]
182
182
  )
183
183
  end
184
+
185
+ # Value object wrapping a dataset UUID for resolution by ID.
186
+ # Used by Eval.run to distinguish dataset-by-ID from dataset-by-name.
187
+ ID = Struct.new(:id, keyword_init: true)
184
188
  end
185
189
 
186
- # Value object wrapping a dataset UUID for resolution by ID.
187
- # Used by Eval.run to distinguish dataset-by-ID from dataset-by-name.
188
- DatasetId = Struct.new(:id, keyword_init: true)
190
+ # @deprecated Use {Braintrust::Dataset::ID} instead.
191
+ DatasetId = Dataset::ID
189
192
  end
@@ -0,0 +1,131 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "cases"
4
+
5
+ module Braintrust
6
+ module Eval
7
+ # Holds all normalized, ready-to-execute eval components.
8
+ # Use Context.build to construct from raw user inputs.
9
+ class Context
10
+ attr_reader :task, :scorers, :cases, :experiment_id, :experiment_name,
11
+ :project_id, :project_name, :state, :tracer_provider,
12
+ :on_progress, :parent_span_attr, :generation
13
+
14
+ def initialize(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil,
15
+ project_id: nil, project_name: nil, state: nil, tracer_provider: nil,
16
+ on_progress: nil, parent_span_attr: nil, generation: nil)
17
+ @task = task
18
+ @scorers = scorers
19
+ @cases = cases
20
+ @experiment_id = experiment_id
21
+ @experiment_name = experiment_name
22
+ @project_id = project_id
23
+ @project_name = project_name
24
+ @state = state
25
+ @tracer_provider = tracer_provider
26
+ @on_progress = on_progress
27
+ @parent_span_attr = parent_span_attr
28
+ @generation = generation
29
+ end
30
+
31
+ # Build a Context from raw user inputs.
32
+ # Factory normalizes task, scorers, and cases into typed wrappers.
33
+ # Parent is resolved into parent_span_attr and generation.
34
+ def self.build(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil,
35
+ project_id: nil, project_name: nil, state: nil, tracer_provider: nil,
36
+ on_progress: nil, parent: nil)
37
+ factory = Factory.new(state: state, tracer_provider: tracer_provider, project_name: project_name)
38
+
39
+ Context.new(
40
+ task: factory.normalize_task(task),
41
+ scorers: factory.normalize_scorers(scorers),
42
+ cases: factory.normalize_cases(cases),
43
+ experiment_id: experiment_id,
44
+ experiment_name: experiment_name,
45
+ project_id: project_id,
46
+ project_name: project_name,
47
+ state: state,
48
+ tracer_provider: tracer_provider,
49
+ on_progress: on_progress,
50
+ parent_span_attr: factory.resolve_parent_span_attr(parent),
51
+ generation: parent&.dig(:generation)
52
+ )
53
+ end
54
+
55
+ # Encapsulates normalization of raw user inputs into typed wrappers.
56
+ class Factory
57
+ def initialize(state: nil, tracer_provider: nil, project_name: nil)
58
+ @state = state
59
+ @tracer_provider = tracer_provider
60
+ @project_name = project_name
61
+ end
62
+
63
+ def normalize_cases(raw)
64
+ case raw
65
+ when Cases
66
+ raw
67
+ when Array, Enumerable
68
+ Cases.new(raw)
69
+ else
70
+ if raw.respond_to?(:each)
71
+ Cases.new(raw)
72
+ else
73
+ raise ArgumentError, "cases must be Array or Enumerable"
74
+ end
75
+ end
76
+ end
77
+
78
+ def resolve_parent_span_attr(parent)
79
+ return nil unless parent
80
+ "#{parent[:object_type]}:#{parent[:object_id]}"
81
+ end
82
+
83
+ def normalize_task(raw)
84
+ case raw
85
+ when Task
86
+ raw
87
+ when Proc
88
+ # Pass Proc/Lambda directly to preserve keyword arg info.
89
+ # Legacy positional lambdas (arity 1) are auto-wrapped by Task#wrap_block.
90
+ Task.new(&raw)
91
+ else
92
+ # Callable class: wrap via method(:call) to preserve keyword arg info
93
+ name = raw.respond_to?(:name) ? raw.name : nil
94
+ Task.new(name, &raw.method(:call))
95
+ end
96
+ end
97
+
98
+ def normalize_scorers(raw)
99
+ raw.map do |scorer|
100
+ case scorer
101
+ when String
102
+ raise ArgumentError, "project is required to resolve scorer slug '#{scorer}'" unless @project_name
103
+ Braintrust::Functions.scorer(
104
+ project: @project_name,
105
+ slug: scorer,
106
+ state: @state,
107
+ tracer_provider: @tracer_provider
108
+ )
109
+ when Braintrust::Scorer::ID
110
+ Braintrust::Functions.scorer(
111
+ id: scorer.function_id,
112
+ version: scorer.version,
113
+ state: @state,
114
+ tracer_provider: @tracer_provider
115
+ )
116
+ when Braintrust::Scorer
117
+ scorer
118
+ when Proc
119
+ # Pass Proc/Lambda directly to preserve keyword arg info
120
+ # (method(:call) loses parameter metadata)
121
+ Braintrust::Scorer.new(&scorer)
122
+ else
123
+ name = scorer.respond_to?(:name) ? scorer.name : nil
124
+ Braintrust::Scorer.new(name, &scorer.method(:call))
125
+ end
126
+ end
127
+ end
128
+ end
129
+ end
130
+ end
131
+ end
@@ -5,21 +5,27 @@ module Braintrust
5
5
  # Base class for evaluators. Subclass and override #task and #scorers,
6
6
  # or instantiate directly with keyword arguments.
7
7
  #
8
+ # Evaluators are used with the dev server, which reports scorer names
9
+ # to the Braintrust UI. Always use named scorers (via Scorer.new or
10
+ # subclass) so they display meaningfully.
11
+ #
8
12
  # @example Subclass pattern
9
13
  # class FoodClassifier < Braintrust::Eval::Evaluator
10
14
  # def task
11
- # ->(input) { classify(input) }
15
+ # ->(input:) { classify(input) }
12
16
  # end
13
17
  #
14
18
  # def scorers
15
- # [Braintrust::Eval.scorer("exact_match") { |i, e, o| o == e ? 1.0 : 0.0 }]
19
+ # [Braintrust::Scorer.new("exact_match") { |expected:, output:| output == expected ? 1.0 : 0.0 }]
16
20
  # end
17
21
  # end
18
22
  #
19
23
  # @example Inline pattern
20
24
  # Braintrust::Eval::Evaluator.new(
21
- # task: ->(input) { input.upcase },
22
- # scorers: [my_scorer]
25
+ # task: ->(input:) { input.upcase },
26
+ # scorers: [
27
+ # Braintrust::Scorer.new("exact_match") { |expected:, output:| output == expected ? 1.0 : 0.0 }
28
+ # ]
23
29
  # )
24
30
  class Evaluator
25
31
  attr_accessor :task, :scorers, :parameters
@@ -48,7 +54,7 @@ module Braintrust
48
54
  # @param project [String, nil] Project name
49
55
  # @param experiment [String, nil] Experiment name
50
56
  # @param project_id [String, nil] Project UUID (skips project creation)
51
- # @param dataset [String, Hash, Dataset, DatasetId, nil] Dataset to fetch
57
+ # @param dataset [String, Hash, Dataset, Dataset::ID, nil] Dataset to fetch
52
58
  # @param scorers [Array, nil] Additional scorers (merged with evaluator's own)
53
59
  # @param parent [Hash, nil] Parent span context
54
60
  # @param state [State, nil] Braintrust state
@@ -1,178 +1,22 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative "../api"
4
- require_relative "scorer"
5
- require "opentelemetry/sdk"
6
- require "json"
3
+ require_relative "../functions"
7
4
 
8
5
  module Braintrust
9
6
  module Eval
10
- # Functions provides remote function execution capabilities
11
- # Allows calling prompts hosted on Braintrust servers as tasks or scorers
7
+ # @deprecated Use {Braintrust::Functions} instead.
12
8
  module Functions
13
9
  class << self
14
- # Create a task callable that invokes a remote function
15
- # @param project [String] Project name
16
- # @param slug [String] Function slug
17
- # @param state [State, nil] Braintrust state (defaults to global)
18
- # @param tracer_provider [TracerProvider, nil] OpenTelemetry tracer provider
19
- # @return [Proc] Callable that accepts input and returns output
20
- def task(project:, slug:, state: nil, tracer_provider: nil)
21
- state ||= Braintrust.current_state
22
- raise Error, "No state available" unless state
23
-
24
- # Resolve function ID from project + slug
25
- api = API.new(state: state)
26
- function_metadata = resolve_function(api, project, slug)
27
- function_id = function_metadata["id"]
28
- function_name = function_metadata["name"] || slug
29
-
30
- # Get tracer for creating spans
31
- tracer_provider ||= OpenTelemetry.tracer_provider
32
- tracer = tracer_provider.tracer("braintrust.functions")
33
-
34
- # Return a lambda that invokes the remote function with tracing
35
- lambda do |input|
36
- # Create a span for the function invocation
37
- tracer.in_span("function: #{slug}") do |span|
38
- span.set_attribute("braintrust.span_attributes", JSON.dump({type: "function"}))
39
- span.set_attribute("braintrust.input_json", JSON.dump(input))
40
- span.set_attribute("braintrust.function.name", function_name)
41
- span.set_attribute("braintrust.function.id", function_id)
42
- span.set_attribute("braintrust.function.slug", slug)
43
-
44
- begin
45
- # Invoke the function via API
46
- output = api.functions.invoke(id: function_id, input: input)
47
- span.set_attribute("braintrust.output_json", JSON.dump(output))
48
- output
49
- rescue => e
50
- # Record exception and set error status
51
- span.record_exception(e)
52
- span.status = OpenTelemetry::Trace::Status.error(e.message)
53
- raise
54
- end
55
- end
56
- end
57
- end
58
-
59
- # Create a scorer that invokes a remote function by ID
60
- # @param id [String] Function UUID
61
- # @param version [String, nil] Optional version to pin to
62
- # @param state [State, nil] Braintrust state (defaults to global)
63
- # @param tracer_provider [TracerProvider, nil] OpenTelemetry tracer provider
64
- # @return [Scorer] Scorer object that invokes remote function
65
- def scorer_by_id(id:, state: nil, version: nil, tracer_provider: nil)
66
- state ||= Braintrust.current_state
67
- api = API.new(state: state)
68
- api.login
69
-
70
- function_metadata = api.functions.get(id: id, version: version)
71
- function_id = function_metadata["id"]
72
- function_name = function_metadata["name"] || id
73
-
74
- tracer_provider ||= OpenTelemetry.tracer_provider
75
- tracer = tracer_provider.tracer("braintrust.functions")
76
-
77
- build_scorer(function_id: function_id, function_name: function_name, api: api, tracer: tracer)
10
+ # @deprecated Use {Braintrust::Functions.task} instead.
11
+ def task(**kwargs)
12
+ Log.warn_once(:eval_functions_task, "Braintrust::Eval::Functions.task is deprecated: use Braintrust::Functions.task instead.")
13
+ Braintrust::Functions.task(**kwargs)
78
14
  end
79
15
 
80
- # Create a scorer that invokes a remote function
81
- # @param project [String] Project name
82
- # @param slug [String] Function slug
83
- # @param state [State, nil] Braintrust state (defaults to global)
84
- # @param tracer_provider [TracerProvider, nil] OpenTelemetry tracer provider
85
- # @return [Scorer] Scorer object that invokes remote function
86
- def scorer(project:, slug:, state: nil, tracer_provider: nil)
87
- state ||= Braintrust.current_state
88
- raise Error, "No state available" unless state
89
-
90
- # Resolve function ID from project + slug
91
- api = API.new(state: state)
92
- function_metadata = resolve_function(api, project, slug)
93
- function_id = function_metadata["id"]
94
- function_name = function_metadata["name"] || slug
95
-
96
- # Get tracer for creating spans
97
- tracer_provider ||= OpenTelemetry.tracer_provider
98
- tracer = tracer_provider.tracer("braintrust.functions")
99
-
100
- build_scorer(function_id: function_id, function_name: function_name, api: api, tracer: tracer)
101
- end
102
-
103
- private
104
-
105
- # Build a Scorer that invokes a remote function
106
- # Shared implementation used by both scorer and scorer_by_id
107
- # @param function_id [String] Function UUID
108
- # @param function_name [String] Function display name
109
- # @param api [API] Braintrust API client
110
- # @param tracer [OpenTelemetry::Trace::Tracer] Tracer instance
111
- # @return [Scorer]
112
- def build_scorer(function_id:, function_name:, api:, tracer:)
113
- Scorer.new(function_name) do |input, expected, output, metadata|
114
- tracer.in_span("function: #{function_name}") do |span|
115
- scorer_input = {
116
- input: input,
117
- expected: expected,
118
- output: output,
119
- metadata: metadata
120
- }
121
-
122
- span.set_attribute("braintrust.span_attributes", JSON.dump({type: "function"}))
123
- span.set_attribute("braintrust.input_json", JSON.dump(scorer_input))
124
- span.set_attribute("braintrust.function.name", function_name)
125
- span.set_attribute("braintrust.function.id", function_id)
126
-
127
- begin
128
- result = api.functions.invoke(id: function_id, input: scorer_input)
129
-
130
- score = case result
131
- when Numeric
132
- result.to_f
133
- when true
134
- 1.0
135
- when false
136
- 0.0
137
- when Hash
138
- if result.key?("score")
139
- result["score"].to_f
140
- else
141
- raise Error, "Hash result must contain 'score' key"
142
- end
143
- when String
144
- result.to_f
145
- when nil
146
- nil
147
- else
148
- raise Error, "Unsupported result type: #{result.class}"
149
- end
150
-
151
- span.set_attribute("braintrust.output_json", JSON.dump(score))
152
- score
153
- rescue => e
154
- span.record_exception(e)
155
- span.status = OpenTelemetry::Trace::Status.error(e.message)
156
- raise
157
- end
158
- end
159
- end
160
- end
161
-
162
- # Resolve function ID from project name and slug
163
- # @param api [API] API client
164
- # @param project [String] Project name
165
- # @param slug [String] Function slug
166
- # @return [Hash] Function metadata
167
- def resolve_function(api, project, slug)
168
- result = api.functions.list(project_name: project, slug: slug)
169
- functions = result["objects"]
170
-
171
- if functions.nil? || functions.empty?
172
- raise Error, "Function '#{slug}' not found in project '#{project}'"
173
- end
174
-
175
- functions.first
16
+ # @deprecated Use {Braintrust::Functions.scorer} instead.
17
+ def scorer(**kwargs)
18
+ Log.warn_once(:eval_functions_scorer, "Braintrust::Eval::Functions.scorer is deprecated: use Braintrust::Functions.scorer instead.")
19
+ Braintrust::Functions.scorer(**kwargs)
176
20
  end
177
21
  end
178
22
  end