langsmith-sdk 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +1 -1
- data/CHANGELOG.md +21 -1
- data/lib/langsmith/batch_processor.rb +4 -4
- data/lib/langsmith/client.rb +94 -0
- data/lib/langsmith/context.rb +45 -2
- data/lib/langsmith/evaluation/experiment_runner.rb +131 -0
- data/lib/langsmith/evaluation.rb +38 -0
- data/lib/langsmith/run.rb +15 -1
- data/lib/langsmith/run_tree.rb +17 -12
- data/lib/langsmith/version.rb +1 -1
- data/lib/langsmith.rb +1 -0
- metadata +5 -4
- data/langsmith.gemspec +0 -38
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 011ff7c7fef392b6ad9e53c8f0e281eceb63eb28fa74ceac651e5b313a0cd81e
|
|
4
|
+
data.tar.gz: aa7714d5f8c12b85dc139cf8ee53f54c14337acea77dec108ee71a5239bf368f
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 9e4e254230a934ed50aa5acfccc89624074cc136cb5e9013e3f778f2c00ef3eaf6a84504b40ecada07b2bba5f7eb1a08da8946f076cc2ab4f747fb6047c57708
|
|
7
|
+
data.tar.gz: a041143b6706394415bc9e14c2fdfc92a328bb69d20068f381eb0dd9a2dfa7adc2e29ebd63f9b4b99d15bb80f18dedad345371b05da4d2400eee306f47ec89eb
|
data/.rubocop.yml
CHANGED
data/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,25 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.3.0] - 2026-02-11
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
|
|
14
|
+
- Evaluation module and `ExperimentRunner` for running experiments against datasets
|
|
15
|
+
- Evaluator protocol with support for custom evaluators in `ExperimentRunner`
|
|
16
|
+
- `Client#create_feedback` for submitting evaluation feedback (`POST /api/v1/feedback`)
|
|
17
|
+
- `Client#read_run` for fetching run details (`GET /api/v1/runs/:run_id`)
|
|
18
|
+
- Evaluation API methods: `list_examples`, `create_experiment`, `close_experiment`
|
|
19
|
+
- Evaluation context wired into run creation
|
|
20
|
+
- Root run ID tracking in evaluation context and `RunTree`
|
|
21
|
+
- `reference_example_id` and `session_id` attributes on `Run`
|
|
22
|
+
|
|
23
|
+
### Fixed
|
|
24
|
+
|
|
25
|
+
- Retry with delay for `read_run` after flush for reliable evaluation reads
|
|
26
|
+
- Graceful error handling separating user block errors from evaluator errors
|
|
27
|
+
- Root run ID read before `with_evaluation` clears it
|
|
28
|
+
|
|
10
29
|
## [0.2.0] - 2025-12-21
|
|
11
30
|
|
|
12
31
|
### Added
|
|
@@ -58,7 +77,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
58
77
|
- `prompt` - Prompt template rendering
|
|
59
78
|
- `parser` - Output parsing operations
|
|
60
79
|
|
|
61
|
-
[Unreleased]: https://github.com/felipekb/langsmith-ruby-sdk/compare/v0.
|
|
80
|
+
[Unreleased]: https://github.com/felipekb/langsmith-ruby-sdk/compare/v0.3.0...HEAD
|
|
81
|
+
[0.3.0]: https://github.com/felipekb/langsmith-ruby-sdk/compare/v0.2.0...v0.3.0
|
|
62
82
|
[0.2.0]: https://github.com/felipekb/langsmith-ruby-sdk/compare/v0.1.1...v0.2.0
|
|
63
83
|
[0.1.1]: https://github.com/felipekb/langsmith-ruby-sdk/compare/v0.1.0...v0.1.1
|
|
64
84
|
[0.1.0]: https://github.com/felipekb/langsmith-ruby-sdk/releases/tag/v0.1.0
|
|
@@ -312,9 +312,7 @@ module Langsmith
|
|
|
312
312
|
def trim_buffer_if_needed
|
|
313
313
|
return unless @max_pending_entries
|
|
314
314
|
|
|
315
|
-
while current_buffer_size > @max_pending_entries
|
|
316
|
-
drop_one_entry
|
|
317
|
-
end
|
|
315
|
+
drop_one_entry while current_buffer_size > @max_pending_entries
|
|
318
316
|
end
|
|
319
317
|
|
|
320
318
|
def current_buffer_size
|
|
@@ -346,7 +344,9 @@ module Langsmith
|
|
|
346
344
|
def log_dropped(entry)
|
|
347
345
|
return unless ENV["LANGSMITH_DEBUG"]
|
|
348
346
|
|
|
349
|
-
log_error(
|
|
347
|
+
log_error(
|
|
348
|
+
"Dropped run entry due to max_pending_entries cap (type: #{entry[:type]}, tenant: #{entry[:tenant_id]})"
|
|
349
|
+
)
|
|
350
350
|
end
|
|
351
351
|
|
|
352
352
|
def log_error(message, force: false)
|
data/lib/langsmith/client.rb
CHANGED
|
@@ -84,8 +84,102 @@ module Langsmith
|
|
|
84
84
|
post("/runs/batch", payload, tenant_id: tenant_id)
|
|
85
85
|
end
|
|
86
86
|
|
|
87
|
+
# List examples from a LangSmith dataset.
|
|
88
|
+
#
|
|
89
|
+
# @param dataset_id [String] the dataset ID to fetch examples from
|
|
90
|
+
# @param limit [Integer, nil] max number of examples to return (API max: 100)
|
|
91
|
+
# @param offset [Integer, nil] number of examples to skip
|
|
92
|
+
# @param tenant_id [String, nil] tenant ID (falls back to configured tenant_id)
|
|
93
|
+
# @return [Array<Hash>] array of example objects
|
|
94
|
+
# @raise [APIError] if the request fails
|
|
95
|
+
def list_examples(dataset_id:, limit: nil, offset: nil, tenant_id: nil)
|
|
96
|
+
params = { dataset: dataset_id }
|
|
97
|
+
params[:limit] = limit if limit
|
|
98
|
+
params[:offset] = offset if offset
|
|
99
|
+
|
|
100
|
+
get("/api/v1/examples", params: params, tenant_id: resolve_tenant_id(tenant_id))
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# Create a new experiment (tracer session) linked to a dataset.
|
|
104
|
+
#
|
|
105
|
+
# @param name [String] experiment name
|
|
106
|
+
# @param dataset_id [String] reference dataset ID
|
|
107
|
+
# @param description [String, nil] optional experiment description
|
|
108
|
+
# @param metadata [Hash, nil] optional metadata (stored as `extra`)
|
|
109
|
+
# @param tenant_id [String, nil] tenant ID (falls back to configured tenant_id)
|
|
110
|
+
# @return [Hash] the created experiment object
|
|
111
|
+
# @raise [APIError] if the request fails
|
|
112
|
+
def create_experiment(name:, dataset_id:, description: nil, metadata: nil, tenant_id: nil)
|
|
113
|
+
payload = {
|
|
114
|
+
name: name,
|
|
115
|
+
reference_dataset_id: dataset_id,
|
|
116
|
+
start_time: Time.now.utc.iso8601
|
|
117
|
+
}
|
|
118
|
+
payload[:description] = description if description
|
|
119
|
+
payload[:extra] = metadata if metadata
|
|
120
|
+
|
|
121
|
+
post("/api/v1/sessions", payload, tenant_id: resolve_tenant_id(tenant_id))
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# Close an experiment by setting its end time.
|
|
125
|
+
#
|
|
126
|
+
# @param experiment_id [String] the experiment (session) ID
|
|
127
|
+
# @param end_time [String] ISO-8601 end time
|
|
128
|
+
# @param tenant_id [String, nil] tenant ID (falls back to configured tenant_id)
|
|
129
|
+
# @return [Hash] the updated experiment object
|
|
130
|
+
# @raise [APIError] if the request fails
|
|
131
|
+
def close_experiment(experiment_id:, end_time:, tenant_id: nil)
|
|
132
|
+
patch("/api/v1/sessions/#{experiment_id}", { end_time: end_time }, tenant_id: resolve_tenant_id(tenant_id))
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
# Create feedback (a score/annotation) on a run.
|
|
136
|
+
#
|
|
137
|
+
# @param run_id [String] UUID of the run to attach feedback to
|
|
138
|
+
# @param key [String] metric name (e.g. "correctness")
|
|
139
|
+
# @param score [Numeric, nil] numeric score (typically 0.0-1.0)
|
|
140
|
+
# @param value [String, nil] categorical value (alternative to score)
|
|
141
|
+
# @param comment [String, nil] explanation or reasoning
|
|
142
|
+
# @param tenant_id [String, nil] tenant ID (falls back to configured tenant_id)
|
|
143
|
+
# @return [Hash] the created feedback object
|
|
144
|
+
# @raise [APIError] if the request fails
|
|
145
|
+
def create_feedback(run_id:, key:, score: nil, value: nil, comment: nil, tenant_id: nil)
|
|
146
|
+
payload = { run_id: run_id, key: key }
|
|
147
|
+
payload[:score] = score if score
|
|
148
|
+
payload[:value] = value if value
|
|
149
|
+
payload[:comment] = comment if comment
|
|
150
|
+
|
|
151
|
+
post("/api/v1/feedback", payload, tenant_id: resolve_tenant_id(tenant_id))
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
# Read a single run by ID.
|
|
155
|
+
#
|
|
156
|
+
# @param run_id [String] UUID of the run to fetch
|
|
157
|
+
# @param tenant_id [String, nil] tenant ID (falls back to configured tenant_id)
|
|
158
|
+
# @return [Hash] the run object (inputs, outputs, child_run_ids, tokens, etc.)
|
|
159
|
+
# @raise [APIError] if the request fails
|
|
160
|
+
def read_run(run_id:, tenant_id: nil)
|
|
161
|
+
get("/api/v1/runs/#{run_id}", tenant_id: resolve_tenant_id(tenant_id))
|
|
162
|
+
end
|
|
163
|
+
|
|
87
164
|
private
|
|
88
165
|
|
|
166
|
+
def resolve_tenant_id(tenant_id)
|
|
167
|
+
tenant_id || Langsmith.configuration.tenant_id
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
def get(path, params: {}, tenant_id: nil)
|
|
171
|
+
response = connection.get(path, params) do |req|
|
|
172
|
+
req.headers["X-Tenant-Id"] = tenant_id if tenant_id
|
|
173
|
+
end
|
|
174
|
+
handle_response(response)
|
|
175
|
+
rescue Faraday::ConnectionFailed, Faraday::TimeoutError => e
|
|
176
|
+
raise APIError, "Network error: #{e.message}"
|
|
177
|
+
rescue Faraday::Error => e
|
|
178
|
+
raise APIError, "Request failed: #{e.message}" unless e.respond_to?(:response) && e.response
|
|
179
|
+
|
|
180
|
+
handle_response(e.response)
|
|
181
|
+
end
|
|
182
|
+
|
|
89
183
|
def connection
|
|
90
184
|
@connection ||= Faraday.new(url: @endpoint) do |f|
|
|
91
185
|
f.request :json
|
data/lib/langsmith/context.rb
CHANGED
|
@@ -12,7 +12,9 @@ module Langsmith
|
|
|
12
12
|
# and caused test failures on Ruby 3.2.
|
|
13
13
|
module Context
|
|
14
14
|
CONTEXT_KEY = :langsmith_run_stack
|
|
15
|
-
|
|
15
|
+
EVALUATION_CONTEXT_KEY = :langsmith_evaluation_context
|
|
16
|
+
EVALUATION_ROOT_RUN_ID_KEY = :langsmith_evaluation_root_run_id
|
|
17
|
+
private_constant :CONTEXT_KEY, :EVALUATION_CONTEXT_KEY, :EVALUATION_ROOT_RUN_ID_KEY
|
|
16
18
|
|
|
17
19
|
class << self
|
|
18
20
|
# Returns the current run stack for this thread.
|
|
@@ -49,9 +51,11 @@ module Langsmith
|
|
|
49
51
|
pop
|
|
50
52
|
end
|
|
51
53
|
|
|
52
|
-
# Clear the entire run stack (useful for testing)
|
|
54
|
+
# Clear the entire run stack and evaluation context (useful for testing)
|
|
53
55
|
def clear!
|
|
54
56
|
Thread.current[CONTEXT_KEY] = []
|
|
57
|
+
Thread.current[EVALUATION_CONTEXT_KEY] = nil
|
|
58
|
+
Thread.current[EVALUATION_ROOT_RUN_ID_KEY] = nil
|
|
55
59
|
end
|
|
56
60
|
|
|
57
61
|
# Check if there's an active trace context
|
|
@@ -68,6 +72,45 @@ module Langsmith
|
|
|
68
72
|
def root_run
|
|
69
73
|
run_stack.first
|
|
70
74
|
end
|
|
75
|
+
|
|
76
|
+
# Returns the current evaluation context, or nil when not in evaluation.
|
|
77
|
+
# @return [Hash, nil] hash with :experiment_id and :example_id, or nil
|
|
78
|
+
def evaluation_context
|
|
79
|
+
Thread.current[EVALUATION_CONTEXT_KEY]
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Returns true when evaluation context is set.
|
|
83
|
+
# @return [Boolean]
|
|
84
|
+
def evaluating?
|
|
85
|
+
!evaluation_context.nil?
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Stores the root run ID for the current evaluation example.
|
|
89
|
+
# Called by RunTree when creating the first root run inside an evaluation block.
|
|
90
|
+
#
|
|
91
|
+
# @param run_id [String] the root run's ID
|
|
92
|
+
def set_evaluation_root_run_id(run_id)
|
|
93
|
+
Thread.current[EVALUATION_ROOT_RUN_ID_KEY] = run_id
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Returns the root run ID for the current evaluation example, or nil.
|
|
97
|
+
# @return [String, nil]
|
|
98
|
+
def evaluation_root_run_id
|
|
99
|
+
Thread.current[EVALUATION_ROOT_RUN_ID_KEY]
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# Execute a block with evaluation context set.
|
|
103
|
+
# Context is cleared in ensure block even if the block raises.
|
|
104
|
+
#
|
|
105
|
+
# @param experiment_id [String] the experiment session ID
|
|
106
|
+
# @param example_id [String] the dataset example ID
|
|
107
|
+
def with_evaluation(experiment_id:, example_id:)
|
|
108
|
+
Thread.current[EVALUATION_CONTEXT_KEY] = { experiment_id: experiment_id, example_id: example_id }
|
|
109
|
+
yield
|
|
110
|
+
ensure
|
|
111
|
+
Thread.current[EVALUATION_CONTEXT_KEY] = nil
|
|
112
|
+
Thread.current[EVALUATION_ROOT_RUN_ID_KEY] = nil
|
|
113
|
+
end
|
|
71
114
|
end
|
|
72
115
|
end
|
|
73
116
|
end
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Langsmith
|
|
4
|
+
module Evaluation
|
|
5
|
+
# Orchestrates running an evaluation experiment against a dataset.
|
|
6
|
+
#
|
|
7
|
+
# Fetches examples, creates an experiment, runs each example through
|
|
8
|
+
# the user-provided block with evaluation context set, scores outputs
|
|
9
|
+
# with evaluators, and returns a summary of results.
|
|
10
|
+
class ExperimentRunner
|
|
11
|
+
# @param dataset_id [String] the dataset to evaluate against
|
|
12
|
+
# @param experiment_name [String] name for the experiment
|
|
13
|
+
# @param description [String, nil] optional experiment description
|
|
14
|
+
# @param metadata [Hash, nil] optional experiment metadata
|
|
15
|
+
# @param evaluators [Hash] map of evaluator key to callable
|
|
16
|
+
# @param block [Proc] block that receives each example and produces a result
|
|
17
|
+
def initialize(dataset_id:, experiment_name:, description: nil, metadata: nil, evaluators: {}, &block)
|
|
18
|
+
@dataset_id = dataset_id
|
|
19
|
+
@experiment_name = experiment_name
|
|
20
|
+
@description = description
|
|
21
|
+
@metadata = metadata
|
|
22
|
+
@evaluators = evaluators
|
|
23
|
+
@block = block
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Run the evaluation experiment.
|
|
27
|
+
#
|
|
28
|
+
# @return [Hash] summary with :experiment_id, :total, :succeeded, :failed, :results
|
|
29
|
+
def run
|
|
30
|
+
examples = client.list_examples(dataset_id: @dataset_id)
|
|
31
|
+
|
|
32
|
+
experiment = client.create_experiment(
|
|
33
|
+
name: @experiment_name,
|
|
34
|
+
dataset_id: @dataset_id,
|
|
35
|
+
description: @description,
|
|
36
|
+
metadata: @metadata
|
|
37
|
+
)
|
|
38
|
+
experiment_id = experiment[:id]
|
|
39
|
+
|
|
40
|
+
results = examples.map { |example| run_example(example, experiment_id) }
|
|
41
|
+
|
|
42
|
+
Langsmith.flush
|
|
43
|
+
client.close_experiment(experiment_id: experiment_id, end_time: Time.now.utc.iso8601)
|
|
44
|
+
|
|
45
|
+
build_summary(experiment_id, results)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
private
|
|
49
|
+
|
|
50
|
+
def client
|
|
51
|
+
Langsmith.client
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def run_example(example, experiment_id)
|
|
55
|
+
outputs = nil
|
|
56
|
+
run_id = nil
|
|
57
|
+
|
|
58
|
+
begin
|
|
59
|
+
Context.with_evaluation(experiment_id: experiment_id, example_id: example[:id]) do
|
|
60
|
+
outputs = @block.call(example)
|
|
61
|
+
run_id = Context.evaluation_root_run_id
|
|
62
|
+
end
|
|
63
|
+
rescue StandardError => e
|
|
64
|
+
return { example_id: example[:id], run_id: nil, status: :error, error: e.message, feedback: nil }
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
feedback = run_evaluators(example, outputs, run_id)
|
|
68
|
+
{ example_id: example[:id], run_id: run_id, status: :success, error: nil, feedback: feedback }
|
|
69
|
+
rescue StandardError => e
|
|
70
|
+
{ example_id: example[:id], run_id: run_id, status: :success, error: e.message, feedback: nil }
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def run_evaluators(example, outputs, run_id)
|
|
74
|
+
return nil if @evaluators.empty? || run_id.nil?
|
|
75
|
+
|
|
76
|
+
Langsmith.flush
|
|
77
|
+
run = fetch_run_with_retry(run_id)
|
|
78
|
+
|
|
79
|
+
@evaluators.each_with_object({}) do |(key, evaluator), feedback|
|
|
80
|
+
feedback[key] = execute_evaluator(key, evaluator, example, outputs, run_id, run)
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# LangSmith has indexing lag after batch ingest — the run may not be
|
|
85
|
+
# queryable immediately. Retry a few times with a short delay.
|
|
86
|
+
def fetch_run_with_retry(run_id, retries: 3, delay: 1)
|
|
87
|
+
client.read_run(run_id: run_id)
|
|
88
|
+
rescue Client::APIError => e
|
|
89
|
+
raise unless e.status_code == 404 && retries.positive?
|
|
90
|
+
|
|
91
|
+
sleep(delay)
|
|
92
|
+
fetch_run_with_retry(run_id, retries: retries - 1, delay: delay)
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def execute_evaluator(key, evaluator, example, outputs, run_id, run)
|
|
96
|
+
result = evaluator.call(
|
|
97
|
+
outputs: outputs,
|
|
98
|
+
reference_outputs: example[:outputs],
|
|
99
|
+
inputs: example[:inputs],
|
|
100
|
+
run: run
|
|
101
|
+
)
|
|
102
|
+
return { score: nil, success: true, skipped: true } if result.nil?
|
|
103
|
+
|
|
104
|
+
normalized = normalize_result(result)
|
|
105
|
+
client.create_feedback(run_id: run_id, key: key.to_s, **normalized)
|
|
106
|
+
normalized.merge(success: true)
|
|
107
|
+
rescue StandardError => e
|
|
108
|
+
{ score: nil, success: false, error: e.message }
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def normalize_result(result)
|
|
112
|
+
case result
|
|
113
|
+
when true then { score: 1.0, value: nil, comment: nil }
|
|
114
|
+
when false then { score: 0.0, value: nil, comment: nil }
|
|
115
|
+
when Hash then { score: result[:score], value: result[:value], comment: result[:comment] }
|
|
116
|
+
else { score: result, value: nil, comment: nil }
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def build_summary(experiment_id, results)
|
|
121
|
+
{
|
|
122
|
+
experiment_id: experiment_id,
|
|
123
|
+
total: results.size,
|
|
124
|
+
succeeded: results.count { |r| r[:status] == :success },
|
|
125
|
+
failed: results.count { |r| r[:status] == :error },
|
|
126
|
+
results: results
|
|
127
|
+
}
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
end
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "evaluation/experiment_runner"
|
|
4
|
+
|
|
5
|
+
module Langsmith
|
|
6
|
+
# Public API for running evaluations against LangSmith datasets.
|
|
7
|
+
#
|
|
8
|
+
# @example
|
|
9
|
+
# Langsmith::Evaluation.run(
|
|
10
|
+
# dataset_id: "dataset-uuid",
|
|
11
|
+
# experiment_name: "my-experiment"
|
|
12
|
+
# ) do |example|
|
|
13
|
+
# Langsmith.trace("eval", run_type: "chain", inputs: example[:inputs]) do
|
|
14
|
+
# my_app.call(example[:inputs])
|
|
15
|
+
# end
|
|
16
|
+
# end
|
|
17
|
+
module Evaluation
|
|
18
|
+
# Run an evaluation experiment against a dataset.
|
|
19
|
+
#
|
|
20
|
+
# @param dataset_id [String] the dataset to evaluate against
|
|
21
|
+
# @param experiment_name [String] name for the experiment
|
|
22
|
+
# @param description [String, nil] optional experiment description
|
|
23
|
+
# @param metadata [Hash, nil] optional experiment metadata
|
|
24
|
+
# @param evaluators [Hash] map of evaluator key to callable (see ExperimentRunner)
|
|
25
|
+
# @yield [Hash] each dataset example
|
|
26
|
+
# @return [Hash] summary with :experiment_id, :total, :succeeded, :failed, :results
|
|
27
|
+
def self.run(dataset_id:, experiment_name:, description: nil, metadata: nil, evaluators: {}, &block)
|
|
28
|
+
ExperimentRunner.new(
|
|
29
|
+
dataset_id: dataset_id,
|
|
30
|
+
experiment_name: experiment_name,
|
|
31
|
+
description: description,
|
|
32
|
+
metadata: metadata,
|
|
33
|
+
evaluators: evaluators,
|
|
34
|
+
&block
|
|
35
|
+
).run
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
data/lib/langsmith/run.rb
CHANGED
|
@@ -37,6 +37,12 @@ module Langsmith
|
|
|
37
37
|
# @return [String, nil] tenant ID for multi-tenant scenarios
|
|
38
38
|
attr_reader :tenant_id
|
|
39
39
|
|
|
40
|
+
# @return [String, nil] links this run to a dataset example (for evaluations)
|
|
41
|
+
attr_reader :reference_example_id
|
|
42
|
+
|
|
43
|
+
# @return [String, nil] links this run to an experiment session (overrides project routing)
|
|
44
|
+
attr_reader :session_id
|
|
45
|
+
|
|
40
46
|
# @return [String] trace ID (root run's ID)
|
|
41
47
|
attr_reader :trace_id
|
|
42
48
|
|
|
@@ -81,6 +87,8 @@ module Langsmith
|
|
|
81
87
|
# @param tenant_id [String, nil] tenant ID for multi-tenant scenarios
|
|
82
88
|
# @param trace_id [String, nil] trace ID (defaults to own ID for root runs)
|
|
83
89
|
# @param parent_dotted_order [String, nil] parent's dotted order for tree ordering
|
|
90
|
+
# @param reference_example_id [String, nil] dataset example ID (for evaluations)
|
|
91
|
+
# @param session_id [String, nil] experiment session ID (overrides project routing)
|
|
84
92
|
#
|
|
85
93
|
# @raise [ArgumentError] if run_type is invalid
|
|
86
94
|
def initialize(
|
|
@@ -95,7 +103,9 @@ module Langsmith
|
|
|
95
103
|
id: nil,
|
|
96
104
|
tenant_id: nil,
|
|
97
105
|
trace_id: nil,
|
|
98
|
-
parent_dotted_order: nil
|
|
106
|
+
parent_dotted_order: nil,
|
|
107
|
+
reference_example_id: nil,
|
|
108
|
+
session_id: nil
|
|
99
109
|
)
|
|
100
110
|
@id = id || SecureRandom.uuid
|
|
101
111
|
@name = name
|
|
@@ -114,6 +124,8 @@ module Langsmith
|
|
|
114
124
|
@tags = tags || []
|
|
115
125
|
@extra = extra || {}
|
|
116
126
|
@events = []
|
|
127
|
+
@reference_example_id = reference_example_id
|
|
128
|
+
@session_id = session_id
|
|
117
129
|
# dotted_order is used for ordering runs in the trace tree
|
|
118
130
|
@dotted_order = build_dotted_order(parent_dotted_order)
|
|
119
131
|
end
|
|
@@ -242,6 +254,8 @@ module Langsmith
|
|
|
242
254
|
outputs:,
|
|
243
255
|
error:,
|
|
244
256
|
parent_run_id:,
|
|
257
|
+
reference_example_id:,
|
|
258
|
+
session_id:,
|
|
245
259
|
trace_id:,
|
|
246
260
|
dotted_order:,
|
|
247
261
|
session_name:,
|
data/lib/langsmith/run_tree.rb
CHANGED
|
@@ -20,23 +20,19 @@ module Langsmith
|
|
|
20
20
|
tenant_id: nil,
|
|
21
21
|
project: nil
|
|
22
22
|
)
|
|
23
|
-
# If no explicit parent, check context for current parent
|
|
24
23
|
effective_parent_id = parent_run_id || Context.current_parent_run_id
|
|
25
|
-
|
|
26
|
-
# Inherit tenant_id from parent run if not explicitly set
|
|
27
24
|
effective_tenant_id = tenant_id || Context.current_run&.tenant_id
|
|
28
25
|
|
|
29
26
|
# Child traces must use the same project as their parent to keep the trace tree together.
|
|
30
|
-
# Only root traces can set the project; children always inherit from parent.
|
|
31
27
|
effective_project = Context.current_run&.session_name || project
|
|
32
|
-
|
|
33
|
-
# Inherit trace_id from root run (parent's trace_id)
|
|
34
|
-
# For root runs, trace_id will default to the run's own ID
|
|
35
28
|
effective_trace_id = Context.current_run&.trace_id
|
|
36
|
-
|
|
37
|
-
# Inherit dotted_order from parent for proper trace ordering
|
|
38
29
|
parent_dotted_order = Context.current_run&.dotted_order
|
|
39
30
|
|
|
31
|
+
# Inject evaluation context: session_id on ALL runs, reference_example_id only on root runs.
|
|
32
|
+
eval_ctx = Context.evaluation_context
|
|
33
|
+
effective_session_id = eval_ctx&.dig(:experiment_id)
|
|
34
|
+
effective_ref_example_id = eval_ctx&.dig(:example_id) unless effective_parent_id
|
|
35
|
+
|
|
40
36
|
@run = Run.new(
|
|
41
37
|
name: name,
|
|
42
38
|
run_type: run_type,
|
|
@@ -48,11 +44,13 @@ module Langsmith
|
|
|
48
44
|
tenant_id: effective_tenant_id,
|
|
49
45
|
session_name: effective_project,
|
|
50
46
|
trace_id: effective_trace_id,
|
|
51
|
-
parent_dotted_order: parent_dotted_order
|
|
47
|
+
parent_dotted_order: parent_dotted_order,
|
|
48
|
+
reference_example_id: effective_ref_example_id,
|
|
49
|
+
session_id: effective_session_id
|
|
52
50
|
)
|
|
53
51
|
|
|
54
|
-
|
|
55
|
-
@posted_end = false
|
|
52
|
+
register_evaluation_root_run(effective_parent_id)
|
|
53
|
+
@posted_start = @posted_end = false
|
|
56
54
|
end
|
|
57
55
|
|
|
58
56
|
# Post the run start to LangSmith
|
|
@@ -138,6 +136,13 @@ module Langsmith
|
|
|
138
136
|
|
|
139
137
|
private
|
|
140
138
|
|
|
139
|
+
# Register the root run ID in evaluation context so the runner can
|
|
140
|
+
# attach feedback to it later. Only root runs (no parent) register;
|
|
141
|
+
# child runs must not overwrite.
|
|
142
|
+
def register_evaluation_root_run(effective_parent_id)
|
|
143
|
+
Context.set_evaluation_root_run_id(@run.id) if effective_parent_id.nil? && Context.evaluating?
|
|
144
|
+
end
|
|
145
|
+
|
|
141
146
|
# Sanitize block results to prevent circular references.
|
|
142
147
|
# When users call methods like `run.add_metadata(...)` as the last line,
|
|
143
148
|
# the Run object itself becomes the result, creating a circular reference.
|
data/lib/langsmith/version.rb
CHANGED
data/lib/langsmith.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: langsmith-sdk
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.3.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Felipe Cabezudo
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2026-02-11 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: concurrent-ruby
|
|
@@ -91,13 +91,14 @@ files:
|
|
|
91
91
|
- examples/complex_agent.rb
|
|
92
92
|
- examples/llm_tracing.rb
|
|
93
93
|
- examples/openai_integration.rb
|
|
94
|
-
- langsmith.gemspec
|
|
95
94
|
- lib/langsmith.rb
|
|
96
95
|
- lib/langsmith/batch_processor.rb
|
|
97
96
|
- lib/langsmith/client.rb
|
|
98
97
|
- lib/langsmith/configuration.rb
|
|
99
98
|
- lib/langsmith/context.rb
|
|
100
99
|
- lib/langsmith/errors.rb
|
|
100
|
+
- lib/langsmith/evaluation.rb
|
|
101
|
+
- lib/langsmith/evaluation/experiment_runner.rb
|
|
101
102
|
- lib/langsmith/railtie.rb
|
|
102
103
|
- lib/langsmith/run.rb
|
|
103
104
|
- lib/langsmith/run_tree.rb
|
|
@@ -126,7 +127,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
126
127
|
- !ruby/object:Gem::Version
|
|
127
128
|
version: '0'
|
|
128
129
|
requirements: []
|
|
129
|
-
rubygems_version: 3.
|
|
130
|
+
rubygems_version: 3.4.19
|
|
130
131
|
signing_key:
|
|
131
132
|
specification_version: 4
|
|
132
133
|
summary: Ruby SDK for LangSmith tracing and observability
|
data/langsmith.gemspec
DELETED
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require_relative "lib/langsmith/version"
|
|
4
|
-
|
|
5
|
-
Gem::Specification.new do |spec|
|
|
6
|
-
spec.name = "langsmith-sdk"
|
|
7
|
-
spec.version = Langsmith::VERSION
|
|
8
|
-
spec.authors = ["Felipe Cabezudo"]
|
|
9
|
-
spec.email = ["felipecabedilo@gmail.com"]
|
|
10
|
-
|
|
11
|
-
spec.summary = "Ruby SDK for LangSmith tracing and observability"
|
|
12
|
-
spec.description = "A Ruby client for LangSmith, providing tracing and observability for LLM applications"
|
|
13
|
-
spec.homepage = "https://github.com/felipekb/langsmith-ruby-sdk"
|
|
14
|
-
spec.license = "MIT"
|
|
15
|
-
spec.required_ruby_version = ">= 3.1.0"
|
|
16
|
-
|
|
17
|
-
spec.metadata["allowed_push_host"] = "https://rubygems.org"
|
|
18
|
-
spec.metadata["homepage_uri"] = spec.homepage
|
|
19
|
-
spec.metadata["source_code_uri"] = spec.homepage
|
|
20
|
-
spec.metadata["changelog_uri"] = "#{spec.homepage}/blob/main/CHANGELOG.md"
|
|
21
|
-
spec.metadata["rubygems_mfa_required"] = "true"
|
|
22
|
-
|
|
23
|
-
spec.files = Dir.chdir(__dir__) do
|
|
24
|
-
`git ls-files -z`.split("\x0").reject do |f|
|
|
25
|
-
(File.expand_path(f) == __FILE__) ||
|
|
26
|
-
f.start_with?(*%w[bin/ test/ spec/ features/ .git .github appveyor Gemfile])
|
|
27
|
-
end
|
|
28
|
-
end
|
|
29
|
-
spec.bindir = "exe"
|
|
30
|
-
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
|
31
|
-
spec.require_paths = ["lib"]
|
|
32
|
-
|
|
33
|
-
# Runtime dependencies
|
|
34
|
-
spec.add_dependency "concurrent-ruby", ">= 1.1", "< 3.0"
|
|
35
|
-
spec.add_dependency "faraday", "~> 2.0"
|
|
36
|
-
spec.add_dependency "faraday-net_http_persistent", "~> 2.0"
|
|
37
|
-
spec.add_dependency "faraday-retry", "~> 2.0"
|
|
38
|
-
end
|