deepeval 3.8.1__py3-none-any.whl → 3.8.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/confident/api.py +31 -3
- deepeval/integrations/crewai/__init__.py +9 -2
- deepeval/integrations/crewai/handler.py +261 -66
- deepeval/integrations/crewai/subs.py +23 -10
- deepeval/integrations/crewai/tool.py +20 -3
- deepeval/integrations/crewai/wrapper.py +69 -15
- deepeval/integrations/langchain/callback.py +310 -14
- deepeval/integrations/langchain/utils.py +75 -24
- deepeval/integrations/llama_index/handler.py +69 -21
- deepeval/integrations/pydantic_ai/instrumentator.py +50 -14
- deepeval/integrations/pydantic_ai/otel.py +9 -0
- deepeval/metrics/utils.py +11 -0
- deepeval/simulator/conversation_simulator.py +4 -2
- deepeval/telemetry.py +12 -91
- deepeval/tracing/api.py +1 -0
- deepeval/tracing/context.py +3 -0
- deepeval/tracing/trace_context.py +5 -0
- deepeval/tracing/tracing.py +7 -5
- deepeval/tracing/types.py +1 -0
- {deepeval-3.8.1.dist-info → deepeval-3.8.3.dist-info}/METADATA +1 -1
- {deepeval-3.8.1.dist-info → deepeval-3.8.3.dist-info}/RECORD +25 -25
- {deepeval-3.8.1.dist-info → deepeval-3.8.3.dist-info}/LICENSE.md +0 -0
- {deepeval-3.8.1.dist-info → deepeval-3.8.3.dist-info}/WHEEL +0 -0
- {deepeval-3.8.1.dist-info → deepeval-3.8.3.dist-info}/entry_points.txt +0 -0
|
@@ -17,8 +17,9 @@ def wrap_crew_kickoff():
|
|
|
17
17
|
func_name="kickoff",
|
|
18
18
|
metric_collection=metric_collection,
|
|
19
19
|
metrics=metrics,
|
|
20
|
-
):
|
|
20
|
+
) as observer:
|
|
21
21
|
result = original_kickoff(self, *args, **kwargs)
|
|
22
|
+
observer.result = str(result) if result else None
|
|
22
23
|
|
|
23
24
|
return result
|
|
24
25
|
|
|
@@ -36,8 +37,9 @@ def wrap_crew_kickoff_for_each():
|
|
|
36
37
|
func_name="kickoff_for_each",
|
|
37
38
|
metric_collection=metric_collection,
|
|
38
39
|
metrics=metrics,
|
|
39
|
-
):
|
|
40
|
+
) as observer:
|
|
40
41
|
result = original_kickoff_for_each(self, *args, **kwargs)
|
|
42
|
+
observer.result = str(result) if result else None
|
|
41
43
|
|
|
42
44
|
return result
|
|
43
45
|
|
|
@@ -55,8 +57,9 @@ def wrap_crew_kickoff_async():
|
|
|
55
57
|
func_name="kickoff_async",
|
|
56
58
|
metric_collection=metric_collection,
|
|
57
59
|
metrics=metrics,
|
|
58
|
-
):
|
|
60
|
+
) as observer:
|
|
59
61
|
result = await original_kickoff_async(self, *args, **kwargs)
|
|
62
|
+
observer.result = str(result) if result else None
|
|
60
63
|
|
|
61
64
|
return result
|
|
62
65
|
|
|
@@ -74,33 +77,61 @@ def wrap_crew_kickoff_for_each_async():
|
|
|
74
77
|
func_name="kickoff_for_each_async",
|
|
75
78
|
metric_collection=metric_collection,
|
|
76
79
|
metrics=metrics,
|
|
77
|
-
):
|
|
80
|
+
) as observer:
|
|
78
81
|
result = await original_kickoff_for_each_async(
|
|
79
82
|
self, *args, **kwargs
|
|
80
83
|
)
|
|
84
|
+
observer.result = str(result) if result else None
|
|
81
85
|
|
|
82
86
|
return result
|
|
83
87
|
|
|
84
88
|
Crew.kickoff_for_each_async = wrapper
|
|
85
89
|
|
|
86
90
|
|
|
87
|
-
def
|
|
88
|
-
|
|
91
|
+
def wrap_crew_akickoff():
|
|
92
|
+
if not hasattr(Crew, "akickoff"):
|
|
93
|
+
return
|
|
89
94
|
|
|
90
|
-
|
|
91
|
-
|
|
95
|
+
original_akickoff = Crew.akickoff
|
|
96
|
+
|
|
97
|
+
@wraps(original_akickoff)
|
|
98
|
+
async def wrapper(self, *args, **kwargs):
|
|
99
|
+
metric_collection, metrics = _check_metrics_and_metric_collection(self)
|
|
100
|
+
with Observer(
|
|
101
|
+
span_type="crew",
|
|
102
|
+
func_name="akickoff",
|
|
103
|
+
metric_collection=metric_collection,
|
|
104
|
+
metrics=metrics,
|
|
105
|
+
) as observer:
|
|
106
|
+
result = await original_akickoff(self, *args, **kwargs)
|
|
107
|
+
observer.result = str(result) if result else None
|
|
108
|
+
|
|
109
|
+
return result
|
|
110
|
+
|
|
111
|
+
Crew.akickoff = wrapper
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def wrap_crew_akickoff_for_each():
|
|
115
|
+
if not hasattr(Crew, "akickoff_for_each"):
|
|
116
|
+
return
|
|
117
|
+
|
|
118
|
+
original_akickoff_for_each = Crew.akickoff_for_each
|
|
119
|
+
|
|
120
|
+
@wraps(original_akickoff_for_each)
|
|
121
|
+
async def wrapper(self, *args, **kwargs):
|
|
92
122
|
metric_collection, metrics = _check_metrics_and_metric_collection(self)
|
|
93
123
|
with Observer(
|
|
94
|
-
span_type="
|
|
95
|
-
func_name="
|
|
96
|
-
observe_kwargs={"model": "temp_model"},
|
|
124
|
+
span_type="crew",
|
|
125
|
+
func_name="akickoff_for_each",
|
|
97
126
|
metric_collection=metric_collection,
|
|
98
127
|
metrics=metrics,
|
|
99
|
-
):
|
|
100
|
-
result =
|
|
128
|
+
) as observer:
|
|
129
|
+
result = await original_akickoff_for_each(self, *args, **kwargs)
|
|
130
|
+
observer.result = str(result) if result else None
|
|
131
|
+
|
|
101
132
|
return result
|
|
102
133
|
|
|
103
|
-
|
|
134
|
+
Crew.akickoff_for_each = wrapper
|
|
104
135
|
|
|
105
136
|
|
|
106
137
|
def wrap_agent_execute_task():
|
|
@@ -114,13 +145,36 @@ def wrap_agent_execute_task():
|
|
|
114
145
|
func_name="execute_task",
|
|
115
146
|
metric_collection=metric_collection,
|
|
116
147
|
metrics=metrics,
|
|
117
|
-
):
|
|
148
|
+
) as observer:
|
|
118
149
|
result = original_execute_task(self, *args, **kwargs)
|
|
150
|
+
observer.result = str(result) if result else None
|
|
119
151
|
return result
|
|
120
152
|
|
|
121
153
|
Agent.execute_task = wrapper
|
|
122
154
|
|
|
123
155
|
|
|
156
|
+
def wrap_agent_aexecute_task():
|
|
157
|
+
if not hasattr(Agent, "aexecute_task"):
|
|
158
|
+
return
|
|
159
|
+
|
|
160
|
+
original_aexecute_task = Agent.aexecute_task
|
|
161
|
+
|
|
162
|
+
@wraps(original_aexecute_task)
|
|
163
|
+
async def wrapper(self, *args, **kwargs):
|
|
164
|
+
metric_collection, metrics = _check_metrics_and_metric_collection(self)
|
|
165
|
+
with Observer(
|
|
166
|
+
span_type="agent",
|
|
167
|
+
func_name="aexecute_task",
|
|
168
|
+
metric_collection=metric_collection,
|
|
169
|
+
metrics=metrics,
|
|
170
|
+
) as observer:
|
|
171
|
+
result = await original_aexecute_task(self, *args, **kwargs)
|
|
172
|
+
observer.result = str(result) if result else None
|
|
173
|
+
return result
|
|
174
|
+
|
|
175
|
+
Agent.aexecute_task = wrapper
|
|
176
|
+
|
|
177
|
+
|
|
124
178
|
def _check_metrics_and_metric_collection(obj: Any):
|
|
125
179
|
metric_collection = getattr(obj, "_metric_collection", None)
|
|
126
180
|
metrics = getattr(obj, "_metrics", None)
|
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import threading
|
|
4
|
+
|
|
1
5
|
from typing import Any, Optional, List, Dict
|
|
2
6
|
from uuid import UUID
|
|
3
7
|
from time import perf_counter
|
|
@@ -20,6 +24,19 @@ from deepeval.tracing.types import (
|
|
|
20
24
|
)
|
|
21
25
|
from deepeval.telemetry import capture_tracing_integration
|
|
22
26
|
|
|
27
|
+
# Debug logging for LangChain callbacks (enable with DEEPEVAL_DEBUG_LANGCHAIN_CALLBACKS=1)
|
|
28
|
+
_DEBUG_CALLBACKS = os.environ.get(
|
|
29
|
+
"DEEPEVAL_DEBUG_LANGCHAIN_CALLBACKS", ""
|
|
30
|
+
).lower() in ("1", "true", "yes")
|
|
31
|
+
|
|
32
|
+
_logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _debug_log(msg: str):
|
|
36
|
+
if _DEBUG_CALLBACKS:
|
|
37
|
+
_logger.debug(f"[LangChain Callback] {msg}")
|
|
38
|
+
|
|
39
|
+
|
|
23
40
|
try:
|
|
24
41
|
from langchain_core.callbacks.base import BaseCallbackHandler
|
|
25
42
|
from langchain_core.outputs import LLMResult
|
|
@@ -29,6 +46,7 @@ try:
|
|
|
29
46
|
# contains langchain imports
|
|
30
47
|
from deepeval.integrations.langchain.utils import (
|
|
31
48
|
parse_prompts_to_messages,
|
|
49
|
+
convert_chat_messages_to_input,
|
|
32
50
|
extract_name,
|
|
33
51
|
safe_extract_model_name,
|
|
34
52
|
safe_extract_token_usage,
|
|
@@ -50,6 +68,12 @@ def is_langchain_installed():
|
|
|
50
68
|
|
|
51
69
|
|
|
52
70
|
class CallbackHandler(BaseCallbackHandler):
|
|
71
|
+
# When users create multiple CallbackHandler instances for the same logical
|
|
72
|
+
# conversation (same thread_id), we want spans to land on the same trace.
|
|
73
|
+
# Otherwise, each handler lazily creates its own trace, and multi-turn flows
|
|
74
|
+
# become multiple single-turn traces.
|
|
75
|
+
_thread_id_to_trace_uuid: Dict[str, str] = {}
|
|
76
|
+
_thread_id_lock = threading.Lock()
|
|
53
77
|
|
|
54
78
|
def __init__(
|
|
55
79
|
self,
|
|
@@ -60,6 +84,7 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
60
84
|
user_id: Optional[str] = None,
|
|
61
85
|
metrics: Optional[List[BaseMetric]] = None,
|
|
62
86
|
metric_collection: Optional[str] = None,
|
|
87
|
+
test_case_id: Optional[str] = None,
|
|
63
88
|
):
|
|
64
89
|
is_langchain_installed()
|
|
65
90
|
with capture_tracing_integration("langchain.callback.CallbackHandler"):
|
|
@@ -74,13 +99,21 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
74
99
|
self._parent_span = None
|
|
75
100
|
|
|
76
101
|
# Stash trace metadata to apply once we know which trace we are using.
|
|
77
|
-
|
|
102
|
+
# _trace_init_fields is cleared after first apply to prevent re-applying
|
|
103
|
+
# on every callback within the same trace. _original_init_fields is kept
|
|
104
|
+
# permanently so we can re-apply when a new trace is created (e.g., in
|
|
105
|
+
# multi-turn scenarios where the previous trace was ended).
|
|
106
|
+
self._original_init_fields: Dict[str, Any] = {
|
|
78
107
|
"name": name,
|
|
79
108
|
"tags": tags,
|
|
80
109
|
"metadata": metadata,
|
|
81
110
|
"thread_id": thread_id,
|
|
82
111
|
"user_id": user_id,
|
|
112
|
+
"test_case_id": test_case_id,
|
|
83
113
|
}
|
|
114
|
+
self._trace_init_fields: Dict[str, Any] = dict(
|
|
115
|
+
self._original_init_fields
|
|
116
|
+
)
|
|
84
117
|
|
|
85
118
|
# Map LangChain run_id -> our span uuid for parent span restoration
|
|
86
119
|
self._run_id_to_span_uuid: Dict[str, str] = {}
|
|
@@ -96,6 +129,34 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
96
129
|
This is done lazily during actual callback execution to avoid context
|
|
97
130
|
corruption when the handler is constructed outside the async task/context.
|
|
98
131
|
"""
|
|
132
|
+
# If the user provided a thread_id, attempt to reuse an existing trace for it.
|
|
133
|
+
# This makes multi-turn tests that use multiple CallbackHandler instances behave
|
|
134
|
+
# as expected: one trace containing multiple turns/spans.
|
|
135
|
+
thread_id = None
|
|
136
|
+
fields = self._trace_init_fields or {}
|
|
137
|
+
if fields.get("thread_id"):
|
|
138
|
+
thread_id = fields["thread_id"]
|
|
139
|
+
# In case _trace_init_fields has already been cleared, fall back to trace metadata.
|
|
140
|
+
if thread_id is None and self._trace is not None:
|
|
141
|
+
thread_id = self._trace.thread_id
|
|
142
|
+
|
|
143
|
+
if thread_id:
|
|
144
|
+
with self._thread_id_lock:
|
|
145
|
+
existing_uuid = self._thread_id_to_trace_uuid.get(thread_id)
|
|
146
|
+
if existing_uuid:
|
|
147
|
+
existing_trace = trace_manager.get_trace_by_uuid(existing_uuid)
|
|
148
|
+
if (
|
|
149
|
+
existing_trace
|
|
150
|
+
and existing_trace.uuid in trace_manager.active_traces
|
|
151
|
+
):
|
|
152
|
+
current_trace_context.set(existing_trace)
|
|
153
|
+
self._trace = existing_trace
|
|
154
|
+
self.trace_uuid = existing_trace.uuid
|
|
155
|
+
# Lazily capture the observe parent span if present.
|
|
156
|
+
if self._parent_span is None:
|
|
157
|
+
self._parent_span = current_span_context.get()
|
|
158
|
+
return existing_trace
|
|
159
|
+
|
|
99
160
|
# Prefer current context trace if it is active.
|
|
100
161
|
ctx_trace = current_trace_context.get()
|
|
101
162
|
if ctx_trace and ctx_trace.uuid in trace_manager.active_traces:
|
|
@@ -107,6 +168,10 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
107
168
|
current_trace_context.set(trace)
|
|
108
169
|
else:
|
|
109
170
|
# Otherwise, create a fresh trace now (in the right context).
|
|
171
|
+
# Restore _trace_init_fields from the original init fields so that
|
|
172
|
+
# the new trace gets the same name/tags/metadata as intended.
|
|
173
|
+
if not self._trace_init_fields and self._original_init_fields:
|
|
174
|
+
self._trace_init_fields = dict(self._original_init_fields)
|
|
110
175
|
trace = trace_manager.start_new_trace()
|
|
111
176
|
current_trace_context.set(trace)
|
|
112
177
|
self._trace = trace
|
|
@@ -114,8 +179,18 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
114
179
|
# Keep a copy for quick access.
|
|
115
180
|
self.trace_uuid = trace.uuid
|
|
116
181
|
|
|
182
|
+
# Register this trace as the canonical trace for this thread_id (if provided).
|
|
183
|
+
# This allows other CallbackHandler instances created for the same thread_id
|
|
184
|
+
# to reuse the same trace instead of creating parallel traces.
|
|
185
|
+
fields = self._trace_init_fields or {}
|
|
186
|
+
tid = fields.get("thread_id") or trace.thread_id
|
|
187
|
+
if tid:
|
|
188
|
+
with self._thread_id_lock:
|
|
189
|
+
# Only set if absent to preserve the "first trace wins" behavior.
|
|
190
|
+
self._thread_id_to_trace_uuid.setdefault(tid, trace.uuid)
|
|
191
|
+
|
|
117
192
|
# Apply stashed metadata once.
|
|
118
|
-
fields =
|
|
193
|
+
fields = self._trace_init_fields or {}
|
|
119
194
|
if fields:
|
|
120
195
|
if fields.get("name") is not None:
|
|
121
196
|
trace.name = fields["name"]
|
|
@@ -127,6 +202,8 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
127
202
|
trace.thread_id = fields["thread_id"]
|
|
128
203
|
if fields.get("user_id") is not None:
|
|
129
204
|
trace.user_id = fields["user_id"]
|
|
205
|
+
if fields.get("test_case_id") is not None:
|
|
206
|
+
trace.test_case_id = fields["test_case_id"]
|
|
130
207
|
# prevent re-applying on every callback
|
|
131
208
|
self._trace_init_fields = {}
|
|
132
209
|
|
|
@@ -202,6 +279,9 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
202
279
|
metadata: Optional[dict[str, Any]] = None,
|
|
203
280
|
**kwargs: Any,
|
|
204
281
|
) -> Any:
|
|
282
|
+
_debug_log(
|
|
283
|
+
f"on_chain_start: run_id={run_id}, parent_run_id={parent_run_id}, name={extract_name(serialized, **kwargs)}"
|
|
284
|
+
)
|
|
205
285
|
# Create spans for all chains to establish proper parent-child hierarchy
|
|
206
286
|
# This is important for LangGraph where there are nested chains
|
|
207
287
|
with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
|
|
@@ -232,6 +312,9 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
232
312
|
parent_run_id: Optional[UUID] = None,
|
|
233
313
|
**kwargs: Any,
|
|
234
314
|
) -> Any:
|
|
315
|
+
_debug_log(
|
|
316
|
+
f"on_chain_end: run_id={run_id}, parent_run_id={parent_run_id}"
|
|
317
|
+
)
|
|
235
318
|
uuid_str = str(run_id)
|
|
236
319
|
base_span = trace_manager.get_span_by_uuid(uuid_str)
|
|
237
320
|
if base_span:
|
|
@@ -246,6 +329,59 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
246
329
|
trace.output = output
|
|
247
330
|
exit_current_context(uuid_str=uuid_str)
|
|
248
331
|
|
|
332
|
+
def on_chat_model_start(
|
|
333
|
+
self,
|
|
334
|
+
serialized: dict[str, Any],
|
|
335
|
+
messages: list[list[Any]], # list[list[BaseMessage]]
|
|
336
|
+
*,
|
|
337
|
+
run_id: UUID,
|
|
338
|
+
parent_run_id: Optional[UUID] = None,
|
|
339
|
+
tags: Optional[list[str]] = None,
|
|
340
|
+
metadata: Optional[dict[str, Any]] = None,
|
|
341
|
+
**kwargs: Any,
|
|
342
|
+
) -> Any:
|
|
343
|
+
"""
|
|
344
|
+
Handle chat model start callback. In LangChain v1, chat models emit
|
|
345
|
+
on_chat_model_start instead of on_llm_start. The on_llm_end callback
|
|
346
|
+
is still used for both.
|
|
347
|
+
"""
|
|
348
|
+
_debug_log(
|
|
349
|
+
f"on_chat_model_start: run_id={run_id}, parent_run_id={parent_run_id}, messages_len={len(messages)}"
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
# Guard against double-counting if both on_llm_start and on_chat_model_start fire
|
|
353
|
+
uuid_str = str(run_id)
|
|
354
|
+
existing_span = trace_manager.get_span_by_uuid(uuid_str)
|
|
355
|
+
if existing_span is not None:
|
|
356
|
+
_debug_log(
|
|
357
|
+
f"on_chat_model_start: span already exists for run_id={run_id}, skipping"
|
|
358
|
+
)
|
|
359
|
+
return
|
|
360
|
+
|
|
361
|
+
with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
|
|
362
|
+
# Convert messages to our internal format using the shared helper
|
|
363
|
+
input_messages = convert_chat_messages_to_input(messages, **kwargs)
|
|
364
|
+
|
|
365
|
+
# Safe extraction of model name (handle None metadata)
|
|
366
|
+
md = metadata or {}
|
|
367
|
+
model = safe_extract_model_name(md, **kwargs)
|
|
368
|
+
|
|
369
|
+
llm_span: LlmSpan = enter_current_context(
|
|
370
|
+
uuid_str=uuid_str,
|
|
371
|
+
span_type="llm",
|
|
372
|
+
func_name=extract_name(serialized, **kwargs),
|
|
373
|
+
)
|
|
374
|
+
# Register this run_id -> span mapping for child callbacks
|
|
375
|
+
self._run_id_to_span_uuid[str(run_id)] = uuid_str
|
|
376
|
+
|
|
377
|
+
llm_span.input = input_messages
|
|
378
|
+
llm_span.model = model
|
|
379
|
+
|
|
380
|
+
# Extract metrics and prompt from metadata if provided, but don't mutate original
|
|
381
|
+
llm_span.metrics = md.get("metrics")
|
|
382
|
+
llm_span.metric_collection = md.get("metric_collection")
|
|
383
|
+
llm_span.prompt = md.get("prompt")
|
|
384
|
+
|
|
249
385
|
def on_llm_start(
|
|
250
386
|
self,
|
|
251
387
|
serialized: dict[str, Any],
|
|
@@ -257,10 +393,25 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
257
393
|
metadata: Optional[dict[str, Any]] = None,
|
|
258
394
|
**kwargs: Any,
|
|
259
395
|
) -> Any:
|
|
396
|
+
_debug_log(
|
|
397
|
+
f"on_llm_start: run_id={run_id}, parent_run_id={parent_run_id}, prompts_len={len(prompts)}"
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
# Guard against double-counting if both on_llm_start and on_chat_model_start fire
|
|
401
|
+
uuid_str = str(run_id)
|
|
402
|
+
existing_span = trace_manager.get_span_by_uuid(uuid_str)
|
|
403
|
+
if existing_span is not None:
|
|
404
|
+
_debug_log(
|
|
405
|
+
f"on_llm_start: span already exists for run_id={run_id}, skipping"
|
|
406
|
+
)
|
|
407
|
+
return
|
|
408
|
+
|
|
260
409
|
with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
|
|
261
|
-
uuid_str = str(run_id)
|
|
262
410
|
input_messages = parse_prompts_to_messages(prompts, **kwargs)
|
|
263
|
-
|
|
411
|
+
|
|
412
|
+
# Safe extraction of model name (handle None metadata)
|
|
413
|
+
md = metadata or {}
|
|
414
|
+
model = safe_extract_model_name(md, **kwargs)
|
|
264
415
|
|
|
265
416
|
llm_span: LlmSpan = enter_current_context(
|
|
266
417
|
uuid_str=uuid_str,
|
|
@@ -272,12 +423,11 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
272
423
|
|
|
273
424
|
llm_span.input = input_messages
|
|
274
425
|
llm_span.model = model
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
llm_span.
|
|
279
|
-
llm_span.
|
|
280
|
-
llm_span.prompt = prompt
|
|
426
|
+
|
|
427
|
+
# Extract metrics and prompt from metadata if provided, but don't mutate original
|
|
428
|
+
llm_span.metrics = md.get("metrics")
|
|
429
|
+
llm_span.metric_collection = md.get("metric_collection")
|
|
430
|
+
llm_span.prompt = md.get("prompt")
|
|
281
431
|
|
|
282
432
|
def on_llm_end(
|
|
283
433
|
self,
|
|
@@ -287,9 +437,20 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
287
437
|
parent_run_id: Optional[UUID] = None,
|
|
288
438
|
**kwargs: Any, # un-logged kwargs
|
|
289
439
|
) -> Any:
|
|
440
|
+
_debug_log(
|
|
441
|
+
f"on_llm_end: run_id={run_id}, parent_run_id={parent_run_id}, response_type={type(response).__name__}"
|
|
442
|
+
)
|
|
290
443
|
uuid_str = str(run_id)
|
|
291
444
|
llm_span: LlmSpan = trace_manager.get_span_by_uuid(uuid_str)
|
|
292
445
|
if llm_span is None:
|
|
446
|
+
_debug_log(f"on_llm_end: NO SPAN FOUND for run_id={run_id}")
|
|
447
|
+
return
|
|
448
|
+
|
|
449
|
+
# Guard against double-finalization (if both on_llm_end and on_chat_model_end fire)
|
|
450
|
+
if llm_span.end_time is not None:
|
|
451
|
+
_debug_log(
|
|
452
|
+
f"on_llm_end: span already finalized for run_id={run_id}, skipping"
|
|
453
|
+
)
|
|
293
454
|
return
|
|
294
455
|
|
|
295
456
|
with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
|
|
@@ -336,7 +497,6 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
336
497
|
)
|
|
337
498
|
|
|
338
499
|
llm_span.model = model if model else llm_span.model
|
|
339
|
-
llm_span.input = llm_span.input
|
|
340
500
|
llm_span.output = output
|
|
341
501
|
llm_span.input_token_count = (
|
|
342
502
|
total_input_tokens if total_input_tokens > 0 else None
|
|
@@ -347,6 +507,121 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
347
507
|
|
|
348
508
|
exit_current_context(uuid_str=uuid_str)
|
|
349
509
|
|
|
510
|
+
def on_chat_model_end(
|
|
511
|
+
self,
|
|
512
|
+
response: Any,
|
|
513
|
+
*,
|
|
514
|
+
run_id: UUID,
|
|
515
|
+
parent_run_id: Optional[UUID] = None,
|
|
516
|
+
**kwargs: Any,
|
|
517
|
+
) -> Any:
|
|
518
|
+
"""
|
|
519
|
+
Handle chat model end callback. This may be called instead of or
|
|
520
|
+
in addition to on_llm_end depending on the LangChain version.
|
|
521
|
+
"""
|
|
522
|
+
_debug_log(
|
|
523
|
+
f"on_chat_model_end: run_id={run_id}, parent_run_id={parent_run_id}, response_type={type(response).__name__}"
|
|
524
|
+
)
|
|
525
|
+
uuid_str = str(run_id)
|
|
526
|
+
llm_span: LlmSpan = trace_manager.get_span_by_uuid(uuid_str)
|
|
527
|
+
if llm_span is None:
|
|
528
|
+
_debug_log(f"on_chat_model_end: NO SPAN FOUND for run_id={run_id}")
|
|
529
|
+
return
|
|
530
|
+
|
|
531
|
+
# Guard against double-finalization, which could happen if both on_llm_end and on_chat_model_end fire
|
|
532
|
+
if llm_span.end_time is not None:
|
|
533
|
+
_debug_log(
|
|
534
|
+
f"on_chat_model_end: span already finalized for run_id={run_id}, skipping"
|
|
535
|
+
)
|
|
536
|
+
return
|
|
537
|
+
|
|
538
|
+
with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
|
|
539
|
+
output = ""
|
|
540
|
+
total_input_tokens = 0
|
|
541
|
+
total_output_tokens = 0
|
|
542
|
+
model = None
|
|
543
|
+
|
|
544
|
+
# Handle LLMResult (same as on_llm_end)
|
|
545
|
+
if isinstance(response, LLMResult):
|
|
546
|
+
for generation in response.generations:
|
|
547
|
+
for gen in generation:
|
|
548
|
+
if isinstance(gen, ChatGeneration):
|
|
549
|
+
if gen.message.response_metadata and isinstance(
|
|
550
|
+
gen.message.response_metadata, dict
|
|
551
|
+
):
|
|
552
|
+
model = gen.message.response_metadata.get(
|
|
553
|
+
"model_name"
|
|
554
|
+
)
|
|
555
|
+
input_tokens, output_tokens = (
|
|
556
|
+
safe_extract_token_usage(
|
|
557
|
+
gen.message.response_metadata
|
|
558
|
+
)
|
|
559
|
+
)
|
|
560
|
+
total_input_tokens += input_tokens
|
|
561
|
+
total_output_tokens += output_tokens
|
|
562
|
+
|
|
563
|
+
if isinstance(gen.message, AIMessage):
|
|
564
|
+
ai_message = gen.message
|
|
565
|
+
tool_calls = []
|
|
566
|
+
for tool_call in ai_message.tool_calls:
|
|
567
|
+
tool_calls.append(
|
|
568
|
+
LlmToolCall(
|
|
569
|
+
name=tool_call["name"],
|
|
570
|
+
args=tool_call["args"],
|
|
571
|
+
id=tool_call["id"],
|
|
572
|
+
)
|
|
573
|
+
)
|
|
574
|
+
output = LlmOutput(
|
|
575
|
+
role="AI",
|
|
576
|
+
content=ai_message.content,
|
|
577
|
+
tool_calls=tool_calls,
|
|
578
|
+
)
|
|
579
|
+
|
|
580
|
+
llm_span.model = model if model else llm_span.model
|
|
581
|
+
llm_span.output = output
|
|
582
|
+
llm_span.input_token_count = (
|
|
583
|
+
total_input_tokens if total_input_tokens > 0 else None
|
|
584
|
+
)
|
|
585
|
+
llm_span.output_token_count = (
|
|
586
|
+
total_output_tokens if total_output_tokens > 0 else None
|
|
587
|
+
)
|
|
588
|
+
|
|
589
|
+
exit_current_context(uuid_str=uuid_str)
|
|
590
|
+
|
|
591
|
+
def on_chat_model_error(
|
|
592
|
+
self,
|
|
593
|
+
error: BaseException,
|
|
594
|
+
*,
|
|
595
|
+
run_id: UUID,
|
|
596
|
+
parent_run_id: Optional[UUID] = None,
|
|
597
|
+
**kwargs: Any,
|
|
598
|
+
) -> Any:
|
|
599
|
+
"""
|
|
600
|
+
Handle chat model error callback.
|
|
601
|
+
"""
|
|
602
|
+
_debug_log(
|
|
603
|
+
f"on_chat_model_error: run_id={run_id}, parent_run_id={parent_run_id}, error={error}"
|
|
604
|
+
)
|
|
605
|
+
uuid_str = str(run_id)
|
|
606
|
+
llm_span: LlmSpan = trace_manager.get_span_by_uuid(uuid_str)
|
|
607
|
+
if llm_span is None:
|
|
608
|
+
_debug_log(
|
|
609
|
+
f"on_chat_model_error: NO SPAN FOUND for run_id={run_id}"
|
|
610
|
+
)
|
|
611
|
+
return
|
|
612
|
+
|
|
613
|
+
# Guard against double-finalization
|
|
614
|
+
if llm_span.end_time is not None:
|
|
615
|
+
_debug_log(
|
|
616
|
+
f"on_chat_model_error: span already finalized for run_id={run_id}, skipping"
|
|
617
|
+
)
|
|
618
|
+
return
|
|
619
|
+
|
|
620
|
+
with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
|
|
621
|
+
llm_span.status = TraceSpanStatus.ERRORED
|
|
622
|
+
llm_span.error = str(error)
|
|
623
|
+
exit_current_context(uuid_str=uuid_str)
|
|
624
|
+
|
|
350
625
|
def on_llm_error(
|
|
351
626
|
self,
|
|
352
627
|
error: BaseException,
|
|
@@ -355,10 +630,22 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
355
630
|
parent_run_id: Optional[UUID] = None,
|
|
356
631
|
**kwargs: Any,
|
|
357
632
|
) -> Any:
|
|
633
|
+
_debug_log(
|
|
634
|
+
f"on_llm_error: run_id={run_id}, parent_run_id={parent_run_id}, error={error}"
|
|
635
|
+
)
|
|
358
636
|
uuid_str = str(run_id)
|
|
359
637
|
llm_span: LlmSpan = trace_manager.get_span_by_uuid(uuid_str)
|
|
360
638
|
if llm_span is None:
|
|
639
|
+
_debug_log(f"on_llm_error: NO SPAN FOUND for run_id={run_id}")
|
|
640
|
+
return
|
|
641
|
+
|
|
642
|
+
# Guard against double-finalization
|
|
643
|
+
if llm_span.end_time is not None:
|
|
644
|
+
_debug_log(
|
|
645
|
+
f"on_llm_error: span already finalized for run_id={run_id}, skipping"
|
|
646
|
+
)
|
|
361
647
|
return
|
|
648
|
+
|
|
362
649
|
with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
|
|
363
650
|
llm_span.status = TraceSpanStatus.ERRORED
|
|
364
651
|
llm_span.error = str(error)
|
|
@@ -396,6 +683,9 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
396
683
|
inputs: Optional[dict[str, Any]] = None,
|
|
397
684
|
**kwargs: Any,
|
|
398
685
|
) -> Any:
|
|
686
|
+
_debug_log(
|
|
687
|
+
f"on_tool_start: run_id={run_id}, parent_run_id={parent_run_id}, name={extract_name(serialized, **kwargs)}"
|
|
688
|
+
)
|
|
399
689
|
with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
|
|
400
690
|
uuid_str = str(run_id)
|
|
401
691
|
|
|
@@ -418,6 +708,9 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
418
708
|
parent_run_id: Optional[UUID] = None,
|
|
419
709
|
**kwargs: Any, # un-logged kwargs
|
|
420
710
|
) -> Any:
|
|
711
|
+
_debug_log(
|
|
712
|
+
f"on_tool_end: run_id={run_id}, parent_run_id={parent_run_id}"
|
|
713
|
+
)
|
|
421
714
|
uuid_str = str(run_id)
|
|
422
715
|
tool_span: ToolSpan = trace_manager.get_span_by_uuid(uuid_str)
|
|
423
716
|
if tool_span is None:
|
|
@@ -485,20 +778,23 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
485
778
|
) -> Any:
|
|
486
779
|
with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
|
|
487
780
|
uuid_str = str(run_id)
|
|
781
|
+
# Safe access to metadata (handle None)
|
|
782
|
+
md = metadata or {}
|
|
488
783
|
retriever_span = enter_current_context(
|
|
489
784
|
uuid_str=uuid_str,
|
|
490
785
|
span_type="retriever",
|
|
491
786
|
func_name=extract_name(serialized, **kwargs),
|
|
492
787
|
observe_kwargs={
|
|
493
|
-
"embedder":
|
|
494
|
-
"ls_embedding_provider", "unknown"
|
|
495
|
-
),
|
|
788
|
+
"embedder": md.get("ls_embedding_provider", "unknown"),
|
|
496
789
|
},
|
|
497
790
|
)
|
|
498
791
|
# Register this run_id -> span mapping for child callbacks
|
|
499
792
|
self._run_id_to_span_uuid[str(run_id)] = uuid_str
|
|
500
793
|
retriever_span.input = query
|
|
501
794
|
|
|
795
|
+
# Extract metric_collection from metadata if provided
|
|
796
|
+
retriever_span.metric_collection = md.get("metric_collection")
|
|
797
|
+
|
|
502
798
|
def on_retriever_end(
|
|
503
799
|
self,
|
|
504
800
|
output: Any,
|