deepeval 3.8.1__py3-none-any.whl → 3.8.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,8 +17,9 @@ def wrap_crew_kickoff():
17
17
  func_name="kickoff",
18
18
  metric_collection=metric_collection,
19
19
  metrics=metrics,
20
- ):
20
+ ) as observer:
21
21
  result = original_kickoff(self, *args, **kwargs)
22
+ observer.result = str(result) if result else None
22
23
 
23
24
  return result
24
25
 
@@ -36,8 +37,9 @@ def wrap_crew_kickoff_for_each():
36
37
  func_name="kickoff_for_each",
37
38
  metric_collection=metric_collection,
38
39
  metrics=metrics,
39
- ):
40
+ ) as observer:
40
41
  result = original_kickoff_for_each(self, *args, **kwargs)
42
+ observer.result = str(result) if result else None
41
43
 
42
44
  return result
43
45
 
@@ -55,8 +57,9 @@ def wrap_crew_kickoff_async():
55
57
  func_name="kickoff_async",
56
58
  metric_collection=metric_collection,
57
59
  metrics=metrics,
58
- ):
60
+ ) as observer:
59
61
  result = await original_kickoff_async(self, *args, **kwargs)
62
+ observer.result = str(result) if result else None
60
63
 
61
64
  return result
62
65
 
@@ -74,33 +77,61 @@ def wrap_crew_kickoff_for_each_async():
74
77
  func_name="kickoff_for_each_async",
75
78
  metric_collection=metric_collection,
76
79
  metrics=metrics,
77
- ):
80
+ ) as observer:
78
81
  result = await original_kickoff_for_each_async(
79
82
  self, *args, **kwargs
80
83
  )
84
+ observer.result = str(result) if result else None
81
85
 
82
86
  return result
83
87
 
84
88
  Crew.kickoff_for_each_async = wrapper
85
89
 
86
90
 
87
- def wrap_llm_call():
88
- original_llm_call = LLM.call
91
+ def wrap_crew_akickoff():
92
+ if not hasattr(Crew, "akickoff"):
93
+ return
89
94
 
90
- @wraps(original_llm_call)
91
- def wrapper(self, *args, **kwargs):
95
+ original_akickoff = Crew.akickoff
96
+
97
+ @wraps(original_akickoff)
98
+ async def wrapper(self, *args, **kwargs):
99
+ metric_collection, metrics = _check_metrics_and_metric_collection(self)
100
+ with Observer(
101
+ span_type="crew",
102
+ func_name="akickoff",
103
+ metric_collection=metric_collection,
104
+ metrics=metrics,
105
+ ) as observer:
106
+ result = await original_akickoff(self, *args, **kwargs)
107
+ observer.result = str(result) if result else None
108
+
109
+ return result
110
+
111
+ Crew.akickoff = wrapper
112
+
113
+
114
+ def wrap_crew_akickoff_for_each():
115
+ if not hasattr(Crew, "akickoff_for_each"):
116
+ return
117
+
118
+ original_akickoff_for_each = Crew.akickoff_for_each
119
+
120
+ @wraps(original_akickoff_for_each)
121
+ async def wrapper(self, *args, **kwargs):
92
122
  metric_collection, metrics = _check_metrics_and_metric_collection(self)
93
123
  with Observer(
94
- span_type="llm",
95
- func_name="call",
96
- observe_kwargs={"model": "temp_model"},
124
+ span_type="crew",
125
+ func_name="akickoff_for_each",
97
126
  metric_collection=metric_collection,
98
127
  metrics=metrics,
99
- ):
100
- result = original_llm_call(self, *args, **kwargs)
128
+ ) as observer:
129
+ result = await original_akickoff_for_each(self, *args, **kwargs)
130
+ observer.result = str(result) if result else None
131
+
101
132
  return result
102
133
 
103
- LLM.call = wrapper
134
+ Crew.akickoff_for_each = wrapper
104
135
 
105
136
 
106
137
  def wrap_agent_execute_task():
@@ -114,13 +145,36 @@ def wrap_agent_execute_task():
114
145
  func_name="execute_task",
115
146
  metric_collection=metric_collection,
116
147
  metrics=metrics,
117
- ):
148
+ ) as observer:
118
149
  result = original_execute_task(self, *args, **kwargs)
150
+ observer.result = str(result) if result else None
119
151
  return result
120
152
 
121
153
  Agent.execute_task = wrapper
122
154
 
123
155
 
156
+ def wrap_agent_aexecute_task():
157
+ if not hasattr(Agent, "aexecute_task"):
158
+ return
159
+
160
+ original_aexecute_task = Agent.aexecute_task
161
+
162
+ @wraps(original_aexecute_task)
163
+ async def wrapper(self, *args, **kwargs):
164
+ metric_collection, metrics = _check_metrics_and_metric_collection(self)
165
+ with Observer(
166
+ span_type="agent",
167
+ func_name="aexecute_task",
168
+ metric_collection=metric_collection,
169
+ metrics=metrics,
170
+ ) as observer:
171
+ result = await original_aexecute_task(self, *args, **kwargs)
172
+ observer.result = str(result) if result else None
173
+ return result
174
+
175
+ Agent.aexecute_task = wrapper
176
+
177
+
124
178
  def _check_metrics_and_metric_collection(obj: Any):
125
179
  metric_collection = getattr(obj, "_metric_collection", None)
126
180
  metrics = getattr(obj, "_metrics", None)
@@ -1,3 +1,7 @@
1
+ import logging
2
+ import os
3
+ import threading
4
+
1
5
  from typing import Any, Optional, List, Dict
2
6
  from uuid import UUID
3
7
  from time import perf_counter
@@ -20,6 +24,19 @@ from deepeval.tracing.types import (
20
24
  )
21
25
  from deepeval.telemetry import capture_tracing_integration
22
26
 
27
+ # Debug logging for LangChain callbacks (enable with DEEPEVAL_DEBUG_LANGCHAIN_CALLBACKS=1)
28
+ _DEBUG_CALLBACKS = os.environ.get(
29
+ "DEEPEVAL_DEBUG_LANGCHAIN_CALLBACKS", ""
30
+ ).lower() in ("1", "true", "yes")
31
+
32
+ _logger = logging.getLogger(__name__)
33
+
34
+
35
+ def _debug_log(msg: str):
36
+ if _DEBUG_CALLBACKS:
37
+ _logger.debug(f"[LangChain Callback] {msg}")
38
+
39
+
23
40
  try:
24
41
  from langchain_core.callbacks.base import BaseCallbackHandler
25
42
  from langchain_core.outputs import LLMResult
@@ -29,6 +46,7 @@ try:
29
46
  # contains langchain imports
30
47
  from deepeval.integrations.langchain.utils import (
31
48
  parse_prompts_to_messages,
49
+ convert_chat_messages_to_input,
32
50
  extract_name,
33
51
  safe_extract_model_name,
34
52
  safe_extract_token_usage,
@@ -50,6 +68,12 @@ def is_langchain_installed():
50
68
 
51
69
 
52
70
  class CallbackHandler(BaseCallbackHandler):
71
+ # When users create multiple CallbackHandler instances for the same logical
72
+ # conversation (same thread_id), we want spans to land on the same trace.
73
+ # Otherwise, each handler lazily creates its own trace, and multi-turn flows
74
+ # become multiple single-turn traces.
75
+ _thread_id_to_trace_uuid: Dict[str, str] = {}
76
+ _thread_id_lock = threading.Lock()
53
77
 
54
78
  def __init__(
55
79
  self,
@@ -60,6 +84,7 @@ class CallbackHandler(BaseCallbackHandler):
60
84
  user_id: Optional[str] = None,
61
85
  metrics: Optional[List[BaseMetric]] = None,
62
86
  metric_collection: Optional[str] = None,
87
+ test_case_id: Optional[str] = None,
63
88
  ):
64
89
  is_langchain_installed()
65
90
  with capture_tracing_integration("langchain.callback.CallbackHandler"):
@@ -74,13 +99,21 @@ class CallbackHandler(BaseCallbackHandler):
74
99
  self._parent_span = None
75
100
 
76
101
  # Stash trace metadata to apply once we know which trace we are using.
77
- self._trace_init_fields: Dict[str, Any] = {
102
+ # _trace_init_fields is cleared after first apply to prevent re-applying
103
+ # on every callback within the same trace. _original_init_fields is kept
104
+ # permanently so we can re-apply when a new trace is created (e.g., in
105
+ # multi-turn scenarios where the previous trace was ended).
106
+ self._original_init_fields: Dict[str, Any] = {
78
107
  "name": name,
79
108
  "tags": tags,
80
109
  "metadata": metadata,
81
110
  "thread_id": thread_id,
82
111
  "user_id": user_id,
112
+ "test_case_id": test_case_id,
83
113
  }
114
+ self._trace_init_fields: Dict[str, Any] = dict(
115
+ self._original_init_fields
116
+ )
84
117
 
85
118
  # Map LangChain run_id -> our span uuid for parent span restoration
86
119
  self._run_id_to_span_uuid: Dict[str, str] = {}
@@ -96,6 +129,34 @@ class CallbackHandler(BaseCallbackHandler):
96
129
  This is done lazily during actual callback execution to avoid context
97
130
  corruption when the handler is constructed outside the async task/context.
98
131
  """
132
+ # If the user provided a thread_id, attempt to reuse an existing trace for it.
133
+ # This makes multi-turn tests that use multiple CallbackHandler instances behave
134
+ # as expected: one trace containing multiple turns/spans.
135
+ thread_id = None
136
+ fields = self._trace_init_fields or {}
137
+ if fields.get("thread_id"):
138
+ thread_id = fields["thread_id"]
139
+ # In case _trace_init_fields has already been cleared, fall back to trace metadata.
140
+ if thread_id is None and self._trace is not None:
141
+ thread_id = self._trace.thread_id
142
+
143
+ if thread_id:
144
+ with self._thread_id_lock:
145
+ existing_uuid = self._thread_id_to_trace_uuid.get(thread_id)
146
+ if existing_uuid:
147
+ existing_trace = trace_manager.get_trace_by_uuid(existing_uuid)
148
+ if (
149
+ existing_trace
150
+ and existing_trace.uuid in trace_manager.active_traces
151
+ ):
152
+ current_trace_context.set(existing_trace)
153
+ self._trace = existing_trace
154
+ self.trace_uuid = existing_trace.uuid
155
+ # Lazily capture the observe parent span if present.
156
+ if self._parent_span is None:
157
+ self._parent_span = current_span_context.get()
158
+ return existing_trace
159
+
99
160
  # Prefer current context trace if it is active.
100
161
  ctx_trace = current_trace_context.get()
101
162
  if ctx_trace and ctx_trace.uuid in trace_manager.active_traces:
@@ -107,6 +168,10 @@ class CallbackHandler(BaseCallbackHandler):
107
168
  current_trace_context.set(trace)
108
169
  else:
109
170
  # Otherwise, create a fresh trace now (in the right context).
171
+ # Restore _trace_init_fields from the original init fields so that
172
+ # the new trace gets the same name/tags/metadata as intended.
173
+ if not self._trace_init_fields and self._original_init_fields:
174
+ self._trace_init_fields = dict(self._original_init_fields)
110
175
  trace = trace_manager.start_new_trace()
111
176
  current_trace_context.set(trace)
112
177
  self._trace = trace
@@ -114,8 +179,18 @@ class CallbackHandler(BaseCallbackHandler):
114
179
  # Keep a copy for quick access.
115
180
  self.trace_uuid = trace.uuid
116
181
 
182
+ # Register this trace as the canonical trace for this thread_id (if provided).
183
+ # This allows other CallbackHandler instances created for the same thread_id
184
+ # to reuse the same trace instead of creating parallel traces.
185
+ fields = self._trace_init_fields or {}
186
+ tid = fields.get("thread_id") or trace.thread_id
187
+ if tid:
188
+ with self._thread_id_lock:
189
+ # Only set if absent to preserve the "first trace wins" behavior.
190
+ self._thread_id_to_trace_uuid.setdefault(tid, trace.uuid)
191
+
117
192
  # Apply stashed metadata once.
118
- fields = getattr(self, "_trace_init_fields", None) or {}
193
+ fields = self._trace_init_fields or {}
119
194
  if fields:
120
195
  if fields.get("name") is not None:
121
196
  trace.name = fields["name"]
@@ -127,6 +202,8 @@ class CallbackHandler(BaseCallbackHandler):
127
202
  trace.thread_id = fields["thread_id"]
128
203
  if fields.get("user_id") is not None:
129
204
  trace.user_id = fields["user_id"]
205
+ if fields.get("test_case_id") is not None:
206
+ trace.test_case_id = fields["test_case_id"]
130
207
  # prevent re-applying on every callback
131
208
  self._trace_init_fields = {}
132
209
 
@@ -202,6 +279,9 @@ class CallbackHandler(BaseCallbackHandler):
202
279
  metadata: Optional[dict[str, Any]] = None,
203
280
  **kwargs: Any,
204
281
  ) -> Any:
282
+ _debug_log(
283
+ f"on_chain_start: run_id={run_id}, parent_run_id={parent_run_id}, name={extract_name(serialized, **kwargs)}"
284
+ )
205
285
  # Create spans for all chains to establish proper parent-child hierarchy
206
286
  # This is important for LangGraph where there are nested chains
207
287
  with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
@@ -232,6 +312,9 @@ class CallbackHandler(BaseCallbackHandler):
232
312
  parent_run_id: Optional[UUID] = None,
233
313
  **kwargs: Any,
234
314
  ) -> Any:
315
+ _debug_log(
316
+ f"on_chain_end: run_id={run_id}, parent_run_id={parent_run_id}"
317
+ )
235
318
  uuid_str = str(run_id)
236
319
  base_span = trace_manager.get_span_by_uuid(uuid_str)
237
320
  if base_span:
@@ -246,6 +329,59 @@ class CallbackHandler(BaseCallbackHandler):
246
329
  trace.output = output
247
330
  exit_current_context(uuid_str=uuid_str)
248
331
 
332
+ def on_chat_model_start(
333
+ self,
334
+ serialized: dict[str, Any],
335
+ messages: list[list[Any]], # list[list[BaseMessage]]
336
+ *,
337
+ run_id: UUID,
338
+ parent_run_id: Optional[UUID] = None,
339
+ tags: Optional[list[str]] = None,
340
+ metadata: Optional[dict[str, Any]] = None,
341
+ **kwargs: Any,
342
+ ) -> Any:
343
+ """
344
+ Handle chat model start callback. In LangChain v1, chat models emit
345
+ on_chat_model_start instead of on_llm_start. The on_llm_end callback
346
+ is still used for both.
347
+ """
348
+ _debug_log(
349
+ f"on_chat_model_start: run_id={run_id}, parent_run_id={parent_run_id}, messages_len={len(messages)}"
350
+ )
351
+
352
+ # Guard against double-counting if both on_llm_start and on_chat_model_start fire
353
+ uuid_str = str(run_id)
354
+ existing_span = trace_manager.get_span_by_uuid(uuid_str)
355
+ if existing_span is not None:
356
+ _debug_log(
357
+ f"on_chat_model_start: span already exists for run_id={run_id}, skipping"
358
+ )
359
+ return
360
+
361
+ with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
362
+ # Convert messages to our internal format using the shared helper
363
+ input_messages = convert_chat_messages_to_input(messages, **kwargs)
364
+
365
+ # Safe extraction of model name (handle None metadata)
366
+ md = metadata or {}
367
+ model = safe_extract_model_name(md, **kwargs)
368
+
369
+ llm_span: LlmSpan = enter_current_context(
370
+ uuid_str=uuid_str,
371
+ span_type="llm",
372
+ func_name=extract_name(serialized, **kwargs),
373
+ )
374
+ # Register this run_id -> span mapping for child callbacks
375
+ self._run_id_to_span_uuid[str(run_id)] = uuid_str
376
+
377
+ llm_span.input = input_messages
378
+ llm_span.model = model
379
+
380
+ # Extract metrics and prompt from metadata if provided, but don't mutate original
381
+ llm_span.metrics = md.get("metrics")
382
+ llm_span.metric_collection = md.get("metric_collection")
383
+ llm_span.prompt = md.get("prompt")
384
+
249
385
  def on_llm_start(
250
386
  self,
251
387
  serialized: dict[str, Any],
@@ -257,10 +393,25 @@ class CallbackHandler(BaseCallbackHandler):
257
393
  metadata: Optional[dict[str, Any]] = None,
258
394
  **kwargs: Any,
259
395
  ) -> Any:
396
+ _debug_log(
397
+ f"on_llm_start: run_id={run_id}, parent_run_id={parent_run_id}, prompts_len={len(prompts)}"
398
+ )
399
+
400
+ # Guard against double-counting if both on_llm_start and on_chat_model_start fire
401
+ uuid_str = str(run_id)
402
+ existing_span = trace_manager.get_span_by_uuid(uuid_str)
403
+ if existing_span is not None:
404
+ _debug_log(
405
+ f"on_llm_start: span already exists for run_id={run_id}, skipping"
406
+ )
407
+ return
408
+
260
409
  with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
261
- uuid_str = str(run_id)
262
410
  input_messages = parse_prompts_to_messages(prompts, **kwargs)
263
- model = safe_extract_model_name(metadata, **kwargs)
411
+
412
+ # Safe extraction of model name (handle None metadata)
413
+ md = metadata or {}
414
+ model = safe_extract_model_name(md, **kwargs)
264
415
 
265
416
  llm_span: LlmSpan = enter_current_context(
266
417
  uuid_str=uuid_str,
@@ -272,12 +423,11 @@ class CallbackHandler(BaseCallbackHandler):
272
423
 
273
424
  llm_span.input = input_messages
274
425
  llm_span.model = model
275
- metrics = metadata.pop("metrics", None)
276
- metric_collection = metadata.pop("metric_collection", None)
277
- prompt = metadata.pop("prompt", None)
278
- llm_span.metrics = metrics
279
- llm_span.metric_collection = metric_collection
280
- llm_span.prompt = prompt
426
+
427
+ # Extract metrics and prompt from metadata if provided, but don't mutate original
428
+ llm_span.metrics = md.get("metrics")
429
+ llm_span.metric_collection = md.get("metric_collection")
430
+ llm_span.prompt = md.get("prompt")
281
431
 
282
432
  def on_llm_end(
283
433
  self,
@@ -287,9 +437,20 @@ class CallbackHandler(BaseCallbackHandler):
287
437
  parent_run_id: Optional[UUID] = None,
288
438
  **kwargs: Any, # un-logged kwargs
289
439
  ) -> Any:
440
+ _debug_log(
441
+ f"on_llm_end: run_id={run_id}, parent_run_id={parent_run_id}, response_type={type(response).__name__}"
442
+ )
290
443
  uuid_str = str(run_id)
291
444
  llm_span: LlmSpan = trace_manager.get_span_by_uuid(uuid_str)
292
445
  if llm_span is None:
446
+ _debug_log(f"on_llm_end: NO SPAN FOUND for run_id={run_id}")
447
+ return
448
+
449
+ # Guard against double-finalization (if both on_llm_end and on_chat_model_end fire)
450
+ if llm_span.end_time is not None:
451
+ _debug_log(
452
+ f"on_llm_end: span already finalized for run_id={run_id}, skipping"
453
+ )
293
454
  return
294
455
 
295
456
  with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
@@ -336,7 +497,6 @@ class CallbackHandler(BaseCallbackHandler):
336
497
  )
337
498
 
338
499
  llm_span.model = model if model else llm_span.model
339
- llm_span.input = llm_span.input
340
500
  llm_span.output = output
341
501
  llm_span.input_token_count = (
342
502
  total_input_tokens if total_input_tokens > 0 else None
@@ -347,6 +507,121 @@ class CallbackHandler(BaseCallbackHandler):
347
507
 
348
508
  exit_current_context(uuid_str=uuid_str)
349
509
 
510
+ def on_chat_model_end(
511
+ self,
512
+ response: Any,
513
+ *,
514
+ run_id: UUID,
515
+ parent_run_id: Optional[UUID] = None,
516
+ **kwargs: Any,
517
+ ) -> Any:
518
+ """
519
+ Handle chat model end callback. This may be called instead of or
520
+ in addition to on_llm_end depending on the LangChain version.
521
+ """
522
+ _debug_log(
523
+ f"on_chat_model_end: run_id={run_id}, parent_run_id={parent_run_id}, response_type={type(response).__name__}"
524
+ )
525
+ uuid_str = str(run_id)
526
+ llm_span: LlmSpan = trace_manager.get_span_by_uuid(uuid_str)
527
+ if llm_span is None:
528
+ _debug_log(f"on_chat_model_end: NO SPAN FOUND for run_id={run_id}")
529
+ return
530
+
531
+ # Guard against double-finalization, which could happen if both on_llm_end and on_chat_model_end fire
532
+ if llm_span.end_time is not None:
533
+ _debug_log(
534
+ f"on_chat_model_end: span already finalized for run_id={run_id}, skipping"
535
+ )
536
+ return
537
+
538
+ with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
539
+ output = ""
540
+ total_input_tokens = 0
541
+ total_output_tokens = 0
542
+ model = None
543
+
544
+ # Handle LLMResult (same as on_llm_end)
545
+ if isinstance(response, LLMResult):
546
+ for generation in response.generations:
547
+ for gen in generation:
548
+ if isinstance(gen, ChatGeneration):
549
+ if gen.message.response_metadata and isinstance(
550
+ gen.message.response_metadata, dict
551
+ ):
552
+ model = gen.message.response_metadata.get(
553
+ "model_name"
554
+ )
555
+ input_tokens, output_tokens = (
556
+ safe_extract_token_usage(
557
+ gen.message.response_metadata
558
+ )
559
+ )
560
+ total_input_tokens += input_tokens
561
+ total_output_tokens += output_tokens
562
+
563
+ if isinstance(gen.message, AIMessage):
564
+ ai_message = gen.message
565
+ tool_calls = []
566
+ for tool_call in ai_message.tool_calls:
567
+ tool_calls.append(
568
+ LlmToolCall(
569
+ name=tool_call["name"],
570
+ args=tool_call["args"],
571
+ id=tool_call["id"],
572
+ )
573
+ )
574
+ output = LlmOutput(
575
+ role="AI",
576
+ content=ai_message.content,
577
+ tool_calls=tool_calls,
578
+ )
579
+
580
+ llm_span.model = model if model else llm_span.model
581
+ llm_span.output = output
582
+ llm_span.input_token_count = (
583
+ total_input_tokens if total_input_tokens > 0 else None
584
+ )
585
+ llm_span.output_token_count = (
586
+ total_output_tokens if total_output_tokens > 0 else None
587
+ )
588
+
589
+ exit_current_context(uuid_str=uuid_str)
590
+
591
+ def on_chat_model_error(
592
+ self,
593
+ error: BaseException,
594
+ *,
595
+ run_id: UUID,
596
+ parent_run_id: Optional[UUID] = None,
597
+ **kwargs: Any,
598
+ ) -> Any:
599
+ """
600
+ Handle chat model error callback.
601
+ """
602
+ _debug_log(
603
+ f"on_chat_model_error: run_id={run_id}, parent_run_id={parent_run_id}, error={error}"
604
+ )
605
+ uuid_str = str(run_id)
606
+ llm_span: LlmSpan = trace_manager.get_span_by_uuid(uuid_str)
607
+ if llm_span is None:
608
+ _debug_log(
609
+ f"on_chat_model_error: NO SPAN FOUND for run_id={run_id}"
610
+ )
611
+ return
612
+
613
+ # Guard against double-finalization
614
+ if llm_span.end_time is not None:
615
+ _debug_log(
616
+ f"on_chat_model_error: span already finalized for run_id={run_id}, skipping"
617
+ )
618
+ return
619
+
620
+ with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
621
+ llm_span.status = TraceSpanStatus.ERRORED
622
+ llm_span.error = str(error)
623
+ exit_current_context(uuid_str=uuid_str)
624
+
350
625
  def on_llm_error(
351
626
  self,
352
627
  error: BaseException,
@@ -355,10 +630,22 @@ class CallbackHandler(BaseCallbackHandler):
355
630
  parent_run_id: Optional[UUID] = None,
356
631
  **kwargs: Any,
357
632
  ) -> Any:
633
+ _debug_log(
634
+ f"on_llm_error: run_id={run_id}, parent_run_id={parent_run_id}, error={error}"
635
+ )
358
636
  uuid_str = str(run_id)
359
637
  llm_span: LlmSpan = trace_manager.get_span_by_uuid(uuid_str)
360
638
  if llm_span is None:
639
+ _debug_log(f"on_llm_error: NO SPAN FOUND for run_id={run_id}")
640
+ return
641
+
642
+ # Guard against double-finalization
643
+ if llm_span.end_time is not None:
644
+ _debug_log(
645
+ f"on_llm_error: span already finalized for run_id={run_id}, skipping"
646
+ )
361
647
  return
648
+
362
649
  with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
363
650
  llm_span.status = TraceSpanStatus.ERRORED
364
651
  llm_span.error = str(error)
@@ -396,6 +683,9 @@ class CallbackHandler(BaseCallbackHandler):
396
683
  inputs: Optional[dict[str, Any]] = None,
397
684
  **kwargs: Any,
398
685
  ) -> Any:
686
+ _debug_log(
687
+ f"on_tool_start: run_id={run_id}, parent_run_id={parent_run_id}, name={extract_name(serialized, **kwargs)}"
688
+ )
399
689
  with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
400
690
  uuid_str = str(run_id)
401
691
 
@@ -418,6 +708,9 @@ class CallbackHandler(BaseCallbackHandler):
418
708
  parent_run_id: Optional[UUID] = None,
419
709
  **kwargs: Any, # un-logged kwargs
420
710
  ) -> Any:
711
+ _debug_log(
712
+ f"on_tool_end: run_id={run_id}, parent_run_id={parent_run_id}"
713
+ )
421
714
  uuid_str = str(run_id)
422
715
  tool_span: ToolSpan = trace_manager.get_span_by_uuid(uuid_str)
423
716
  if tool_span is None:
@@ -485,20 +778,23 @@ class CallbackHandler(BaseCallbackHandler):
485
778
  ) -> Any:
486
779
  with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
487
780
  uuid_str = str(run_id)
781
+ # Safe access to metadata (handle None)
782
+ md = metadata or {}
488
783
  retriever_span = enter_current_context(
489
784
  uuid_str=uuid_str,
490
785
  span_type="retriever",
491
786
  func_name=extract_name(serialized, **kwargs),
492
787
  observe_kwargs={
493
- "embedder": metadata.get(
494
- "ls_embedding_provider", "unknown"
495
- ),
788
+ "embedder": md.get("ls_embedding_provider", "unknown"),
496
789
  },
497
790
  )
498
791
  # Register this run_id -> span mapping for child callbacks
499
792
  self._run_id_to_span_uuid[str(run_id)] = uuid_str
500
793
  retriever_span.input = query
501
794
 
795
+ # Extract metric_collection from metadata if provided
796
+ retriever_span.metric_collection = md.get("metric_collection")
797
+
502
798
  def on_retriever_end(
503
799
  self,
504
800
  output: Any,