deepeval 3.8.1__py3-none-any.whl → 3.8.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/confident/api.py +31 -3
- deepeval/integrations/langchain/callback.py +306 -14
- deepeval/integrations/langchain/utils.py +75 -24
- deepeval/integrations/pydantic_ai/instrumentator.py +43 -11
- deepeval/integrations/pydantic_ai/otel.py +9 -0
- deepeval/metrics/utils.py +11 -0
- deepeval/simulator/conversation_simulator.py +4 -2
- deepeval/telemetry.py +12 -91
- deepeval/tracing/tracing.py +6 -5
- {deepeval-3.8.1.dist-info → deepeval-3.8.2.dist-info}/METADATA +1 -1
- {deepeval-3.8.1.dist-info → deepeval-3.8.2.dist-info}/RECORD +15 -15
- {deepeval-3.8.1.dist-info → deepeval-3.8.2.dist-info}/LICENSE.md +0 -0
- {deepeval-3.8.1.dist-info → deepeval-3.8.2.dist-info}/WHEEL +0 -0
- {deepeval-3.8.1.dist-info → deepeval-3.8.2.dist-info}/entry_points.txt +0 -0
deepeval/_version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__: str = "3.8.
|
|
1
|
+
__version__: str = "3.8.2"
|
deepeval/confident/api.py
CHANGED
|
@@ -26,16 +26,44 @@ API_BASE_URL_EU = "https://eu.api.confident-ai.com"
|
|
|
26
26
|
retryable_exceptions = requests.exceptions.SSLError
|
|
27
27
|
|
|
28
28
|
|
|
29
|
+
def _infer_region_from_api_key(api_key: Optional[str]) -> Optional[str]:
|
|
30
|
+
"""
|
|
31
|
+
Infer region from Confident API key prefix.
|
|
32
|
+
|
|
33
|
+
Supported:
|
|
34
|
+
- confident_eu_... => "EU"
|
|
35
|
+
- confident_us_... => "US"
|
|
36
|
+
|
|
37
|
+
Returns None if prefix is not recognized or api_key is falsy.
|
|
38
|
+
"""
|
|
39
|
+
if not api_key:
|
|
40
|
+
return None
|
|
41
|
+
key = api_key.strip().lower()
|
|
42
|
+
if key.startswith("confident_eu_"):
|
|
43
|
+
return "EU"
|
|
44
|
+
if key.startswith("confident_us_"):
|
|
45
|
+
return "US"
|
|
46
|
+
return None
|
|
47
|
+
|
|
48
|
+
|
|
29
49
|
def get_base_api_url():
|
|
30
50
|
s = get_settings()
|
|
31
51
|
if s.CONFIDENT_BASE_URL:
|
|
32
52
|
base_url = s.CONFIDENT_BASE_URL.rstrip("/")
|
|
33
53
|
return base_url
|
|
54
|
+
# If the user has explicitly set a region, respect it.
|
|
34
55
|
region = KEY_FILE_HANDLER.fetch_data(KeyValues.CONFIDENT_REGION)
|
|
35
|
-
if region
|
|
56
|
+
if region:
|
|
57
|
+
return API_BASE_URL_EU if region == "EU" else API_BASE_URL
|
|
58
|
+
|
|
59
|
+
# Otherwise, infer region from the API key prefix.
|
|
60
|
+
api_key = get_confident_api_key()
|
|
61
|
+
inferred = _infer_region_from_api_key(api_key)
|
|
62
|
+
if inferred == "EU":
|
|
36
63
|
return API_BASE_URL_EU
|
|
37
|
-
|
|
38
|
-
|
|
64
|
+
|
|
65
|
+
# Default to US (backwards compatible)
|
|
66
|
+
return API_BASE_URL
|
|
39
67
|
|
|
40
68
|
|
|
41
69
|
def get_confident_api_key() -> Optional[str]:
|
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import threading
|
|
4
|
+
|
|
1
5
|
from typing import Any, Optional, List, Dict
|
|
2
6
|
from uuid import UUID
|
|
3
7
|
from time import perf_counter
|
|
@@ -20,6 +24,19 @@ from deepeval.tracing.types import (
|
|
|
20
24
|
)
|
|
21
25
|
from deepeval.telemetry import capture_tracing_integration
|
|
22
26
|
|
|
27
|
+
# Debug logging for LangChain callbacks (enable with DEEPEVAL_DEBUG_LANGCHAIN_CALLBACKS=1)
|
|
28
|
+
_DEBUG_CALLBACKS = os.environ.get(
|
|
29
|
+
"DEEPEVAL_DEBUG_LANGCHAIN_CALLBACKS", ""
|
|
30
|
+
).lower() in ("1", "true", "yes")
|
|
31
|
+
|
|
32
|
+
_logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _debug_log(msg: str):
|
|
36
|
+
if _DEBUG_CALLBACKS:
|
|
37
|
+
_logger.debug(f"[LangChain Callback] {msg}")
|
|
38
|
+
|
|
39
|
+
|
|
23
40
|
try:
|
|
24
41
|
from langchain_core.callbacks.base import BaseCallbackHandler
|
|
25
42
|
from langchain_core.outputs import LLMResult
|
|
@@ -29,6 +46,7 @@ try:
|
|
|
29
46
|
# contains langchain imports
|
|
30
47
|
from deepeval.integrations.langchain.utils import (
|
|
31
48
|
parse_prompts_to_messages,
|
|
49
|
+
convert_chat_messages_to_input,
|
|
32
50
|
extract_name,
|
|
33
51
|
safe_extract_model_name,
|
|
34
52
|
safe_extract_token_usage,
|
|
@@ -50,6 +68,12 @@ def is_langchain_installed():
|
|
|
50
68
|
|
|
51
69
|
|
|
52
70
|
class CallbackHandler(BaseCallbackHandler):
|
|
71
|
+
# When users create multiple CallbackHandler instances for the same logical
|
|
72
|
+
# conversation (same thread_id), we want spans to land on the same trace.
|
|
73
|
+
# Otherwise, each handler lazily creates its own trace, and multi-turn flows
|
|
74
|
+
# become multiple single-turn traces.
|
|
75
|
+
_thread_id_to_trace_uuid: Dict[str, str] = {}
|
|
76
|
+
_thread_id_lock = threading.Lock()
|
|
53
77
|
|
|
54
78
|
def __init__(
|
|
55
79
|
self,
|
|
@@ -74,13 +98,20 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
74
98
|
self._parent_span = None
|
|
75
99
|
|
|
76
100
|
# Stash trace metadata to apply once we know which trace we are using.
|
|
77
|
-
|
|
101
|
+
# _trace_init_fields is cleared after first apply to prevent re-applying
|
|
102
|
+
# on every callback within the same trace. _original_init_fields is kept
|
|
103
|
+
# permanently so we can re-apply when a new trace is created (e.g., in
|
|
104
|
+
# multi-turn scenarios where the previous trace was ended).
|
|
105
|
+
self._original_init_fields: Dict[str, Any] = {
|
|
78
106
|
"name": name,
|
|
79
107
|
"tags": tags,
|
|
80
108
|
"metadata": metadata,
|
|
81
109
|
"thread_id": thread_id,
|
|
82
110
|
"user_id": user_id,
|
|
83
111
|
}
|
|
112
|
+
self._trace_init_fields: Dict[str, Any] = dict(
|
|
113
|
+
self._original_init_fields
|
|
114
|
+
)
|
|
84
115
|
|
|
85
116
|
# Map LangChain run_id -> our span uuid for parent span restoration
|
|
86
117
|
self._run_id_to_span_uuid: Dict[str, str] = {}
|
|
@@ -96,6 +127,34 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
96
127
|
This is done lazily during actual callback execution to avoid context
|
|
97
128
|
corruption when the handler is constructed outside the async task/context.
|
|
98
129
|
"""
|
|
130
|
+
# If the user provided a thread_id, attempt to reuse an existing trace for it.
|
|
131
|
+
# This makes multi-turn tests that use multiple CallbackHandler instances behave
|
|
132
|
+
# as expected: one trace containing multiple turns/spans.
|
|
133
|
+
thread_id = None
|
|
134
|
+
fields = self._trace_init_fields or {}
|
|
135
|
+
if fields.get("thread_id"):
|
|
136
|
+
thread_id = fields["thread_id"]
|
|
137
|
+
# In case _trace_init_fields has already been cleared, fall back to trace metadata.
|
|
138
|
+
if thread_id is None and self._trace is not None:
|
|
139
|
+
thread_id = self._trace.thread_id
|
|
140
|
+
|
|
141
|
+
if thread_id:
|
|
142
|
+
with self._thread_id_lock:
|
|
143
|
+
existing_uuid = self._thread_id_to_trace_uuid.get(thread_id)
|
|
144
|
+
if existing_uuid:
|
|
145
|
+
existing_trace = trace_manager.get_trace_by_uuid(existing_uuid)
|
|
146
|
+
if (
|
|
147
|
+
existing_trace
|
|
148
|
+
and existing_trace.uuid in trace_manager.active_traces
|
|
149
|
+
):
|
|
150
|
+
current_trace_context.set(existing_trace)
|
|
151
|
+
self._trace = existing_trace
|
|
152
|
+
self.trace_uuid = existing_trace.uuid
|
|
153
|
+
# Lazily capture the observe parent span if present.
|
|
154
|
+
if self._parent_span is None:
|
|
155
|
+
self._parent_span = current_span_context.get()
|
|
156
|
+
return existing_trace
|
|
157
|
+
|
|
99
158
|
# Prefer current context trace if it is active.
|
|
100
159
|
ctx_trace = current_trace_context.get()
|
|
101
160
|
if ctx_trace and ctx_trace.uuid in trace_manager.active_traces:
|
|
@@ -107,6 +166,10 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
107
166
|
current_trace_context.set(trace)
|
|
108
167
|
else:
|
|
109
168
|
# Otherwise, create a fresh trace now (in the right context).
|
|
169
|
+
# Restore _trace_init_fields from the original init fields so that
|
|
170
|
+
# the new trace gets the same name/tags/metadata as intended.
|
|
171
|
+
if not self._trace_init_fields and self._original_init_fields:
|
|
172
|
+
self._trace_init_fields = dict(self._original_init_fields)
|
|
110
173
|
trace = trace_manager.start_new_trace()
|
|
111
174
|
current_trace_context.set(trace)
|
|
112
175
|
self._trace = trace
|
|
@@ -114,8 +177,18 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
114
177
|
# Keep a copy for quick access.
|
|
115
178
|
self.trace_uuid = trace.uuid
|
|
116
179
|
|
|
180
|
+
# Register this trace as the canonical trace for this thread_id (if provided).
|
|
181
|
+
# This allows other CallbackHandler instances created for the same thread_id
|
|
182
|
+
# to reuse the same trace instead of creating parallel traces.
|
|
183
|
+
fields = self._trace_init_fields or {}
|
|
184
|
+
tid = fields.get("thread_id") or trace.thread_id
|
|
185
|
+
if tid:
|
|
186
|
+
with self._thread_id_lock:
|
|
187
|
+
# Only set if absent to preserve the "first trace wins" behavior.
|
|
188
|
+
self._thread_id_to_trace_uuid.setdefault(tid, trace.uuid)
|
|
189
|
+
|
|
117
190
|
# Apply stashed metadata once.
|
|
118
|
-
fields =
|
|
191
|
+
fields = self._trace_init_fields or {}
|
|
119
192
|
if fields:
|
|
120
193
|
if fields.get("name") is not None:
|
|
121
194
|
trace.name = fields["name"]
|
|
@@ -202,6 +275,9 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
202
275
|
metadata: Optional[dict[str, Any]] = None,
|
|
203
276
|
**kwargs: Any,
|
|
204
277
|
) -> Any:
|
|
278
|
+
_debug_log(
|
|
279
|
+
f"on_chain_start: run_id={run_id}, parent_run_id={parent_run_id}, name={extract_name(serialized, **kwargs)}"
|
|
280
|
+
)
|
|
205
281
|
# Create spans for all chains to establish proper parent-child hierarchy
|
|
206
282
|
# This is important for LangGraph where there are nested chains
|
|
207
283
|
with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
|
|
@@ -232,6 +308,9 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
232
308
|
parent_run_id: Optional[UUID] = None,
|
|
233
309
|
**kwargs: Any,
|
|
234
310
|
) -> Any:
|
|
311
|
+
_debug_log(
|
|
312
|
+
f"on_chain_end: run_id={run_id}, parent_run_id={parent_run_id}"
|
|
313
|
+
)
|
|
235
314
|
uuid_str = str(run_id)
|
|
236
315
|
base_span = trace_manager.get_span_by_uuid(uuid_str)
|
|
237
316
|
if base_span:
|
|
@@ -246,6 +325,59 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
246
325
|
trace.output = output
|
|
247
326
|
exit_current_context(uuid_str=uuid_str)
|
|
248
327
|
|
|
328
|
+
def on_chat_model_start(
|
|
329
|
+
self,
|
|
330
|
+
serialized: dict[str, Any],
|
|
331
|
+
messages: list[list[Any]], # list[list[BaseMessage]]
|
|
332
|
+
*,
|
|
333
|
+
run_id: UUID,
|
|
334
|
+
parent_run_id: Optional[UUID] = None,
|
|
335
|
+
tags: Optional[list[str]] = None,
|
|
336
|
+
metadata: Optional[dict[str, Any]] = None,
|
|
337
|
+
**kwargs: Any,
|
|
338
|
+
) -> Any:
|
|
339
|
+
"""
|
|
340
|
+
Handle chat model start callback. In LangChain v1, chat models emit
|
|
341
|
+
on_chat_model_start instead of on_llm_start. The on_llm_end callback
|
|
342
|
+
is still used for both.
|
|
343
|
+
"""
|
|
344
|
+
_debug_log(
|
|
345
|
+
f"on_chat_model_start: run_id={run_id}, parent_run_id={parent_run_id}, messages_len={len(messages)}"
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
# Guard against double-counting if both on_llm_start and on_chat_model_start fire
|
|
349
|
+
uuid_str = str(run_id)
|
|
350
|
+
existing_span = trace_manager.get_span_by_uuid(uuid_str)
|
|
351
|
+
if existing_span is not None:
|
|
352
|
+
_debug_log(
|
|
353
|
+
f"on_chat_model_start: span already exists for run_id={run_id}, skipping"
|
|
354
|
+
)
|
|
355
|
+
return
|
|
356
|
+
|
|
357
|
+
with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
|
|
358
|
+
# Convert messages to our internal format using the shared helper
|
|
359
|
+
input_messages = convert_chat_messages_to_input(messages, **kwargs)
|
|
360
|
+
|
|
361
|
+
# Safe extraction of model name (handle None metadata)
|
|
362
|
+
md = metadata or {}
|
|
363
|
+
model = safe_extract_model_name(md, **kwargs)
|
|
364
|
+
|
|
365
|
+
llm_span: LlmSpan = enter_current_context(
|
|
366
|
+
uuid_str=uuid_str,
|
|
367
|
+
span_type="llm",
|
|
368
|
+
func_name=extract_name(serialized, **kwargs),
|
|
369
|
+
)
|
|
370
|
+
# Register this run_id -> span mapping for child callbacks
|
|
371
|
+
self._run_id_to_span_uuid[str(run_id)] = uuid_str
|
|
372
|
+
|
|
373
|
+
llm_span.input = input_messages
|
|
374
|
+
llm_span.model = model
|
|
375
|
+
|
|
376
|
+
# Extract metrics and prompt from metadata if provided, but don't mutate original
|
|
377
|
+
llm_span.metrics = md.get("metrics")
|
|
378
|
+
llm_span.metric_collection = md.get("metric_collection")
|
|
379
|
+
llm_span.prompt = md.get("prompt")
|
|
380
|
+
|
|
249
381
|
def on_llm_start(
|
|
250
382
|
self,
|
|
251
383
|
serialized: dict[str, Any],
|
|
@@ -257,10 +389,25 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
257
389
|
metadata: Optional[dict[str, Any]] = None,
|
|
258
390
|
**kwargs: Any,
|
|
259
391
|
) -> Any:
|
|
392
|
+
_debug_log(
|
|
393
|
+
f"on_llm_start: run_id={run_id}, parent_run_id={parent_run_id}, prompts_len={len(prompts)}"
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
# Guard against double-counting if both on_llm_start and on_chat_model_start fire
|
|
397
|
+
uuid_str = str(run_id)
|
|
398
|
+
existing_span = trace_manager.get_span_by_uuid(uuid_str)
|
|
399
|
+
if existing_span is not None:
|
|
400
|
+
_debug_log(
|
|
401
|
+
f"on_llm_start: span already exists for run_id={run_id}, skipping"
|
|
402
|
+
)
|
|
403
|
+
return
|
|
404
|
+
|
|
260
405
|
with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
|
|
261
|
-
uuid_str = str(run_id)
|
|
262
406
|
input_messages = parse_prompts_to_messages(prompts, **kwargs)
|
|
263
|
-
|
|
407
|
+
|
|
408
|
+
# Safe extraction of model name (handle None metadata)
|
|
409
|
+
md = metadata or {}
|
|
410
|
+
model = safe_extract_model_name(md, **kwargs)
|
|
264
411
|
|
|
265
412
|
llm_span: LlmSpan = enter_current_context(
|
|
266
413
|
uuid_str=uuid_str,
|
|
@@ -272,12 +419,11 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
272
419
|
|
|
273
420
|
llm_span.input = input_messages
|
|
274
421
|
llm_span.model = model
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
llm_span.
|
|
279
|
-
llm_span.
|
|
280
|
-
llm_span.prompt = prompt
|
|
422
|
+
|
|
423
|
+
# Extract metrics and prompt from metadata if provided, but don't mutate original
|
|
424
|
+
llm_span.metrics = md.get("metrics")
|
|
425
|
+
llm_span.metric_collection = md.get("metric_collection")
|
|
426
|
+
llm_span.prompt = md.get("prompt")
|
|
281
427
|
|
|
282
428
|
def on_llm_end(
|
|
283
429
|
self,
|
|
@@ -287,9 +433,20 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
287
433
|
parent_run_id: Optional[UUID] = None,
|
|
288
434
|
**kwargs: Any, # un-logged kwargs
|
|
289
435
|
) -> Any:
|
|
436
|
+
_debug_log(
|
|
437
|
+
f"on_llm_end: run_id={run_id}, parent_run_id={parent_run_id}, response_type={type(response).__name__}"
|
|
438
|
+
)
|
|
290
439
|
uuid_str = str(run_id)
|
|
291
440
|
llm_span: LlmSpan = trace_manager.get_span_by_uuid(uuid_str)
|
|
292
441
|
if llm_span is None:
|
|
442
|
+
_debug_log(f"on_llm_end: NO SPAN FOUND for run_id={run_id}")
|
|
443
|
+
return
|
|
444
|
+
|
|
445
|
+
# Guard against double-finalization (if both on_llm_end and on_chat_model_end fire)
|
|
446
|
+
if llm_span.end_time is not None:
|
|
447
|
+
_debug_log(
|
|
448
|
+
f"on_llm_end: span already finalized for run_id={run_id}, skipping"
|
|
449
|
+
)
|
|
293
450
|
return
|
|
294
451
|
|
|
295
452
|
with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
|
|
@@ -336,7 +493,6 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
336
493
|
)
|
|
337
494
|
|
|
338
495
|
llm_span.model = model if model else llm_span.model
|
|
339
|
-
llm_span.input = llm_span.input
|
|
340
496
|
llm_span.output = output
|
|
341
497
|
llm_span.input_token_count = (
|
|
342
498
|
total_input_tokens if total_input_tokens > 0 else None
|
|
@@ -347,6 +503,121 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
347
503
|
|
|
348
504
|
exit_current_context(uuid_str=uuid_str)
|
|
349
505
|
|
|
506
|
+
def on_chat_model_end(
|
|
507
|
+
self,
|
|
508
|
+
response: Any,
|
|
509
|
+
*,
|
|
510
|
+
run_id: UUID,
|
|
511
|
+
parent_run_id: Optional[UUID] = None,
|
|
512
|
+
**kwargs: Any,
|
|
513
|
+
) -> Any:
|
|
514
|
+
"""
|
|
515
|
+
Handle chat model end callback. This may be called instead of or
|
|
516
|
+
in addition to on_llm_end depending on the LangChain version.
|
|
517
|
+
"""
|
|
518
|
+
_debug_log(
|
|
519
|
+
f"on_chat_model_end: run_id={run_id}, parent_run_id={parent_run_id}, response_type={type(response).__name__}"
|
|
520
|
+
)
|
|
521
|
+
uuid_str = str(run_id)
|
|
522
|
+
llm_span: LlmSpan = trace_manager.get_span_by_uuid(uuid_str)
|
|
523
|
+
if llm_span is None:
|
|
524
|
+
_debug_log(f"on_chat_model_end: NO SPAN FOUND for run_id={run_id}")
|
|
525
|
+
return
|
|
526
|
+
|
|
527
|
+
# Guard against double-finalization, which could happen if both on_llm_end and on_chat_model_end fire
|
|
528
|
+
if llm_span.end_time is not None:
|
|
529
|
+
_debug_log(
|
|
530
|
+
f"on_chat_model_end: span already finalized for run_id={run_id}, skipping"
|
|
531
|
+
)
|
|
532
|
+
return
|
|
533
|
+
|
|
534
|
+
with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
|
|
535
|
+
output = ""
|
|
536
|
+
total_input_tokens = 0
|
|
537
|
+
total_output_tokens = 0
|
|
538
|
+
model = None
|
|
539
|
+
|
|
540
|
+
# Handle LLMResult (same as on_llm_end)
|
|
541
|
+
if isinstance(response, LLMResult):
|
|
542
|
+
for generation in response.generations:
|
|
543
|
+
for gen in generation:
|
|
544
|
+
if isinstance(gen, ChatGeneration):
|
|
545
|
+
if gen.message.response_metadata and isinstance(
|
|
546
|
+
gen.message.response_metadata, dict
|
|
547
|
+
):
|
|
548
|
+
model = gen.message.response_metadata.get(
|
|
549
|
+
"model_name"
|
|
550
|
+
)
|
|
551
|
+
input_tokens, output_tokens = (
|
|
552
|
+
safe_extract_token_usage(
|
|
553
|
+
gen.message.response_metadata
|
|
554
|
+
)
|
|
555
|
+
)
|
|
556
|
+
total_input_tokens += input_tokens
|
|
557
|
+
total_output_tokens += output_tokens
|
|
558
|
+
|
|
559
|
+
if isinstance(gen.message, AIMessage):
|
|
560
|
+
ai_message = gen.message
|
|
561
|
+
tool_calls = []
|
|
562
|
+
for tool_call in ai_message.tool_calls:
|
|
563
|
+
tool_calls.append(
|
|
564
|
+
LlmToolCall(
|
|
565
|
+
name=tool_call["name"],
|
|
566
|
+
args=tool_call["args"],
|
|
567
|
+
id=tool_call["id"],
|
|
568
|
+
)
|
|
569
|
+
)
|
|
570
|
+
output = LlmOutput(
|
|
571
|
+
role="AI",
|
|
572
|
+
content=ai_message.content,
|
|
573
|
+
tool_calls=tool_calls,
|
|
574
|
+
)
|
|
575
|
+
|
|
576
|
+
llm_span.model = model if model else llm_span.model
|
|
577
|
+
llm_span.output = output
|
|
578
|
+
llm_span.input_token_count = (
|
|
579
|
+
total_input_tokens if total_input_tokens > 0 else None
|
|
580
|
+
)
|
|
581
|
+
llm_span.output_token_count = (
|
|
582
|
+
total_output_tokens if total_output_tokens > 0 else None
|
|
583
|
+
)
|
|
584
|
+
|
|
585
|
+
exit_current_context(uuid_str=uuid_str)
|
|
586
|
+
|
|
587
|
+
def on_chat_model_error(
|
|
588
|
+
self,
|
|
589
|
+
error: BaseException,
|
|
590
|
+
*,
|
|
591
|
+
run_id: UUID,
|
|
592
|
+
parent_run_id: Optional[UUID] = None,
|
|
593
|
+
**kwargs: Any,
|
|
594
|
+
) -> Any:
|
|
595
|
+
"""
|
|
596
|
+
Handle chat model error callback.
|
|
597
|
+
"""
|
|
598
|
+
_debug_log(
|
|
599
|
+
f"on_chat_model_error: run_id={run_id}, parent_run_id={parent_run_id}, error={error}"
|
|
600
|
+
)
|
|
601
|
+
uuid_str = str(run_id)
|
|
602
|
+
llm_span: LlmSpan = trace_manager.get_span_by_uuid(uuid_str)
|
|
603
|
+
if llm_span is None:
|
|
604
|
+
_debug_log(
|
|
605
|
+
f"on_chat_model_error: NO SPAN FOUND for run_id={run_id}"
|
|
606
|
+
)
|
|
607
|
+
return
|
|
608
|
+
|
|
609
|
+
# Guard against double-finalization
|
|
610
|
+
if llm_span.end_time is not None:
|
|
611
|
+
_debug_log(
|
|
612
|
+
f"on_chat_model_error: span already finalized for run_id={run_id}, skipping"
|
|
613
|
+
)
|
|
614
|
+
return
|
|
615
|
+
|
|
616
|
+
with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
|
|
617
|
+
llm_span.status = TraceSpanStatus.ERRORED
|
|
618
|
+
llm_span.error = str(error)
|
|
619
|
+
exit_current_context(uuid_str=uuid_str)
|
|
620
|
+
|
|
350
621
|
def on_llm_error(
|
|
351
622
|
self,
|
|
352
623
|
error: BaseException,
|
|
@@ -355,10 +626,22 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
355
626
|
parent_run_id: Optional[UUID] = None,
|
|
356
627
|
**kwargs: Any,
|
|
357
628
|
) -> Any:
|
|
629
|
+
_debug_log(
|
|
630
|
+
f"on_llm_error: run_id={run_id}, parent_run_id={parent_run_id}, error={error}"
|
|
631
|
+
)
|
|
358
632
|
uuid_str = str(run_id)
|
|
359
633
|
llm_span: LlmSpan = trace_manager.get_span_by_uuid(uuid_str)
|
|
360
634
|
if llm_span is None:
|
|
635
|
+
_debug_log(f"on_llm_error: NO SPAN FOUND for run_id={run_id}")
|
|
636
|
+
return
|
|
637
|
+
|
|
638
|
+
# Guard against double-finalization
|
|
639
|
+
if llm_span.end_time is not None:
|
|
640
|
+
_debug_log(
|
|
641
|
+
f"on_llm_error: span already finalized for run_id={run_id}, skipping"
|
|
642
|
+
)
|
|
361
643
|
return
|
|
644
|
+
|
|
362
645
|
with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
|
|
363
646
|
llm_span.status = TraceSpanStatus.ERRORED
|
|
364
647
|
llm_span.error = str(error)
|
|
@@ -396,6 +679,9 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
396
679
|
inputs: Optional[dict[str, Any]] = None,
|
|
397
680
|
**kwargs: Any,
|
|
398
681
|
) -> Any:
|
|
682
|
+
_debug_log(
|
|
683
|
+
f"on_tool_start: run_id={run_id}, parent_run_id={parent_run_id}, name={extract_name(serialized, **kwargs)}"
|
|
684
|
+
)
|
|
399
685
|
with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
|
|
400
686
|
uuid_str = str(run_id)
|
|
401
687
|
|
|
@@ -418,6 +704,9 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
418
704
|
parent_run_id: Optional[UUID] = None,
|
|
419
705
|
**kwargs: Any, # un-logged kwargs
|
|
420
706
|
) -> Any:
|
|
707
|
+
_debug_log(
|
|
708
|
+
f"on_tool_end: run_id={run_id}, parent_run_id={parent_run_id}"
|
|
709
|
+
)
|
|
421
710
|
uuid_str = str(run_id)
|
|
422
711
|
tool_span: ToolSpan = trace_manager.get_span_by_uuid(uuid_str)
|
|
423
712
|
if tool_span is None:
|
|
@@ -485,20 +774,23 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
485
774
|
) -> Any:
|
|
486
775
|
with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
|
|
487
776
|
uuid_str = str(run_id)
|
|
777
|
+
# Safe access to metadata (handle None)
|
|
778
|
+
md = metadata or {}
|
|
488
779
|
retriever_span = enter_current_context(
|
|
489
780
|
uuid_str=uuid_str,
|
|
490
781
|
span_type="retriever",
|
|
491
782
|
func_name=extract_name(serialized, **kwargs),
|
|
492
783
|
observe_kwargs={
|
|
493
|
-
"embedder":
|
|
494
|
-
"ls_embedding_provider", "unknown"
|
|
495
|
-
),
|
|
784
|
+
"embedder": md.get("ls_embedding_provider", "unknown"),
|
|
496
785
|
},
|
|
497
786
|
)
|
|
498
787
|
# Register this run_id -> span mapping for child callbacks
|
|
499
788
|
self._run_id_to_span_uuid[str(run_id)] = uuid_str
|
|
500
789
|
retriever_span.input = query
|
|
501
790
|
|
|
791
|
+
# Extract metric_collection from metadata if provided
|
|
792
|
+
retriever_span.metric_collection = md.get("metric_collection")
|
|
793
|
+
|
|
502
794
|
def on_retriever_end(
|
|
503
795
|
self,
|
|
504
796
|
output: Any,
|
|
@@ -1,5 +1,77 @@
|
|
|
1
|
-
|
|
1
|
+
import uuid
|
|
2
|
+
from typing import Any, List, Dict, Optional, Union, Literal, Callable
|
|
3
|
+
from time import perf_counter
|
|
2
4
|
from langchain_core.outputs import ChatGeneration
|
|
5
|
+
from rich.progress import Progress
|
|
6
|
+
|
|
7
|
+
from deepeval.metrics import BaseMetric
|
|
8
|
+
from deepeval.tracing.context import current_span_context, current_trace_context
|
|
9
|
+
from deepeval.tracing.tracing import trace_manager
|
|
10
|
+
from deepeval.tracing.types import (
|
|
11
|
+
AgentSpan,
|
|
12
|
+
BaseSpan,
|
|
13
|
+
LlmSpan,
|
|
14
|
+
RetrieverSpan,
|
|
15
|
+
SpanType,
|
|
16
|
+
ToolSpan,
|
|
17
|
+
TraceSpanStatus,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def convert_chat_messages_to_input(
|
|
22
|
+
messages: list[list[Any]], **kwargs
|
|
23
|
+
) -> List[Dict[str, str]]:
|
|
24
|
+
"""
|
|
25
|
+
Convert LangChain chat messages to our internal format.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
messages: list[list[BaseMessage]] - outer list is batches, inner is messages.
|
|
29
|
+
**kwargs: May contain invocation_params with tools definitions.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
List of dicts with 'role' and 'content' keys, matching the schema used
|
|
33
|
+
by parse_prompts_to_messages for consistency.
|
|
34
|
+
"""
|
|
35
|
+
# Valid roles matching parse_prompts_to_messages
|
|
36
|
+
ROLE_MAPPING = {
|
|
37
|
+
"human": "human",
|
|
38
|
+
"user": "human",
|
|
39
|
+
"ai": "ai",
|
|
40
|
+
"assistant": "ai",
|
|
41
|
+
"system": "system",
|
|
42
|
+
"tool": "tool",
|
|
43
|
+
"function": "function",
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
result: List[Dict[str, str]] = []
|
|
47
|
+
for batch in messages:
|
|
48
|
+
for msg in batch:
|
|
49
|
+
# BaseMessage has .type (role) and .content
|
|
50
|
+
raw_role = getattr(msg, "type", "unknown")
|
|
51
|
+
content = getattr(msg, "content", "")
|
|
52
|
+
|
|
53
|
+
# Normalize role using same conventions as prompt parsing
|
|
54
|
+
role = ROLE_MAPPING.get(raw_role.lower(), raw_role)
|
|
55
|
+
|
|
56
|
+
# Convert content to string (handles empty content, lists, etc.)
|
|
57
|
+
if isinstance(content, list):
|
|
58
|
+
# Some messages have content as a list of content blocks
|
|
59
|
+
content_str = " ".join(
|
|
60
|
+
str(c.get("text", c) if isinstance(c, dict) else c)
|
|
61
|
+
for c in content
|
|
62
|
+
)
|
|
63
|
+
else:
|
|
64
|
+
content_str = str(content) if content else ""
|
|
65
|
+
|
|
66
|
+
result.append({"role": role, "content": content_str})
|
|
67
|
+
|
|
68
|
+
# Append tool definitions if present which matches parse_prompts_to_messages behavior
|
|
69
|
+
tools = kwargs.get("invocation_params", {}).get("tools", None)
|
|
70
|
+
if tools and isinstance(tools, list):
|
|
71
|
+
for tool in tools:
|
|
72
|
+
result.append({"role": "Tool Input", "content": str(tool)})
|
|
73
|
+
|
|
74
|
+
return result
|
|
3
75
|
|
|
4
76
|
|
|
5
77
|
def parse_prompts_to_messages(
|
|
@@ -112,27 +184,6 @@ def safe_extract_model_name(
|
|
|
112
184
|
return None
|
|
113
185
|
|
|
114
186
|
|
|
115
|
-
from typing import Any, List, Dict, Optional, Union, Literal, Callable
|
|
116
|
-
from langchain_core.outputs import ChatGeneration
|
|
117
|
-
from time import perf_counter
|
|
118
|
-
import uuid
|
|
119
|
-
from rich.progress import Progress
|
|
120
|
-
from deepeval.tracing.tracing import Observer
|
|
121
|
-
|
|
122
|
-
from deepeval.metrics import BaseMetric
|
|
123
|
-
from deepeval.tracing.context import current_span_context, current_trace_context
|
|
124
|
-
from deepeval.tracing.tracing import trace_manager
|
|
125
|
-
from deepeval.tracing.types import (
|
|
126
|
-
AgentSpan,
|
|
127
|
-
BaseSpan,
|
|
128
|
-
LlmSpan,
|
|
129
|
-
RetrieverSpan,
|
|
130
|
-
SpanType,
|
|
131
|
-
ToolSpan,
|
|
132
|
-
TraceSpanStatus,
|
|
133
|
-
)
|
|
134
|
-
|
|
135
|
-
|
|
136
187
|
def enter_current_context(
|
|
137
188
|
span_type: Optional[
|
|
138
189
|
Union[Literal["agent", "llm", "retriever", "tool"], str]
|
|
@@ -239,8 +290,8 @@ def enter_current_context(
|
|
|
239
290
|
|
|
240
291
|
if (
|
|
241
292
|
parent_span
|
|
242
|
-
and
|
|
243
|
-
and
|
|
293
|
+
and parent_span.progress is not None
|
|
294
|
+
and parent_span.pbar_callback_id is not None
|
|
244
295
|
):
|
|
245
296
|
progress = parent_span.progress
|
|
246
297
|
pbar_callback_id = parent_span.pbar_callback_id
|
|
@@ -40,6 +40,7 @@ try:
|
|
|
40
40
|
from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
|
|
41
41
|
OTLPSpanExporter,
|
|
42
42
|
)
|
|
43
|
+
from opentelemetry.trace import set_tracer_provider
|
|
43
44
|
from pydantic_ai.models.instrumented import (
|
|
44
45
|
InstrumentationSettings as _BaseInstrumentationSettings,
|
|
45
46
|
)
|
|
@@ -131,7 +132,12 @@ class ConfidentInstrumentationSettings(InstrumentationSettings):
|
|
|
131
132
|
):
|
|
132
133
|
is_dependency_installed()
|
|
133
134
|
|
|
134
|
-
|
|
135
|
+
if trace_manager.environment is not None:
|
|
136
|
+
_environment = trace_manager.environment
|
|
137
|
+
elif settings.CONFIDENT_TRACE_ENVIRONMENT is not None:
|
|
138
|
+
_environment = settings.CONFIDENT_TRACE_ENVIRONMENT
|
|
139
|
+
else:
|
|
140
|
+
_environment = "development"
|
|
135
141
|
if _environment and _environment in [
|
|
136
142
|
"production",
|
|
137
143
|
"staging",
|
|
@@ -176,6 +182,12 @@ class ConfidentInstrumentationSettings(InstrumentationSettings):
|
|
|
176
182
|
)
|
|
177
183
|
)
|
|
178
184
|
)
|
|
185
|
+
try:
|
|
186
|
+
set_tracer_provider(trace_provider)
|
|
187
|
+
except Exception as e:
|
|
188
|
+
# Handle case where provider is already set (optional warning)
|
|
189
|
+
logger.warning(f"Could not set global tracer provider: {e}")
|
|
190
|
+
|
|
179
191
|
super().__init__(tracer_provider=trace_provider)
|
|
180
192
|
|
|
181
193
|
|
|
@@ -234,16 +246,14 @@ class SpanInterceptor(SpanProcessor):
|
|
|
234
246
|
)
|
|
235
247
|
|
|
236
248
|
# set agent name and metric collection
|
|
237
|
-
|
|
238
|
-
span.
|
|
239
|
-
span.
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
self.settings.agent_metric_collection,
|
|
246
|
-
)
|
|
249
|
+
agent_name = (
|
|
250
|
+
span.attributes.get("gen_ai.agent.name")
|
|
251
|
+
or span.attributes.get("pydantic_ai.agent.name")
|
|
252
|
+
or span.attributes.get("agent_name")
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
if agent_name:
|
|
256
|
+
self._add_agent_span(span, agent_name)
|
|
247
257
|
|
|
248
258
|
# set llm metric collection
|
|
249
259
|
if span.attributes.get("gen_ai.operation.name") in [
|
|
@@ -270,6 +280,19 @@ class SpanInterceptor(SpanProcessor):
|
|
|
270
280
|
)
|
|
271
281
|
|
|
272
282
|
def on_end(self, span):
|
|
283
|
+
|
|
284
|
+
already_processed = (
|
|
285
|
+
span.attributes.get("confident.span.type") == "agent"
|
|
286
|
+
)
|
|
287
|
+
if not already_processed:
|
|
288
|
+
agent_name = (
|
|
289
|
+
span.attributes.get("gen_ai.agent.name")
|
|
290
|
+
or span.attributes.get("pydantic_ai.agent.name")
|
|
291
|
+
or span.attributes.get("agent_name")
|
|
292
|
+
)
|
|
293
|
+
if agent_name:
|
|
294
|
+
self._add_agent_span(span, agent_name)
|
|
295
|
+
|
|
273
296
|
if self.settings.is_test_mode:
|
|
274
297
|
if span.attributes.get("confident.span.type") == "agent":
|
|
275
298
|
|
|
@@ -323,3 +346,12 @@ class SpanInterceptor(SpanProcessor):
|
|
|
323
346
|
trace.end_time = perf_counter()
|
|
324
347
|
trace_manager.traces_to_evaluate.append(trace)
|
|
325
348
|
test_exporter.clear_span_json_list()
|
|
349
|
+
|
|
350
|
+
def _add_agent_span(self, span, name):
|
|
351
|
+
span.set_attribute("confident.span.type", "agent")
|
|
352
|
+
span.set_attribute("confident.span.name", name)
|
|
353
|
+
if self.settings.agent_metric_collection:
|
|
354
|
+
span.set_attribute(
|
|
355
|
+
"confident.span.metric_collection",
|
|
356
|
+
self.settings.agent_metric_collection,
|
|
357
|
+
)
|
|
@@ -2,6 +2,7 @@ import warnings
|
|
|
2
2
|
from typing import Optional
|
|
3
3
|
from deepeval.telemetry import capture_tracing_integration
|
|
4
4
|
from deepeval.config.settings import get_settings
|
|
5
|
+
import logging
|
|
5
6
|
|
|
6
7
|
try:
|
|
7
8
|
from opentelemetry import trace
|
|
@@ -24,6 +25,9 @@ def is_opentelemetry_available():
|
|
|
24
25
|
return True
|
|
25
26
|
|
|
26
27
|
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
settings = get_settings()
|
|
30
|
+
|
|
27
31
|
settings = get_settings()
|
|
28
32
|
# OTLP_ENDPOINT = "https://otel.confident-ai.com/v1/traces"
|
|
29
33
|
|
|
@@ -51,6 +55,11 @@ def instrument_pydantic_ai(api_key: Optional[str] = None):
|
|
|
51
55
|
)
|
|
52
56
|
)
|
|
53
57
|
)
|
|
58
|
+
try:
|
|
59
|
+
trace.set_tracer_provider(tracer_provider)
|
|
60
|
+
except Exception as e:
|
|
61
|
+
# Handle case where provider is already set (optional warning)
|
|
62
|
+
logger.warning(f"Could not set global tracer provider: {e}")
|
|
54
63
|
|
|
55
64
|
# create an instrumented exporter
|
|
56
65
|
from pydantic_ai.models.instrumented import InstrumentationSettings
|
deepeval/metrics/utils.py
CHANGED
|
@@ -320,6 +320,17 @@ def check_llm_test_case_params(
|
|
|
320
320
|
metric.error = error_str
|
|
321
321
|
raise ValueError(error_str)
|
|
322
322
|
|
|
323
|
+
# Centralized: if a metric requires actual_output, reject empty/whitespace
|
|
324
|
+
# (including empty multimodal outputs) as "missing params".
|
|
325
|
+
if LLMTestCaseParams.ACTUAL_OUTPUT in test_case_params:
|
|
326
|
+
actual_output = getattr(
|
|
327
|
+
test_case, LLMTestCaseParams.ACTUAL_OUTPUT.value
|
|
328
|
+
)
|
|
329
|
+
if isinstance(actual_output, str) and actual_output == "":
|
|
330
|
+
error_str = f"'actual_output' cannot be empty for the '{metric.__name__}' metric"
|
|
331
|
+
metric.error = error_str
|
|
332
|
+
raise MissingTestCaseParamsError(error_str)
|
|
333
|
+
|
|
323
334
|
missing_params = []
|
|
324
335
|
for param in test_case_params:
|
|
325
336
|
if getattr(test_case, param.value) is None:
|
|
@@ -610,7 +610,8 @@ class ConversationSimulator:
|
|
|
610
610
|
) -> BaseModel:
|
|
611
611
|
if self.using_native_model:
|
|
612
612
|
res, cost = self.simulator_model.generate(prompt, schema=schema)
|
|
613
|
-
|
|
613
|
+
if cost is not None:
|
|
614
|
+
self.simulation_cost += cost
|
|
614
615
|
return res
|
|
615
616
|
else:
|
|
616
617
|
try:
|
|
@@ -630,7 +631,8 @@ class ConversationSimulator:
|
|
|
630
631
|
res, cost = await self.simulator_model.a_generate(
|
|
631
632
|
prompt, schema=schema
|
|
632
633
|
)
|
|
633
|
-
|
|
634
|
+
if cost is not None:
|
|
635
|
+
self.simulation_cost += cost
|
|
634
636
|
return res
|
|
635
637
|
else:
|
|
636
638
|
try:
|
deepeval/telemetry.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from contextlib import contextmanager
|
|
2
|
-
import logging
|
|
3
2
|
import os
|
|
4
3
|
import socket
|
|
5
4
|
import sys
|
|
@@ -85,13 +84,6 @@ if not telemetry_opt_out():
|
|
|
85
84
|
anonymous_public_ip = None
|
|
86
85
|
|
|
87
86
|
if not telemetry_opt_out():
|
|
88
|
-
from opentelemetry import trace
|
|
89
|
-
from opentelemetry.sdk.trace import TracerProvider
|
|
90
|
-
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
|
91
|
-
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
|
|
92
|
-
OTLPSpanExporter,
|
|
93
|
-
)
|
|
94
|
-
|
|
95
87
|
anonymous_public_ip = get_anonymous_public_ip()
|
|
96
88
|
sentry_sdk.init(
|
|
97
89
|
dsn="https://5ef587d58109ee45d6544f3657efdd1f@o4506098477236224.ingest.sentry.io/4506098479136768",
|
|
@@ -102,27 +94,6 @@ if not telemetry_opt_out():
|
|
|
102
94
|
default_integrations=False, # Disable Sentry's default integrations
|
|
103
95
|
)
|
|
104
96
|
|
|
105
|
-
# Set up the Tracer Provider
|
|
106
|
-
trace.set_tracer_provider(TracerProvider())
|
|
107
|
-
tracer_provider = trace.get_tracer_provider()
|
|
108
|
-
|
|
109
|
-
# New Relic License Key and OTLP Endpoint
|
|
110
|
-
NEW_RELIC_LICENSE_KEY = "1711c684db8a30361a7edb0d0398772cFFFFNRAL"
|
|
111
|
-
NEW_RELIC_OTLP_ENDPOINT = "https://otlp.nr-data.net:4317"
|
|
112
|
-
otlp_exporter = OTLPSpanExporter(
|
|
113
|
-
endpoint=NEW_RELIC_OTLP_ENDPOINT,
|
|
114
|
-
headers={"api-key": NEW_RELIC_LICENSE_KEY},
|
|
115
|
-
)
|
|
116
|
-
|
|
117
|
-
# Add the OTLP exporter to the span processor
|
|
118
|
-
span_processor = BatchSpanProcessor(otlp_exporter)
|
|
119
|
-
tracer_provider.add_span_processor(span_processor)
|
|
120
|
-
|
|
121
|
-
logging.getLogger("opentelemetry.exporter.otlp").setLevel(logging.CRITICAL)
|
|
122
|
-
|
|
123
|
-
# Create a tracer for your application
|
|
124
|
-
tracer = trace.get_tracer(__name__)
|
|
125
|
-
|
|
126
97
|
# Initialize PostHog
|
|
127
98
|
posthog = Posthog(
|
|
128
99
|
project_api_key="phc_IXvGRcscJJoIb049PtjIZ65JnXQguOUZ5B5MncunFdB",
|
|
@@ -199,11 +170,7 @@ def capture_evaluation_run(type: str):
|
|
|
199
170
|
posthog.capture(
|
|
200
171
|
distinct_id=distinct_id, event=event, properties=properties
|
|
201
172
|
)
|
|
202
|
-
|
|
203
|
-
with tracer.start_as_current_span(event) as span:
|
|
204
|
-
for property, value in properties.items():
|
|
205
|
-
span.set_attribute(property, value)
|
|
206
|
-
yield span
|
|
173
|
+
yield
|
|
207
174
|
|
|
208
175
|
|
|
209
176
|
@contextmanager
|
|
@@ -227,11 +194,7 @@ def capture_recommend_metrics():
|
|
|
227
194
|
posthog.capture(
|
|
228
195
|
distinct_id=distinct_id, event=event, properties=properties
|
|
229
196
|
)
|
|
230
|
-
|
|
231
|
-
with tracer.start_as_current_span(event) as span:
|
|
232
|
-
for property, value in properties.items():
|
|
233
|
-
span.set_attribute(property, value)
|
|
234
|
-
yield span
|
|
197
|
+
yield
|
|
235
198
|
|
|
236
199
|
|
|
237
200
|
@contextmanager
|
|
@@ -259,11 +222,7 @@ def capture_metric_type(
|
|
|
259
222
|
posthog.capture(
|
|
260
223
|
distinct_id=distinct_id, event=event, properties=properties
|
|
261
224
|
)
|
|
262
|
-
|
|
263
|
-
with tracer.start_as_current_span(event) as span:
|
|
264
|
-
for property, value in properties.items():
|
|
265
|
-
span.set_attribute(property, value)
|
|
266
|
-
yield span
|
|
225
|
+
yield
|
|
267
226
|
|
|
268
227
|
|
|
269
228
|
@contextmanager
|
|
@@ -297,11 +256,7 @@ def capture_synthesizer_run(
|
|
|
297
256
|
posthog.capture(
|
|
298
257
|
distinct_id=distinct_id, event=event, properties=properties
|
|
299
258
|
)
|
|
300
|
-
|
|
301
|
-
with tracer.start_as_current_span(event) as span:
|
|
302
|
-
for property, value in properties.items():
|
|
303
|
-
span.set_attribute(property, value)
|
|
304
|
-
yield span
|
|
259
|
+
yield
|
|
305
260
|
|
|
306
261
|
|
|
307
262
|
@contextmanager
|
|
@@ -330,11 +285,7 @@ def capture_conversation_simulator_run(num_conversations: int):
|
|
|
330
285
|
posthog.capture(
|
|
331
286
|
distinct_id=distinct_id, event=event, properties=properties
|
|
332
287
|
)
|
|
333
|
-
|
|
334
|
-
with tracer.start_as_current_span(event) as span:
|
|
335
|
-
for property, value in properties.items():
|
|
336
|
-
span.set_attribute(property, value)
|
|
337
|
-
yield span
|
|
288
|
+
yield
|
|
338
289
|
|
|
339
290
|
|
|
340
291
|
@contextmanager
|
|
@@ -360,11 +311,7 @@ def capture_guardrails(guards: List[str]):
|
|
|
360
311
|
posthog.capture(
|
|
361
312
|
distinct_id=distinct_id, event=event, properties=properties
|
|
362
313
|
)
|
|
363
|
-
|
|
364
|
-
with tracer.start_as_current_span(event) as span:
|
|
365
|
-
for property, value in properties.items():
|
|
366
|
-
span.set_attribute(property, value)
|
|
367
|
-
yield span
|
|
314
|
+
yield
|
|
368
315
|
|
|
369
316
|
|
|
370
317
|
@contextmanager
|
|
@@ -391,11 +338,7 @@ def capture_benchmark_run(benchmark: str, num_tasks: int):
|
|
|
391
338
|
posthog.capture(
|
|
392
339
|
distinct_id=distinct_id, event=event, properties=properties
|
|
393
340
|
)
|
|
394
|
-
|
|
395
|
-
with tracer.start_as_current_span(event) as span:
|
|
396
|
-
for property, value in properties.items():
|
|
397
|
-
span.set_attribute(property, value)
|
|
398
|
-
yield span
|
|
341
|
+
yield
|
|
399
342
|
|
|
400
343
|
|
|
401
344
|
@contextmanager
|
|
@@ -421,11 +364,7 @@ def capture_login_event():
|
|
|
421
364
|
posthog.capture(
|
|
422
365
|
distinct_id=distinct_id, event=event, properties=properties
|
|
423
366
|
)
|
|
424
|
-
|
|
425
|
-
with tracer.start_as_current_span(event) as span:
|
|
426
|
-
for property, value in properties.items():
|
|
427
|
-
span.set_attribute(property, value)
|
|
428
|
-
yield span
|
|
367
|
+
yield
|
|
429
368
|
|
|
430
369
|
|
|
431
370
|
@contextmanager
|
|
@@ -451,11 +390,7 @@ def capture_view_event():
|
|
|
451
390
|
posthog.capture(
|
|
452
391
|
distinct_id=distinct_id, event=event, properties=properties
|
|
453
392
|
)
|
|
454
|
-
|
|
455
|
-
with tracer.start_as_current_span(event) as span:
|
|
456
|
-
for property, value in properties.items():
|
|
457
|
-
span.set_attribute(property, value)
|
|
458
|
-
yield span
|
|
393
|
+
yield
|
|
459
394
|
|
|
460
395
|
|
|
461
396
|
@contextmanager
|
|
@@ -478,11 +413,7 @@ def capture_pull_dataset():
|
|
|
478
413
|
posthog.capture(
|
|
479
414
|
distinct_id=distinct_id, event=event, properties=properties
|
|
480
415
|
)
|
|
481
|
-
|
|
482
|
-
with tracer.start_as_current_span(event) as span:
|
|
483
|
-
for property, value in properties.items():
|
|
484
|
-
span.set_attribute(property, value)
|
|
485
|
-
yield span
|
|
416
|
+
yield
|
|
486
417
|
|
|
487
418
|
|
|
488
419
|
# track metrics that are components and metrics that aren't components
|
|
@@ -509,11 +440,7 @@ def capture_send_trace():
|
|
|
509
440
|
posthog.capture(
|
|
510
441
|
distinct_id=distinct_id, event=event, properties=properties
|
|
511
442
|
)
|
|
512
|
-
|
|
513
|
-
with tracer.start_as_current_span(event) as span:
|
|
514
|
-
for property, value in properties.items():
|
|
515
|
-
span.set_attribute(property, value)
|
|
516
|
-
yield span
|
|
443
|
+
yield
|
|
517
444
|
|
|
518
445
|
|
|
519
446
|
# tracing integration
|
|
@@ -542,13 +469,7 @@ def capture_tracing_integration(integration_name: str):
|
|
|
542
469
|
posthog.capture(
|
|
543
470
|
distinct_id=distinct_id, event=event, properties=properties
|
|
544
471
|
)
|
|
545
|
-
|
|
546
|
-
with tracer.start_as_current_span(event) as span:
|
|
547
|
-
for property, value in properties.items():
|
|
548
|
-
span.set_attribute(property, value)
|
|
549
|
-
# OTEL/New Relic filtering attributes
|
|
550
|
-
span.set_attribute("integration.name", integration_name)
|
|
551
|
-
yield span
|
|
472
|
+
yield
|
|
552
473
|
|
|
553
474
|
|
|
554
475
|
#########################################################
|
deepeval/tracing/tracing.py
CHANGED
|
@@ -969,9 +969,9 @@ class Observer:
|
|
|
969
969
|
else:
|
|
970
970
|
current_trace = current_trace_context.get()
|
|
971
971
|
if current_trace.input is None:
|
|
972
|
-
current_trace.input = self.function_kwargs
|
|
972
|
+
current_trace.input = trace_manager.mask(self.function_kwargs)
|
|
973
973
|
if current_trace.output is None:
|
|
974
|
-
current_trace.output = self.result
|
|
974
|
+
current_trace.output = trace_manager.mask(self.result)
|
|
975
975
|
if current_span.status == TraceSpanStatus.ERRORED:
|
|
976
976
|
current_trace.status = TraceSpanStatus.ERRORED
|
|
977
977
|
if current_trace and current_trace.uuid == current_span.trace_uuid:
|
|
@@ -1037,7 +1037,8 @@ class Observer:
|
|
|
1037
1037
|
return RetrieverSpan(**span_kwargs, embedder=embedder)
|
|
1038
1038
|
|
|
1039
1039
|
elif self.span_type == SpanType.TOOL.value:
|
|
1040
|
-
|
|
1040
|
+
description = self.observe_kwargs.get("description", None)
|
|
1041
|
+
return ToolSpan(**span_kwargs, description=description)
|
|
1041
1042
|
else:
|
|
1042
1043
|
return BaseSpan(**span_kwargs)
|
|
1043
1044
|
|
|
@@ -1107,7 +1108,7 @@ def observe(
|
|
|
1107
1108
|
yield chunk
|
|
1108
1109
|
observer.__exit__(None, None, None)
|
|
1109
1110
|
except Exception as e:
|
|
1110
|
-
observer.__exit__(
|
|
1111
|
+
observer.__exit__(e.__class__, e, e.__traceback__)
|
|
1111
1112
|
raise
|
|
1112
1113
|
|
|
1113
1114
|
return gen()
|
|
@@ -1150,7 +1151,7 @@ def observe(
|
|
|
1150
1151
|
yield from original_gen
|
|
1151
1152
|
observer.__exit__(None, None, None)
|
|
1152
1153
|
except Exception as e:
|
|
1153
|
-
observer.__exit__(
|
|
1154
|
+
observer.__exit__(e.__class__, e, e.__traceback__)
|
|
1154
1155
|
raise
|
|
1155
1156
|
|
|
1156
1157
|
return gen()
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
deepeval/__init__.py,sha256=tle4lT4FONApg3OeztGPEdrpGMEGLWajyGTu7bEd3s0,2976
|
|
2
|
-
deepeval/_version.py,sha256=
|
|
2
|
+
deepeval/_version.py,sha256=A8HjzlffHmJot3rBAExqN_D-QxaG8UT8zqiP26xCL2M,27
|
|
3
3
|
deepeval/annotation/__init__.py,sha256=ZFhUVNNuH_YgQSZJ-m5E9iUb9TkAkEV33a6ouMDZ8EI,111
|
|
4
4
|
deepeval/annotation/annotation.py,sha256=WLFZRkx6wRJcNzaOMMGXuTfw6Q1_1Mv5A4jpD7Ea4sU,2300
|
|
5
5
|
deepeval/annotation/api.py,sha256=EYN33ACVzVxsFleRYm60KB4Exvff3rPJKt1VBuuX970,2147
|
|
@@ -142,7 +142,7 @@ deepeval/cli/test.py,sha256=aoBPMfk0HTvOqb2xdvMykkx_s4SHst7lEnoUiSXo1lU,5483
|
|
|
142
142
|
deepeval/cli/types.py,sha256=_7KdthstHNc-JKCWrfpDQCf_j8h9PMxh0qJCHmVXJr0,310
|
|
143
143
|
deepeval/cli/utils.py,sha256=3fgH5WPTTe7Cz_QOLCHyflXB81kmFaSxXHJ2tnxvFLw,10649
|
|
144
144
|
deepeval/confident/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
145
|
-
deepeval/confident/api.py,sha256=
|
|
145
|
+
deepeval/confident/api.py,sha256=rxMNMK5VYPQKdEDSRsovlULV14QlGW3TNDkARLj_Pt4,9589
|
|
146
146
|
deepeval/confident/types.py,sha256=9bgePDaU31yY7JGwCLZcc7pev9VGtNDZLbjsVpCLVdc,574
|
|
147
147
|
deepeval/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
148
148
|
deepeval/config/dotenv_handler.py,sha256=lOosoC7fm9RljriY8EFl5ywSGfSiQsVf_vmYqzpbZ8s,588
|
|
@@ -180,16 +180,16 @@ deepeval/integrations/hugging_face/rich_manager.py,sha256=WvFtPGpPmGeg2Ftsnojga6
|
|
|
180
180
|
deepeval/integrations/hugging_face/tests/test_callbacks.py,sha256=88Wyg-aDaXujj9jHeGdFF3ITSl2-y7eaJGWgSyvvDi8,4607
|
|
181
181
|
deepeval/integrations/hugging_face/utils.py,sha256=HUKdQcTIb76Ct69AS737oPxmlVxk5fw2UbT2pLn-o8k,1817
|
|
182
182
|
deepeval/integrations/langchain/__init__.py,sha256=G1Qey5WkKou2-PA34KwWgmayQ_TbvXqPyotTbzmD8tw,84
|
|
183
|
-
deepeval/integrations/langchain/callback.py,sha256=
|
|
183
|
+
deepeval/integrations/langchain/callback.py,sha256=uZrhmlzw2dcFunqQzNPfMo9vWkHCX7PbGbazKRhBbBY,32687
|
|
184
184
|
deepeval/integrations/langchain/patch.py,sha256=fCHfZXU9xX3IJ6SG8GEYzn3qrifyUkT0i_uUABTsmcs,1255
|
|
185
|
-
deepeval/integrations/langchain/utils.py,sha256=
|
|
185
|
+
deepeval/integrations/langchain/utils.py,sha256=mhv0anU5ZnbBsESMuCooT9FSNPkx2ObrVLlq7QNEZOI,13104
|
|
186
186
|
deepeval/integrations/llama_index/__init__.py,sha256=Ujs9ZBJFkuCWUDBJOF88UbM1Y-S6QFQhxSo0oQnEWNw,90
|
|
187
187
|
deepeval/integrations/llama_index/handler.py,sha256=uTvNXmAF4xBh8t9bBm5sBFX6ETp8SrkOZlFlE_GWdmM,10771
|
|
188
188
|
deepeval/integrations/llama_index/utils.py,sha256=onmmo1vpn6cpOY5EhfTc0Uui7X6l1M0HD3sq-KVAesg,3380
|
|
189
189
|
deepeval/integrations/pydantic_ai/__init__.py,sha256=UIkXn_g6h9LTQXG1PaWu1eCFkCssIwG48WSvN46UWgU,202
|
|
190
190
|
deepeval/integrations/pydantic_ai/agent.py,sha256=-NKvpTUw3AxRNhuxVFcx9mw5BWCujzOwsaC8u7K0ubc,1178
|
|
191
|
-
deepeval/integrations/pydantic_ai/instrumentator.py,sha256=
|
|
192
|
-
deepeval/integrations/pydantic_ai/otel.py,sha256=
|
|
191
|
+
deepeval/integrations/pydantic_ai/instrumentator.py,sha256=COqw4FJsUZacaP4Dfn1aaOXvUTvZOuhcqqQD-_sLD04,13047
|
|
192
|
+
deepeval/integrations/pydantic_ai/otel.py,sha256=xWYnMT1HwcAmyWdoJa6C1sHwd5frP9_IcR8dj9sKsG0,2386
|
|
193
193
|
deepeval/integrations/pydantic_ai/test_instrumentator.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
194
194
|
deepeval/key_handler.py,sha256=lajMBgF2lCzbQpW4e6Y7cD9FOw0Qk5UOKS4_kIIHj6Y,9562
|
|
195
195
|
deepeval/metrics/__init__.py,sha256=19Df323r8aAlx2sRfV9BHJLicORhTLpogR8M1deJetw,4680
|
|
@@ -384,7 +384,7 @@ deepeval/metrics/turn_relevancy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
|
|
|
384
384
|
deepeval/metrics/turn_relevancy/schema.py,sha256=om0zFJcM6qu2GWS9aJTP3lUmuEXX8KpoACEvCsJqfq4,234
|
|
385
385
|
deepeval/metrics/turn_relevancy/template.py,sha256=k02QVclRtCTVBZ7Xd4f-LdTrSO_dBxquQiFYqRYmiSA,3245
|
|
386
386
|
deepeval/metrics/turn_relevancy/turn_relevancy.py,sha256=gMx5o5vfPJjVKior96L_A-4o3IoAyxSoTgI8U9sJtRY,9468
|
|
387
|
-
deepeval/metrics/utils.py,sha256=
|
|
387
|
+
deepeval/metrics/utils.py,sha256=osdTrK0jMiMynfks3uUFx6KmhcbRmr41ZXoGMisx2xY,21932
|
|
388
388
|
deepeval/model_integrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
389
389
|
deepeval/model_integrations/types.py,sha256=rbVMhC_2yWwD6JqzkRO9D7aMVC_KtXN686G_S7de7S8,630
|
|
390
390
|
deepeval/model_integrations/utils.py,sha256=Zt9SYPgTxlGsQFZgpZvh_a5fWuL8mmIFVSe6uoQywZ4,3562
|
|
@@ -468,7 +468,7 @@ deepeval/red_teaming/README.md,sha256=BY5rAdpp3-sMMToEKwq0Nsd9ivkGDzPE16DeDb8GY7
|
|
|
468
468
|
deepeval/scorer/__init__.py,sha256=hTvtoV3a4l0dSBjERm-jX7jveTtKZXK0c9JerQo0T_w,27
|
|
469
469
|
deepeval/scorer/scorer.py,sha256=EmXo1wEMMAL2it8WxNJ4cTqZLCH1ad4BY2VewoX6b10,18348
|
|
470
470
|
deepeval/simulator/__init__.py,sha256=wkyevg9nh46rsVnVrBjY3K5bHlkqjwx4TtrTfyjDCO0,96
|
|
471
|
-
deepeval/simulator/conversation_simulator.py,sha256=
|
|
471
|
+
deepeval/simulator/conversation_simulator.py,sha256=RfCZZmxiKNiSmd_g9CN-Un_ekkqeyDARP3aXcj3rUck,27282
|
|
472
472
|
deepeval/simulator/schema.py,sha256=16X2-m92plP52YTd-dvECt_-6gsz0U4j7Ut3UdI6gKY,252
|
|
473
473
|
deepeval/simulator/template.py,sha256=6wh6xiUaZQn-pvkBWgOK7pWfsv5nntgjGfTqUkcKn0A,6461
|
|
474
474
|
deepeval/singleton.py,sha256=irNbt0-IRI7rD4t05OZHsrNovpeva0XPc8PoieFytG8,532
|
|
@@ -486,7 +486,7 @@ deepeval/synthesizer/templates/template_extraction.py,sha256=jmvr8AOOUzDgsHYIOsq
|
|
|
486
486
|
deepeval/synthesizer/templates/template_prompt.py,sha256=bzfC71AXZqBrmoDWmBvuIQKD6hPJZ0ZAWX4hy-lPlnQ,21478
|
|
487
487
|
deepeval/synthesizer/types.py,sha256=wUZntvCAE29sM9K8hk9RPwUpkTip1ObOCExyMEo3sME,493
|
|
488
488
|
deepeval/synthesizer/utils.py,sha256=o-9z5gApQcHqDqusgrD0LagXWAju17LVc27BxtaA7og,1018
|
|
489
|
-
deepeval/telemetry.py,sha256=
|
|
489
|
+
deepeval/telemetry.py,sha256=VSPAv1XWS0jzDIjPzgAg42WDfYgqaR4Iwi8RrM_aPns,18041
|
|
490
490
|
deepeval/test_case/__init__.py,sha256=i1hIGeE_J1Zm-KmDVFqmogvBKzyOlIsENrfhL-3B8_M,658
|
|
491
491
|
deepeval/test_case/api.py,sha256=i9e1ggt4O9w_cu7tMSArw-LkiIZ_u_WPgpM2YAhfgks,3408
|
|
492
492
|
deepeval/test_case/arena_test_case.py,sha256=ngEU5_-YVQ-qPSOVVuSUJ_nuvdQR-MGA_QZQst5c8MI,1482
|
|
@@ -516,12 +516,12 @@ deepeval/tracing/patchers.py,sha256=Oi9wao3oDYhcviv7p0KoWBeS9ne7rHLa2gh9AR9EyiU,
|
|
|
516
516
|
deepeval/tracing/perf_epoch_bridge.py,sha256=iyAPddB6Op7NpMtPHJ29lDm53Btz9yLaN6xSCfTRQm4,1825
|
|
517
517
|
deepeval/tracing/trace_context.py,sha256=Z0n0Cu1A5g9dXiZnzTFO5TzeOYHKeNuO6v3_EU_Gi_c,3568
|
|
518
518
|
deepeval/tracing/trace_test_manager.py,sha256=wt4y7EWTRc4Bw938-UFFtXHkdFFOrnx6JaIk7J5Iulw,555
|
|
519
|
-
deepeval/tracing/tracing.py,sha256=
|
|
519
|
+
deepeval/tracing/tracing.py,sha256=ge3XXJkxlmCk5KfrqOOjxXIuA1CIXFOKJxhRTmXRSVQ,46849
|
|
520
520
|
deepeval/tracing/types.py,sha256=3QkF0toQ6f0fEDARYOUV6Iv9UJFbg14kSpn3dL1H5CE,6040
|
|
521
521
|
deepeval/tracing/utils.py,sha256=mdvhYAxDNsdnusaEXJd-c-_O2Jn6S3xSuzRvLO1Jz4U,5684
|
|
522
522
|
deepeval/utils.py,sha256=Wsu95g6t1wdttxWIESVwuUxbml7C-9ZTsV7qHCQI3Xg,27259
|
|
523
|
-
deepeval-3.8.
|
|
524
|
-
deepeval-3.8.
|
|
525
|
-
deepeval-3.8.
|
|
526
|
-
deepeval-3.8.
|
|
527
|
-
deepeval-3.8.
|
|
523
|
+
deepeval-3.8.2.dist-info/LICENSE.md,sha256=0ATkuLv6QgsJTBODUHC5Rak_PArA6gv2t7inJzNTP38,11352
|
|
524
|
+
deepeval-3.8.2.dist-info/METADATA,sha256=SUHVBa7pgBKF2XG3L3c_cItJWvuCdAQxzQSctzeAezQ,18752
|
|
525
|
+
deepeval-3.8.2.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
|
|
526
|
+
deepeval-3.8.2.dist-info/entry_points.txt,sha256=NoismUQfwLOojSGZmBrdcpwfaoFRAzUhBvZD3UwOKog,95
|
|
527
|
+
deepeval-3.8.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|