deepeval 3.5.3__py3-none-any.whl → 3.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,29 @@
1
- from typing import List
2
- from pydantic_ai.messages import ModelResponsePart
1
+ from time import perf_counter
2
+ from contextlib import asynccontextmanager
3
+ import inspect
4
+ import functools
5
+ from typing import Any, Callable, List, Optional
6
+
7
+ from pydantic_ai.models import Model
3
8
  from pydantic_ai.agent import AgentRunResult
4
9
  from pydantic_ai._run_context import RunContext
10
+ from pydantic_ai.messages import (
11
+ ModelRequest,
12
+ ModelResponse,
13
+ ModelResponsePart,
14
+ SystemPromptPart,
15
+ TextPart,
16
+ ToolCallPart,
17
+ ToolReturnPart,
18
+ UserPromptPart,
19
+ )
20
+
21
+ from deepeval.prompt import Prompt
22
+ from deepeval.tracing.tracing import Observer
23
+ from deepeval.metrics.base_metric import BaseMetric
5
24
  from deepeval.test_case.llm_test_case import ToolCall
25
+ from deepeval.tracing.context import current_trace_context, current_span_context
26
+ from deepeval.tracing.types import AgentSpan, LlmOutput, LlmSpan, LlmToolCall
6
27
 
7
28
 
8
29
  # llm tools called
@@ -84,3 +105,219 @@ def sanitize_run_context(value):
84
105
  return {sanitize_run_context(v) for v in value}
85
106
 
86
107
  return value
108
+
109
+
110
+ def patch_llm_model(
111
+ model: Model,
112
+ llm_metric_collection: Optional[str] = None,
113
+ llm_metrics: Optional[List[BaseMetric]] = None,
114
+ llm_prompt: Optional[Prompt] = None,
115
+ ):
116
+ original_func = model.request
117
+ sig = inspect.signature(original_func)
118
+
119
+ try:
120
+ model_name = model.model_name
121
+ except Exception:
122
+ model_name = "unknown"
123
+
124
+ @functools.wraps(original_func)
125
+ async def wrapper(*args, **kwargs):
126
+ bound = sig.bind_partial(*args, **kwargs)
127
+ bound.apply_defaults()
128
+ request = bound.arguments.get("messages", [])
129
+
130
+ with Observer(
131
+ span_type="llm",
132
+ func_name="LLM",
133
+ observe_kwargs={"model": model_name},
134
+ metrics=llm_metrics,
135
+ metric_collection=llm_metric_collection,
136
+ ) as observer:
137
+ result = await original_func(*args, **kwargs)
138
+ observer.update_span_properties = (
139
+ lambda llm_span: set_llm_span_attributes(
140
+ llm_span, request, result, llm_prompt
141
+ )
142
+ )
143
+ observer.result = result
144
+ return result
145
+
146
+ model.request = wrapper
147
+
148
+ stream_original_func = model.request_stream
149
+ stream_sig = inspect.signature(stream_original_func)
150
+
151
+ @asynccontextmanager
152
+ async def stream_wrapper(*args, **kwargs):
153
+ bound = stream_sig.bind_partial(*args, **kwargs)
154
+ bound.apply_defaults()
155
+ request = bound.arguments.get("messages", [])
156
+
157
+ with Observer(
158
+ span_type="llm",
159
+ func_name="LLM",
160
+ observe_kwargs={"model": model_name},
161
+ metrics=llm_metrics,
162
+ metric_collection=llm_metric_collection,
163
+ ) as observer:
164
+ llm_span: LlmSpan = current_span_context.get()
165
+ async with stream_original_func(
166
+ *args, **kwargs
167
+ ) as streamed_response:
168
+ try:
169
+ yield streamed_response
170
+ if not llm_span.token_intervals:
171
+ llm_span.token_intervals = {perf_counter(): "NA"}
172
+ else:
173
+ llm_span.token_intervals[perf_counter()] = "NA"
174
+ finally:
175
+ try:
176
+ result = streamed_response.get()
177
+ observer.update_span_properties = (
178
+ lambda llm_span: set_llm_span_attributes(
179
+ llm_span, request, result, llm_prompt
180
+ )
181
+ )
182
+ observer.result = result
183
+ except Exception:
184
+ pass
185
+
186
+ model.request_stream = stream_wrapper
187
+
188
+
189
+ def create_patched_tool(
190
+ func: Callable,
191
+ metrics: Optional[List[BaseMetric]] = None,
192
+ metric_collection: Optional[str] = None,
193
+ ):
194
+ import asyncio
195
+
196
+ original_func = func
197
+
198
+ is_async = asyncio.iscoroutinefunction(original_func)
199
+
200
+ if is_async:
201
+
202
+ @functools.wraps(original_func)
203
+ async def async_wrapper(*args, **kwargs):
204
+ sanitized_args = sanitize_run_context(args)
205
+ sanitized_kwargs = sanitize_run_context(kwargs)
206
+ with Observer(
207
+ span_type="tool",
208
+ func_name=original_func.__name__,
209
+ metrics=metrics,
210
+ metric_collection=metric_collection,
211
+ function_kwargs={"args": sanitized_args, **sanitized_kwargs},
212
+ ) as observer:
213
+ result = await original_func(*args, **kwargs)
214
+ observer.result = result
215
+
216
+ return result
217
+
218
+ return async_wrapper
219
+ else:
220
+
221
+ @functools.wraps(original_func)
222
+ def sync_wrapper(*args, **kwargs):
223
+ sanitized_args = sanitize_run_context(args)
224
+ sanitized_kwargs = sanitize_run_context(kwargs)
225
+ with Observer(
226
+ span_type="tool",
227
+ func_name=original_func.__name__,
228
+ metrics=metrics,
229
+ metric_collection=metric_collection,
230
+ function_kwargs={"args": sanitized_args, **sanitized_kwargs},
231
+ ) as observer:
232
+ result = original_func(*args, **kwargs)
233
+ observer.result = result
234
+
235
+ return result
236
+
237
+ return sync_wrapper
238
+
239
+
240
+ def update_trace_context(
241
+ trace_name: Optional[str] = None,
242
+ trace_tags: Optional[List[str]] = None,
243
+ trace_metadata: Optional[dict] = None,
244
+ trace_thread_id: Optional[str] = None,
245
+ trace_user_id: Optional[str] = None,
246
+ trace_metric_collection: Optional[str] = None,
247
+ trace_metrics: Optional[List[BaseMetric]] = None,
248
+ trace_input: Optional[Any] = None,
249
+ trace_output: Optional[Any] = None,
250
+ ):
251
+
252
+ current_trace = current_trace_context.get()
253
+
254
+ if trace_name:
255
+ current_trace.name = trace_name
256
+ if trace_tags:
257
+ current_trace.tags = trace_tags
258
+ if trace_metadata:
259
+ current_trace.metadata = trace_metadata
260
+ if trace_thread_id:
261
+ current_trace.thread_id = trace_thread_id
262
+ if trace_user_id:
263
+ current_trace.user_id = trace_user_id
264
+ if trace_metric_collection:
265
+ current_trace.metric_collection = trace_metric_collection
266
+ if trace_metrics:
267
+ current_trace.metrics = trace_metrics
268
+ if trace_input:
269
+ current_trace.input = trace_input
270
+ if trace_output:
271
+ current_trace.output = trace_output
272
+
273
+
274
+ def set_llm_span_attributes(
275
+ llm_span: LlmSpan,
276
+ requests: List[ModelRequest],
277
+ result: ModelResponse,
278
+ llm_prompt: Optional[Prompt] = None,
279
+ ):
280
+ llm_span.prompt = llm_prompt
281
+
282
+ input = []
283
+ for request in requests:
284
+ for part in request.parts:
285
+ if isinstance(part, SystemPromptPart):
286
+ input.append({"role": "System", "content": part.content})
287
+ elif isinstance(part, UserPromptPart):
288
+ input.append({"role": "User", "content": part.content})
289
+ elif isinstance(part, ToolCallPart):
290
+ input.append(
291
+ {
292
+ "role": "Tool Call",
293
+ "name": part.tool_name,
294
+ "content": part.args_as_json_str(),
295
+ }
296
+ )
297
+ elif isinstance(part, ToolReturnPart):
298
+ input.append(
299
+ {
300
+ "role": "Tool Return",
301
+ "name": part.tool_name,
302
+ "content": part.model_response_str(),
303
+ }
304
+ )
305
+ llm_span.input = input
306
+
307
+ content = ""
308
+ tool_calls = []
309
+ for part in result.parts:
310
+ if isinstance(part, TextPart):
311
+ content += part.content + "\n"
312
+ elif isinstance(part, ToolCallPart):
313
+ tool_calls.append(
314
+ LlmToolCall(name=part.tool_name, args=part.args_as_dict())
315
+ )
316
+ llm_span.output = LlmOutput(
317
+ role="Assistant", content=content, tool_calls=tool_calls
318
+ )
319
+ llm_span.tools_called = extract_tools_called_from_llm_response(result.parts)
320
+
321
+
322
+ def set_agent_span_attributes(agent_span: AgentSpan, result: AgentRunResult):
323
+ agent_span.tools_called = extract_tools_called(result)
@@ -283,8 +283,9 @@ class MCPUseMetric(BaseMetric):
283
283
  mcp_resources_called: List[MCPResourceCall],
284
284
  mcp_prompts_called: List[MCPPromptCall],
285
285
  ) -> tuple[str, str]:
286
+ available_primitives = "MCP Primitives Available: \n"
286
287
  for mcp_server in mcp_servers:
287
- available_primitives = f"MCP Server {mcp_server.server_name}\n"
288
+ available_primitives += f"MCP Server {mcp_server.server_name}\n"
288
289
  available_primitives += (
289
290
  (
290
291
  "\nAvailable Tools:\n[\n"
@@ -43,7 +43,7 @@ class NonAdviceMetric(BaseMetric):
43
43
  "or ['financial', 'medical'] for multiple types."
44
44
  )
45
45
 
46
- self.threshold = 0 if strict_mode else threshold
46
+ self.threshold = 1 if strict_mode else threshold
47
47
  self.advice_types = advice_types
48
48
  self.model, self.using_native_model = initialize_model(model)
49
49
  self.evaluation_model = self.model.get_model_name()
@@ -293,7 +293,7 @@ class NonAdviceMetric(BaseMetric):
293
293
  appropriate_advice_count += 1
294
294
 
295
295
  score = appropriate_advice_count / number_of_verdicts
296
- return 1 if self.strict_mode and score < 1 else score
296
+ return 0 if self.strict_mode and score < self.threshold else score
297
297
 
298
298
  def is_successful(self) -> bool:
299
299
  if self.error is not None:
@@ -35,7 +35,7 @@ class PIILeakageMetric(BaseMetric):
35
35
  verbose_mode: bool = False,
36
36
  evaluation_template: Type[PIILeakageTemplate] = PIILeakageTemplate,
37
37
  ):
38
- self.threshold = 0 if strict_mode else threshold
38
+ self.threshold = 1 if strict_mode else threshold
39
39
  self.model, self.using_native_model = initialize_model(model)
40
40
  self.evaluation_model = self.model.get_model_name()
41
41
  self.include_reason = include_reason
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deepeval
3
- Version: 3.5.3
3
+ Version: 3.5.4
4
4
  Summary: The LLM Evaluation Framework
5
5
  Home-page: https://github.com/confident-ai/deepeval
6
6
  License: Apache-2.0
@@ -1,5 +1,5 @@
1
1
  deepeval/__init__.py,sha256=6fsb813LD_jNhqR-xZnSdE5E-KsBbC3tc4oIg5ZMgTw,2115
2
- deepeval/_version.py,sha256=FauBIJ2kq1CmkcSxNhoO0BZN8gc3azHQFgdBkDjya18,27
2
+ deepeval/_version.py,sha256=Vy_DqdUIdzt42W7BKglfMO5ghp2Wa6OV5Tatx__sA2U,27
3
3
  deepeval/annotation/__init__.py,sha256=ZFhUVNNuH_YgQSZJ-m5E9iUb9TkAkEV33a6ouMDZ8EI,111
4
4
  deepeval/annotation/annotation.py,sha256=3j3-syeJepAcEj3u3e4T_BeRDzNr7yXGDIoNQGMKpwQ,2298
5
5
  deepeval/annotation/api.py,sha256=EYN33ACVzVxsFleRYm60KB4Exvff3rPJKt1VBuuX970,2147
@@ -179,10 +179,11 @@ deepeval/integrations/llama_index/__init__.py,sha256=zBwUFQXDp6QFtp1cfANy8ucV08r
179
179
  deepeval/integrations/llama_index/agent/patched.py,sha256=4JbH0WQmt4lct7xxIH0phj8_Y-V35dgVv7DEDXK0jZI,2149
180
180
  deepeval/integrations/llama_index/handler.py,sha256=eqI1n8E4MsvfKoFs5Zrm9IdCR7g9eBgNedISs7UkU_I,8947
181
181
  deepeval/integrations/llama_index/utils.py,sha256=mxW71-3PjvBvJpLIU0kNWuTzCidy5l_-roLt8ZyWYA0,2599
182
- deepeval/integrations/pydantic_ai/__init__.py,sha256=36fBKBLRo1y5jFlj0Y4xhDJsiq4ZnqtmFO32R90Azo4,96
182
+ deepeval/integrations/pydantic_ai/__init__.py,sha256=0-GZpWgCnFI-fVHI-3DosWQK85rk6CoRRhl4AiytBAw,258
183
+ deepeval/integrations/pydantic_ai/agent.py,sha256=HxfeTLsdWGgRMy00ymgXdE6dcFDmFBsdgfl9BbvyJns,12311
183
184
  deepeval/integrations/pydantic_ai/otel.py,sha256=2DpO3RapdztXPlT9BWhQfF4dJDMyp2X7YvuplJ0SwC8,1661
184
- deepeval/integrations/pydantic_ai/patcher.py,sha256=C8CpY6UTO9oaai36l5C_GMT0Lqx9UVoJQYRPF0u0tMc,13127
185
- deepeval/integrations/pydantic_ai/utils.py,sha256=0BT3v1heuAnfhd9_FSp4XL8818MGdaI6oY2sTQtotfs,3211
185
+ deepeval/integrations/pydantic_ai/patcher.py,sha256=yy4SZRmRhgYxh6qGVWWf8DnSMCDA9GLkFw1HbPToQ1w,17696
186
+ deepeval/integrations/pydantic_ai/utils.py,sha256=734e9un-fn5V7MueAmVsXh304qgumv_fdcmdOC4HrJw,10998
186
187
  deepeval/key_handler.py,sha256=damdQEBLGy4IVk5DR5-E3blIZdLbcMtyeGAFn_4_SG4,6505
187
188
  deepeval/metrics/__init__.py,sha256=nvO0Wv2JROjK1I9MDNIFUJlrRAZI2C0xbGYSBZK5q4g,4013
188
189
  deepeval/metrics/answer_relevancy/__init__.py,sha256=WbZUpoSg2GQoqJ4VIRirVVQ1JDx5xwT-RskwqNKfWGM,46
@@ -262,7 +263,7 @@ deepeval/metrics/mcp/multi_turn_mcp_use_metric.py,sha256=XegYpPVH0qR5lKqQUjMg8dx
262
263
  deepeval/metrics/mcp/schema.py,sha256=e9_bFfI8uHeejaePu-YIX8qpAax1noPaKhpiD_NYlgg,310
263
264
  deepeval/metrics/mcp/template.py,sha256=iL1V9W40piCAlstk_qYOTHAy2aymqbMmujHempUk25s,5482
264
265
  deepeval/metrics/mcp_use_metric/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
265
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py,sha256=71WhX-GPD8zdkUhcdSfB81aGORojK6TODpx-Fw3NOac,14017
266
+ deepeval/metrics/mcp_use_metric/mcp_use_metric.py,sha256=2Q0fhNfLEoUCiz-NaG8UKNthdAywgexT2mqBXxipTPk,14080
266
267
  deepeval/metrics/mcp_use_metric/schema.py,sha256=GAWacRNl0i7ir_AE_f1_OBEj0Q5xhcIwEytmTFpiwyw,169
267
268
  deepeval/metrics/mcp_use_metric/template.py,sha256=iUttypDHYPc_QPB8hvuPhmSodJMhqUpau3P53thnE_Y,5625
268
269
  deepeval/metrics/misuse/__init__.py,sha256=TqtaJf0zzFceJtb3BSTT_hTA6OzgnF3Y-XuWjR2bgVs,37
@@ -318,11 +319,11 @@ deepeval/metrics/multimodal_metrics/text_to_image/schema.py,sha256=ygt_RGnVlYh__
318
319
  deepeval/metrics/multimodal_metrics/text_to_image/template.py,sha256=WSXXI0Tee1wE7FPyQJwHYXuqHaevYz9T04ns1P85Qec,2568
319
320
  deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py,sha256=20wzu09kQhNMxjp7oJ_sMfWXOWNhuj0kkyEed9lwS2g,11160
320
321
  deepeval/metrics/non_advice/__init__.py,sha256=GP55jVADpkODABIjzK0JX1MKpFNZ0bM7Q2Jczlc4BSU,40
321
- deepeval/metrics/non_advice/non_advice.py,sha256=DebtVrzAdwvoJ1eWovC6ahjY1qdsxUCYl2lQYkS3vI4,11234
322
+ deepeval/metrics/non_advice/non_advice.py,sha256=-pyy1uXfCn0Yuo-JnM4H9lakKzYyzRlIWQ3cA5SwpuM,11247
322
323
  deepeval/metrics/non_advice/schema.py,sha256=bODTV8jfjIYTwnYRHz32p47tqdXkTRqLXj_s5ZUxYAQ,299
323
324
  deepeval/metrics/non_advice/template.py,sha256=KiRoU_Re3JFHylKZ1O8hztZ3yEQf3vW_HWwHxQjDb6o,2864
324
325
  deepeval/metrics/pii_leakage/__init__.py,sha256=tBc9OGp4gmgoYz6FA3ipr48fpsCMvq6WtlwOjMqhCD0,42
325
- deepeval/metrics/pii_leakage/pii_leakage.py,sha256=sZPCjlegbs_djexoOGg5WEiYUgEYZQFPPYlRDk_FRUc,10851
326
+ deepeval/metrics/pii_leakage/pii_leakage.py,sha256=EIQMS_hOiYhEW5x4nYJwS6AhWl9jhN261atVoWZI3f4,10851
326
327
  deepeval/metrics/pii_leakage/schema.py,sha256=Jk9jdf4HAa76J237mnosWOCV71pBBNdLfaVhf-4dKEg,313
327
328
  deepeval/metrics/pii_leakage/template.py,sha256=DEW21CyR2lEI1y2C_fXgZnGJlYw0fvnB-LF-HEKZnqo,2418
328
329
  deepeval/metrics/prompt_alignment/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -460,8 +461,8 @@ deepeval/tracing/tracing.py,sha256=vOVFdN6fVMW53XhyqTZSfp4vI7DCqRez4TKNhdhr-sg,4
460
461
  deepeval/tracing/types.py,sha256=l_utWKerNlE5H3mOKpeUJLsvpP3cMyjH7HRANNgTmSQ,5306
461
462
  deepeval/tracing/utils.py,sha256=w_kdhuyBCygllnbqLpDdKJqpJo42t3ZMlGhNicV2A8c,6467
462
463
  deepeval/utils.py,sha256=r8tV_NYJSi6ib-oQw6cLw3L7ZSe4KIJVJc1ng6-kDX4,17179
463
- deepeval-3.5.3.dist-info/LICENSE.md,sha256=0ATkuLv6QgsJTBODUHC5Rak_PArA6gv2t7inJzNTP38,11352
464
- deepeval-3.5.3.dist-info/METADATA,sha256=WDYyYAc2YdYGWufFtlvsEFsV___80J_xPPCMkvlwau8,18682
465
- deepeval-3.5.3.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
466
- deepeval-3.5.3.dist-info/entry_points.txt,sha256=fVr8UphXTfJe9I2rObmUtfU3gkSrYeM0pLy-NbJYg10,94
467
- deepeval-3.5.3.dist-info/RECORD,,
464
+ deepeval-3.5.4.dist-info/LICENSE.md,sha256=0ATkuLv6QgsJTBODUHC5Rak_PArA6gv2t7inJzNTP38,11352
465
+ deepeval-3.5.4.dist-info/METADATA,sha256=fJ15yXxlzKTfOsoW5z7uxIJ4Qx6X-UTpKj7pabi5Tv8,18682
466
+ deepeval-3.5.4.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
467
+ deepeval-3.5.4.dist-info/entry_points.txt,sha256=fVr8UphXTfJe9I2rObmUtfU3gkSrYeM0pLy-NbJYg10,94
468
+ deepeval-3.5.4.dist-info/RECORD,,