deepeval 3.6.8__py3-none-any.whl → 3.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/anthropic/__init__.py +19 -0
  3. deepeval/anthropic/extractors.py +94 -0
  4. deepeval/anthropic/patch.py +169 -0
  5. deepeval/anthropic/utils.py +225 -0
  6. deepeval/benchmarks/drop/drop.py +40 -14
  7. deepeval/benchmarks/ifeval/ifeval.py +2 -2
  8. deepeval/confident/types.py +4 -2
  9. deepeval/config/settings.py +258 -47
  10. deepeval/config/settings_manager.py +4 -0
  11. deepeval/config/utils.py +5 -0
  12. deepeval/dataset/dataset.py +162 -30
  13. deepeval/dataset/utils.py +41 -13
  14. deepeval/evaluate/execute.py +1099 -633
  15. deepeval/integrations/crewai/handler.py +36 -0
  16. deepeval/integrations/langchain/callback.py +27 -2
  17. deepeval/integrations/llama_index/handler.py +58 -4
  18. deepeval/integrations/llama_index/utils.py +24 -0
  19. deepeval/metrics/__init__.py +5 -0
  20. deepeval/metrics/exact_match/__init__.py +0 -0
  21. deepeval/metrics/exact_match/exact_match.py +94 -0
  22. deepeval/metrics/indicator.py +21 -1
  23. deepeval/metrics/pattern_match/__init__.py +0 -0
  24. deepeval/metrics/pattern_match/pattern_match.py +103 -0
  25. deepeval/metrics/task_completion/task_completion.py +9 -2
  26. deepeval/model_integrations/__init__.py +0 -0
  27. deepeval/model_integrations/utils.py +116 -0
  28. deepeval/models/base_model.py +3 -1
  29. deepeval/models/llms/amazon_bedrock_model.py +20 -17
  30. deepeval/models/llms/openai_model.py +10 -1
  31. deepeval/models/retry_policy.py +103 -20
  32. deepeval/openai/__init__.py +3 -1
  33. deepeval/openai/extractors.py +2 -2
  34. deepeval/openai/utils.py +7 -31
  35. deepeval/prompt/api.py +11 -10
  36. deepeval/prompt/prompt.py +5 -4
  37. deepeval/simulator/conversation_simulator.py +25 -18
  38. deepeval/synthesizer/chunking/context_generator.py +9 -1
  39. deepeval/telemetry.py +3 -3
  40. deepeval/test_case/llm_test_case.py +3 -2
  41. deepeval/test_run/api.py +3 -2
  42. deepeval/test_run/cache.py +4 -3
  43. deepeval/test_run/test_run.py +24 -5
  44. deepeval/tracing/api.py +11 -10
  45. deepeval/tracing/otel/exporter.py +11 -0
  46. deepeval/tracing/patchers.py +102 -1
  47. deepeval/tracing/trace_context.py +13 -4
  48. deepeval/tracing/tracing.py +10 -1
  49. deepeval/tracing/types.py +8 -8
  50. deepeval/tracing/utils.py +9 -0
  51. deepeval/utils.py +44 -2
  52. {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/METADATA +2 -2
  53. {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/RECORD +57 -47
  54. /deepeval/{openai → model_integrations}/types.py +0 -0
  55. {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/LICENSE.md +0 -0
  56. {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/WHEEL +0 -0
  57. {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/entry_points.txt +0 -0
@@ -23,6 +23,8 @@ try:
23
23
  AgentExecutionCompletedEvent,
24
24
  ToolUsageStartedEvent,
25
25
  ToolUsageFinishedEvent,
26
+ KnowledgeRetrievalStartedEvent,
27
+ KnowledgeRetrievalCompletedEvent,
26
28
  )
27
29
 
28
30
  crewai_installed = True
@@ -69,6 +71,14 @@ class CrewAIEventsListener(BaseEventListener):
69
71
 
70
72
  return execution_id
71
73
 
74
+ @staticmethod
75
+ def get_knowledge_execution_id(source, event) -> str:
76
+ source_id = id(source)
77
+ agent_id = id(event.agent) if hasattr(event, "agent") else "unknown"
78
+ execution_id = f"_knowledge_{source_id}_{agent_id}"
79
+
80
+ return execution_id
81
+
72
82
  def setup_listeners(self, crewai_event_bus):
73
83
  @crewai_event_bus.on(CrewKickoffStartedEvent)
74
84
  def on_crew_started(source, event: CrewKickoffStartedEvent):
@@ -161,6 +171,32 @@ class CrewAIEventsListener(BaseEventListener):
161
171
  current_span.output = event.output
162
172
  observer.__exit__(None, None, None)
163
173
 
174
+ @crewai_event_bus.on(KnowledgeRetrievalStartedEvent)
175
+ def on_knowledge_started(source, event: KnowledgeRetrievalStartedEvent):
176
+ observer = Observer(
177
+ span_type="tool",
178
+ func_name="knowledge_retrieval",
179
+ function_kwargs={},
180
+ )
181
+ self.span_observers[
182
+ self.get_knowledge_execution_id(source, event)
183
+ ] = observer
184
+ observer.__enter__()
185
+
186
+ @crewai_event_bus.on(KnowledgeRetrievalCompletedEvent)
187
+ def on_knowledge_completed(
188
+ source, event: KnowledgeRetrievalCompletedEvent
189
+ ):
190
+ observer = self.span_observers.pop(
191
+ self.get_knowledge_execution_id(source, event)
192
+ )
193
+ if observer:
194
+ current_span = current_span_context.get()
195
+ if current_span:
196
+ current_span.input = event.query
197
+ current_span.output = event.retrieved_knowledge
198
+ observer.__exit__(None, None, None)
199
+
164
200
 
165
201
  def instrument_crewai(api_key: Optional[str] = None):
166
202
  is_crewai_installed()
@@ -1,12 +1,15 @@
1
1
  from typing import Any, Optional, List, Dict
2
2
  from uuid import UUID
3
3
  from time import perf_counter
4
- from deepeval.tracing.context import current_trace_context
4
+
5
+ from deepeval.tracing.context import current_span_context, current_trace_context
6
+ from deepeval.test_case.llm_test_case import ToolCall
5
7
  from deepeval.tracing.types import (
6
8
  LlmOutput,
7
9
  LlmToolCall,
8
10
  )
9
11
  from deepeval.metrics import BaseMetric
12
+ from deepeval.tracing.utils import prepare_tool_call_input_parameters
10
13
 
11
14
  try:
12
15
  from langchain_core.callbacks.base import BaseCallbackHandler
@@ -266,12 +269,34 @@ class CallbackHandler(BaseCallbackHandler):
266
269
  parent_run_id: Optional[UUID] = None,
267
270
  **kwargs: Any, # un-logged kwargs
268
271
  ) -> Any:
269
-
270
272
  uuid_str = str(run_id)
271
273
  tool_span: ToolSpan = trace_manager.get_span_by_uuid(uuid_str)
272
274
  tool_span.output = output
273
275
  exit_current_context(uuid_str=uuid_str)
274
276
 
277
+ # set the tools called in the parent span as well as on the trace level
278
+ tool_call = ToolCall(
279
+ name=tool_span.name,
280
+ description=tool_span.description,
281
+ output=output,
282
+ input_parameters=prepare_tool_call_input_parameters(
283
+ tool_span.input
284
+ ),
285
+ )
286
+ parent_span = current_span_context.get()
287
+ if parent_span:
288
+ if parent_span.tools_called is None:
289
+ parent_span.tools_called = []
290
+
291
+ parent_span.tools_called.append(tool_call)
292
+
293
+ trace = current_trace_context.get()
294
+ if trace:
295
+ if trace.tools_called is None:
296
+ trace.tools_called = []
297
+
298
+ trace.tools_called.append(tool_call)
299
+
275
300
  def on_tool_error(
276
301
  self,
277
302
  error: BaseException,
@@ -2,13 +2,28 @@ from typing import Any, Dict, Optional
2
2
  import inspect
3
3
  from time import perf_counter
4
4
  import uuid
5
+
6
+ from llama_index.core.agent.workflow.workflow_events import (
7
+ AgentWorkflowStartEvent,
8
+ )
9
+ from deepeval.integrations.llama_index.utils import (
10
+ extract_output_from_llm_chat_end_event,
11
+ )
5
12
  from deepeval.telemetry import capture_tracing_integration
6
13
  from deepeval.tracing import trace_manager
7
- from deepeval.tracing.types import AgentSpan, BaseSpan, LlmSpan, TraceSpanStatus
14
+ from deepeval.tracing.types import (
15
+ ToolSpan,
16
+ AgentSpan,
17
+ BaseSpan,
18
+ LlmSpan,
19
+ TraceSpanStatus,
20
+ )
8
21
  from deepeval.tracing.trace_context import (
9
22
  current_llm_context,
10
23
  current_agent_context,
11
24
  )
25
+ from deepeval.test_case import ToolCall
26
+ from deepeval.tracing.utils import make_json_serializable
12
27
 
13
28
  try:
14
29
  from llama_index.core.instrumentation.events.base import BaseEvent
@@ -89,6 +104,7 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
89
104
  if llm_span_context
90
105
  else None
91
106
  ),
107
+ prompt=llm_span_context.prompt if llm_span_context else None,
92
108
  )
93
109
  trace_manager.add_span(llm_span)
94
110
  trace_manager.add_span_to_trace(llm_span)
@@ -106,7 +122,9 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
106
122
  llm_span.status = TraceSpanStatus.SUCCESS
107
123
  llm_span.end_time = perf_counter()
108
124
  llm_span.input = llm_span.input
109
- llm_span.output = event.response.message.blocks[0].text
125
+ llm_span.output = extract_output_from_llm_chat_end_event(
126
+ event
127
+ )
110
128
  trace_manager.remove_span(llm_span.uuid)
111
129
  del self.open_ai_astream_to_llm_span_map[event.span_id]
112
130
 
@@ -151,6 +169,14 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
151
169
  # conditions to qualify as agent start run span
152
170
  if method_name == "run":
153
171
  agent_span_context = current_agent_context.get()
172
+ start_event = bound_args.arguments.get("start_event")
173
+
174
+ if start_event and isinstance(start_event, AgentWorkflowStartEvent):
175
+ input = start_event.model_dump()
176
+
177
+ else:
178
+ input = bound_args.arguments
179
+
154
180
  span = AgentSpan(
155
181
  uuid=id_,
156
182
  status=TraceSpanStatus.IN_PROGRESS,
@@ -159,7 +185,7 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
159
185
  parent_uuid=parent_span_id,
160
186
  start_time=perf_counter(),
161
187
  name="Agent", # TODO: decide the name of the span
162
- input=bound_args.arguments,
188
+ input=input,
163
189
  metrics=(
164
190
  agent_span_context.metrics if agent_span_context else None
165
191
  ),
@@ -169,7 +195,17 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
169
195
  else None
170
196
  ),
171
197
  )
172
-
198
+ elif method_name == "acall":
199
+ span = ToolSpan(
200
+ uuid=id_,
201
+ status=TraceSpanStatus.IN_PROGRESS,
202
+ children=[],
203
+ trace_uuid=trace_uuid,
204
+ parent_uuid=parent_span_id,
205
+ start_time=perf_counter(),
206
+ input=bound_args.arguments,
207
+ name="Tool",
208
+ )
173
209
  # prepare input test case params for the span
174
210
  prepare_input_llm_test_case_params(
175
211
  class_name, method_name, span, bound_args.arguments
@@ -192,10 +228,28 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
192
228
  if base_span is None:
193
229
  return None
194
230
 
231
+ class_name, method_name = parse_id(id_)
232
+ if method_name == "call_tool":
233
+ output_json = make_json_serializable(result)
234
+ if output_json and isinstance(output_json, dict):
235
+ if base_span.tools_called is None:
236
+ base_span.tools_called = []
237
+ base_span.tools_called.append(
238
+ ToolCall(
239
+ name=output_json.get("tool_name", "Tool"),
240
+ input_parameters=output_json.get("tool_kwargs", {}),
241
+ output=output_json.get("tool_output", {}),
242
+ )
243
+ )
195
244
  base_span.end_time = perf_counter()
196
245
  base_span.status = TraceSpanStatus.SUCCESS
197
246
  base_span.output = result
198
247
 
248
+ if isinstance(base_span, ToolSpan):
249
+ result_json = make_json_serializable(result)
250
+ if result_json and isinstance(result_json, dict):
251
+ base_span.name = result_json.get("tool_name", "Tool")
252
+
199
253
  if base_span.llm_test_case:
200
254
  class_name, method_name = parse_id(id_)
201
255
  prepare_output_llm_test_case_params(
@@ -1,3 +1,4 @@
1
+ from llama_index.core.instrumentation.events.llm import LLMChatEndEvent
1
2
  from deepeval.test_case.llm_test_case import LLMTestCase, ToolCall
2
3
  from deepeval.tracing.types import BaseSpan
3
4
  from typing import Any
@@ -81,3 +82,26 @@ def prepare_output_llm_test_case_params(
81
82
  )
82
83
 
83
84
  span.llm_test_case.tools_called = tool_calls
85
+
86
+
87
+ def extract_output_from_llm_chat_end_event(event: LLMChatEndEvent) -> list:
88
+ messages = []
89
+ for msg in event.response.message.blocks:
90
+ if msg.block_type == "text":
91
+ messages.append(
92
+ {
93
+ "role": event.response.message.role.value,
94
+ "content": msg.text,
95
+ }
96
+ )
97
+ elif msg.block_type == "tool_call":
98
+ messages.append(
99
+ {
100
+ "name": msg.tool_name,
101
+ "input_parameters": msg.tool_kwargs,
102
+ "id": msg.tool_call_id,
103
+ }
104
+ )
105
+ else:
106
+ messages.append(msg.model_dump())
107
+ return messages
@@ -8,6 +8,8 @@ from .base_metric import (
8
8
  from .dag.dag import DAGMetric, DeepAcyclicGraph
9
9
  from .conversational_dag.conversational_dag import ConversationalDAGMetric
10
10
  from .bias.bias import BiasMetric
11
+ from .exact_match.exact_match import ExactMatchMetric
12
+ from .pattern_match.pattern_match import PatternMatchMetric
11
13
  from .toxicity.toxicity import ToxicityMetric
12
14
  from .pii_leakage.pii_leakage import PIILeakageMetric
13
15
  from .non_advice.non_advice import NonAdviceMetric
@@ -69,6 +71,9 @@ __all__ = [
69
71
  "BaseConversationalMetric",
70
72
  "BaseMultimodalMetric",
71
73
  "BaseArenaMetric",
74
+ # Non-LLM metrics
75
+ "ExactMatchMetric",
76
+ "PatternMatchMetric",
72
77
  # Core metrics
73
78
  "GEval",
74
79
  "ArenaGEval",
File without changes
@@ -0,0 +1,94 @@
1
+ from typing import List
2
+
3
+ from deepeval.metrics.indicator import metric_progress_indicator
4
+ from deepeval.metrics.utils import (
5
+ check_llm_test_case_params,
6
+ construct_verbose_logs,
7
+ )
8
+ from deepeval.metrics.api import metric_data_manager
9
+ from deepeval.metrics import BaseMetric
10
+ from deepeval.test_case import LLMTestCase, LLMTestCaseParams
11
+
12
+
13
+ class ExactMatchMetric(BaseMetric):
14
+ _required_params: List[LLMTestCaseParams] = [
15
+ LLMTestCaseParams.INPUT,
16
+ LLMTestCaseParams.ACTUAL_OUTPUT,
17
+ LLMTestCaseParams.EXPECTED_OUTPUT,
18
+ ]
19
+
20
+ def __init__(
21
+ self,
22
+ threshold: float = 1,
23
+ verbose_mode: bool = False,
24
+ ):
25
+ self.threshold = threshold
26
+ self.verbose_mode = verbose_mode
27
+
28
+ def measure(
29
+ self,
30
+ test_case: LLMTestCase,
31
+ _show_indicator: bool = True,
32
+ _in_component: bool = False,
33
+ _log_metric_to_confident: bool = True,
34
+ ) -> float:
35
+ check_llm_test_case_params(test_case, self._required_params, self)
36
+
37
+ with metric_progress_indicator(
38
+ self, _show_indicator=_show_indicator, _in_component=_in_component
39
+ ):
40
+ expected = test_case.expected_output.strip()
41
+ actual = test_case.actual_output.strip()
42
+
43
+ if expected == actual:
44
+ self.score = self.precision = self.recall = self.f1 = 1.0
45
+ self.reason = (
46
+ "The actual and expected outputs are exact matches."
47
+ )
48
+ else:
49
+ self.score = self.precision = self.recall = self.f1 = 0.0
50
+ self.reason = "The actual and expected outputs are different."
51
+
52
+ self.success = self.score >= self.threshold
53
+
54
+ if self.verbose_mode:
55
+ self.verbose_logs = construct_verbose_logs(
56
+ self,
57
+ steps=[
58
+ f"Score: {self.score:.2f}",
59
+ f"Reason: {self.reason}",
60
+ ],
61
+ )
62
+
63
+ if _log_metric_to_confident:
64
+ metric_data_manager.post_metric_if_enabled(
65
+ self, test_case=test_case
66
+ )
67
+
68
+ return self.score
69
+
70
+ async def a_measure(
71
+ self,
72
+ test_case: LLMTestCase,
73
+ _show_indicator: bool = True,
74
+ _in_component: bool = False,
75
+ ) -> float:
76
+ return self.measure(
77
+ test_case,
78
+ _show_indicator=_show_indicator,
79
+ _in_component=_in_component,
80
+ )
81
+
82
+ def is_successful(self) -> bool:
83
+ if self.error is not None:
84
+ self.success = False
85
+ else:
86
+ try:
87
+ self.success = self.score >= self.threshold
88
+ except:
89
+ self.success = False
90
+ return self.success
91
+
92
+ @property
93
+ def __name__(self):
94
+ return "Exact Match"
@@ -18,6 +18,10 @@ from deepeval.test_run.cache import CachedTestCase, Cache
18
18
  from deepeval.telemetry import capture_metric_type
19
19
  from deepeval.utils import update_pbar
20
20
 
21
+ import logging
22
+
23
+ logger = logging.getLogger(__name__)
24
+
21
25
 
22
26
  def format_metric_description(
23
27
  metric: Union[BaseMetric, BaseConversationalMetric, BaseArenaMetric],
@@ -43,7 +47,7 @@ def metric_progress_indicator(
43
47
  _show_indicator: bool = True,
44
48
  _in_component: bool = False,
45
49
  ):
46
- captured_async_mode = False if async_mode == None else async_mode
50
+ captured_async_mode = False if async_mode is None else async_mode
47
51
  with capture_metric_type(
48
52
  metric.__name__,
49
53
  async_mode=captured_async_mode,
@@ -250,6 +254,21 @@ async def safe_a_measure(
250
254
  _log_metric_to_confident=False,
251
255
  )
252
256
  update_pbar(progress, pbar_eval_id)
257
+
258
+ except asyncio.CancelledError:
259
+ logger.info("caught asyncio.CancelledError")
260
+
261
+ # treat cancellation as a timeout so we still emit a MetricData
262
+ metric.error = (
263
+ "Timed out/cancelled while evaluating metric. "
264
+ "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
265
+ "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
266
+ )
267
+ metric.success = False
268
+
269
+ if not ignore_errors:
270
+ raise
271
+
253
272
  except MissingTestCaseParamsError as e:
254
273
  if skip_on_missing_params:
255
274
  metric.skipped = True
@@ -277,5 +296,6 @@ async def safe_a_measure(
277
296
  if ignore_errors:
278
297
  metric.error = str(e)
279
298
  metric.success = False # Assuming you want to set success to False
299
+ logger.info("a metric was marked as errored")
280
300
  else:
281
301
  raise
File without changes
@@ -0,0 +1,103 @@
1
+ import re
2
+ from typing import List
3
+
4
+ from deepeval.metrics.indicator import metric_progress_indicator
5
+ from deepeval.metrics.utils import (
6
+ check_llm_test_case_params,
7
+ construct_verbose_logs,
8
+ )
9
+ from deepeval.metrics.api import metric_data_manager
10
+ from deepeval.metrics import BaseMetric
11
+ from deepeval.test_case import LLMTestCase, LLMTestCaseParams
12
+
13
+
14
+ class PatternMatchMetric(BaseMetric):
15
+ _required_params: List[LLMTestCaseParams] = [
16
+ LLMTestCaseParams.INPUT,
17
+ LLMTestCaseParams.ACTUAL_OUTPUT,
18
+ ]
19
+
20
+ def __init__(
21
+ self,
22
+ pattern: str,
23
+ ignore_case: bool = False,
24
+ threshold: float = 1.0,
25
+ verbose_mode: bool = False,
26
+ ):
27
+ self.pattern = pattern.strip()
28
+ self.ignore_case = ignore_case
29
+ self.verbose_mode = verbose_mode
30
+ self.threshold = threshold
31
+
32
+ flags = re.IGNORECASE if ignore_case else 0
33
+ try:
34
+ self._compiled_pattern = re.compile(self.pattern, flags)
35
+ except re.error as e:
36
+ raise ValueError(f"Invalid regex pattern: {pattern} — {e}")
37
+
38
+ def measure(
39
+ self,
40
+ test_case: LLMTestCase,
41
+ _show_indicator: bool = True,
42
+ _in_component: bool = False,
43
+ _log_metric_to_confident: bool = True,
44
+ ) -> float:
45
+ check_llm_test_case_params(test_case, self._required_params, self)
46
+
47
+ with metric_progress_indicator(
48
+ self, _show_indicator=_show_indicator, _in_component=_in_component
49
+ ):
50
+ actual = test_case.actual_output.strip()
51
+ full_match = self._compiled_pattern.fullmatch(actual)
52
+
53
+ self.score = 1.0 if full_match else 0.0
54
+ self.reason = (
55
+ f"The actual output fully matches the pattern."
56
+ if full_match
57
+ else f"The actual output does not match the pattern."
58
+ )
59
+ self.success = self.score >= self.threshold
60
+
61
+ if self.verbose_mode:
62
+ self.verbose_logs = construct_verbose_logs(
63
+ self,
64
+ steps=[
65
+ f"Pattern: {self.pattern}",
66
+ f"Actual: {actual}",
67
+ f"Score: {self.score:.2f}",
68
+ f"Reason: {self.reason}",
69
+ ],
70
+ )
71
+
72
+ if _log_metric_to_confident:
73
+ metric_data_manager.post_metric_if_enabled(
74
+ self, test_case=test_case
75
+ )
76
+
77
+ return self.score
78
+
79
+ async def a_measure(
80
+ self,
81
+ test_case: LLMTestCase,
82
+ _show_indicator: bool = True,
83
+ _in_component: bool = False,
84
+ ) -> float:
85
+ return self.measure(
86
+ test_case,
87
+ _show_indicator=_show_indicator,
88
+ _in_component=_in_component,
89
+ )
90
+
91
+ def is_successful(self) -> bool:
92
+ if self.error is not None:
93
+ self.success = False
94
+ else:
95
+ try:
96
+ self.success = self.score >= self.threshold
97
+ except:
98
+ self.success = False
99
+ return self.success
100
+
101
+ @property
102
+ def __name__(self):
103
+ return "Pattern Match"
@@ -36,6 +36,11 @@ class TaskCompletionMetric(BaseMetric):
36
36
  strict_mode: bool = False,
37
37
  verbose_mode: bool = False,
38
38
  ):
39
+ if task is None:
40
+ self._is_task_provided = False
41
+ else:
42
+ self._is_task_provided = True
43
+
39
44
  self.task = task
40
45
  self.threshold = 1 if strict_mode else threshold
41
46
  self.model, self.using_native_model = initialize_model(model)
@@ -73,7 +78,8 @@ class TaskCompletionMetric(BaseMetric):
73
78
  )
74
79
  else:
75
80
  task, self.outcome = self._extract_task_and_outcome(test_case)
76
- self.task = task if self.task is None else self.task
81
+ if self.task is None or not self._is_task_provided:
82
+ self.task = task
77
83
  self.verdict, self.reason = self._generate_verdicts()
78
84
  self.score = self._calculate_score()
79
85
  self.success = self.score >= self.threshold
@@ -108,7 +114,8 @@ class TaskCompletionMetric(BaseMetric):
108
114
  task, self.outcome = await self._a_extract_task_and_outcome(
109
115
  test_case
110
116
  )
111
- self.task = task if self.task is None else self.task
117
+ if self.task is None or not self._is_task_provided:
118
+ self.task = task
112
119
  self.verdict, self.reason = await self._a_generate_verdicts()
113
120
  self.score = self._calculate_score()
114
121
  self.success = self.score >= self.threshold
File without changes
@@ -0,0 +1,116 @@
1
+ import json
2
+ import uuid
3
+ from typing import Any, List, Optional
4
+
5
+ from deepeval.model_integrations.types import InputParameters, OutputParameters
6
+ from deepeval.test_case.llm_test_case import ToolCall
7
+ from deepeval.tracing.context import (
8
+ current_span_context,
9
+ current_trace_context,
10
+ update_current_span,
11
+ update_llm_span,
12
+ )
13
+ from deepeval.tracing.trace_context import current_llm_context
14
+ from deepeval.tracing.types import ToolSpan, TraceSpanStatus
15
+ from deepeval.utils import shorten, len_long
16
+
17
+
18
+ def _update_all_attributes(
19
+ input_parameters: InputParameters,
20
+ output_parameters: OutputParameters,
21
+ expected_tools: List[ToolCall],
22
+ expected_output: str,
23
+ context: List[str],
24
+ retrieval_context: List[str],
25
+ ):
26
+ """Update span and trace attributes with input/output parameters."""
27
+ update_current_span(
28
+ input=input_parameters.input or input_parameters.messages or "NA",
29
+ output=output_parameters.output or "NA",
30
+ tools_called=output_parameters.tools_called,
31
+ # attributes to be added
32
+ expected_output=expected_output,
33
+ expected_tools=expected_tools,
34
+ context=context,
35
+ retrieval_context=retrieval_context,
36
+ )
37
+
38
+ llm_context = current_llm_context.get()
39
+
40
+ update_llm_span(
41
+ input_token_count=output_parameters.prompt_tokens,
42
+ output_token_count=output_parameters.completion_tokens,
43
+ prompt=llm_context.prompt,
44
+ )
45
+
46
+ if output_parameters.tools_called:
47
+ create_child_tool_spans(output_parameters)
48
+
49
+ __update_input_and_output_of_current_trace(
50
+ input_parameters, output_parameters
51
+ )
52
+
53
+
54
+ def __update_input_and_output_of_current_trace(
55
+ input_parameters: InputParameters, output_parameters: OutputParameters
56
+ ):
57
+
58
+ current_trace = current_trace_context.get()
59
+ if current_trace:
60
+ if current_trace.input is None:
61
+ current_trace.input = (
62
+ input_parameters.input or input_parameters.messages
63
+ )
64
+ if current_trace.output is None:
65
+ current_trace.output = output_parameters.output
66
+
67
+ return
68
+
69
+
70
+ def create_child_tool_spans(output_parameters: OutputParameters):
71
+ if output_parameters.tools_called is None:
72
+ return
73
+
74
+ current_span = current_span_context.get()
75
+ for tool_called in output_parameters.tools_called:
76
+ tool_span = ToolSpan(
77
+ **{
78
+ "uuid": str(uuid.uuid4()),
79
+ "trace_uuid": current_span.trace_uuid,
80
+ "parent_uuid": current_span.uuid,
81
+ "start_time": current_span.start_time,
82
+ "end_time": current_span.start_time,
83
+ "status": TraceSpanStatus.SUCCESS,
84
+ "children": [],
85
+ "name": tool_called.name,
86
+ "input": tool_called.input_parameters,
87
+ "output": None,
88
+ "metrics": None,
89
+ "description": tool_called.description,
90
+ }
91
+ )
92
+ current_span.children.append(tool_span)
93
+
94
+
95
+ _URL_MAX = 200
96
+ _JSON_MAX = max(
97
+ len_long(), 400
98
+ ) # <- make this bigger by increasing DEEPEVAL_MAXLEN_LONG above 400
99
+
100
+
101
+ def compact_dump(value: Any) -> str:
102
+ try:
103
+ dumped = json.dumps(
104
+ value, ensure_ascii=False, default=str, separators=(",", ":")
105
+ )
106
+ except Exception:
107
+ dumped = repr(value)
108
+ return shorten(dumped, max_len=_JSON_MAX)
109
+
110
+
111
+ def fmt_url(url: Optional[str]) -> str:
112
+ if not url:
113
+ return ""
114
+ if url.startswith("data:"):
115
+ return "[data-uri]"
116
+ return shorten(url, max_len=_URL_MAX)
@@ -68,7 +68,9 @@ class DeepEvalBaseLLM(ABC):
68
68
  Returns:
69
69
  A list of strings.
70
70
  """
71
- raise AttributeError
71
+ raise NotImplementedError(
72
+ "batch_generate is not implemented for this model"
73
+ )
72
74
 
73
75
  @abstractmethod
74
76
  def get_model_name(self, *args, **kwargs) -> str: