deepeval 3.6.4__py3-none-any.whl → 3.6.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deepeval/_version.py CHANGED
@@ -1 +1 @@
1
- __version__: str = "3.6.4"
1
+ __version__: str = "3.6.5"
@@ -180,6 +180,19 @@ class Settings(BaseSettings):
180
180
  # into this directory. The directory will be created on demand.
181
181
  DEEPEVAL_RESULTS_FOLDER: Optional[Path] = None
182
182
 
183
+ # Display / Truncation
184
+ DEEPEVAL_MAXLEN_TINY: Optional[int] = 40
185
+ DEEPEVAL_MAXLEN_SHORT: Optional[int] = 60
186
+ DEEPEVAL_MAXLEN_MEDIUM: Optional[int] = 120
187
+ DEEPEVAL_MAXLEN_LONG: Optional[int] = 240
188
+
189
+ # If set, this overrides the default max_len used by deepeval/utils shorten
190
+ # falls back to DEEPEVAL_MAXLEN_LONG when None.
191
+ DEEPEVAL_SHORTEN_DEFAULT_MAXLEN: Optional[int] = None
192
+
193
+ # Optional global suffix (keeps your "..." default).
194
+ DEEPEVAL_SHORTEN_SUFFIX: Optional[str] = "..."
195
+
183
196
  #
184
197
  # GPU and perf toggles
185
198
  #
@@ -1266,11 +1266,17 @@ class EvaluationDataset:
1266
1266
  detach(ctx_token)
1267
1267
 
1268
1268
  else:
1269
- confident_link = global_test_run_manager.wrap_up_test_run(
1269
+ res = global_test_run_manager.wrap_up_test_run(
1270
1270
  run_duration, display_table=False
1271
1271
  )
1272
+ if isinstance(res, tuple):
1273
+ confident_link, test_run_id = res
1274
+ else:
1275
+ confident_link = test_run_id = None
1272
1276
  return EvaluationResult(
1273
- test_results=test_results, confident_link=confident_link
1277
+ test_results=test_results,
1278
+ confident_link=confident_link,
1279
+ test_run_id=test_run_id,
1274
1280
  )
1275
1281
 
1276
1282
  def evaluate(self, task: Task):
@@ -268,11 +268,17 @@ def evaluate(
268
268
  test_run = global_test_run_manager.get_test_run()
269
269
  test_run.hyperparameters = process_hyperparameters(hyperparameters)
270
270
  global_test_run_manager.save_test_run(TEMP_FILE_PATH)
271
- confident_link = global_test_run_manager.wrap_up_test_run(
271
+ res = global_test_run_manager.wrap_up_test_run(
272
272
  run_duration, display_table=False
273
273
  )
274
+ if isinstance(res, tuple):
275
+ confident_link, test_run_id = res
276
+ else:
277
+ confident_link = test_run_id = None
274
278
  return EvaluationResult(
275
- test_results=test_results, confident_link=confident_link
279
+ test_results=test_results,
280
+ confident_link=confident_link,
281
+ test_run_id=test_run_id,
276
282
  )
277
283
  elif metric_collection:
278
284
  api = Api()
@@ -45,9 +45,7 @@ from deepeval.dataset import Golden
45
45
  from deepeval.contextvars import set_current_golden, reset_current_golden
46
46
  from deepeval.errors import MissingTestCaseParamsError
47
47
  from deepeval.metrics.utils import copy_metrics
48
- from deepeval.utils import (
49
- get_or_create_event_loop,
50
- )
48
+ from deepeval.utils import get_or_create_event_loop, shorten, len_medium
51
49
  from deepeval.telemetry import capture_evaluation_run
52
50
  from deepeval.metrics import (
53
51
  BaseMetric,
@@ -1802,14 +1800,11 @@ def a_execute_agentic_test_cases_from_loop(
1802
1800
  )
1803
1801
 
1804
1802
  # record metadata for debugging
1805
- MAX_META_INPUT_LENGTH = 120
1806
1803
  started = time.perf_counter()
1807
- short_input = current_golden_ctx["input"]
1808
- if (
1809
- isinstance(short_input, str)
1810
- and len(short_input) > MAX_META_INPUT_LENGTH
1811
- ):
1812
- short_input = short_input[:MAX_META_INPUT_LENGTH] + "…"
1804
+ short_input = current_golden_ctx.get("input")
1805
+ if isinstance(short_input, str):
1806
+ short_input = shorten(short_input, len_medium())
1807
+
1813
1808
  task_meta[task] = {
1814
1809
  "golden_index": current_golden_ctx["index"],
1815
1810
  "golden_name": current_golden_ctx["name"],
@@ -1972,7 +1967,7 @@ def a_execute_agentic_test_cases_from_loop(
1972
1967
 
1973
1968
  if settings.DEEPEVAL_DEBUG_ASYNC:
1974
1969
  logger.warning(
1975
- "[deepeval] %d stray task(s) not tracked; cancelling",
1970
+ "[deepeval] %d stray task(s) not tracked; cancelling...",
1976
1971
  len(leftovers),
1977
1972
  )
1978
1973
  for t in leftovers:
@@ -1,7 +1,8 @@
1
1
  from typing import Optional, List, Union, Dict
2
2
  from dataclasses import dataclass
3
3
  from pydantic import BaseModel
4
- from deepeval.test_run import MetricData
4
+
5
+ from deepeval.test_run.api import MetricData, TurnApi
5
6
  from deepeval.test_case import MLLMImage
6
7
 
7
8
 
@@ -19,9 +20,11 @@ class TestResult:
19
20
  expected_output: Optional[str] = None
20
21
  context: Optional[List[str]] = None
21
22
  retrieval_context: Optional[List[str]] = None
23
+ turns: Optional[List[TurnApi]] = None
22
24
  additional_metadata: Optional[Dict] = None
23
25
 
24
26
 
25
27
  class EvaluationResult(BaseModel):
26
28
  test_results: List[TestResult]
27
29
  confident_link: Optional[str]
30
+ test_run_id: Optional[str]
@@ -1,9 +1,10 @@
1
1
  import ast
2
2
  import inspect
3
- from typing import Optional, List, Callable, Union, Dict
4
- import os, time
5
-
3
+ from typing import Optional, List, Callable, Union
4
+ import os
5
+ import time
6
6
 
7
+ from deepeval.utils import format_turn
7
8
  from deepeval.test_case.conversational_test_case import Turn
8
9
  from deepeval.test_run.api import TurnApi
9
10
  from deepeval.test_run.test_run import TestRunResultDisplay
@@ -34,6 +35,29 @@ from deepeval.tracing.utils import (
34
35
  )
35
36
 
36
37
 
38
+ def _is_metric_successful(metric_data: MetricData) -> bool:
39
+ """
40
+ Robustly determine success for a metric row.
41
+
42
+ Rationale:
43
+ - If the metric recorded an error, treat as failure.
44
+ - Be defensive: custom rows may not be MetricData at runtime.
45
+ """
46
+ if getattr(metric_data, "error", None):
47
+ return False
48
+
49
+ s = getattr(metric_data, "success", None)
50
+ if isinstance(s, bool):
51
+ return s
52
+ if s is None:
53
+ return False
54
+ if isinstance(s, (int, float)):
55
+ return bool(s)
56
+ if isinstance(s, str):
57
+ return s.strip().lower() in {"true", "t", "1", "yes", "y"}
58
+ return False
59
+
60
+
37
61
  def create_metric_data(metric: BaseMetric) -> MetricData:
38
62
  if metric.error is not None:
39
63
  return MetricData(
@@ -75,6 +99,7 @@ def create_test_result(
75
99
  metrics_data=api_test_case.metrics_data,
76
100
  conversational=True,
77
101
  additional_metadata=api_test_case.additional_metadata,
102
+ turns=api_test_case.turns,
78
103
  )
79
104
  else:
80
105
  multimodal = (
@@ -112,6 +137,7 @@ def create_api_turn(turn: Turn, index: int) -> TurnApi:
112
137
  return TurnApi(
113
138
  role=turn.role,
114
139
  content=turn.content,
140
+ user_id=turn.user_id,
115
141
  retrievalContext=turn.retrieval_context,
116
142
  toolsCalled=turn.tools_called,
117
143
  additionalMetadata=turn.additional_metadata,
@@ -372,17 +398,7 @@ def print_test_result(test_result: TestResult, display: TestRunResultDisplay):
372
398
  print("Metrics Summary\n")
373
399
 
374
400
  for metric_data in test_result.metrics_data:
375
- successful = True
376
- if metric_data.error is not None:
377
- successful = False
378
- else:
379
- # This try block is for user defined custom metrics,
380
- # which might not handle the score == undefined case elegantly
381
- try:
382
- if not metric_data.success:
383
- successful = False
384
- except:
385
- successful = False
401
+ successful = _is_metric_successful(metric_data)
386
402
 
387
403
  if not successful:
388
404
  print(
@@ -401,9 +417,14 @@ def print_test_result(test_result: TestResult, display: TestRunResultDisplay):
401
417
 
402
418
  elif test_result.conversational:
403
419
  print("For conversational test case:\n")
404
- print(
405
- f" - Unable to print conversational test case. Run 'deepeval login' to view conversational evaluations in full."
406
- )
420
+ if test_result.turns:
421
+ print(" Turns:")
422
+ turns = sorted(test_result.turns, key=lambda t: t.order)
423
+ for t in turns:
424
+ print(format_turn(t))
425
+ else:
426
+ print(" - No turns recorded in this test case.")
427
+
407
428
  else:
408
429
  print("For test case:\n")
409
430
  print(f" - input: {test_result.input}")
@@ -470,15 +491,7 @@ def write_test_result_to_file(
470
491
  file.write("Metrics Summary\n\n")
471
492
 
472
493
  for metric_data in test_result.metrics_data:
473
- successful = True
474
- if metric_data.error is not None:
475
- successful = False
476
- else:
477
- try:
478
- if not metric_data.success:
479
- successful = False
480
- except:
481
- successful = False
494
+ successful = _is_metric_successful(metric_data)
482
495
 
483
496
  if not successful:
484
497
  file.write(
@@ -500,9 +513,13 @@ def write_test_result_to_file(
500
513
  file.write(f" - actual output: {test_result.actual_output}\n")
501
514
  elif test_result.conversational:
502
515
  file.write("For conversational test case:\n\n")
503
- file.write(
504
- " - Unable to print conversational test case. Run 'deepeval login' to view conversational evaluations in full.\n"
505
- )
516
+ if test_result.turns:
517
+ file.write(" Turns:\n")
518
+ turns = sorted(test_result.turns, key=lambda t: t.order)
519
+ for t in turns:
520
+ file.write(format_turn(t) + "\n")
521
+ else:
522
+ file.write(" - No turns recorded in this test case.\n")
506
523
  else:
507
524
  file.write("For test case:\n\n")
508
525
  file.write(f" - input: {test_result.input}\n")
@@ -1,4 +1,3 @@
1
1
  from .handler import instrument_crewai
2
- from .agent import Agent
3
2
 
4
- __all__ = ["instrument_crewai", "Agent"]
3
+ __all__ = ["instrument_crewai"]
@@ -1,30 +1,50 @@
1
- from typing import Optional
1
+ import logging
2
2
  import deepeval
3
- from deepeval.integrations.crewai.agent import (
4
- Agent as PatchedAgent,
5
- agent_registry,
6
- )
7
- from deepeval.integrations.crewai.patch import patch_build_context_for_task
3
+
4
+ from typing import Optional
8
5
  from deepeval.telemetry import capture_tracing_integration
9
- from deepeval.tracing.types import AgentSpan, LlmSpan
6
+ from deepeval.tracing.context import current_span_context, current_trace_context
7
+ from deepeval.tracing.tracing import Observer
8
+ from deepeval.tracing.types import LlmSpan
9
+ from deepeval.config.settings import get_settings
10
+
11
+
12
+ logger = logging.getLogger(__name__)
13
+
10
14
 
11
15
  try:
12
- from crewai.crew import Crew
13
- from crewai.llm import LLM
14
- from crewai.agent import Agent
15
- from crewai.utilities.events import AgentExecutionCompletedEvent
16
16
  from crewai.utilities.events.base_event_listener import BaseEventListener
17
- from crewai.task import Task
18
- from crewai.agents.crew_agent_executor import CrewAgentExecutor
19
- from crewai.utilities.events import ToolUsageFinishedEvent
20
- from crewai.tools.tool_usage import ToolUsage
21
- from crewai.utilities.events import LLMCallCompletedEvent
22
- from crewai.memory.contextual.contextual_memory import ContextualMemory
17
+ from crewai.events import (
18
+ CrewKickoffStartedEvent,
19
+ CrewKickoffCompletedEvent,
20
+ LLMCallStartedEvent,
21
+ LLMCallCompletedEvent,
22
+ AgentExecutionStartedEvent,
23
+ AgentExecutionCompletedEvent,
24
+ ToolUsageStartedEvent,
25
+ ToolUsageFinishedEvent,
26
+ )
23
27
 
24
28
  crewai_installed = True
25
- except:
29
+ except ImportError as e:
30
+ if get_settings().DEEPEVAL_VERBOSE_MODE:
31
+ if isinstance(e, ModuleNotFoundError):
32
+ logger.warning(
33
+ "Optional crewai dependency not installed: %s",
34
+ e.name,
35
+ stacklevel=2,
36
+ )
37
+ else:
38
+ logger.warning(
39
+ "Optional crewai import failed: %s",
40
+ e,
41
+ stacklevel=2,
42
+ )
43
+
26
44
  crewai_installed = False
27
45
 
46
+ IS_WRAPPED_ALL = False
47
+
28
48
 
29
49
  def is_crewai_installed():
30
50
  if not crewai_installed:
@@ -33,81 +53,114 @@ def is_crewai_installed():
33
53
  )
34
54
 
35
55
 
36
- from deepeval.test_case.llm_test_case import LLMTestCase
37
- from deepeval.tracing.tracing import (
38
- observe,
39
- current_span_context,
40
- trace_manager,
41
- current_trace_context,
42
- )
43
-
44
-
45
56
  class CrewAIEventsListener(BaseEventListener):
46
57
  def __init__(self):
47
58
  is_crewai_installed()
48
59
  super().__init__()
60
+ self.span_observers: dict[str, Observer] = {}
49
61
 
50
- def setup_listeners(self, crewai_event_bus):
62
+ @staticmethod
63
+ def get_tool_execution_id(source, event) -> str:
64
+ source_id = id(source)
65
+ task_id = getattr(event, "task_id", "unknown")
66
+ agent_id = getattr(event, "agent_id", "unknown")
67
+ tool_name = getattr(event, "tool_name", "unknown")
68
+ execution_id = f"tool_{source_id}_{task_id}_{agent_id}_{tool_name}"
51
69
 
52
- @crewai_event_bus.on(AgentExecutionCompletedEvent)
53
- def on_agent_execution_completed(
54
- source, event: AgentExecutionCompletedEvent
55
- ):
70
+ return execution_id
71
+
72
+ def setup_listeners(self, crewai_event_bus):
73
+ @crewai_event_bus.on(CrewKickoffStartedEvent)
74
+ def on_crew_started(source, event: CrewKickoffStartedEvent):
75
+ # Assuming that this event is called in the crew.kickoff method
56
76
  current_span = current_span_context.get()
57
77
 
58
- if isinstance(current_span, AgentSpan):
59
- if isinstance(source, Agent):
60
- current_span.name = source.role
61
- current_span.available_tools = [
62
- tool.name for tool in source.tools
63
- ]
78
+ # set the input
79
+ if current_span:
80
+ current_span.input = event.inputs
81
+
82
+ # set trace input
83
+ current_trace = current_trace_context.get()
84
+ if current_trace:
85
+ current_trace.input = event.inputs
86
+
87
+ @crewai_event_bus.on(CrewKickoffCompletedEvent)
88
+ def on_crew_completed(source, event: CrewKickoffCompletedEvent):
89
+ # Assuming that this event is called in the crew.kickoff method
90
+ current_span = current_span_context.get()
64
91
 
92
+ # set the output
65
93
  if current_span:
66
- # set llm test case
67
- input = None
68
- actual_output = None
69
- expected_output = None
70
-
71
- if isinstance(event.task, Task):
72
- input = event.task.prompt()
73
- actual_output = event.output
74
- expected_output = event.task.expected_output
75
-
76
- current_span.input = input
77
- current_span.output = actual_output
78
- current_span.expected_output = expected_output
79
-
80
- # set metrics
81
- if isinstance(source, PatchedAgent):
82
- current_span.metrics = agent_registry.get_metrics(source)
83
- current_span.metric_collection = (
84
- agent_registry.get_metric_collection(source)
85
- )
86
-
87
- # set offline evals
88
- if current_span.metric_collection:
89
- trace_manager.integration_traces_to_evaluate.append(
90
- current_trace_context.get()
91
- )
94
+ current_span.output = str(event.output)
92
95
 
93
- @crewai_event_bus.on(ToolUsageFinishedEvent)
94
- def on_tool_usage_finished(source, event: ToolUsageFinishedEvent):
96
+ # set trace output
97
+ current_trace = current_trace_context.get()
98
+ if current_trace:
99
+ current_trace.output = str(event.output)
100
+
101
+ @crewai_event_bus.on(LLMCallStartedEvent)
102
+ def on_llm_started(source, event: LLMCallStartedEvent):
103
+ # Assuming that this event is called in the llm.call method
95
104
  current_span = current_span_context.get()
96
- current_span.input = event.tool_args
97
- current_span.output = event.output
98
- current_span.name = event.tool_name
105
+
106
+ # set the input
107
+ if current_span:
108
+ current_span.input = event.messages
109
+
110
+ # set the model
111
+ if isinstance(current_span, LlmSpan):
112
+ current_span.model = event.model
99
113
 
100
114
  @crewai_event_bus.on(LLMCallCompletedEvent)
101
- def on_llm_call_finished(source, event: LLMCallCompletedEvent):
115
+ def on_llm_completed(source, event: LLMCallCompletedEvent):
116
+ # Assuming that this event is called in the llm.call method
102
117
  current_span = current_span_context.get()
103
118
 
104
- if isinstance(current_span, LlmSpan):
105
- if isinstance(source, LLM):
106
- current_span.model = source.model
107
-
108
- current_span.input = event.messages
119
+ # set the output
120
+ if current_span:
109
121
  current_span.output = event.response
110
122
 
123
+ @crewai_event_bus.on(AgentExecutionStartedEvent)
124
+ def on_agent_started(source, event: AgentExecutionStartedEvent):
125
+ # Assuming that this event is called in the agent.execute_task method
126
+ current_span = current_span_context.get()
127
+
128
+ # set the input
129
+ if current_span:
130
+ current_span.input = event.task_prompt
131
+
132
+ @crewai_event_bus.on(AgentExecutionCompletedEvent)
133
+ def on_agent_completed(source, event: AgentExecutionCompletedEvent):
134
+ # Assuming that this event is called in the agent.execute_task method
135
+ current_span = current_span_context.get()
136
+
137
+ # set the output
138
+ if current_span:
139
+ current_span.output = event.output
140
+
141
+ @crewai_event_bus.on(ToolUsageStartedEvent)
142
+ def on_tool_started(source, event: ToolUsageStartedEvent):
143
+ observer = Observer(
144
+ span_type="tool",
145
+ func_name=event.tool_name,
146
+ function_kwargs=event.tool_args,
147
+ )
148
+ self.span_observers[self.get_tool_execution_id(source, event)] = (
149
+ observer
150
+ )
151
+ observer.__enter__()
152
+
153
+ @crewai_event_bus.on(ToolUsageFinishedEvent)
154
+ def on_tool_completed(source, event: ToolUsageFinishedEvent):
155
+ observer = self.span_observers.pop(
156
+ self.get_tool_execution_id(source, event)
157
+ )
158
+ if observer:
159
+ current_span = current_span_context.get()
160
+ if current_span:
161
+ current_span.output = event.output
162
+ observer.__exit__(None, None, None)
163
+
111
164
 
112
165
  def instrument_crewai(api_key: Optional[str] = None):
113
166
  is_crewai_installed()
@@ -115,10 +168,29 @@ def instrument_crewai(api_key: Optional[str] = None):
115
168
  if api_key:
116
169
  deepeval.login(api_key)
117
170
 
118
- Crew.kickoff = observe(Crew.kickoff)
119
- LLM.call = observe(LLM.call, type="llm", model="")
120
- Agent.execute_task = observe(Agent.execute_task, type="agent")
121
- CrewAgentExecutor.invoke = observe(CrewAgentExecutor.invoke)
122
- ToolUsage.use = observe(ToolUsage.use, type="tool")
123
- patch_build_context_for_task()
171
+ wrap_all()
172
+
124
173
  CrewAIEventsListener()
174
+
175
+
176
+ def wrap_all():
177
+ global IS_WRAPPED_ALL
178
+
179
+ if not IS_WRAPPED_ALL:
180
+ from deepeval.integrations.crewai.wrapper import (
181
+ wrap_crew_kickoff,
182
+ wrap_crew_kickoff_for_each,
183
+ wrap_crew_kickoff_async,
184
+ wrap_crew_kickoff_for_each_async,
185
+ wrap_llm_call,
186
+ wrap_agent_execute_task,
187
+ )
188
+
189
+ wrap_crew_kickoff()
190
+ wrap_crew_kickoff_for_each()
191
+ wrap_crew_kickoff_async()
192
+ wrap_crew_kickoff_for_each_async()
193
+ wrap_llm_call()
194
+ wrap_agent_execute_task()
195
+
196
+ IS_WRAPPED_ALL = True
@@ -0,0 +1,87 @@
1
+ from crewai.llm import LLM
2
+ from crewai.crew import Crew
3
+ from crewai.agent import Agent
4
+ from functools import wraps
5
+ from deepeval.tracing.tracing import Observer
6
+
7
+
8
+ def wrap_crew_kickoff():
9
+ original_kickoff = Crew.kickoff
10
+
11
+ @wraps(original_kickoff)
12
+ def wrapper(self, *args, **kwargs):
13
+ with Observer(span_type="crew", func_name="kickoff"):
14
+ result = original_kickoff(self, *args, **kwargs)
15
+
16
+ return result
17
+
18
+ Crew.kickoff = wrapper
19
+
20
+
21
+ def wrap_crew_kickoff_for_each():
22
+ original_kickoff_for_each = Crew.kickoff_for_each
23
+
24
+ @wraps(original_kickoff_for_each)
25
+ def wrapper(self, *args, **kwargs):
26
+ with Observer(span_type="crew", func_name="kickoff_for_each"):
27
+ result = original_kickoff_for_each(self, *args, **kwargs)
28
+
29
+ return result
30
+
31
+ Crew.kickoff_for_each = wrapper
32
+
33
+
34
+ def wrap_crew_kickoff_async():
35
+ original_kickoff_async = Crew.kickoff_async
36
+
37
+ @wraps(original_kickoff_async)
38
+ async def wrapper(self, *args, **kwargs):
39
+ with Observer(span_type="crew", func_name="kickoff_async"):
40
+ result = await original_kickoff_async(self, *args, **kwargs)
41
+
42
+ return result
43
+
44
+ Crew.kickoff_async = wrapper
45
+
46
+
47
+ def wrap_crew_kickoff_for_each_async():
48
+ original_kickoff_for_each_async = Crew.kickoff_for_each_async
49
+
50
+ @wraps(original_kickoff_for_each_async)
51
+ async def wrapper(self, *args, **kwargs):
52
+ with Observer(span_type="crew", func_name="kickoff_for_each_async"):
53
+ result = await original_kickoff_for_each_async(
54
+ self, *args, **kwargs
55
+ )
56
+
57
+ return result
58
+
59
+ Crew.kickoff_for_each_async = wrapper
60
+
61
+
62
+ def wrap_llm_call():
63
+ original_llm_call = LLM.call
64
+
65
+ @wraps(original_llm_call)
66
+ def wrapper(self, *args, **kwargs):
67
+ with Observer(
68
+ span_type="llm",
69
+ func_name="call",
70
+ observe_kwargs={"model": "temp_model"},
71
+ ):
72
+ result = original_llm_call(self, *args, **kwargs)
73
+ return result
74
+
75
+ LLM.call = wrapper
76
+
77
+
78
+ def wrap_agent_execute_task():
79
+ original_execute_task = Agent.execute_task
80
+
81
+ @wraps(original_execute_task)
82
+ def wrapper(self, *args, **kwargs):
83
+ with Observer(span_type="agent", func_name="execute_task"):
84
+ result = original_execute_task(self, *args, **kwargs)
85
+ return result
86
+
87
+ Agent.execute_task = wrapper