deepeval 3.6.5__py3-none-any.whl → 3.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. deepeval/__init__.py +42 -10
  2. deepeval/_version.py +1 -1
  3. deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
  4. deepeval/cli/main.py +42 -0
  5. deepeval/confident/api.py +1 -0
  6. deepeval/config/logging.py +33 -0
  7. deepeval/config/settings.py +176 -16
  8. deepeval/constants.py +8 -1
  9. deepeval/dataset/dataset.py +2 -11
  10. deepeval/dataset/utils.py +1 -1
  11. deepeval/evaluate/evaluate.py +5 -1
  12. deepeval/evaluate/execute.py +118 -60
  13. deepeval/evaluate/utils.py +20 -116
  14. deepeval/integrations/crewai/__init__.py +6 -1
  15. deepeval/integrations/crewai/handler.py +1 -1
  16. deepeval/integrations/crewai/subs.py +51 -0
  17. deepeval/integrations/crewai/wrapper.py +45 -5
  18. deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
  19. deepeval/metrics/api.py +281 -0
  20. deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
  21. deepeval/metrics/bias/bias.py +12 -3
  22. deepeval/metrics/contextual_precision/contextual_precision.py +12 -3
  23. deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
  24. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
  25. deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
  26. deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
  27. deepeval/metrics/conversational_dag/nodes.py +12 -4
  28. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +73 -59
  29. deepeval/metrics/dag/dag.py +12 -0
  30. deepeval/metrics/dag/nodes.py +12 -4
  31. deepeval/metrics/faithfulness/faithfulness.py +12 -1
  32. deepeval/metrics/g_eval/g_eval.py +37 -15
  33. deepeval/metrics/hallucination/hallucination.py +12 -1
  34. deepeval/metrics/indicator.py +8 -2
  35. deepeval/metrics/json_correctness/json_correctness.py +12 -1
  36. deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
  37. deepeval/metrics/mcp/mcp_task_completion.py +13 -0
  38. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +13 -0
  39. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +12 -1
  40. deepeval/metrics/misuse/misuse.py +12 -1
  41. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
  42. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
  43. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
  44. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
  45. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
  46. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +6 -1
  47. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
  48. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
  49. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
  50. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
  51. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
  52. deepeval/metrics/non_advice/non_advice.py +12 -0
  53. deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
  54. deepeval/metrics/prompt_alignment/prompt_alignment.py +53 -24
  55. deepeval/metrics/role_adherence/role_adherence.py +12 -0
  56. deepeval/metrics/role_violation/role_violation.py +12 -0
  57. deepeval/metrics/summarization/summarization.py +12 -1
  58. deepeval/metrics/task_completion/task_completion.py +3 -0
  59. deepeval/metrics/tool_correctness/tool_correctness.py +8 -0
  60. deepeval/metrics/toxicity/toxicity.py +12 -0
  61. deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
  62. deepeval/models/llms/grok_model.py +1 -1
  63. deepeval/models/llms/openai_model.py +2 -0
  64. deepeval/models/retry_policy.py +202 -11
  65. deepeval/openai/__init__.py +14 -32
  66. deepeval/openai/extractors.py +24 -34
  67. deepeval/openai/patch.py +256 -161
  68. deepeval/openai/types.py +20 -0
  69. deepeval/openai/utils.py +98 -56
  70. deepeval/prompt/__init__.py +19 -1
  71. deepeval/prompt/api.py +160 -0
  72. deepeval/prompt/prompt.py +244 -62
  73. deepeval/prompt/utils.py +144 -2
  74. deepeval/synthesizer/chunking/context_generator.py +209 -152
  75. deepeval/synthesizer/chunking/doc_chunker.py +46 -12
  76. deepeval/synthesizer/synthesizer.py +8 -5
  77. deepeval/test_case/api.py +131 -0
  78. deepeval/test_run/__init__.py +1 -0
  79. deepeval/test_run/hyperparameters.py +47 -8
  80. deepeval/test_run/test_run.py +104 -1
  81. deepeval/tracing/api.py +3 -1
  82. deepeval/tracing/message_types/__init__.py +10 -0
  83. deepeval/tracing/message_types/base.py +6 -0
  84. deepeval/tracing/message_types/messages.py +14 -0
  85. deepeval/tracing/message_types/tools.py +18 -0
  86. deepeval/tracing/otel/exporter.py +0 -6
  87. deepeval/tracing/otel/utils.py +58 -8
  88. deepeval/tracing/trace_context.py +73 -4
  89. deepeval/tracing/trace_test_manager.py +19 -0
  90. deepeval/tracing/tracing.py +52 -4
  91. deepeval/tracing/types.py +16 -0
  92. deepeval/tracing/utils.py +8 -0
  93. {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/METADATA +1 -1
  94. {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/RECORD +97 -87
  95. {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/LICENSE.md +0 -0
  96. {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/WHEEL +0 -0
  97. {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/entry_points.txt +0 -0
@@ -61,6 +61,7 @@ from deepeval.test_case import (
61
61
  ConversationalTestCase,
62
62
  MLLMTestCase,
63
63
  )
64
+ from deepeval.test_case.api import create_api_test_case
64
65
  from deepeval.test_run import (
65
66
  global_test_run_manager,
66
67
  LLMApiTestCase,
@@ -80,18 +81,20 @@ from deepeval.evaluate.utils import (
80
81
  create_api_trace,
81
82
  create_metric_data,
82
83
  create_test_result,
83
- create_api_test_case,
84
84
  count_metrics_in_trace,
85
85
  extract_trace_test_results,
86
86
  )
87
87
  from deepeval.utils import add_pbar, update_pbar, custom_console
88
- from deepeval.openai.utils import openai_test_case_pairs
89
88
  from deepeval.tracing.types import TestCaseMetricPair
90
89
  from deepeval.config.settings import get_settings
91
-
90
+ from deepeval.test_run import TEMP_FILE_PATH
91
+ from deepeval.confident.api import is_confident
92
+ from deepeval.test_run.hyperparameters import (
93
+ process_hyperparameters,
94
+ process_prompts,
95
+ )
92
96
 
93
97
  logger = logging.getLogger(__name__)
94
- settings = get_settings()
95
98
 
96
99
 
97
100
  async def _snapshot_tasks():
@@ -100,6 +103,18 @@ async def _snapshot_tasks():
100
103
  return {t for t in asyncio.all_tasks() if t is not cur}
101
104
 
102
105
 
106
+ def _per_task_timeout() -> float:
107
+ return get_settings().DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
108
+
109
+
110
+ def _gather_timeout() -> float:
111
+ s = get_settings()
112
+ return (
113
+ s.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
114
+ + s.DEEPEVAL_TASK_GATHER_BUFFER_SECONDS
115
+ )
116
+
117
+
103
118
  ###########################################
104
119
  ### E2E Evals #############################
105
120
  ###########################################
@@ -838,7 +853,7 @@ def execute_agentic_test_cases(
838
853
  loop.run_until_complete(
839
854
  asyncio.wait_for(
840
855
  coro,
841
- timeout=settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
856
+ timeout=_per_task_timeout(),
842
857
  )
843
858
  )
844
859
  else:
@@ -891,6 +906,7 @@ def execute_agentic_test_cases(
891
906
  trace_api.agent_spans.append(api_span)
892
907
  elif isinstance(span, LlmSpan):
893
908
  trace_api.llm_spans.append(api_span)
909
+ log_prompt(span, test_run_manager)
894
910
  elif isinstance(span, RetrieverSpan):
895
911
  trace_api.retriever_spans.append(api_span)
896
912
  elif isinstance(span, ToolSpan):
@@ -1196,7 +1212,7 @@ async def _a_execute_agentic_test_case(
1196
1212
  if asyncio.iscoroutinefunction(observed_callback):
1197
1213
  await asyncio.wait_for(
1198
1214
  observed_callback(golden.input),
1199
- timeout=settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
1215
+ timeout=_per_task_timeout(),
1200
1216
  )
1201
1217
  else:
1202
1218
  observed_callback(golden.input)
@@ -1273,6 +1289,7 @@ async def _a_execute_agentic_test_case(
1273
1289
  verbose_mode=verbose_mode,
1274
1290
  progress=progress,
1275
1291
  pbar_eval_id=pbar_eval_id,
1292
+ test_run_manager=test_run_manager,
1276
1293
  _use_bar_indicator=_use_bar_indicator,
1277
1294
  )
1278
1295
  child_tasks = [dfs(child) for child in span.children]
@@ -1280,7 +1297,18 @@ async def _a_execute_agentic_test_case(
1280
1297
  await asyncio.gather(*child_tasks)
1281
1298
 
1282
1299
  test_start_time = time.perf_counter()
1283
- await dfs(current_trace.root_spans[0])
1300
+ if current_trace and current_trace.root_spans:
1301
+ await dfs(current_trace.root_spans[0])
1302
+ else:
1303
+ if (
1304
+ logger.isEnabledFor(logging.DEBUG)
1305
+ and get_settings().DEEPEVAL_VERBOSE_MODE
1306
+ ):
1307
+ logger.debug(
1308
+ "Skipping DFS: empty trace or no root spans (trace=%s)",
1309
+ current_trace.uuid if current_trace else None,
1310
+ )
1311
+
1284
1312
  test_end_time = time.perf_counter()
1285
1313
  run_duration = test_end_time - test_start_time
1286
1314
 
@@ -1302,6 +1330,7 @@ async def _a_execute_span_test_case(
1302
1330
  verbose_mode: Optional[bool],
1303
1331
  progress: Optional[Progress],
1304
1332
  pbar_eval_id: Optional[int],
1333
+ test_run_manager: Optional[TestRunManager],
1305
1334
  _use_bar_indicator: bool,
1306
1335
  ):
1307
1336
  api_span: BaseApiSpan = trace_manager._convert_span_to_api_span(span)
@@ -1309,6 +1338,7 @@ async def _a_execute_span_test_case(
1309
1338
  trace_api.agent_spans.append(api_span)
1310
1339
  elif isinstance(span, LlmSpan):
1311
1340
  trace_api.llm_spans.append(api_span)
1341
+ log_prompt(span, test_run_manager)
1312
1342
  elif isinstance(span, RetrieverSpan):
1313
1343
  trace_api.retriever_spans.append(api_span)
1314
1344
  elif isinstance(span, ToolSpan):
@@ -1557,6 +1587,7 @@ def execute_agentic_test_cases_from_loop(
1557
1587
  trace_api.agent_spans.append(api_span)
1558
1588
  elif isinstance(span, LlmSpan):
1559
1589
  trace_api.llm_spans.append(api_span)
1590
+ log_prompt(span, test_run_manager)
1560
1591
  elif isinstance(span, RetrieverSpan):
1561
1592
  trace_api.retriever_spans.append(api_span)
1562
1593
  elif isinstance(span, ToolSpan):
@@ -1737,6 +1768,7 @@ def execute_agentic_test_cases_from_loop(
1737
1768
  local_trace_manager.evaluating = False
1738
1769
  local_trace_manager.traces_to_evaluate_order.clear()
1739
1770
  local_trace_manager.traces_to_evaluate.clear()
1771
+ local_trace_manager.trace_uuid_to_golden.clear()
1740
1772
 
1741
1773
 
1742
1774
  def a_execute_agentic_test_cases_from_loop(
@@ -1753,11 +1785,6 @@ def a_execute_agentic_test_cases_from_loop(
1753
1785
  _is_assert_test: bool = False,
1754
1786
  ) -> Iterator[TestResult]:
1755
1787
 
1756
- GATHER_TIMEOUT_SECONDS = (
1757
- settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
1758
- + settings.DEEPEVAL_TASK_GATHER_BUFFER_SECONDS
1759
- )
1760
-
1761
1788
  semaphore = asyncio.Semaphore(async_config.max_concurrent)
1762
1789
  original_create_task = asyncio.create_task
1763
1790
 
@@ -1772,7 +1799,7 @@ def a_execute_agentic_test_cases_from_loop(
1772
1799
  async def execute_callback_with_semaphore(coroutine: Awaitable):
1773
1800
  async with semaphore:
1774
1801
  return await asyncio.wait_for(
1775
- coroutine, timeout=settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
1802
+ coroutine, timeout=_per_task_timeout()
1776
1803
  )
1777
1804
 
1778
1805
  def evaluate_test_cases(
@@ -1814,7 +1841,7 @@ def a_execute_agentic_test_cases_from_loop(
1814
1841
  }
1815
1842
 
1816
1843
  def on_task_done(t: asyncio.Task):
1817
- if settings.DEEPEVAL_DEBUG_ASYNC:
1844
+ if get_settings().DEEPEVAL_DEBUG_ASYNC:
1818
1845
  # Using info level here to make it easy to spot these logs.
1819
1846
  # We are gated by DEEPEVAL_DEBUG_ASYNC
1820
1847
  meta = task_meta.get(t, {})
@@ -1888,7 +1915,7 @@ def a_execute_agentic_test_cases_from_loop(
1888
1915
  loop.run_until_complete(
1889
1916
  asyncio.wait_for(
1890
1917
  asyncio.gather(*created_tasks, return_exceptions=True),
1891
- timeout=GATHER_TIMEOUT_SECONDS,
1918
+ timeout=_gather_timeout(),
1892
1919
  )
1893
1920
  )
1894
1921
  except asyncio.TimeoutError:
@@ -1903,16 +1930,13 @@ def a_execute_agentic_test_cases_from_loop(
1903
1930
  elapsed_time = time.perf_counter() - start_time
1904
1931
 
1905
1932
  # Determine if it was a per task or gather timeout based on task's elapsed time
1906
- if (
1907
- elapsed_time
1908
- >= settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
1909
- ):
1933
+ if elapsed_time >= _per_task_timeout():
1910
1934
  timeout_type = "per-task"
1911
1935
  else:
1912
1936
  timeout_type = "gather"
1913
1937
 
1914
1938
  logger.warning(
1915
- f"[deepeval] gather TIMEOUT after {GATHER_TIMEOUT_SECONDS}s; "
1939
+ f"[deepeval] gather TIMEOUT after {_gather_timeout()}s; "
1916
1940
  f"pending={len(pending)} tasks. Timeout type: {timeout_type}. "
1917
1941
  f"To give tasks more time, consider increasing "
1918
1942
  f"DEEPEVAL_PER_TASK_TIMEOUT_SECONDS for longer task completion time or "
@@ -1926,7 +1950,7 @@ def a_execute_agentic_test_cases_from_loop(
1926
1950
  elapsed_time,
1927
1951
  meta,
1928
1952
  )
1929
- if loop.get_debug() and settings.DEEPEVAL_DEBUG_ASYNC:
1953
+ if loop.get_debug() and get_settings().DEEPEVAL_DEBUG_ASYNC:
1930
1954
  frames = t.get_stack(limit=6)
1931
1955
  if frames:
1932
1956
  logger.info(" stack:")
@@ -1947,12 +1971,12 @@ def a_execute_agentic_test_cases_from_loop(
1947
1971
  return
1948
1972
 
1949
1973
  try:
1974
+ current_tasks = set()
1950
1975
  # Find tasks that were created during this run but we didn’t track
1951
1976
  current_tasks = loop.run_until_complete(_snapshot_tasks())
1952
1977
  except RuntimeError:
1953
1978
  # this might happen if the loop is already closing
1954
- # nothing we can do
1955
- return
1979
+ pass
1956
1980
 
1957
1981
  leftovers = [
1958
1982
  t
@@ -1962,10 +1986,7 @@ def a_execute_agentic_test_cases_from_loop(
1962
1986
  and not t.done()
1963
1987
  ]
1964
1988
 
1965
- if not leftovers:
1966
- return
1967
-
1968
- if settings.DEEPEVAL_DEBUG_ASYNC:
1989
+ if get_settings().DEEPEVAL_DEBUG_ASYNC:
1969
1990
  logger.warning(
1970
1991
  "[deepeval] %d stray task(s) not tracked; cancelling...",
1971
1992
  len(leftovers),
@@ -1975,20 +1996,21 @@ def a_execute_agentic_test_cases_from_loop(
1975
1996
  name = t.get_name()
1976
1997
  logger.warning(" - STRAY %s meta=%s", name, meta)
1977
1998
 
1978
- for t in leftovers:
1979
- t.cancel()
1999
+ if leftovers:
2000
+ for t in leftovers:
2001
+ t.cancel()
1980
2002
 
1981
- # Drain strays so they don’t leak into the next iteration
1982
- try:
1983
- loop.run_until_complete(
1984
- asyncio.gather(*leftovers, return_exceptions=True)
1985
- )
1986
- except RuntimeError:
1987
- # If the loop is closing here, just continue
1988
- if settings.DEEPEVAL_DEBUG_ASYNC:
1989
- logger.warning(
1990
- "[deepeval] failed to drain stray tasks because loop is closing"
2003
+ # Drain strays so they don’t leak into the next iteration
2004
+ try:
2005
+ loop.run_until_complete(
2006
+ asyncio.gather(*leftovers, return_exceptions=True)
1991
2007
  )
2008
+ except RuntimeError:
2009
+ # If the loop is closing here, just continue
2010
+ if get_settings().DEEPEVAL_DEBUG_ASYNC:
2011
+ logger.warning(
2012
+ "[deepeval] failed to drain stray tasks because loop is closing"
2013
+ )
1992
2014
 
1993
2015
  # Evaluate traces
1994
2016
  if trace_manager.traces_to_evaluate:
@@ -2011,25 +2033,6 @@ def a_execute_agentic_test_cases_from_loop(
2011
2033
  pbar_id=pbar_id,
2012
2034
  )
2013
2035
  )
2014
- elif openai_test_case_pairs:
2015
- loop.run_until_complete(
2016
- _evaluate_test_case_pairs(
2017
- test_case_pairs=openai_test_case_pairs,
2018
- test_run=test_run,
2019
- test_run_manager=test_run_manager,
2020
- test_results=test_results,
2021
- ignore_errors=error_config.ignore_errors,
2022
- skip_on_missing_params=error_config.skip_on_missing_params,
2023
- show_indicator=display_config.show_indicator,
2024
- verbose_mode=display_config.verbose_mode,
2025
- throttle_value=async_config.throttle_value,
2026
- max_concurrent=async_config.max_concurrent,
2027
- _use_bar_indicator=_use_bar_indicator,
2028
- _is_assert_test=_is_assert_test,
2029
- progress=progress,
2030
- pbar_id=pbar_id,
2031
- )
2032
- )
2033
2036
  elif trace_manager.integration_traces_to_evaluate:
2034
2037
  loop.run_until_complete(
2035
2038
  _a_evaluate_traces(
@@ -2103,6 +2106,7 @@ def a_execute_agentic_test_cases_from_loop(
2103
2106
  local_trace_manager.evaluating = False
2104
2107
  local_trace_manager.traces_to_evaluate_order.clear()
2105
2108
  local_trace_manager.traces_to_evaluate.clear()
2109
+ local_trace_manager.trace_uuid_to_golden.clear()
2106
2110
 
2107
2111
 
2108
2112
  async def _a_evaluate_traces(
@@ -2129,8 +2133,26 @@ async def _a_evaluate_traces(
2129
2133
  return await func(*args, **kwargs)
2130
2134
 
2131
2135
  eval_tasks = []
2132
- for count, trace in enumerate(traces_to_evaluate):
2133
- golden = goldens[count]
2136
+ # Here, we will work off a fixed-set copy to avoid surprises from potential
2137
+ # mid-iteration mutation
2138
+ traces_snapshot = list(traces_to_evaluate or [])
2139
+
2140
+ for count, trace in enumerate(traces_snapshot):
2141
+ # Prefer the explicit mapping from trace -> golden captured at trace creation.
2142
+ golden = trace_manager.trace_uuid_to_golden.get(trace.uuid)
2143
+ if not golden:
2144
+ # trace started during evaluation_loop but the CURRENT_GOLDEN was
2145
+ # not set for some reason. We can’t map it to a golden, so the best
2146
+ # we can do is skip evaluation for this trace.
2147
+ if (
2148
+ logger.isEnabledFor(logging.DEBUG)
2149
+ and get_settings().DEEPEVAL_VERBOSE_MODE
2150
+ ):
2151
+ logger.debug(
2152
+ "Skipping trace %s: no golden association found during evaluation_loop ",
2153
+ trace.uuid,
2154
+ )
2155
+ continue
2134
2156
  with capture_evaluation_run("golden"):
2135
2157
  task = execute_evals_with_semaphore(
2136
2158
  func=_a_execute_agentic_test_case,
@@ -2222,6 +2244,7 @@ def _execute_metric(
2222
2244
  test_case,
2223
2245
  _show_indicator=show_metric_indicator,
2224
2246
  _in_component=in_component,
2247
+ _log_metric_to_confident=False,
2225
2248
  )
2226
2249
  except MissingTestCaseParamsError as e:
2227
2250
  if error_config.skip_on_missing_params:
@@ -2256,3 +2279,38 @@ def _execute_metric(
2256
2279
  metric.success = False
2257
2280
  else:
2258
2281
  raise
2282
+
2283
+
2284
+ def log_prompt(
2285
+ llm_span: LlmSpan,
2286
+ test_run_manager: TestRunManager,
2287
+ ):
2288
+ prompt = llm_span.prompt
2289
+ if prompt is None:
2290
+ return
2291
+
2292
+ span_hyperparameters = {}
2293
+ prompt_version = prompt.version if is_confident() else None
2294
+ key = f"{prompt.alias}_{prompt_version}"
2295
+ span_hyperparameters[key] = prompt
2296
+
2297
+ test_run = test_run_manager.get_test_run()
2298
+ if test_run.prompts is None:
2299
+ test_run.prompts = []
2300
+ if test_run.hyperparameters is None:
2301
+ test_run.hyperparameters = {}
2302
+
2303
+ if key not in test_run.hyperparameters:
2304
+ test_run.hyperparameters.update(
2305
+ process_hyperparameters(span_hyperparameters, False)
2306
+ )
2307
+ existing_prompt_keys = {
2308
+ f"{p.alias}_{p.version}" for p in test_run.prompts
2309
+ }
2310
+ new_prompts = process_prompts(span_hyperparameters)
2311
+ for new_prompt in new_prompts:
2312
+ new_prompt_key = f"{new_prompt.alias}_{new_prompt.version}"
2313
+ if new_prompt_key not in existing_prompt_keys:
2314
+ test_run.prompts.append(new_prompt)
2315
+
2316
+ global_test_run_manager.save_test_run(TEMP_FILE_PATH)
@@ -28,7 +28,6 @@ from deepeval.evaluate.types import TestResult
28
28
  from deepeval.tracing.api import TraceApi, BaseApiSpan, TraceSpanApiStatus
29
29
  from deepeval.tracing.tracing import BaseSpan, Trace
30
30
  from deepeval.tracing.types import TraceSpanStatus
31
- from deepeval.constants import PYTEST_RUN_TEST_NAME
32
31
  from deepeval.tracing.utils import (
33
32
  perf_counter_to_datetime,
34
33
  to_zod_compatible_iso,
@@ -133,121 +132,6 @@ def create_test_result(
133
132
  )
134
133
 
135
134
 
136
- def create_api_turn(turn: Turn, index: int) -> TurnApi:
137
- return TurnApi(
138
- role=turn.role,
139
- content=turn.content,
140
- user_id=turn.user_id,
141
- retrievalContext=turn.retrieval_context,
142
- toolsCalled=turn.tools_called,
143
- additionalMetadata=turn.additional_metadata,
144
- order=index,
145
- )
146
-
147
-
148
- def create_api_test_case(
149
- test_case: Union[LLMTestCase, ConversationalTestCase, MLLMTestCase],
150
- trace: Optional[TraceApi] = None,
151
- index: Optional[int] = None,
152
- ) -> Union[LLMApiTestCase, ConversationalApiTestCase]:
153
- if isinstance(test_case, ConversationalTestCase):
154
- order = (
155
- test_case._dataset_rank
156
- if test_case._dataset_rank is not None
157
- else index
158
- )
159
- if test_case.name:
160
- name = test_case.name
161
- else:
162
- name = os.getenv(
163
- PYTEST_RUN_TEST_NAME, f"conversational_test_case_{order}"
164
- )
165
-
166
- api_test_case = ConversationalApiTestCase(
167
- name=name,
168
- success=True,
169
- metricsData=[],
170
- runDuration=0,
171
- evaluationCost=None,
172
- order=order,
173
- scenario=test_case.scenario,
174
- expectedOutcome=test_case.expected_outcome,
175
- userDescription=test_case.user_description,
176
- context=test_case.context,
177
- tags=test_case.tags,
178
- comments=test_case.comments,
179
- additionalMetadata=test_case.additional_metadata,
180
- )
181
- api_test_case.turns = [
182
- create_api_turn(
183
- turn=turn,
184
- index=index,
185
- )
186
- for index, turn in enumerate(test_case.turns)
187
- ]
188
-
189
- return api_test_case
190
- else:
191
- order = (
192
- test_case._dataset_rank
193
- if test_case._dataset_rank is not None
194
- else index
195
- )
196
-
197
- success = True
198
- if test_case.name is not None:
199
- name = test_case.name
200
- else:
201
- name = os.getenv(PYTEST_RUN_TEST_NAME, f"test_case_{order}")
202
- metrics_data = []
203
-
204
- if isinstance(test_case, LLMTestCase):
205
- api_test_case = LLMApiTestCase(
206
- name=name,
207
- input=test_case.input,
208
- actualOutput=test_case.actual_output,
209
- expectedOutput=test_case.expected_output,
210
- context=test_case.context,
211
- retrievalContext=test_case.retrieval_context,
212
- toolsCalled=test_case.tools_called,
213
- expectedTools=test_case.expected_tools,
214
- tokenCost=test_case.token_cost,
215
- completionTime=test_case.completion_time,
216
- tags=test_case.tags,
217
- success=success,
218
- metricsData=metrics_data,
219
- runDuration=None,
220
- evaluationCost=None,
221
- order=order,
222
- additionalMetadata=test_case.additional_metadata,
223
- comments=test_case.comments,
224
- trace=trace,
225
- )
226
- elif isinstance(test_case, MLLMTestCase):
227
- api_test_case = LLMApiTestCase(
228
- name=name,
229
- input="",
230
- multimodalInput=test_case.input,
231
- multimodalActualOutput=test_case.actual_output,
232
- multimodalExpectedOutput=test_case.expected_output,
233
- multimodalRetrievalContext=test_case.retrieval_context,
234
- multimodalContext=test_case.context,
235
- toolsCalled=test_case.tools_called,
236
- expectedTools=test_case.expected_tools,
237
- tokenCost=test_case.token_cost,
238
- completionTime=test_case.completion_time,
239
- success=success,
240
- metricsData=metrics_data,
241
- runDuration=None,
242
- evaluationCost=None,
243
- order=order,
244
- additionalMetadata=test_case.additional_metadata,
245
- comments=test_case.comments,
246
- )
247
- # llm_test_case_lookup_map[instance_id] = api_test_case
248
- return api_test_case
249
-
250
-
251
135
  def create_api_trace(trace: Trace, golden: Golden) -> TraceApi:
252
136
  return TraceApi(
253
137
  uuid=trace.uuid,
@@ -309,6 +193,26 @@ def validate_assert_test_inputs(
309
193
  "Both 'test_case' and 'metrics' must be provided together."
310
194
  )
311
195
 
196
+ if test_case and metrics:
197
+ if isinstance(test_case, LLMTestCase) and not all(
198
+ isinstance(metric, BaseMetric) for metric in metrics
199
+ ):
200
+ raise ValueError(
201
+ "All 'metrics' for an 'LLMTestCase' must be instances of 'BaseMetric' only."
202
+ )
203
+ if isinstance(test_case, ConversationalTestCase) and not all(
204
+ isinstance(metric, BaseConversationalMetric) for metric in metrics
205
+ ):
206
+ raise ValueError(
207
+ "All 'metrics' for an 'ConversationalTestCase' must be instances of 'BaseConversationalMetric' only."
208
+ )
209
+ if isinstance(test_case, MLLMTestCase) and not all(
210
+ isinstance(metric, BaseMultimodalMetric) for metric in metrics
211
+ ):
212
+ raise ValueError(
213
+ "All 'metrics' for an 'MLLMTestCase' must be instances of 'BaseMultimodalMetric' only."
214
+ )
215
+
312
216
  if not ((golden and observed_callback) or (test_case and metrics)):
313
217
  raise ValueError(
314
218
  "You must provide either ('golden' + 'observed_callback') or ('test_case' + 'metrics')."
@@ -1,3 +1,8 @@
1
1
  from .handler import instrument_crewai
2
+ from .subs import (
3
+ DeepEvalCrew as Crew,
4
+ DeepEvalAgent as Agent,
5
+ DeepEvalLLM as LLM,
6
+ )
2
7
 
3
- __all__ = ["instrument_crewai"]
8
+ __all__ = ["instrument_crewai", "Crew", "Agent", "LLM"]
@@ -13,7 +13,7 @@ logger = logging.getLogger(__name__)
13
13
 
14
14
 
15
15
  try:
16
- from crewai.utilities.events.base_event_listener import BaseEventListener
16
+ from crewai.events import BaseEventListener
17
17
  from crewai.events import (
18
18
  CrewKickoffStartedEvent,
19
19
  CrewKickoffCompletedEvent,
@@ -0,0 +1,51 @@
1
+ from typing import List, Optional, Type, TypeVar
2
+ from pydantic import PrivateAttr
3
+
4
+ from deepeval.metrics.base_metric import BaseMetric
5
+
6
+ try:
7
+ from crewai import Crew, Agent, LLM
8
+
9
+ is_crewai_installed = True
10
+ except ImportError:
11
+ is_crewai_installed = False
12
+
13
+
14
+ def is_crewai_installed():
15
+ if not is_crewai_installed:
16
+ raise ImportError(
17
+ "CrewAI is not installed. Please install it with `pip install crewai`."
18
+ )
19
+
20
+
21
+ T = TypeVar("T")
22
+
23
+
24
+ def create_deepeval_class(base_class: Type[T], class_name: str) -> Type[T]:
25
+ """Factory function to create DeepEval-enabled CrewAI classes"""
26
+
27
+ class DeepEvalClass(base_class):
28
+ _metric_collection: Optional[str] = PrivateAttr(default=None)
29
+ _metrics: Optional[List[BaseMetric]] = PrivateAttr(default=None)
30
+
31
+ def __init__(
32
+ self,
33
+ *args,
34
+ metrics: Optional[List[BaseMetric]] = None,
35
+ metric_collection: Optional[str] = None,
36
+ **kwargs
37
+ ):
38
+ is_crewai_installed()
39
+ super().__init__(*args, **kwargs)
40
+ self._metric_collection = metric_collection
41
+ self._metrics = metrics
42
+
43
+ DeepEvalClass.__name__ = class_name
44
+ DeepEvalClass.__qualname__ = class_name
45
+ return DeepEvalClass
46
+
47
+
48
+ # Create the classes
49
+ DeepEvalCrew = create_deepeval_class(Crew, "DeepEvalCrew")
50
+ DeepEvalAgent = create_deepeval_class(Agent, "DeepEvalAgent")
51
+ DeepEvalLLM = create_deepeval_class(LLM, "DeepEvalLLM")