deepeval 3.6.6__py3-none-any.whl → 3.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
  3. deepeval/cli/main.py +42 -0
  4. deepeval/confident/api.py +1 -0
  5. deepeval/config/settings.py +22 -4
  6. deepeval/constants.py +8 -1
  7. deepeval/dataset/dataset.py +2 -11
  8. deepeval/dataset/utils.py +1 -1
  9. deepeval/evaluate/evaluate.py +5 -1
  10. deepeval/evaluate/execute.py +97 -42
  11. deepeval/evaluate/utils.py +20 -116
  12. deepeval/integrations/crewai/__init__.py +6 -1
  13. deepeval/integrations/crewai/handler.py +1 -1
  14. deepeval/integrations/crewai/subs.py +51 -0
  15. deepeval/integrations/crewai/wrapper.py +45 -5
  16. deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
  17. deepeval/metrics/api.py +281 -0
  18. deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
  19. deepeval/metrics/bias/bias.py +12 -3
  20. deepeval/metrics/contextual_precision/contextual_precision.py +12 -3
  21. deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
  22. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
  23. deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
  24. deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
  25. deepeval/metrics/conversational_dag/nodes.py +12 -4
  26. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +73 -59
  27. deepeval/metrics/dag/dag.py +12 -0
  28. deepeval/metrics/dag/nodes.py +12 -4
  29. deepeval/metrics/faithfulness/faithfulness.py +12 -1
  30. deepeval/metrics/g_eval/g_eval.py +11 -0
  31. deepeval/metrics/hallucination/hallucination.py +12 -1
  32. deepeval/metrics/indicator.py +8 -2
  33. deepeval/metrics/json_correctness/json_correctness.py +12 -1
  34. deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
  35. deepeval/metrics/mcp/mcp_task_completion.py +13 -0
  36. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +13 -0
  37. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +12 -1
  38. deepeval/metrics/misuse/misuse.py +12 -1
  39. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
  40. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
  41. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
  42. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
  43. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
  44. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +6 -1
  45. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
  46. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
  47. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
  48. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
  49. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
  50. deepeval/metrics/non_advice/non_advice.py +12 -0
  51. deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
  52. deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
  53. deepeval/metrics/role_adherence/role_adherence.py +12 -0
  54. deepeval/metrics/role_violation/role_violation.py +12 -0
  55. deepeval/metrics/summarization/summarization.py +12 -1
  56. deepeval/metrics/task_completion/task_completion.py +3 -0
  57. deepeval/metrics/tool_correctness/tool_correctness.py +8 -0
  58. deepeval/metrics/toxicity/toxicity.py +12 -0
  59. deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
  60. deepeval/models/llms/grok_model.py +1 -1
  61. deepeval/models/llms/openai_model.py +2 -0
  62. deepeval/openai/__init__.py +14 -32
  63. deepeval/openai/extractors.py +24 -34
  64. deepeval/openai/patch.py +256 -161
  65. deepeval/openai/types.py +20 -0
  66. deepeval/openai/utils.py +98 -56
  67. deepeval/prompt/__init__.py +19 -1
  68. deepeval/prompt/api.py +160 -0
  69. deepeval/prompt/prompt.py +244 -62
  70. deepeval/prompt/utils.py +144 -2
  71. deepeval/synthesizer/chunking/context_generator.py +209 -152
  72. deepeval/synthesizer/chunking/doc_chunker.py +46 -12
  73. deepeval/synthesizer/synthesizer.py +8 -5
  74. deepeval/test_case/api.py +131 -0
  75. deepeval/test_run/__init__.py +1 -0
  76. deepeval/test_run/hyperparameters.py +47 -8
  77. deepeval/test_run/test_run.py +104 -1
  78. deepeval/tracing/api.py +3 -1
  79. deepeval/tracing/message_types/__init__.py +10 -0
  80. deepeval/tracing/message_types/base.py +6 -0
  81. deepeval/tracing/message_types/messages.py +14 -0
  82. deepeval/tracing/message_types/tools.py +18 -0
  83. deepeval/tracing/otel/utils.py +1 -1
  84. deepeval/tracing/trace_context.py +73 -4
  85. deepeval/tracing/tracing.py +51 -3
  86. deepeval/tracing/types.py +16 -0
  87. deepeval/tracing/utils.py +8 -0
  88. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/METADATA +1 -1
  89. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/RECORD +92 -84
  90. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/LICENSE.md +0 -0
  91. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/WHEEL +0 -0
  92. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/entry_points.txt +0 -0
deepeval/_version.py CHANGED
@@ -1 +1 @@
1
- __version__: str = "3.6.6"
1
+ __version__: str = "3.6.7"
@@ -121,6 +121,7 @@ class EquityMedQA(DeepEvalBaseBenchmark):
121
121
  score = metric.measure(
122
122
  LLMTestCase(input=golden.input, actual_output=prediction),
123
123
  _show_indicator=False,
124
+ _log_metric_to_confident=False,
124
125
  )
125
126
  flipped_score = (
126
127
  1 - metric.score if metric.score in [0, 1] else metric.score
deepeval/cli/main.py CHANGED
@@ -328,6 +328,31 @@ def set_debug(
328
328
  "--trace-flush/--no-trace-flush",
329
329
  help="Enable / disable CONFIDENT_TRACE_FLUSH.",
330
330
  ),
331
+ trace_sample_rate: Optional[float] = typer.Option(
332
+ None,
333
+ "--trace-sample-rate",
334
+ help="Set CONFIDENT_TRACE_SAMPLE_RATE.",
335
+ ),
336
+ metric_logging_verbose: Optional[bool] = typer.Option(
337
+ None,
338
+ "--metric-logging-verbose/--no-metric-logging-verbose",
339
+ help="Enable / disable CONFIDENT_METRIC_LOGGING_VERBOSE.",
340
+ ),
341
+ metric_logging_flush: Optional[bool] = typer.Option(
342
+ None,
343
+ "--metric-logging-flush/--no-metric-logging-flush",
344
+ help="Enable / disable CONFIDENT_METRIC_LOGGING_FLUSH.",
345
+ ),
346
+ metric_logging_sample_rate: Optional[float] = typer.Option(
347
+ None,
348
+ "--metric-logging-sample-rate",
349
+ help="Set CONFIDENT_METRIC_LOGGING_SAMPLE_RATE.",
350
+ ),
351
+ metric_logging_enabled: Optional[bool] = typer.Option(
352
+ None,
353
+ "--metric-logging-enabled/--no-metric-logging-enabled",
354
+ help="Enable / disable CONFIDENT_METRIC_LOGGING_ENABLED.",
355
+ ),
331
356
  # Advanced / potentially surprising
332
357
  error_reporting: Optional[bool] = typer.Option(
333
358
  None,
@@ -387,6 +412,20 @@ def set_debug(
387
412
  settings.CONFIDENT_TRACE_ENVIRONMENT = trace_env
388
413
  if trace_flush is not None:
389
414
  settings.CONFIDENT_TRACE_FLUSH = trace_flush
415
+ if trace_sample_rate is not None:
416
+ settings.CONFIDENT_TRACE_SAMPLE_RATE = trace_sample_rate
417
+
418
+ # Confident metrics
419
+ if metric_logging_verbose is not None:
420
+ settings.CONFIDENT_METRIC_LOGGING_VERBOSE = metric_logging_verbose
421
+ if metric_logging_flush is not None:
422
+ settings.CONFIDENT_METRIC_LOGGING_FLUSH = metric_logging_flush
423
+ if metric_logging_sample_rate is not None:
424
+ settings.CONFIDENT_METRIC_LOGGING_SAMPLE_RATE = (
425
+ metric_logging_sample_rate
426
+ )
427
+ if metric_logging_enabled is not None:
428
+ settings.CONFIDENT_METRIC_LOGGING_ENABLED = metric_logging_enabled
390
429
 
391
430
  # Advanced
392
431
  if error_reporting is not None:
@@ -438,6 +477,8 @@ def unset_debug(
438
477
  settings.LOG_LEVEL = "info"
439
478
  settings.CONFIDENT_TRACE_ENVIRONMENT = "development"
440
479
  settings.CONFIDENT_TRACE_VERBOSE = True
480
+ settings.CONFIDENT_METRIC_LOGGING_VERBOSE = True
481
+ settings.CONFIDENT_METRIC_LOGGING_ENABLED = True
441
482
 
442
483
  # Clear optional toggles/overrides
443
484
  settings.DEEPEVAL_VERBOSE_MODE = None
@@ -449,6 +490,7 @@ def unset_debug(
449
490
  settings.GRPC_TRACE = None
450
491
 
451
492
  settings.CONFIDENT_TRACE_FLUSH = None
493
+ settings.CONFIDENT_METRIC_LOGGING_FLUSH = None
452
494
 
453
495
  settings.ERROR_REPORTING = None
454
496
  settings.IGNORE_DEEPEVAL_ERRORS = None
deepeval/confident/api.py CHANGED
@@ -87,6 +87,7 @@ class Endpoints(Enum):
87
87
  DATASET_ALIAS_QUEUE_ENDPOINT = "/v1/datasets/:alias/queue"
88
88
 
89
89
  TEST_RUN_ENDPOINT = "/v1/test-run"
90
+ METRIC_DATA_ENDPOINT = "/v1/metric-data"
90
91
  TRACES_ENDPOINT = "/v1/traces"
91
92
  ANNOTATIONS_ENDPOINT = "/v1/annotations"
92
93
  PROMPTS_VERSION_ID_ENDPOINT = "/v1/prompts/:alias/versions/:versionId"
@@ -337,10 +337,17 @@ class Settings(BaseSettings):
337
337
  SKIP_DEEPEVAL_MISSING_PARAMS: Optional[bool] = None
338
338
  DEEPEVAL_VERBOSE_MODE: Optional[bool] = None
339
339
  ENABLE_DEEPEVAL_CACHE: Optional[bool] = None
340
+
340
341
  CONFIDENT_TRACE_FLUSH: Optional[bool] = None
341
342
  CONFIDENT_TRACE_ENVIRONMENT: Optional[str] = "development"
342
343
  CONFIDENT_TRACE_VERBOSE: Optional[bool] = True
343
- CONFIDENT_SAMPLE_RATE: Optional[float] = 1.0
344
+ CONFIDENT_TRACE_SAMPLE_RATE: Optional[float] = 1.0
345
+
346
+ CONFIDENT_METRIC_LOGGING_FLUSH: Optional[bool] = None
347
+ CONFIDENT_METRIC_LOGGING_VERBOSE: Optional[bool] = True
348
+ CONFIDENT_METRIC_LOGGING_SAMPLE_RATE: Optional[float] = 1.0
349
+ CONFIDENT_METRIC_LOGGING_ENABLED: Optional[bool] = True
350
+
344
351
  OTEL_EXPORTER_OTLP_ENDPOINT: Optional[AnyUrl] = None
345
352
 
346
353
  #
@@ -355,6 +362,12 @@ class Settings(BaseSettings):
355
362
  None # per-attempt timeout. Set 0/None to disable
356
363
  )
357
364
 
365
+ #
366
+ # Async Document Pipelines
367
+ #
368
+
369
+ DEEPEVAL_MAX_CONCURRENT_DOC_PROCESSING: conint(ge=1) = 2
370
+
358
371
  #
359
372
  # Async Task Configuration
360
373
  #
@@ -484,7 +497,8 @@ class Settings(BaseSettings):
484
497
  "OPENAI_COST_PER_INPUT_TOKEN",
485
498
  "OPENAI_COST_PER_OUTPUT_TOKEN",
486
499
  "TEMPERATURE",
487
- "CONFIDENT_SAMPLE_RATE",
500
+ "CONFIDENT_TRACE_SAMPLE_RATE",
501
+ "CONFIDENT_METRIC_LOGGING_SAMPLE_RATE",
488
502
  mode="before",
489
503
  )
490
504
  @classmethod
@@ -496,13 +510,17 @@ class Settings(BaseSettings):
496
510
  return None
497
511
  return float(v)
498
512
 
499
- @field_validator("CONFIDENT_SAMPLE_RATE")
513
+ @field_validator(
514
+ "CONFIDENT_TRACE_SAMPLE_RATE", "CONFIDENT_METRIC_LOGGING_SAMPLE_RATE"
515
+ )
500
516
  @classmethod
501
517
  def _validate_sample_rate(cls, v):
502
518
  if v is None:
503
519
  return None
504
520
  if not (0.0 <= float(v) <= 1.0):
505
- raise ValueError("CONFIDENT_SAMPLE_RATE must be between 0 and 1")
521
+ raise ValueError(
522
+ "CONFIDENT_TRACE_SAMPLE_RATE or CONFIDENT_METRIC_LOGGING_SAMPLE_RATE must be between 0 and 1"
523
+ )
506
524
  return float(v)
507
525
 
508
526
  @field_validator("DEEPEVAL_DEFAULT_SAVE", mode="before")
deepeval/constants.py CHANGED
@@ -9,9 +9,16 @@ LOGIN_PROMPT = "\n✨👀 Looking for a place for your LLM test data to live
9
9
 
10
10
  CONFIDENT_TRACE_VERBOSE = "CONFIDENT_TRACE_VERBOSE"
11
11
  CONFIDENT_TRACE_FLUSH = "CONFIDENT_TRACE_FLUSH"
12
- CONFIDENT_SAMPLE_RATE = "CONFIDENT_SAMPLE_RATE"
12
+ CONFIDENT_TRACE_SAMPLE_RATE = "CONFIDENT_TRACE_SAMPLE_RATE"
13
13
  CONFIDENT_TRACE_ENVIRONMENT = "CONFIDENT_TRACE_ENVIRONMENT"
14
14
  CONFIDENT_TRACING_ENABLED = "CONFIDENT_TRACING_ENABLED"
15
+
16
+ CONFIDENT_METRIC_LOGGING_VERBOSE = "CONFIDENT_METRIC_LOGGING_VERBOSE"
17
+ CONFIDENT_METRIC_LOGGING_FLUSH = "CONFIDENT_METRIC_LOGGING_FLUSH"
18
+ CONFIDENT_METRIC_LOGGING_SAMPLE_RATE = "CONFIDENT_METRIC_LOGGING_SAMPLE_RATE"
19
+ CONFIDENT_METRIC_LOGGING_ENABLED = "CONFIDENT_METRIC_LOGGING_ENABLED"
20
+
21
+
15
22
  CONFIDENT_OPEN_BROWSER = "CONFIDENT_OPEN_BROWSER"
16
23
  CONFIDENT_TEST_CASE_BATCH_SIZE = "CONFIDENT_TEST_CASE_BATCH_SIZE"
17
24
 
@@ -49,7 +49,7 @@ from deepeval.utils import (
49
49
  from deepeval.test_run import (
50
50
  global_test_run_manager,
51
51
  )
52
- from deepeval.openai.utils import openai_test_case_pairs
52
+
53
53
  from deepeval.tracing import trace_manager
54
54
  from deepeval.tracing.tracing import EVAL_DUMMY_SPAN_NAME
55
55
 
@@ -1248,16 +1248,7 @@ class EvaluationDataset:
1248
1248
  display_config.file_output_dir,
1249
1249
  )
1250
1250
 
1251
- # update hyperparameters
1252
- test_run = global_test_run_manager.get_test_run()
1253
- if len(openai_test_case_pairs) > 0:
1254
- raw_hyperparameters = openai_test_case_pairs[-1].hyperparameters
1255
- test_run.hyperparameters = process_hyperparameters(
1256
- raw_hyperparameters
1257
- )
1258
-
1259
- # clean up
1260
- openai_test_case_pairs.clear()
1251
+ # save test run
1261
1252
  global_test_run_manager.save_test_run(TEMP_FILE_PATH)
1262
1253
 
1263
1254
  # sandwich end trace for OTEL
deepeval/dataset/utils.py CHANGED
@@ -120,7 +120,7 @@ def format_turns(turns: List[Turn]) -> str:
120
120
  }
121
121
  res.append(cur_turn)
122
122
  try:
123
- return json.dumps(res)
123
+ return json.dumps(res, ensure_ascii=False)
124
124
  except Exception as e:
125
125
  raise ValueError(f"Error serializing turns: {e}")
126
126
 
@@ -28,7 +28,10 @@ from deepeval.evaluate.utils import (
28
28
  from deepeval.dataset import Golden
29
29
  from deepeval.prompt import Prompt
30
30
  from deepeval.test_case.utils import check_valid_test_cases_type
31
- from deepeval.test_run.hyperparameters import process_hyperparameters
31
+ from deepeval.test_run.hyperparameters import (
32
+ process_hyperparameters,
33
+ process_prompts,
34
+ )
32
35
  from deepeval.test_run.test_run import TEMP_FILE_PATH
33
36
  from deepeval.utils import (
34
37
  get_or_create_event_loop,
@@ -267,6 +270,7 @@ def evaluate(
267
270
 
268
271
  test_run = global_test_run_manager.get_test_run()
269
272
  test_run.hyperparameters = process_hyperparameters(hyperparameters)
273
+ test_run.prompts = process_prompts(hyperparameters)
270
274
  global_test_run_manager.save_test_run(TEMP_FILE_PATH)
271
275
  res = global_test_run_manager.wrap_up_test_run(
272
276
  run_duration, display_table=False
@@ -61,6 +61,7 @@ from deepeval.test_case import (
61
61
  ConversationalTestCase,
62
62
  MLLMTestCase,
63
63
  )
64
+ from deepeval.test_case.api import create_api_test_case
64
65
  from deepeval.test_run import (
65
66
  global_test_run_manager,
66
67
  LLMApiTestCase,
@@ -80,15 +81,18 @@ from deepeval.evaluate.utils import (
80
81
  create_api_trace,
81
82
  create_metric_data,
82
83
  create_test_result,
83
- create_api_test_case,
84
84
  count_metrics_in_trace,
85
85
  extract_trace_test_results,
86
86
  )
87
87
  from deepeval.utils import add_pbar, update_pbar, custom_console
88
- from deepeval.openai.utils import openai_test_case_pairs
89
88
  from deepeval.tracing.types import TestCaseMetricPair
90
89
  from deepeval.config.settings import get_settings
91
-
90
+ from deepeval.test_run import TEMP_FILE_PATH
91
+ from deepeval.confident.api import is_confident
92
+ from deepeval.test_run.hyperparameters import (
93
+ process_hyperparameters,
94
+ process_prompts,
95
+ )
92
96
 
93
97
  logger = logging.getLogger(__name__)
94
98
 
@@ -902,6 +906,7 @@ def execute_agentic_test_cases(
902
906
  trace_api.agent_spans.append(api_span)
903
907
  elif isinstance(span, LlmSpan):
904
908
  trace_api.llm_spans.append(api_span)
909
+ log_prompt(span, test_run_manager)
905
910
  elif isinstance(span, RetrieverSpan):
906
911
  trace_api.retriever_spans.append(api_span)
907
912
  elif isinstance(span, ToolSpan):
@@ -1284,6 +1289,7 @@ async def _a_execute_agentic_test_case(
1284
1289
  verbose_mode=verbose_mode,
1285
1290
  progress=progress,
1286
1291
  pbar_eval_id=pbar_eval_id,
1292
+ test_run_manager=test_run_manager,
1287
1293
  _use_bar_indicator=_use_bar_indicator,
1288
1294
  )
1289
1295
  child_tasks = [dfs(child) for child in span.children]
@@ -1291,7 +1297,18 @@ async def _a_execute_agentic_test_case(
1291
1297
  await asyncio.gather(*child_tasks)
1292
1298
 
1293
1299
  test_start_time = time.perf_counter()
1294
- await dfs(current_trace.root_spans[0])
1300
+ if current_trace and current_trace.root_spans:
1301
+ await dfs(current_trace.root_spans[0])
1302
+ else:
1303
+ if (
1304
+ logger.isEnabledFor(logging.DEBUG)
1305
+ and get_settings().DEEPEVAL_VERBOSE_MODE
1306
+ ):
1307
+ logger.debug(
1308
+ "Skipping DFS: empty trace or no root spans (trace=%s)",
1309
+ current_trace.uuid if current_trace else None,
1310
+ )
1311
+
1295
1312
  test_end_time = time.perf_counter()
1296
1313
  run_duration = test_end_time - test_start_time
1297
1314
 
@@ -1313,6 +1330,7 @@ async def _a_execute_span_test_case(
1313
1330
  verbose_mode: Optional[bool],
1314
1331
  progress: Optional[Progress],
1315
1332
  pbar_eval_id: Optional[int],
1333
+ test_run_manager: Optional[TestRunManager],
1316
1334
  _use_bar_indicator: bool,
1317
1335
  ):
1318
1336
  api_span: BaseApiSpan = trace_manager._convert_span_to_api_span(span)
@@ -1320,6 +1338,7 @@ async def _a_execute_span_test_case(
1320
1338
  trace_api.agent_spans.append(api_span)
1321
1339
  elif isinstance(span, LlmSpan):
1322
1340
  trace_api.llm_spans.append(api_span)
1341
+ log_prompt(span, test_run_manager)
1323
1342
  elif isinstance(span, RetrieverSpan):
1324
1343
  trace_api.retriever_spans.append(api_span)
1325
1344
  elif isinstance(span, ToolSpan):
@@ -1568,6 +1587,7 @@ def execute_agentic_test_cases_from_loop(
1568
1587
  trace_api.agent_spans.append(api_span)
1569
1588
  elif isinstance(span, LlmSpan):
1570
1589
  trace_api.llm_spans.append(api_span)
1590
+ log_prompt(span, test_run_manager)
1571
1591
  elif isinstance(span, RetrieverSpan):
1572
1592
  trace_api.retriever_spans.append(api_span)
1573
1593
  elif isinstance(span, ToolSpan):
@@ -1748,6 +1768,7 @@ def execute_agentic_test_cases_from_loop(
1748
1768
  local_trace_manager.evaluating = False
1749
1769
  local_trace_manager.traces_to_evaluate_order.clear()
1750
1770
  local_trace_manager.traces_to_evaluate.clear()
1771
+ local_trace_manager.trace_uuid_to_golden.clear()
1751
1772
 
1752
1773
 
1753
1774
  def a_execute_agentic_test_cases_from_loop(
@@ -1950,12 +1971,12 @@ def a_execute_agentic_test_cases_from_loop(
1950
1971
  return
1951
1972
 
1952
1973
  try:
1974
+ current_tasks = set()
1953
1975
  # Find tasks that were created during this run but we didn’t track
1954
1976
  current_tasks = loop.run_until_complete(_snapshot_tasks())
1955
1977
  except RuntimeError:
1956
1978
  # this might happen if the loop is already closing
1957
- # nothing we can do
1958
- return
1979
+ pass
1959
1980
 
1960
1981
  leftovers = [
1961
1982
  t
@@ -1965,9 +1986,6 @@ def a_execute_agentic_test_cases_from_loop(
1965
1986
  and not t.done()
1966
1987
  ]
1967
1988
 
1968
- if not leftovers:
1969
- return
1970
-
1971
1989
  if get_settings().DEEPEVAL_DEBUG_ASYNC:
1972
1990
  logger.warning(
1973
1991
  "[deepeval] %d stray task(s) not tracked; cancelling...",
@@ -1978,20 +1996,21 @@ def a_execute_agentic_test_cases_from_loop(
1978
1996
  name = t.get_name()
1979
1997
  logger.warning(" - STRAY %s meta=%s", name, meta)
1980
1998
 
1981
- for t in leftovers:
1982
- t.cancel()
1999
+ if leftovers:
2000
+ for t in leftovers:
2001
+ t.cancel()
1983
2002
 
1984
- # Drain strays so they don’t leak into the next iteration
1985
- try:
1986
- loop.run_until_complete(
1987
- asyncio.gather(*leftovers, return_exceptions=True)
1988
- )
1989
- except RuntimeError:
1990
- # If the loop is closing here, just continue
1991
- if get_settings().DEEPEVAL_DEBUG_ASYNC:
1992
- logger.warning(
1993
- "[deepeval] failed to drain stray tasks because loop is closing"
2003
+ # Drain strays so they don’t leak into the next iteration
2004
+ try:
2005
+ loop.run_until_complete(
2006
+ asyncio.gather(*leftovers, return_exceptions=True)
1994
2007
  )
2008
+ except RuntimeError:
2009
+ # If the loop is closing here, just continue
2010
+ if get_settings().DEEPEVAL_DEBUG_ASYNC:
2011
+ logger.warning(
2012
+ "[deepeval] failed to drain stray tasks because loop is closing"
2013
+ )
1995
2014
 
1996
2015
  # Evaluate traces
1997
2016
  if trace_manager.traces_to_evaluate:
@@ -2014,25 +2033,6 @@ def a_execute_agentic_test_cases_from_loop(
2014
2033
  pbar_id=pbar_id,
2015
2034
  )
2016
2035
  )
2017
- elif openai_test_case_pairs:
2018
- loop.run_until_complete(
2019
- _evaluate_test_case_pairs(
2020
- test_case_pairs=openai_test_case_pairs,
2021
- test_run=test_run,
2022
- test_run_manager=test_run_manager,
2023
- test_results=test_results,
2024
- ignore_errors=error_config.ignore_errors,
2025
- skip_on_missing_params=error_config.skip_on_missing_params,
2026
- show_indicator=display_config.show_indicator,
2027
- verbose_mode=display_config.verbose_mode,
2028
- throttle_value=async_config.throttle_value,
2029
- max_concurrent=async_config.max_concurrent,
2030
- _use_bar_indicator=_use_bar_indicator,
2031
- _is_assert_test=_is_assert_test,
2032
- progress=progress,
2033
- pbar_id=pbar_id,
2034
- )
2035
- )
2036
2036
  elif trace_manager.integration_traces_to_evaluate:
2037
2037
  loop.run_until_complete(
2038
2038
  _a_evaluate_traces(
@@ -2106,6 +2106,7 @@ def a_execute_agentic_test_cases_from_loop(
2106
2106
  local_trace_manager.evaluating = False
2107
2107
  local_trace_manager.traces_to_evaluate_order.clear()
2108
2108
  local_trace_manager.traces_to_evaluate.clear()
2109
+ local_trace_manager.trace_uuid_to_golden.clear()
2109
2110
 
2110
2111
 
2111
2112
  async def _a_evaluate_traces(
@@ -2132,8 +2133,26 @@ async def _a_evaluate_traces(
2132
2133
  return await func(*args, **kwargs)
2133
2134
 
2134
2135
  eval_tasks = []
2135
- for count, trace in enumerate(traces_to_evaluate):
2136
- golden = goldens[count]
2136
+ # Here, we will work off a fixed-set copy to avoid surprises from potential
2137
+ # mid-iteration mutation
2138
+ traces_snapshot = list(traces_to_evaluate or [])
2139
+
2140
+ for count, trace in enumerate(traces_snapshot):
2141
+ # Prefer the explicit mapping from trace -> golden captured at trace creation.
2142
+ golden = trace_manager.trace_uuid_to_golden.get(trace.uuid)
2143
+ if not golden:
2144
+ # trace started during evaluation_loop but the CURRENT_GOLDEN was
2145
+ # not set for some reason. We can’t map it to a golden, so the best
2146
+ # we can do is skip evaluation for this trace.
2147
+ if (
2148
+ logger.isEnabledFor(logging.DEBUG)
2149
+ and get_settings().DEEPEVAL_VERBOSE_MODE
2150
+ ):
2151
+ logger.debug(
2152
+ "Skipping trace %s: no golden association found during evaluation_loop ",
2153
+ trace.uuid,
2154
+ )
2155
+ continue
2137
2156
  with capture_evaluation_run("golden"):
2138
2157
  task = execute_evals_with_semaphore(
2139
2158
  func=_a_execute_agentic_test_case,
@@ -2225,6 +2244,7 @@ def _execute_metric(
2225
2244
  test_case,
2226
2245
  _show_indicator=show_metric_indicator,
2227
2246
  _in_component=in_component,
2247
+ _log_metric_to_confident=False,
2228
2248
  )
2229
2249
  except MissingTestCaseParamsError as e:
2230
2250
  if error_config.skip_on_missing_params:
@@ -2259,3 +2279,38 @@ def _execute_metric(
2259
2279
  metric.success = False
2260
2280
  else:
2261
2281
  raise
2282
+
2283
+
2284
+ def log_prompt(
2285
+ llm_span: LlmSpan,
2286
+ test_run_manager: TestRunManager,
2287
+ ):
2288
+ prompt = llm_span.prompt
2289
+ if prompt is None:
2290
+ return
2291
+
2292
+ span_hyperparameters = {}
2293
+ prompt_version = prompt.version if is_confident() else None
2294
+ key = f"{prompt.alias}_{prompt_version}"
2295
+ span_hyperparameters[key] = prompt
2296
+
2297
+ test_run = test_run_manager.get_test_run()
2298
+ if test_run.prompts is None:
2299
+ test_run.prompts = []
2300
+ if test_run.hyperparameters is None:
2301
+ test_run.hyperparameters = {}
2302
+
2303
+ if key not in test_run.hyperparameters:
2304
+ test_run.hyperparameters.update(
2305
+ process_hyperparameters(span_hyperparameters, False)
2306
+ )
2307
+ existing_prompt_keys = {
2308
+ f"{p.alias}_{p.version}" for p in test_run.prompts
2309
+ }
2310
+ new_prompts = process_prompts(span_hyperparameters)
2311
+ for new_prompt in new_prompts:
2312
+ new_prompt_key = f"{new_prompt.alias}_{new_prompt.version}"
2313
+ if new_prompt_key not in existing_prompt_keys:
2314
+ test_run.prompts.append(new_prompt)
2315
+
2316
+ global_test_run_manager.save_test_run(TEMP_FILE_PATH)
@@ -28,7 +28,6 @@ from deepeval.evaluate.types import TestResult
28
28
  from deepeval.tracing.api import TraceApi, BaseApiSpan, TraceSpanApiStatus
29
29
  from deepeval.tracing.tracing import BaseSpan, Trace
30
30
  from deepeval.tracing.types import TraceSpanStatus
31
- from deepeval.constants import PYTEST_RUN_TEST_NAME
32
31
  from deepeval.tracing.utils import (
33
32
  perf_counter_to_datetime,
34
33
  to_zod_compatible_iso,
@@ -133,121 +132,6 @@ def create_test_result(
133
132
  )
134
133
 
135
134
 
136
- def create_api_turn(turn: Turn, index: int) -> TurnApi:
137
- return TurnApi(
138
- role=turn.role,
139
- content=turn.content,
140
- user_id=turn.user_id,
141
- retrievalContext=turn.retrieval_context,
142
- toolsCalled=turn.tools_called,
143
- additionalMetadata=turn.additional_metadata,
144
- order=index,
145
- )
146
-
147
-
148
- def create_api_test_case(
149
- test_case: Union[LLMTestCase, ConversationalTestCase, MLLMTestCase],
150
- trace: Optional[TraceApi] = None,
151
- index: Optional[int] = None,
152
- ) -> Union[LLMApiTestCase, ConversationalApiTestCase]:
153
- if isinstance(test_case, ConversationalTestCase):
154
- order = (
155
- test_case._dataset_rank
156
- if test_case._dataset_rank is not None
157
- else index
158
- )
159
- if test_case.name:
160
- name = test_case.name
161
- else:
162
- name = os.getenv(
163
- PYTEST_RUN_TEST_NAME, f"conversational_test_case_{order}"
164
- )
165
-
166
- api_test_case = ConversationalApiTestCase(
167
- name=name,
168
- success=True,
169
- metricsData=[],
170
- runDuration=0,
171
- evaluationCost=None,
172
- order=order,
173
- scenario=test_case.scenario,
174
- expectedOutcome=test_case.expected_outcome,
175
- userDescription=test_case.user_description,
176
- context=test_case.context,
177
- tags=test_case.tags,
178
- comments=test_case.comments,
179
- additionalMetadata=test_case.additional_metadata,
180
- )
181
- api_test_case.turns = [
182
- create_api_turn(
183
- turn=turn,
184
- index=index,
185
- )
186
- for index, turn in enumerate(test_case.turns)
187
- ]
188
-
189
- return api_test_case
190
- else:
191
- order = (
192
- test_case._dataset_rank
193
- if test_case._dataset_rank is not None
194
- else index
195
- )
196
-
197
- success = True
198
- if test_case.name is not None:
199
- name = test_case.name
200
- else:
201
- name = os.getenv(PYTEST_RUN_TEST_NAME, f"test_case_{order}")
202
- metrics_data = []
203
-
204
- if isinstance(test_case, LLMTestCase):
205
- api_test_case = LLMApiTestCase(
206
- name=name,
207
- input=test_case.input,
208
- actualOutput=test_case.actual_output,
209
- expectedOutput=test_case.expected_output,
210
- context=test_case.context,
211
- retrievalContext=test_case.retrieval_context,
212
- toolsCalled=test_case.tools_called,
213
- expectedTools=test_case.expected_tools,
214
- tokenCost=test_case.token_cost,
215
- completionTime=test_case.completion_time,
216
- tags=test_case.tags,
217
- success=success,
218
- metricsData=metrics_data,
219
- runDuration=None,
220
- evaluationCost=None,
221
- order=order,
222
- additionalMetadata=test_case.additional_metadata,
223
- comments=test_case.comments,
224
- trace=trace,
225
- )
226
- elif isinstance(test_case, MLLMTestCase):
227
- api_test_case = LLMApiTestCase(
228
- name=name,
229
- input="",
230
- multimodalInput=test_case.input,
231
- multimodalActualOutput=test_case.actual_output,
232
- multimodalExpectedOutput=test_case.expected_output,
233
- multimodalRetrievalContext=test_case.retrieval_context,
234
- multimodalContext=test_case.context,
235
- toolsCalled=test_case.tools_called,
236
- expectedTools=test_case.expected_tools,
237
- tokenCost=test_case.token_cost,
238
- completionTime=test_case.completion_time,
239
- success=success,
240
- metricsData=metrics_data,
241
- runDuration=None,
242
- evaluationCost=None,
243
- order=order,
244
- additionalMetadata=test_case.additional_metadata,
245
- comments=test_case.comments,
246
- )
247
- # llm_test_case_lookup_map[instance_id] = api_test_case
248
- return api_test_case
249
-
250
-
251
135
  def create_api_trace(trace: Trace, golden: Golden) -> TraceApi:
252
136
  return TraceApi(
253
137
  uuid=trace.uuid,
@@ -309,6 +193,26 @@ def validate_assert_test_inputs(
309
193
  "Both 'test_case' and 'metrics' must be provided together."
310
194
  )
311
195
 
196
+ if test_case and metrics:
197
+ if isinstance(test_case, LLMTestCase) and not all(
198
+ isinstance(metric, BaseMetric) for metric in metrics
199
+ ):
200
+ raise ValueError(
201
+ "All 'metrics' for an 'LLMTestCase' must be instances of 'BaseMetric' only."
202
+ )
203
+ if isinstance(test_case, ConversationalTestCase) and not all(
204
+ isinstance(metric, BaseConversationalMetric) for metric in metrics
205
+ ):
206
+ raise ValueError(
207
+ "All 'metrics' for an 'ConversationalTestCase' must be instances of 'BaseConversationalMetric' only."
208
+ )
209
+ if isinstance(test_case, MLLMTestCase) and not all(
210
+ isinstance(metric, BaseMultimodalMetric) for metric in metrics
211
+ ):
212
+ raise ValueError(
213
+ "All 'metrics' for an 'MLLMTestCase' must be instances of 'BaseMultimodalMetric' only."
214
+ )
215
+
312
216
  if not ((golden and observed_callback) or (test_case and metrics)):
313
217
  raise ValueError(
314
218
  "You must provide either ('golden' + 'observed_callback') or ('test_case' + 'metrics')."
@@ -1,3 +1,8 @@
1
1
  from .handler import instrument_crewai
2
+ from .subs import (
3
+ DeepEvalCrew as Crew,
4
+ DeepEvalAgent as Agent,
5
+ DeepEvalLLM as LLM,
6
+ )
2
7
 
3
- __all__ = ["instrument_crewai"]
8
+ __all__ = ["instrument_crewai", "Crew", "Agent", "LLM"]
@@ -13,7 +13,7 @@ logger = logging.getLogger(__name__)
13
13
 
14
14
 
15
15
  try:
16
- from crewai.utilities.events.base_event_listener import BaseEventListener
16
+ from crewai.events import BaseEventListener
17
17
  from crewai.events import (
18
18
  CrewKickoffStartedEvent,
19
19
  CrewKickoffCompletedEvent,