PyPI - deepeval - Versions diffs - 3.6.5__py3-none-any.whl → 3.6.7__py3-none-any.whl - Mend

deepeval 3.6.5py3-none-any.whl → 3.6.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (97) hide show

deepeval/__init__.py +42 -10
deepeval/_version.py +1 -1
deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
deepeval/cli/main.py +42 -0
deepeval/confident/api.py +1 -0
deepeval/config/logging.py +33 -0
deepeval/config/settings.py +176 -16
deepeval/constants.py +8 -1
deepeval/dataset/dataset.py +2 -11
deepeval/dataset/utils.py +1 -1
deepeval/evaluate/evaluate.py +5 -1
deepeval/evaluate/execute.py +118 -60
deepeval/evaluate/utils.py +20 -116
deepeval/integrations/crewai/__init__.py +6 -1
deepeval/integrations/crewai/handler.py +1 -1
deepeval/integrations/crewai/subs.py +51 -0
deepeval/integrations/crewai/wrapper.py +45 -5
deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
deepeval/metrics/api.py +281 -0
deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
deepeval/metrics/bias/bias.py +12 -3
deepeval/metrics/contextual_precision/contextual_precision.py +12 -3
deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
deepeval/metrics/conversational_dag/nodes.py +12 -4
deepeval/metrics/conversational_g_eval/conversational_g_eval.py +73 -59
deepeval/metrics/dag/dag.py +12 -0
deepeval/metrics/dag/nodes.py +12 -4
deepeval/metrics/faithfulness/faithfulness.py +12 -1
deepeval/metrics/g_eval/g_eval.py +37 -15
deepeval/metrics/hallucination/hallucination.py +12 -1
deepeval/metrics/indicator.py +8 -2
deepeval/metrics/json_correctness/json_correctness.py +12 -1
deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
deepeval/metrics/mcp/mcp_task_completion.py +13 -0
deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +13 -0
deepeval/metrics/mcp_use_metric/mcp_use_metric.py +12 -1
deepeval/metrics/misuse/misuse.py +12 -1
deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +6 -1
deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
deepeval/metrics/non_advice/non_advice.py +12 -0
deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
deepeval/metrics/prompt_alignment/prompt_alignment.py +53 -24
deepeval/metrics/role_adherence/role_adherence.py +12 -0
deepeval/metrics/role_violation/role_violation.py +12 -0
deepeval/metrics/summarization/summarization.py +12 -1
deepeval/metrics/task_completion/task_completion.py +3 -0
deepeval/metrics/tool_correctness/tool_correctness.py +8 -0
deepeval/metrics/toxicity/toxicity.py +12 -0
deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
deepeval/models/llms/grok_model.py +1 -1
deepeval/models/llms/openai_model.py +2 -0
deepeval/models/retry_policy.py +202 -11
deepeval/openai/__init__.py +14 -32
deepeval/openai/extractors.py +24 -34
deepeval/openai/patch.py +256 -161
deepeval/openai/types.py +20 -0
deepeval/openai/utils.py +98 -56
deepeval/prompt/__init__.py +19 -1
deepeval/prompt/api.py +160 -0
deepeval/prompt/prompt.py +244 -62
deepeval/prompt/utils.py +144 -2
deepeval/synthesizer/chunking/context_generator.py +209 -152
deepeval/synthesizer/chunking/doc_chunker.py +46 -12
deepeval/synthesizer/synthesizer.py +8 -5
deepeval/test_case/api.py +131 -0
deepeval/test_run/__init__.py +1 -0
deepeval/test_run/hyperparameters.py +47 -8
deepeval/test_run/test_run.py +104 -1
deepeval/tracing/api.py +3 -1
deepeval/tracing/message_types/__init__.py +10 -0
deepeval/tracing/message_types/base.py +6 -0
deepeval/tracing/message_types/messages.py +14 -0
deepeval/tracing/message_types/tools.py +18 -0
deepeval/tracing/otel/exporter.py +0 -6
deepeval/tracing/otel/utils.py +58 -8
deepeval/tracing/trace_context.py +73 -4
deepeval/tracing/trace_test_manager.py +19 -0
deepeval/tracing/tracing.py +52 -4
deepeval/tracing/types.py +16 -0
deepeval/tracing/utils.py +8 -0
{deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/METADATA +1 -1
{deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/RECORD +97 -87
{deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/LICENSE.md +0 -0
{deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/WHEEL +0 -0
{deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/entry_points.txt +0 -0

deepeval/evaluate/execute.py CHANGED Viewed

@@ -61,6 +61,7 @@ from deepeval.test_case import (
     ConversationalTestCase,
     MLLMTestCase,
 )
+from deepeval.test_case.api import create_api_test_case
 from deepeval.test_run import (
     global_test_run_manager,
     LLMApiTestCase,
@@ -80,18 +81,20 @@ from deepeval.evaluate.utils import (
     create_api_trace,
     create_metric_data,
     create_test_result,
-    create_api_test_case,
     count_metrics_in_trace,
     extract_trace_test_results,
 )
 from deepeval.utils import add_pbar, update_pbar, custom_console
-from deepeval.openai.utils import openai_test_case_pairs
 from deepeval.tracing.types import TestCaseMetricPair
 from deepeval.config.settings import get_settings
+from deepeval.test_run import TEMP_FILE_PATH
+from deepeval.confident.api import is_confident
+from deepeval.test_run.hyperparameters import (
+    process_hyperparameters,
+    process_prompts,
+)
 logger = logging.getLogger(__name__)
-settings = get_settings()
 async def _snapshot_tasks():
@@ -100,6 +103,18 @@ async def _snapshot_tasks():
     return {t for t in asyncio.all_tasks() if t is not cur}
+def _per_task_timeout() -> float:
+    return get_settings().DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
+def _gather_timeout() -> float:
+    s = get_settings()
+    return (
+        s.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
+        + s.DEEPEVAL_TASK_GATHER_BUFFER_SECONDS
+    )
 ###########################################
 ### E2E Evals #############################
 ###########################################
@@ -838,7 +853,7 @@ def execute_agentic_test_cases(
                         loop.run_until_complete(
                             asyncio.wait_for(
                                 coro,
-                                timeout=settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
+                                timeout=_per_task_timeout(),
                             )
                         )
                     else:
@@ -891,6 +906,7 @@ def execute_agentic_test_cases(
                         trace_api.agent_spans.append(api_span)
                     elif isinstance(span, LlmSpan):
                         trace_api.llm_spans.append(api_span)
+                        log_prompt(span, test_run_manager)
                     elif isinstance(span, RetrieverSpan):
                         trace_api.retriever_spans.append(api_span)
                     elif isinstance(span, ToolSpan):
@@ -1196,7 +1212,7 @@ async def _a_execute_agentic_test_case(
             if asyncio.iscoroutinefunction(observed_callback):
                 await asyncio.wait_for(
                     observed_callback(golden.input),
-                    timeout=settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
+                    timeout=_per_task_timeout(),
                 )
             else:
                 observed_callback(golden.input)
@@ -1273,6 +1289,7 @@ async def _a_execute_agentic_test_case(
             verbose_mode=verbose_mode,
             progress=progress,
             pbar_eval_id=pbar_eval_id,
+            test_run_manager=test_run_manager,
             _use_bar_indicator=_use_bar_indicator,
         )
         child_tasks = [dfs(child) for child in span.children]
@@ -1280,7 +1297,18 @@ async def _a_execute_agentic_test_case(
             await asyncio.gather(*child_tasks)
     test_start_time = time.perf_counter()
-    await dfs(current_trace.root_spans[0])
+    if current_trace and current_trace.root_spans:
+        await dfs(current_trace.root_spans[0])
+    else:
+        if (
+            logger.isEnabledFor(logging.DEBUG)
+            and get_settings().DEEPEVAL_VERBOSE_MODE
+        ):
+            logger.debug(
+                "Skipping DFS: empty trace or no root spans (trace=%s)",
+                current_trace.uuid if current_trace else None,
+            )
     test_end_time = time.perf_counter()
     run_duration = test_end_time - test_start_time
@@ -1302,6 +1330,7 @@ async def _a_execute_span_test_case(
     verbose_mode: Optional[bool],
     progress: Optional[Progress],
     pbar_eval_id: Optional[int],
+    test_run_manager: Optional[TestRunManager],
     _use_bar_indicator: bool,
 ):
     api_span: BaseApiSpan = trace_manager._convert_span_to_api_span(span)
@@ -1309,6 +1338,7 @@ async def _a_execute_span_test_case(
         trace_api.agent_spans.append(api_span)
     elif isinstance(span, LlmSpan):
         trace_api.llm_spans.append(api_span)
+        log_prompt(span, test_run_manager)
     elif isinstance(span, RetrieverSpan):
         trace_api.retriever_spans.append(api_span)
     elif isinstance(span, ToolSpan):
@@ -1557,6 +1587,7 @@ def execute_agentic_test_cases_from_loop(
                         trace_api.agent_spans.append(api_span)
                     elif isinstance(span, LlmSpan):
                         trace_api.llm_spans.append(api_span)
+                        log_prompt(span, test_run_manager)
                     elif isinstance(span, RetrieverSpan):
                         trace_api.retriever_spans.append(api_span)
                     elif isinstance(span, ToolSpan):
@@ -1737,6 +1768,7 @@ def execute_agentic_test_cases_from_loop(
         local_trace_manager.evaluating = False
         local_trace_manager.traces_to_evaluate_order.clear()
         local_trace_manager.traces_to_evaluate.clear()
+        local_trace_manager.trace_uuid_to_golden.clear()
 def a_execute_agentic_test_cases_from_loop(
@@ -1753,11 +1785,6 @@ def a_execute_agentic_test_cases_from_loop(
     _is_assert_test: bool = False,
 ) -> Iterator[TestResult]:
-    GATHER_TIMEOUT_SECONDS = (
-        settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
-        + settings.DEEPEVAL_TASK_GATHER_BUFFER_SECONDS
-    )
     semaphore = asyncio.Semaphore(async_config.max_concurrent)
     original_create_task = asyncio.create_task
@@ -1772,7 +1799,7 @@ def a_execute_agentic_test_cases_from_loop(
     async def execute_callback_with_semaphore(coroutine: Awaitable):
         async with semaphore:
             return await asyncio.wait_for(
-                coroutine, timeout=settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
+                coroutine, timeout=_per_task_timeout()
             )
     def evaluate_test_cases(
@@ -1814,7 +1841,7 @@ def a_execute_agentic_test_cases_from_loop(
             }
             def on_task_done(t: asyncio.Task):
-                if settings.DEEPEVAL_DEBUG_ASYNC:
+                if get_settings().DEEPEVAL_DEBUG_ASYNC:
                     # Using info level here to make it easy to spot these logs.
                     # We are gated by DEEPEVAL_DEBUG_ASYNC
                     meta = task_meta.get(t, {})
@@ -1888,7 +1915,7 @@ def a_execute_agentic_test_cases_from_loop(
                 loop.run_until_complete(
                     asyncio.wait_for(
                         asyncio.gather(*created_tasks, return_exceptions=True),
-                        timeout=GATHER_TIMEOUT_SECONDS,
+                        timeout=_gather_timeout(),
                     )
                 )
             except asyncio.TimeoutError:
@@ -1903,16 +1930,13 @@ def a_execute_agentic_test_cases_from_loop(
                     elapsed_time = time.perf_counter() - start_time
                     # Determine if it was a per task or gather timeout based on task's elapsed time
-                    if (
-                        elapsed_time
-                        >= settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
-                    ):
+                    if elapsed_time >= _per_task_timeout():
                         timeout_type = "per-task"
                     else:
                         timeout_type = "gather"
                     logger.warning(
-                        f"[deepeval] gather TIMEOUT after {GATHER_TIMEOUT_SECONDS}s; "
+                        f"[deepeval] gather TIMEOUT after {_gather_timeout()}s; "
                         f"pending={len(pending)} tasks. Timeout type: {timeout_type}. "
                         f"To give tasks more time, consider increasing "
                         f"DEEPEVAL_PER_TASK_TIMEOUT_SECONDS for longer task completion time or "
@@ -1926,7 +1950,7 @@ def a_execute_agentic_test_cases_from_loop(
                         elapsed_time,
                         meta,
                     )
-                    if loop.get_debug() and settings.DEEPEVAL_DEBUG_ASYNC:
+                    if loop.get_debug() and get_settings().DEEPEVAL_DEBUG_ASYNC:
                         frames = t.get_stack(limit=6)
                         if frames:
                             logger.info("    stack:")
@@ -1947,12 +1971,12 @@ def a_execute_agentic_test_cases_from_loop(
                     return
                 try:
+                    current_tasks = set()
                     # Find tasks that were created during this run but we didn’t track
                     current_tasks = loop.run_until_complete(_snapshot_tasks())
                 except RuntimeError:
                     # this might happen if the loop is already closing
-                    # nothing we can do
-                    return
+                    pass
                 leftovers = [
                     t
@@ -1962,10 +1986,7 @@ def a_execute_agentic_test_cases_from_loop(
                     and not t.done()
                 ]
-                if not leftovers:
-                    return
-                if settings.DEEPEVAL_DEBUG_ASYNC:
+                if get_settings().DEEPEVAL_DEBUG_ASYNC:
                     logger.warning(
                         "[deepeval] %d stray task(s) not tracked; cancelling...",
                         len(leftovers),
@@ -1975,20 +1996,21 @@ def a_execute_agentic_test_cases_from_loop(
                         name = t.get_name()
                         logger.warning("  - STRAY %s meta=%s", name, meta)
-                for t in leftovers:
-                    t.cancel()
+                if leftovers:
+                    for t in leftovers:
+                        t.cancel()
-                # Drain strays so they don’t leak into the next iteration
-                try:
-                    loop.run_until_complete(
-                        asyncio.gather(*leftovers, return_exceptions=True)
-                    )
-                except RuntimeError:
-                    # If the loop is closing here, just continue
-                    if settings.DEEPEVAL_DEBUG_ASYNC:
-                        logger.warning(
-                            "[deepeval] failed to drain stray tasks because loop is closing"
+                    # Drain strays so they don’t leak into the next iteration
+                    try:
+                        loop.run_until_complete(
+                            asyncio.gather(*leftovers, return_exceptions=True)
                         )
+                    except RuntimeError:
+                        # If the loop is closing here, just continue
+                        if get_settings().DEEPEVAL_DEBUG_ASYNC:
+                            logger.warning(
+                                "[deepeval] failed to drain stray tasks because loop is closing"
+                            )
         # Evaluate traces
         if trace_manager.traces_to_evaluate:
@@ -2011,25 +2033,6 @@ def a_execute_agentic_test_cases_from_loop(
                     pbar_id=pbar_id,
                 )
             )
-        elif openai_test_case_pairs:
-            loop.run_until_complete(
-                _evaluate_test_case_pairs(
-                    test_case_pairs=openai_test_case_pairs,
-                    test_run=test_run,
-                    test_run_manager=test_run_manager,
-                    test_results=test_results,
-                    ignore_errors=error_config.ignore_errors,
-                    skip_on_missing_params=error_config.skip_on_missing_params,
-                    show_indicator=display_config.show_indicator,
-                    verbose_mode=display_config.verbose_mode,
-                    throttle_value=async_config.throttle_value,
-                    max_concurrent=async_config.max_concurrent,
-                    _use_bar_indicator=_use_bar_indicator,
-                    _is_assert_test=_is_assert_test,
-                    progress=progress,
-                    pbar_id=pbar_id,
-                )
-            )
         elif trace_manager.integration_traces_to_evaluate:
             loop.run_until_complete(
                 _a_evaluate_traces(
@@ -2103,6 +2106,7 @@ def a_execute_agentic_test_cases_from_loop(
         local_trace_manager.evaluating = False
         local_trace_manager.traces_to_evaluate_order.clear()
         local_trace_manager.traces_to_evaluate.clear()
+        local_trace_manager.trace_uuid_to_golden.clear()
 async def _a_evaluate_traces(
@@ -2129,8 +2133,26 @@ async def _a_evaluate_traces(
             return await func(*args, **kwargs)
     eval_tasks = []
-    for count, trace in enumerate(traces_to_evaluate):
-        golden = goldens[count]
+    # Here, we will work off a fixed-set copy to avoid surprises from potential
+    # mid-iteration mutation
+    traces_snapshot = list(traces_to_evaluate or [])
+    for count, trace in enumerate(traces_snapshot):
+        # Prefer the explicit mapping from trace -> golden captured at trace creation.
+        golden = trace_manager.trace_uuid_to_golden.get(trace.uuid)
+        if not golden:
+            # trace started during evaluation_loop but the CURRENT_GOLDEN was
+            # not set for some reason. We can’t map it to a golden, so the best
+            # we can do is skip evaluation for this trace.
+            if (
+                logger.isEnabledFor(logging.DEBUG)
+                and get_settings().DEEPEVAL_VERBOSE_MODE
+            ):
+                logger.debug(
+                    "Skipping trace %s: no golden association found during evaluation_loop ",
+                    trace.uuid,
+                )
+            continue
         with capture_evaluation_run("golden"):
             task = execute_evals_with_semaphore(
                 func=_a_execute_agentic_test_case,
@@ -2222,6 +2244,7 @@ def _execute_metric(
             test_case,
             _show_indicator=show_metric_indicator,
             _in_component=in_component,
+            _log_metric_to_confident=False,
         )
     except MissingTestCaseParamsError as e:
         if error_config.skip_on_missing_params:
@@ -2256,3 +2279,38 @@ def _execute_metric(
             metric.success = False
         else:
             raise
+def log_prompt(
+    llm_span: LlmSpan,
+    test_run_manager: TestRunManager,
+):
+    prompt = llm_span.prompt
+    if prompt is None:
+        return
+    span_hyperparameters = {}
+    prompt_version = prompt.version if is_confident() else None
+    key = f"{prompt.alias}_{prompt_version}"
+    span_hyperparameters[key] = prompt
+    test_run = test_run_manager.get_test_run()
+    if test_run.prompts is None:
+        test_run.prompts = []
+    if test_run.hyperparameters is None:
+        test_run.hyperparameters = {}
+    if key not in test_run.hyperparameters:
+        test_run.hyperparameters.update(
+            process_hyperparameters(span_hyperparameters, False)
+        )
+        existing_prompt_keys = {
+            f"{p.alias}_{p.version}" for p in test_run.prompts
+        }
+        new_prompts = process_prompts(span_hyperparameters)
+        for new_prompt in new_prompts:
+            new_prompt_key = f"{new_prompt.alias}_{new_prompt.version}"
+            if new_prompt_key not in existing_prompt_keys:
+                test_run.prompts.append(new_prompt)
+    global_test_run_manager.save_test_run(TEMP_FILE_PATH)

deepeval/evaluate/utils.py CHANGED Viewed

@@ -28,7 +28,6 @@ from deepeval.evaluate.types import TestResult
 from deepeval.tracing.api import TraceApi, BaseApiSpan, TraceSpanApiStatus
 from deepeval.tracing.tracing import BaseSpan, Trace
 from deepeval.tracing.types import TraceSpanStatus
-from deepeval.constants import PYTEST_RUN_TEST_NAME
 from deepeval.tracing.utils import (
     perf_counter_to_datetime,
     to_zod_compatible_iso,
@@ -133,121 +132,6 @@ def create_test_result(
             )
-def create_api_turn(turn: Turn, index: int) -> TurnApi:
-    return TurnApi(
-        role=turn.role,
-        content=turn.content,
-        user_id=turn.user_id,
-        retrievalContext=turn.retrieval_context,
-        toolsCalled=turn.tools_called,
-        additionalMetadata=turn.additional_metadata,
-        order=index,
-    )
-def create_api_test_case(
-    test_case: Union[LLMTestCase, ConversationalTestCase, MLLMTestCase],
-    trace: Optional[TraceApi] = None,
-    index: Optional[int] = None,
-) -> Union[LLMApiTestCase, ConversationalApiTestCase]:
-    if isinstance(test_case, ConversationalTestCase):
-        order = (
-            test_case._dataset_rank
-            if test_case._dataset_rank is not None
-            else index
-        )
-        if test_case.name:
-            name = test_case.name
-        else:
-            name = os.getenv(
-                PYTEST_RUN_TEST_NAME, f"conversational_test_case_{order}"
-            )
-        api_test_case = ConversationalApiTestCase(
-            name=name,
-            success=True,
-            metricsData=[],
-            runDuration=0,
-            evaluationCost=None,
-            order=order,
-            scenario=test_case.scenario,
-            expectedOutcome=test_case.expected_outcome,
-            userDescription=test_case.user_description,
-            context=test_case.context,
-            tags=test_case.tags,
-            comments=test_case.comments,
-            additionalMetadata=test_case.additional_metadata,
-        )
-        api_test_case.turns = [
-            create_api_turn(
-                turn=turn,
-                index=index,
-            )
-            for index, turn in enumerate(test_case.turns)
-        ]
-        return api_test_case
-    else:
-        order = (
-            test_case._dataset_rank
-            if test_case._dataset_rank is not None
-            else index
-        )
-        success = True
-        if test_case.name is not None:
-            name = test_case.name
-        else:
-            name = os.getenv(PYTEST_RUN_TEST_NAME, f"test_case_{order}")
-        metrics_data = []
-        if isinstance(test_case, LLMTestCase):
-            api_test_case = LLMApiTestCase(
-                name=name,
-                input=test_case.input,
-                actualOutput=test_case.actual_output,
-                expectedOutput=test_case.expected_output,
-                context=test_case.context,
-                retrievalContext=test_case.retrieval_context,
-                toolsCalled=test_case.tools_called,
-                expectedTools=test_case.expected_tools,
-                tokenCost=test_case.token_cost,
-                completionTime=test_case.completion_time,
-                tags=test_case.tags,
-                success=success,
-                metricsData=metrics_data,
-                runDuration=None,
-                evaluationCost=None,
-                order=order,
-                additionalMetadata=test_case.additional_metadata,
-                comments=test_case.comments,
-                trace=trace,
-            )
-        elif isinstance(test_case, MLLMTestCase):
-            api_test_case = LLMApiTestCase(
-                name=name,
-                input="",
-                multimodalInput=test_case.input,
-                multimodalActualOutput=test_case.actual_output,
-                multimodalExpectedOutput=test_case.expected_output,
-                multimodalRetrievalContext=test_case.retrieval_context,
-                multimodalContext=test_case.context,
-                toolsCalled=test_case.tools_called,
-                expectedTools=test_case.expected_tools,
-                tokenCost=test_case.token_cost,
-                completionTime=test_case.completion_time,
-                success=success,
-                metricsData=metrics_data,
-                runDuration=None,
-                evaluationCost=None,
-                order=order,
-                additionalMetadata=test_case.additional_metadata,
-                comments=test_case.comments,
-            )
-        # llm_test_case_lookup_map[instance_id] = api_test_case
-        return api_test_case
 def create_api_trace(trace: Trace, golden: Golden) -> TraceApi:
     return TraceApi(
         uuid=trace.uuid,
@@ -309,6 +193,26 @@ def validate_assert_test_inputs(
             "Both 'test_case' and 'metrics' must be provided together."
         )
+    if test_case and metrics:
+        if isinstance(test_case, LLMTestCase) and not all(
+            isinstance(metric, BaseMetric) for metric in metrics
+        ):
+            raise ValueError(
+                "All 'metrics' for an 'LLMTestCase' must be instances of 'BaseMetric' only."
+            )
+        if isinstance(test_case, ConversationalTestCase) and not all(
+            isinstance(metric, BaseConversationalMetric) for metric in metrics
+        ):
+            raise ValueError(
+                "All 'metrics' for an 'ConversationalTestCase' must be instances of 'BaseConversationalMetric' only."
+            )
+        if isinstance(test_case, MLLMTestCase) and not all(
+            isinstance(metric, BaseMultimodalMetric) for metric in metrics
+        ):
+            raise ValueError(
+                "All 'metrics' for an 'MLLMTestCase' must be instances of 'BaseMultimodalMetric' only."
+            )
     if not ((golden and observed_callback) or (test_case and metrics)):
         raise ValueError(
             "You must provide either ('golden' + 'observed_callback') or ('test_case' + 'metrics')."

deepeval/integrations/crewai/__init__.py CHANGED Viewed

@@ -1,3 +1,8 @@
 from .handler import instrument_crewai
+from .subs import (
+    DeepEvalCrew as Crew,
+    DeepEvalAgent as Agent,
+    DeepEvalLLM as LLM,
+)
-__all__ = ["instrument_crewai"]
+__all__ = ["instrument_crewai", "Crew", "Agent", "LLM"]

deepeval/integrations/crewai/handler.py CHANGED Viewed

@@ -13,7 +13,7 @@ logger = logging.getLogger(__name__)
 try:
-    from crewai.utilities.events.base_event_listener import BaseEventListener
+    from crewai.events import BaseEventListener
     from crewai.events import (
         CrewKickoffStartedEvent,
         CrewKickoffCompletedEvent,

deepeval/integrations/crewai/subs.py ADDED Viewed

@@ -0,0 +1,51 @@
+from typing import List, Optional, Type, TypeVar
+from pydantic import PrivateAttr
+from deepeval.metrics.base_metric import BaseMetric
+try:
+    from crewai import Crew, Agent, LLM
+    is_crewai_installed = True
+except ImportError:
+    is_crewai_installed = False
+def is_crewai_installed():
+    if not is_crewai_installed:
+        raise ImportError(
+            "CrewAI is not installed. Please install it with `pip install crewai`."
+        )
+T = TypeVar("T")
+def create_deepeval_class(base_class: Type[T], class_name: str) -> Type[T]:
+    """Factory function to create DeepEval-enabled CrewAI classes"""
+    class DeepEvalClass(base_class):
+        _metric_collection: Optional[str] = PrivateAttr(default=None)
+        _metrics: Optional[List[BaseMetric]] = PrivateAttr(default=None)
+        def __init__(
+            self,
+            *args,
+            metrics: Optional[List[BaseMetric]] = None,
+            metric_collection: Optional[str] = None,
+            **kwargs
+        ):
+            is_crewai_installed()
+            super().__init__(*args, **kwargs)
+            self._metric_collection = metric_collection
+            self._metrics = metrics
+    DeepEvalClass.__name__ = class_name
+    DeepEvalClass.__qualname__ = class_name
+    return DeepEvalClass
+# Create the classes
+DeepEvalCrew = create_deepeval_class(Crew, "DeepEvalCrew")
+DeepEvalAgent = create_deepeval_class(Agent, "DeepEvalAgent")
+DeepEvalLLM = create_deepeval_class(LLM, "DeepEvalLLM")

deepeval 3.6.5__py3-none-any.whl → 3.6.7__py3-none-any.whl

deepeval 3.6.5py3-none-any.whl → 3.6.7py3-none-any.whl