deepeval 3.6.5__py3-none-any.whl → 3.6.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/__init__.py +42 -10
- deepeval/_version.py +1 -1
- deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
- deepeval/cli/main.py +42 -0
- deepeval/confident/api.py +1 -0
- deepeval/config/logging.py +33 -0
- deepeval/config/settings.py +176 -16
- deepeval/constants.py +8 -1
- deepeval/dataset/dataset.py +2 -11
- deepeval/dataset/utils.py +1 -1
- deepeval/evaluate/evaluate.py +5 -1
- deepeval/evaluate/execute.py +118 -60
- deepeval/evaluate/utils.py +20 -116
- deepeval/integrations/crewai/__init__.py +6 -1
- deepeval/integrations/crewai/handler.py +1 -1
- deepeval/integrations/crewai/subs.py +51 -0
- deepeval/integrations/crewai/wrapper.py +45 -5
- deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
- deepeval/metrics/api.py +281 -0
- deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
- deepeval/metrics/bias/bias.py +12 -3
- deepeval/metrics/contextual_precision/contextual_precision.py +12 -3
- deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
- deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
- deepeval/metrics/conversational_dag/nodes.py +12 -4
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +73 -59
- deepeval/metrics/dag/dag.py +12 -0
- deepeval/metrics/dag/nodes.py +12 -4
- deepeval/metrics/faithfulness/faithfulness.py +12 -1
- deepeval/metrics/g_eval/g_eval.py +37 -15
- deepeval/metrics/hallucination/hallucination.py +12 -1
- deepeval/metrics/indicator.py +8 -2
- deepeval/metrics/json_correctness/json_correctness.py +12 -1
- deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +13 -0
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +13 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +12 -1
- deepeval/metrics/misuse/misuse.py +12 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
- deepeval/metrics/non_advice/non_advice.py +12 -0
- deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
- deepeval/metrics/prompt_alignment/prompt_alignment.py +53 -24
- deepeval/metrics/role_adherence/role_adherence.py +12 -0
- deepeval/metrics/role_violation/role_violation.py +12 -0
- deepeval/metrics/summarization/summarization.py +12 -1
- deepeval/metrics/task_completion/task_completion.py +3 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +8 -0
- deepeval/metrics/toxicity/toxicity.py +12 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
- deepeval/models/llms/grok_model.py +1 -1
- deepeval/models/llms/openai_model.py +2 -0
- deepeval/models/retry_policy.py +202 -11
- deepeval/openai/__init__.py +14 -32
- deepeval/openai/extractors.py +24 -34
- deepeval/openai/patch.py +256 -161
- deepeval/openai/types.py +20 -0
- deepeval/openai/utils.py +98 -56
- deepeval/prompt/__init__.py +19 -1
- deepeval/prompt/api.py +160 -0
- deepeval/prompt/prompt.py +244 -62
- deepeval/prompt/utils.py +144 -2
- deepeval/synthesizer/chunking/context_generator.py +209 -152
- deepeval/synthesizer/chunking/doc_chunker.py +46 -12
- deepeval/synthesizer/synthesizer.py +8 -5
- deepeval/test_case/api.py +131 -0
- deepeval/test_run/__init__.py +1 -0
- deepeval/test_run/hyperparameters.py +47 -8
- deepeval/test_run/test_run.py +104 -1
- deepeval/tracing/api.py +3 -1
- deepeval/tracing/message_types/__init__.py +10 -0
- deepeval/tracing/message_types/base.py +6 -0
- deepeval/tracing/message_types/messages.py +14 -0
- deepeval/tracing/message_types/tools.py +18 -0
- deepeval/tracing/otel/exporter.py +0 -6
- deepeval/tracing/otel/utils.py +58 -8
- deepeval/tracing/trace_context.py +73 -4
- deepeval/tracing/trace_test_manager.py +19 -0
- deepeval/tracing/tracing.py +52 -4
- deepeval/tracing/types.py +16 -0
- deepeval/tracing/utils.py +8 -0
- {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/METADATA +1 -1
- {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/RECORD +97 -87
- {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/WHEEL +0 -0
- {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/entry_points.txt +0 -0
deepeval/evaluate/execute.py
CHANGED
|
@@ -61,6 +61,7 @@ from deepeval.test_case import (
|
|
|
61
61
|
ConversationalTestCase,
|
|
62
62
|
MLLMTestCase,
|
|
63
63
|
)
|
|
64
|
+
from deepeval.test_case.api import create_api_test_case
|
|
64
65
|
from deepeval.test_run import (
|
|
65
66
|
global_test_run_manager,
|
|
66
67
|
LLMApiTestCase,
|
|
@@ -80,18 +81,20 @@ from deepeval.evaluate.utils import (
|
|
|
80
81
|
create_api_trace,
|
|
81
82
|
create_metric_data,
|
|
82
83
|
create_test_result,
|
|
83
|
-
create_api_test_case,
|
|
84
84
|
count_metrics_in_trace,
|
|
85
85
|
extract_trace_test_results,
|
|
86
86
|
)
|
|
87
87
|
from deepeval.utils import add_pbar, update_pbar, custom_console
|
|
88
|
-
from deepeval.openai.utils import openai_test_case_pairs
|
|
89
88
|
from deepeval.tracing.types import TestCaseMetricPair
|
|
90
89
|
from deepeval.config.settings import get_settings
|
|
91
|
-
|
|
90
|
+
from deepeval.test_run import TEMP_FILE_PATH
|
|
91
|
+
from deepeval.confident.api import is_confident
|
|
92
|
+
from deepeval.test_run.hyperparameters import (
|
|
93
|
+
process_hyperparameters,
|
|
94
|
+
process_prompts,
|
|
95
|
+
)
|
|
92
96
|
|
|
93
97
|
logger = logging.getLogger(__name__)
|
|
94
|
-
settings = get_settings()
|
|
95
98
|
|
|
96
99
|
|
|
97
100
|
async def _snapshot_tasks():
|
|
@@ -100,6 +103,18 @@ async def _snapshot_tasks():
|
|
|
100
103
|
return {t for t in asyncio.all_tasks() if t is not cur}
|
|
101
104
|
|
|
102
105
|
|
|
106
|
+
def _per_task_timeout() -> float:
|
|
107
|
+
return get_settings().DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _gather_timeout() -> float:
|
|
111
|
+
s = get_settings()
|
|
112
|
+
return (
|
|
113
|
+
s.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
|
|
114
|
+
+ s.DEEPEVAL_TASK_GATHER_BUFFER_SECONDS
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
|
|
103
118
|
###########################################
|
|
104
119
|
### E2E Evals #############################
|
|
105
120
|
###########################################
|
|
@@ -838,7 +853,7 @@ def execute_agentic_test_cases(
|
|
|
838
853
|
loop.run_until_complete(
|
|
839
854
|
asyncio.wait_for(
|
|
840
855
|
coro,
|
|
841
|
-
timeout=
|
|
856
|
+
timeout=_per_task_timeout(),
|
|
842
857
|
)
|
|
843
858
|
)
|
|
844
859
|
else:
|
|
@@ -891,6 +906,7 @@ def execute_agentic_test_cases(
|
|
|
891
906
|
trace_api.agent_spans.append(api_span)
|
|
892
907
|
elif isinstance(span, LlmSpan):
|
|
893
908
|
trace_api.llm_spans.append(api_span)
|
|
909
|
+
log_prompt(span, test_run_manager)
|
|
894
910
|
elif isinstance(span, RetrieverSpan):
|
|
895
911
|
trace_api.retriever_spans.append(api_span)
|
|
896
912
|
elif isinstance(span, ToolSpan):
|
|
@@ -1196,7 +1212,7 @@ async def _a_execute_agentic_test_case(
|
|
|
1196
1212
|
if asyncio.iscoroutinefunction(observed_callback):
|
|
1197
1213
|
await asyncio.wait_for(
|
|
1198
1214
|
observed_callback(golden.input),
|
|
1199
|
-
timeout=
|
|
1215
|
+
timeout=_per_task_timeout(),
|
|
1200
1216
|
)
|
|
1201
1217
|
else:
|
|
1202
1218
|
observed_callback(golden.input)
|
|
@@ -1273,6 +1289,7 @@ async def _a_execute_agentic_test_case(
|
|
|
1273
1289
|
verbose_mode=verbose_mode,
|
|
1274
1290
|
progress=progress,
|
|
1275
1291
|
pbar_eval_id=pbar_eval_id,
|
|
1292
|
+
test_run_manager=test_run_manager,
|
|
1276
1293
|
_use_bar_indicator=_use_bar_indicator,
|
|
1277
1294
|
)
|
|
1278
1295
|
child_tasks = [dfs(child) for child in span.children]
|
|
@@ -1280,7 +1297,18 @@ async def _a_execute_agentic_test_case(
|
|
|
1280
1297
|
await asyncio.gather(*child_tasks)
|
|
1281
1298
|
|
|
1282
1299
|
test_start_time = time.perf_counter()
|
|
1283
|
-
|
|
1300
|
+
if current_trace and current_trace.root_spans:
|
|
1301
|
+
await dfs(current_trace.root_spans[0])
|
|
1302
|
+
else:
|
|
1303
|
+
if (
|
|
1304
|
+
logger.isEnabledFor(logging.DEBUG)
|
|
1305
|
+
and get_settings().DEEPEVAL_VERBOSE_MODE
|
|
1306
|
+
):
|
|
1307
|
+
logger.debug(
|
|
1308
|
+
"Skipping DFS: empty trace or no root spans (trace=%s)",
|
|
1309
|
+
current_trace.uuid if current_trace else None,
|
|
1310
|
+
)
|
|
1311
|
+
|
|
1284
1312
|
test_end_time = time.perf_counter()
|
|
1285
1313
|
run_duration = test_end_time - test_start_time
|
|
1286
1314
|
|
|
@@ -1302,6 +1330,7 @@ async def _a_execute_span_test_case(
|
|
|
1302
1330
|
verbose_mode: Optional[bool],
|
|
1303
1331
|
progress: Optional[Progress],
|
|
1304
1332
|
pbar_eval_id: Optional[int],
|
|
1333
|
+
test_run_manager: Optional[TestRunManager],
|
|
1305
1334
|
_use_bar_indicator: bool,
|
|
1306
1335
|
):
|
|
1307
1336
|
api_span: BaseApiSpan = trace_manager._convert_span_to_api_span(span)
|
|
@@ -1309,6 +1338,7 @@ async def _a_execute_span_test_case(
|
|
|
1309
1338
|
trace_api.agent_spans.append(api_span)
|
|
1310
1339
|
elif isinstance(span, LlmSpan):
|
|
1311
1340
|
trace_api.llm_spans.append(api_span)
|
|
1341
|
+
log_prompt(span, test_run_manager)
|
|
1312
1342
|
elif isinstance(span, RetrieverSpan):
|
|
1313
1343
|
trace_api.retriever_spans.append(api_span)
|
|
1314
1344
|
elif isinstance(span, ToolSpan):
|
|
@@ -1557,6 +1587,7 @@ def execute_agentic_test_cases_from_loop(
|
|
|
1557
1587
|
trace_api.agent_spans.append(api_span)
|
|
1558
1588
|
elif isinstance(span, LlmSpan):
|
|
1559
1589
|
trace_api.llm_spans.append(api_span)
|
|
1590
|
+
log_prompt(span, test_run_manager)
|
|
1560
1591
|
elif isinstance(span, RetrieverSpan):
|
|
1561
1592
|
trace_api.retriever_spans.append(api_span)
|
|
1562
1593
|
elif isinstance(span, ToolSpan):
|
|
@@ -1737,6 +1768,7 @@ def execute_agentic_test_cases_from_loop(
|
|
|
1737
1768
|
local_trace_manager.evaluating = False
|
|
1738
1769
|
local_trace_manager.traces_to_evaluate_order.clear()
|
|
1739
1770
|
local_trace_manager.traces_to_evaluate.clear()
|
|
1771
|
+
local_trace_manager.trace_uuid_to_golden.clear()
|
|
1740
1772
|
|
|
1741
1773
|
|
|
1742
1774
|
def a_execute_agentic_test_cases_from_loop(
|
|
@@ -1753,11 +1785,6 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1753
1785
|
_is_assert_test: bool = False,
|
|
1754
1786
|
) -> Iterator[TestResult]:
|
|
1755
1787
|
|
|
1756
|
-
GATHER_TIMEOUT_SECONDS = (
|
|
1757
|
-
settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
|
|
1758
|
-
+ settings.DEEPEVAL_TASK_GATHER_BUFFER_SECONDS
|
|
1759
|
-
)
|
|
1760
|
-
|
|
1761
1788
|
semaphore = asyncio.Semaphore(async_config.max_concurrent)
|
|
1762
1789
|
original_create_task = asyncio.create_task
|
|
1763
1790
|
|
|
@@ -1772,7 +1799,7 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1772
1799
|
async def execute_callback_with_semaphore(coroutine: Awaitable):
|
|
1773
1800
|
async with semaphore:
|
|
1774
1801
|
return await asyncio.wait_for(
|
|
1775
|
-
coroutine, timeout=
|
|
1802
|
+
coroutine, timeout=_per_task_timeout()
|
|
1776
1803
|
)
|
|
1777
1804
|
|
|
1778
1805
|
def evaluate_test_cases(
|
|
@@ -1814,7 +1841,7 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1814
1841
|
}
|
|
1815
1842
|
|
|
1816
1843
|
def on_task_done(t: asyncio.Task):
|
|
1817
|
-
if
|
|
1844
|
+
if get_settings().DEEPEVAL_DEBUG_ASYNC:
|
|
1818
1845
|
# Using info level here to make it easy to spot these logs.
|
|
1819
1846
|
# We are gated by DEEPEVAL_DEBUG_ASYNC
|
|
1820
1847
|
meta = task_meta.get(t, {})
|
|
@@ -1888,7 +1915,7 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1888
1915
|
loop.run_until_complete(
|
|
1889
1916
|
asyncio.wait_for(
|
|
1890
1917
|
asyncio.gather(*created_tasks, return_exceptions=True),
|
|
1891
|
-
timeout=
|
|
1918
|
+
timeout=_gather_timeout(),
|
|
1892
1919
|
)
|
|
1893
1920
|
)
|
|
1894
1921
|
except asyncio.TimeoutError:
|
|
@@ -1903,16 +1930,13 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1903
1930
|
elapsed_time = time.perf_counter() - start_time
|
|
1904
1931
|
|
|
1905
1932
|
# Determine if it was a per task or gather timeout based on task's elapsed time
|
|
1906
|
-
if (
|
|
1907
|
-
elapsed_time
|
|
1908
|
-
>= settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
|
|
1909
|
-
):
|
|
1933
|
+
if elapsed_time >= _per_task_timeout():
|
|
1910
1934
|
timeout_type = "per-task"
|
|
1911
1935
|
else:
|
|
1912
1936
|
timeout_type = "gather"
|
|
1913
1937
|
|
|
1914
1938
|
logger.warning(
|
|
1915
|
-
f"[deepeval] gather TIMEOUT after {
|
|
1939
|
+
f"[deepeval] gather TIMEOUT after {_gather_timeout()}s; "
|
|
1916
1940
|
f"pending={len(pending)} tasks. Timeout type: {timeout_type}. "
|
|
1917
1941
|
f"To give tasks more time, consider increasing "
|
|
1918
1942
|
f"DEEPEVAL_PER_TASK_TIMEOUT_SECONDS for longer task completion time or "
|
|
@@ -1926,7 +1950,7 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1926
1950
|
elapsed_time,
|
|
1927
1951
|
meta,
|
|
1928
1952
|
)
|
|
1929
|
-
if loop.get_debug() and
|
|
1953
|
+
if loop.get_debug() and get_settings().DEEPEVAL_DEBUG_ASYNC:
|
|
1930
1954
|
frames = t.get_stack(limit=6)
|
|
1931
1955
|
if frames:
|
|
1932
1956
|
logger.info(" stack:")
|
|
@@ -1947,12 +1971,12 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1947
1971
|
return
|
|
1948
1972
|
|
|
1949
1973
|
try:
|
|
1974
|
+
current_tasks = set()
|
|
1950
1975
|
# Find tasks that were created during this run but we didn’t track
|
|
1951
1976
|
current_tasks = loop.run_until_complete(_snapshot_tasks())
|
|
1952
1977
|
except RuntimeError:
|
|
1953
1978
|
# this might happen if the loop is already closing
|
|
1954
|
-
|
|
1955
|
-
return
|
|
1979
|
+
pass
|
|
1956
1980
|
|
|
1957
1981
|
leftovers = [
|
|
1958
1982
|
t
|
|
@@ -1962,10 +1986,7 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1962
1986
|
and not t.done()
|
|
1963
1987
|
]
|
|
1964
1988
|
|
|
1965
|
-
if
|
|
1966
|
-
return
|
|
1967
|
-
|
|
1968
|
-
if settings.DEEPEVAL_DEBUG_ASYNC:
|
|
1989
|
+
if get_settings().DEEPEVAL_DEBUG_ASYNC:
|
|
1969
1990
|
logger.warning(
|
|
1970
1991
|
"[deepeval] %d stray task(s) not tracked; cancelling...",
|
|
1971
1992
|
len(leftovers),
|
|
@@ -1975,20 +1996,21 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1975
1996
|
name = t.get_name()
|
|
1976
1997
|
logger.warning(" - STRAY %s meta=%s", name, meta)
|
|
1977
1998
|
|
|
1978
|
-
|
|
1979
|
-
t
|
|
1999
|
+
if leftovers:
|
|
2000
|
+
for t in leftovers:
|
|
2001
|
+
t.cancel()
|
|
1980
2002
|
|
|
1981
|
-
|
|
1982
|
-
|
|
1983
|
-
|
|
1984
|
-
|
|
1985
|
-
)
|
|
1986
|
-
except RuntimeError:
|
|
1987
|
-
# If the loop is closing here, just continue
|
|
1988
|
-
if settings.DEEPEVAL_DEBUG_ASYNC:
|
|
1989
|
-
logger.warning(
|
|
1990
|
-
"[deepeval] failed to drain stray tasks because loop is closing"
|
|
2003
|
+
# Drain strays so they don’t leak into the next iteration
|
|
2004
|
+
try:
|
|
2005
|
+
loop.run_until_complete(
|
|
2006
|
+
asyncio.gather(*leftovers, return_exceptions=True)
|
|
1991
2007
|
)
|
|
2008
|
+
except RuntimeError:
|
|
2009
|
+
# If the loop is closing here, just continue
|
|
2010
|
+
if get_settings().DEEPEVAL_DEBUG_ASYNC:
|
|
2011
|
+
logger.warning(
|
|
2012
|
+
"[deepeval] failed to drain stray tasks because loop is closing"
|
|
2013
|
+
)
|
|
1992
2014
|
|
|
1993
2015
|
# Evaluate traces
|
|
1994
2016
|
if trace_manager.traces_to_evaluate:
|
|
@@ -2011,25 +2033,6 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
2011
2033
|
pbar_id=pbar_id,
|
|
2012
2034
|
)
|
|
2013
2035
|
)
|
|
2014
|
-
elif openai_test_case_pairs:
|
|
2015
|
-
loop.run_until_complete(
|
|
2016
|
-
_evaluate_test_case_pairs(
|
|
2017
|
-
test_case_pairs=openai_test_case_pairs,
|
|
2018
|
-
test_run=test_run,
|
|
2019
|
-
test_run_manager=test_run_manager,
|
|
2020
|
-
test_results=test_results,
|
|
2021
|
-
ignore_errors=error_config.ignore_errors,
|
|
2022
|
-
skip_on_missing_params=error_config.skip_on_missing_params,
|
|
2023
|
-
show_indicator=display_config.show_indicator,
|
|
2024
|
-
verbose_mode=display_config.verbose_mode,
|
|
2025
|
-
throttle_value=async_config.throttle_value,
|
|
2026
|
-
max_concurrent=async_config.max_concurrent,
|
|
2027
|
-
_use_bar_indicator=_use_bar_indicator,
|
|
2028
|
-
_is_assert_test=_is_assert_test,
|
|
2029
|
-
progress=progress,
|
|
2030
|
-
pbar_id=pbar_id,
|
|
2031
|
-
)
|
|
2032
|
-
)
|
|
2033
2036
|
elif trace_manager.integration_traces_to_evaluate:
|
|
2034
2037
|
loop.run_until_complete(
|
|
2035
2038
|
_a_evaluate_traces(
|
|
@@ -2103,6 +2106,7 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
2103
2106
|
local_trace_manager.evaluating = False
|
|
2104
2107
|
local_trace_manager.traces_to_evaluate_order.clear()
|
|
2105
2108
|
local_trace_manager.traces_to_evaluate.clear()
|
|
2109
|
+
local_trace_manager.trace_uuid_to_golden.clear()
|
|
2106
2110
|
|
|
2107
2111
|
|
|
2108
2112
|
async def _a_evaluate_traces(
|
|
@@ -2129,8 +2133,26 @@ async def _a_evaluate_traces(
|
|
|
2129
2133
|
return await func(*args, **kwargs)
|
|
2130
2134
|
|
|
2131
2135
|
eval_tasks = []
|
|
2132
|
-
|
|
2133
|
-
|
|
2136
|
+
# Here, we will work off a fixed-set copy to avoid surprises from potential
|
|
2137
|
+
# mid-iteration mutation
|
|
2138
|
+
traces_snapshot = list(traces_to_evaluate or [])
|
|
2139
|
+
|
|
2140
|
+
for count, trace in enumerate(traces_snapshot):
|
|
2141
|
+
# Prefer the explicit mapping from trace -> golden captured at trace creation.
|
|
2142
|
+
golden = trace_manager.trace_uuid_to_golden.get(trace.uuid)
|
|
2143
|
+
if not golden:
|
|
2144
|
+
# trace started during evaluation_loop but the CURRENT_GOLDEN was
|
|
2145
|
+
# not set for some reason. We can’t map it to a golden, so the best
|
|
2146
|
+
# we can do is skip evaluation for this trace.
|
|
2147
|
+
if (
|
|
2148
|
+
logger.isEnabledFor(logging.DEBUG)
|
|
2149
|
+
and get_settings().DEEPEVAL_VERBOSE_MODE
|
|
2150
|
+
):
|
|
2151
|
+
logger.debug(
|
|
2152
|
+
"Skipping trace %s: no golden association found during evaluation_loop ",
|
|
2153
|
+
trace.uuid,
|
|
2154
|
+
)
|
|
2155
|
+
continue
|
|
2134
2156
|
with capture_evaluation_run("golden"):
|
|
2135
2157
|
task = execute_evals_with_semaphore(
|
|
2136
2158
|
func=_a_execute_agentic_test_case,
|
|
@@ -2222,6 +2244,7 @@ def _execute_metric(
|
|
|
2222
2244
|
test_case,
|
|
2223
2245
|
_show_indicator=show_metric_indicator,
|
|
2224
2246
|
_in_component=in_component,
|
|
2247
|
+
_log_metric_to_confident=False,
|
|
2225
2248
|
)
|
|
2226
2249
|
except MissingTestCaseParamsError as e:
|
|
2227
2250
|
if error_config.skip_on_missing_params:
|
|
@@ -2256,3 +2279,38 @@ def _execute_metric(
|
|
|
2256
2279
|
metric.success = False
|
|
2257
2280
|
else:
|
|
2258
2281
|
raise
|
|
2282
|
+
|
|
2283
|
+
|
|
2284
|
+
def log_prompt(
|
|
2285
|
+
llm_span: LlmSpan,
|
|
2286
|
+
test_run_manager: TestRunManager,
|
|
2287
|
+
):
|
|
2288
|
+
prompt = llm_span.prompt
|
|
2289
|
+
if prompt is None:
|
|
2290
|
+
return
|
|
2291
|
+
|
|
2292
|
+
span_hyperparameters = {}
|
|
2293
|
+
prompt_version = prompt.version if is_confident() else None
|
|
2294
|
+
key = f"{prompt.alias}_{prompt_version}"
|
|
2295
|
+
span_hyperparameters[key] = prompt
|
|
2296
|
+
|
|
2297
|
+
test_run = test_run_manager.get_test_run()
|
|
2298
|
+
if test_run.prompts is None:
|
|
2299
|
+
test_run.prompts = []
|
|
2300
|
+
if test_run.hyperparameters is None:
|
|
2301
|
+
test_run.hyperparameters = {}
|
|
2302
|
+
|
|
2303
|
+
if key not in test_run.hyperparameters:
|
|
2304
|
+
test_run.hyperparameters.update(
|
|
2305
|
+
process_hyperparameters(span_hyperparameters, False)
|
|
2306
|
+
)
|
|
2307
|
+
existing_prompt_keys = {
|
|
2308
|
+
f"{p.alias}_{p.version}" for p in test_run.prompts
|
|
2309
|
+
}
|
|
2310
|
+
new_prompts = process_prompts(span_hyperparameters)
|
|
2311
|
+
for new_prompt in new_prompts:
|
|
2312
|
+
new_prompt_key = f"{new_prompt.alias}_{new_prompt.version}"
|
|
2313
|
+
if new_prompt_key not in existing_prompt_keys:
|
|
2314
|
+
test_run.prompts.append(new_prompt)
|
|
2315
|
+
|
|
2316
|
+
global_test_run_manager.save_test_run(TEMP_FILE_PATH)
|
deepeval/evaluate/utils.py
CHANGED
|
@@ -28,7 +28,6 @@ from deepeval.evaluate.types import TestResult
|
|
|
28
28
|
from deepeval.tracing.api import TraceApi, BaseApiSpan, TraceSpanApiStatus
|
|
29
29
|
from deepeval.tracing.tracing import BaseSpan, Trace
|
|
30
30
|
from deepeval.tracing.types import TraceSpanStatus
|
|
31
|
-
from deepeval.constants import PYTEST_RUN_TEST_NAME
|
|
32
31
|
from deepeval.tracing.utils import (
|
|
33
32
|
perf_counter_to_datetime,
|
|
34
33
|
to_zod_compatible_iso,
|
|
@@ -133,121 +132,6 @@ def create_test_result(
|
|
|
133
132
|
)
|
|
134
133
|
|
|
135
134
|
|
|
136
|
-
def create_api_turn(turn: Turn, index: int) -> TurnApi:
|
|
137
|
-
return TurnApi(
|
|
138
|
-
role=turn.role,
|
|
139
|
-
content=turn.content,
|
|
140
|
-
user_id=turn.user_id,
|
|
141
|
-
retrievalContext=turn.retrieval_context,
|
|
142
|
-
toolsCalled=turn.tools_called,
|
|
143
|
-
additionalMetadata=turn.additional_metadata,
|
|
144
|
-
order=index,
|
|
145
|
-
)
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
def create_api_test_case(
|
|
149
|
-
test_case: Union[LLMTestCase, ConversationalTestCase, MLLMTestCase],
|
|
150
|
-
trace: Optional[TraceApi] = None,
|
|
151
|
-
index: Optional[int] = None,
|
|
152
|
-
) -> Union[LLMApiTestCase, ConversationalApiTestCase]:
|
|
153
|
-
if isinstance(test_case, ConversationalTestCase):
|
|
154
|
-
order = (
|
|
155
|
-
test_case._dataset_rank
|
|
156
|
-
if test_case._dataset_rank is not None
|
|
157
|
-
else index
|
|
158
|
-
)
|
|
159
|
-
if test_case.name:
|
|
160
|
-
name = test_case.name
|
|
161
|
-
else:
|
|
162
|
-
name = os.getenv(
|
|
163
|
-
PYTEST_RUN_TEST_NAME, f"conversational_test_case_{order}"
|
|
164
|
-
)
|
|
165
|
-
|
|
166
|
-
api_test_case = ConversationalApiTestCase(
|
|
167
|
-
name=name,
|
|
168
|
-
success=True,
|
|
169
|
-
metricsData=[],
|
|
170
|
-
runDuration=0,
|
|
171
|
-
evaluationCost=None,
|
|
172
|
-
order=order,
|
|
173
|
-
scenario=test_case.scenario,
|
|
174
|
-
expectedOutcome=test_case.expected_outcome,
|
|
175
|
-
userDescription=test_case.user_description,
|
|
176
|
-
context=test_case.context,
|
|
177
|
-
tags=test_case.tags,
|
|
178
|
-
comments=test_case.comments,
|
|
179
|
-
additionalMetadata=test_case.additional_metadata,
|
|
180
|
-
)
|
|
181
|
-
api_test_case.turns = [
|
|
182
|
-
create_api_turn(
|
|
183
|
-
turn=turn,
|
|
184
|
-
index=index,
|
|
185
|
-
)
|
|
186
|
-
for index, turn in enumerate(test_case.turns)
|
|
187
|
-
]
|
|
188
|
-
|
|
189
|
-
return api_test_case
|
|
190
|
-
else:
|
|
191
|
-
order = (
|
|
192
|
-
test_case._dataset_rank
|
|
193
|
-
if test_case._dataset_rank is not None
|
|
194
|
-
else index
|
|
195
|
-
)
|
|
196
|
-
|
|
197
|
-
success = True
|
|
198
|
-
if test_case.name is not None:
|
|
199
|
-
name = test_case.name
|
|
200
|
-
else:
|
|
201
|
-
name = os.getenv(PYTEST_RUN_TEST_NAME, f"test_case_{order}")
|
|
202
|
-
metrics_data = []
|
|
203
|
-
|
|
204
|
-
if isinstance(test_case, LLMTestCase):
|
|
205
|
-
api_test_case = LLMApiTestCase(
|
|
206
|
-
name=name,
|
|
207
|
-
input=test_case.input,
|
|
208
|
-
actualOutput=test_case.actual_output,
|
|
209
|
-
expectedOutput=test_case.expected_output,
|
|
210
|
-
context=test_case.context,
|
|
211
|
-
retrievalContext=test_case.retrieval_context,
|
|
212
|
-
toolsCalled=test_case.tools_called,
|
|
213
|
-
expectedTools=test_case.expected_tools,
|
|
214
|
-
tokenCost=test_case.token_cost,
|
|
215
|
-
completionTime=test_case.completion_time,
|
|
216
|
-
tags=test_case.tags,
|
|
217
|
-
success=success,
|
|
218
|
-
metricsData=metrics_data,
|
|
219
|
-
runDuration=None,
|
|
220
|
-
evaluationCost=None,
|
|
221
|
-
order=order,
|
|
222
|
-
additionalMetadata=test_case.additional_metadata,
|
|
223
|
-
comments=test_case.comments,
|
|
224
|
-
trace=trace,
|
|
225
|
-
)
|
|
226
|
-
elif isinstance(test_case, MLLMTestCase):
|
|
227
|
-
api_test_case = LLMApiTestCase(
|
|
228
|
-
name=name,
|
|
229
|
-
input="",
|
|
230
|
-
multimodalInput=test_case.input,
|
|
231
|
-
multimodalActualOutput=test_case.actual_output,
|
|
232
|
-
multimodalExpectedOutput=test_case.expected_output,
|
|
233
|
-
multimodalRetrievalContext=test_case.retrieval_context,
|
|
234
|
-
multimodalContext=test_case.context,
|
|
235
|
-
toolsCalled=test_case.tools_called,
|
|
236
|
-
expectedTools=test_case.expected_tools,
|
|
237
|
-
tokenCost=test_case.token_cost,
|
|
238
|
-
completionTime=test_case.completion_time,
|
|
239
|
-
success=success,
|
|
240
|
-
metricsData=metrics_data,
|
|
241
|
-
runDuration=None,
|
|
242
|
-
evaluationCost=None,
|
|
243
|
-
order=order,
|
|
244
|
-
additionalMetadata=test_case.additional_metadata,
|
|
245
|
-
comments=test_case.comments,
|
|
246
|
-
)
|
|
247
|
-
# llm_test_case_lookup_map[instance_id] = api_test_case
|
|
248
|
-
return api_test_case
|
|
249
|
-
|
|
250
|
-
|
|
251
135
|
def create_api_trace(trace: Trace, golden: Golden) -> TraceApi:
|
|
252
136
|
return TraceApi(
|
|
253
137
|
uuid=trace.uuid,
|
|
@@ -309,6 +193,26 @@ def validate_assert_test_inputs(
|
|
|
309
193
|
"Both 'test_case' and 'metrics' must be provided together."
|
|
310
194
|
)
|
|
311
195
|
|
|
196
|
+
if test_case and metrics:
|
|
197
|
+
if isinstance(test_case, LLMTestCase) and not all(
|
|
198
|
+
isinstance(metric, BaseMetric) for metric in metrics
|
|
199
|
+
):
|
|
200
|
+
raise ValueError(
|
|
201
|
+
"All 'metrics' for an 'LLMTestCase' must be instances of 'BaseMetric' only."
|
|
202
|
+
)
|
|
203
|
+
if isinstance(test_case, ConversationalTestCase) and not all(
|
|
204
|
+
isinstance(metric, BaseConversationalMetric) for metric in metrics
|
|
205
|
+
):
|
|
206
|
+
raise ValueError(
|
|
207
|
+
"All 'metrics' for an 'ConversationalTestCase' must be instances of 'BaseConversationalMetric' only."
|
|
208
|
+
)
|
|
209
|
+
if isinstance(test_case, MLLMTestCase) and not all(
|
|
210
|
+
isinstance(metric, BaseMultimodalMetric) for metric in metrics
|
|
211
|
+
):
|
|
212
|
+
raise ValueError(
|
|
213
|
+
"All 'metrics' for an 'MLLMTestCase' must be instances of 'BaseMultimodalMetric' only."
|
|
214
|
+
)
|
|
215
|
+
|
|
312
216
|
if not ((golden and observed_callback) or (test_case and metrics)):
|
|
313
217
|
raise ValueError(
|
|
314
218
|
"You must provide either ('golden' + 'observed_callback') or ('test_case' + 'metrics')."
|
|
@@ -13,7 +13,7 @@ logger = logging.getLogger(__name__)
|
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
try:
|
|
16
|
-
from crewai.
|
|
16
|
+
from crewai.events import BaseEventListener
|
|
17
17
|
from crewai.events import (
|
|
18
18
|
CrewKickoffStartedEvent,
|
|
19
19
|
CrewKickoffCompletedEvent,
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from typing import List, Optional, Type, TypeVar
|
|
2
|
+
from pydantic import PrivateAttr
|
|
3
|
+
|
|
4
|
+
from deepeval.metrics.base_metric import BaseMetric
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
from crewai import Crew, Agent, LLM
|
|
8
|
+
|
|
9
|
+
is_crewai_installed = True
|
|
10
|
+
except ImportError:
|
|
11
|
+
is_crewai_installed = False
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def is_crewai_installed():
|
|
15
|
+
if not is_crewai_installed:
|
|
16
|
+
raise ImportError(
|
|
17
|
+
"CrewAI is not installed. Please install it with `pip install crewai`."
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
T = TypeVar("T")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def create_deepeval_class(base_class: Type[T], class_name: str) -> Type[T]:
|
|
25
|
+
"""Factory function to create DeepEval-enabled CrewAI classes"""
|
|
26
|
+
|
|
27
|
+
class DeepEvalClass(base_class):
|
|
28
|
+
_metric_collection: Optional[str] = PrivateAttr(default=None)
|
|
29
|
+
_metrics: Optional[List[BaseMetric]] = PrivateAttr(default=None)
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
*args,
|
|
34
|
+
metrics: Optional[List[BaseMetric]] = None,
|
|
35
|
+
metric_collection: Optional[str] = None,
|
|
36
|
+
**kwargs
|
|
37
|
+
):
|
|
38
|
+
is_crewai_installed()
|
|
39
|
+
super().__init__(*args, **kwargs)
|
|
40
|
+
self._metric_collection = metric_collection
|
|
41
|
+
self._metrics = metrics
|
|
42
|
+
|
|
43
|
+
DeepEvalClass.__name__ = class_name
|
|
44
|
+
DeepEvalClass.__qualname__ = class_name
|
|
45
|
+
return DeepEvalClass
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# Create the classes
|
|
49
|
+
DeepEvalCrew = create_deepeval_class(Crew, "DeepEvalCrew")
|
|
50
|
+
DeepEvalAgent = create_deepeval_class(Agent, "DeepEvalAgent")
|
|
51
|
+
DeepEvalLLM = create_deepeval_class(LLM, "DeepEvalLLM")
|