deepeval 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
- deepeval/cli/main.py +42 -0
- deepeval/confident/api.py +1 -0
- deepeval/config/settings.py +22 -4
- deepeval/constants.py +8 -1
- deepeval/dataset/dataset.py +2 -11
- deepeval/dataset/utils.py +1 -1
- deepeval/errors.py +20 -2
- deepeval/evaluate/evaluate.py +5 -1
- deepeval/evaluate/execute.py +811 -248
- deepeval/evaluate/types.py +1 -0
- deepeval/evaluate/utils.py +33 -119
- deepeval/integrations/crewai/__init__.py +7 -1
- deepeval/integrations/crewai/handler.py +1 -1
- deepeval/integrations/crewai/subs.py +51 -0
- deepeval/integrations/crewai/tool.py +71 -0
- deepeval/integrations/crewai/wrapper.py +45 -5
- deepeval/integrations/llama_index/__init__.py +0 -4
- deepeval/integrations/llama_index/handler.py +20 -21
- deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
- deepeval/metrics/__init__.py +13 -0
- deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
- deepeval/metrics/api.py +281 -0
- deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
- deepeval/metrics/base_metric.py +1 -0
- deepeval/metrics/bias/bias.py +12 -3
- deepeval/metrics/contextual_precision/contextual_precision.py +39 -24
- deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
- deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
- deepeval/metrics/conversational_dag/nodes.py +12 -4
- deepeval/metrics/conversational_g_eval/__init__.py +3 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +84 -66
- deepeval/metrics/dag/dag.py +12 -0
- deepeval/metrics/dag/nodes.py +12 -4
- deepeval/metrics/dag/schema.py +1 -1
- deepeval/metrics/dag/templates.py +2 -2
- deepeval/metrics/faithfulness/faithfulness.py +12 -1
- deepeval/metrics/g_eval/g_eval.py +11 -0
- deepeval/metrics/goal_accuracy/__init__.py +1 -0
- deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
- deepeval/metrics/goal_accuracy/schema.py +17 -0
- deepeval/metrics/goal_accuracy/template.py +235 -0
- deepeval/metrics/hallucination/hallucination.py +20 -9
- deepeval/metrics/indicator.py +8 -2
- deepeval/metrics/json_correctness/json_correctness.py +12 -1
- deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +20 -2
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +29 -6
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +14 -2
- deepeval/metrics/misuse/misuse.py +12 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +38 -25
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
- deepeval/metrics/non_advice/non_advice.py +12 -0
- deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
- deepeval/metrics/plan_adherence/__init__.py +1 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
- deepeval/metrics/plan_adherence/schema.py +11 -0
- deepeval/metrics/plan_adherence/template.py +170 -0
- deepeval/metrics/plan_quality/__init__.py +1 -0
- deepeval/metrics/plan_quality/plan_quality.py +292 -0
- deepeval/metrics/plan_quality/schema.py +11 -0
- deepeval/metrics/plan_quality/template.py +101 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
- deepeval/metrics/role_adherence/role_adherence.py +12 -0
- deepeval/metrics/role_violation/role_violation.py +12 -0
- deepeval/metrics/step_efficiency/__init__.py +1 -0
- deepeval/metrics/step_efficiency/schema.py +11 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
- deepeval/metrics/step_efficiency/template.py +256 -0
- deepeval/metrics/summarization/summarization.py +12 -1
- deepeval/metrics/task_completion/task_completion.py +4 -0
- deepeval/metrics/tool_correctness/schema.py +6 -0
- deepeval/metrics/tool_correctness/template.py +88 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +233 -21
- deepeval/metrics/tool_use/__init__.py +1 -0
- deepeval/metrics/tool_use/schema.py +19 -0
- deepeval/metrics/tool_use/template.py +220 -0
- deepeval/metrics/tool_use/tool_use.py +458 -0
- deepeval/metrics/topic_adherence/__init__.py +1 -0
- deepeval/metrics/topic_adherence/schema.py +16 -0
- deepeval/metrics/topic_adherence/template.py +162 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
- deepeval/metrics/toxicity/toxicity.py +12 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
- deepeval/models/embedding_models/azure_embedding_model.py +37 -36
- deepeval/models/embedding_models/local_embedding_model.py +30 -32
- deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
- deepeval/models/embedding_models/openai_embedding_model.py +22 -31
- deepeval/models/llms/grok_model.py +1 -1
- deepeval/models/llms/openai_model.py +2 -0
- deepeval/openai/__init__.py +14 -32
- deepeval/openai/extractors.py +85 -50
- deepeval/openai/patch.py +258 -167
- deepeval/openai/types.py +20 -0
- deepeval/openai/utils.py +205 -56
- deepeval/prompt/__init__.py +19 -1
- deepeval/prompt/api.py +160 -0
- deepeval/prompt/prompt.py +245 -62
- deepeval/prompt/utils.py +186 -15
- deepeval/synthesizer/chunking/context_generator.py +209 -152
- deepeval/synthesizer/chunking/doc_chunker.py +46 -12
- deepeval/synthesizer/synthesizer.py +19 -15
- deepeval/test_case/api.py +131 -0
- deepeval/test_case/llm_test_case.py +6 -2
- deepeval/test_run/__init__.py +1 -0
- deepeval/test_run/hyperparameters.py +47 -8
- deepeval/test_run/test_run.py +292 -206
- deepeval/tracing/__init__.py +2 -1
- deepeval/tracing/api.py +3 -1
- deepeval/tracing/otel/exporter.py +3 -4
- deepeval/tracing/otel/utils.py +24 -5
- deepeval/tracing/trace_context.py +89 -5
- deepeval/tracing/tracing.py +74 -3
- deepeval/tracing/types.py +20 -2
- deepeval/tracing/utils.py +8 -0
- deepeval/utils.py +21 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/RECORD +133 -103
- deepeval/integrations/llama_index/agent/patched.py +0 -68
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
deepeval/test_run/test_run.py
CHANGED
|
@@ -32,6 +32,15 @@ from deepeval.utils import (
|
|
|
32
32
|
)
|
|
33
33
|
from deepeval.test_run.cache import global_test_run_cache_manager
|
|
34
34
|
from deepeval.constants import CONFIDENT_TEST_CASE_BATCH_SIZE, HIDDEN_DIR
|
|
35
|
+
from deepeval.prompt import (
|
|
36
|
+
PromptMessage,
|
|
37
|
+
ModelSettings,
|
|
38
|
+
PromptInterpolationType,
|
|
39
|
+
OutputType,
|
|
40
|
+
)
|
|
41
|
+
from rich.panel import Panel
|
|
42
|
+
from rich.columns import Columns
|
|
43
|
+
|
|
35
44
|
|
|
36
45
|
TEMP_FILE_PATH = f"{HIDDEN_DIR}/.temp_test_run_data.json"
|
|
37
46
|
LATEST_TEST_RUN_FILE_PATH = f"{HIDDEN_DIR}/.latest_test_run.json"
|
|
@@ -71,6 +80,16 @@ class TraceMetricScores(BaseModel):
|
|
|
71
80
|
base: Dict[str, Dict[str, MetricScores]] = Field(default_factory=dict)
|
|
72
81
|
|
|
73
82
|
|
|
83
|
+
class PromptData(BaseModel):
|
|
84
|
+
alias: Optional[str] = None
|
|
85
|
+
version: Optional[str] = None
|
|
86
|
+
text_template: Optional[str] = None
|
|
87
|
+
messages_template: Optional[List[PromptMessage]] = None
|
|
88
|
+
model_settings: Optional[ModelSettings] = None
|
|
89
|
+
output_type: Optional[OutputType] = None
|
|
90
|
+
interpolation_type: Optional[PromptInterpolationType] = None
|
|
91
|
+
|
|
92
|
+
|
|
74
93
|
class MetricsAverageDict:
|
|
75
94
|
def __init__(self):
|
|
76
95
|
self.metric_dict = {}
|
|
@@ -123,6 +142,7 @@ class TestRun(BaseModel):
|
|
|
123
142
|
)
|
|
124
143
|
identifier: Optional[str] = None
|
|
125
144
|
hyperparameters: Optional[Dict[str, Any]] = Field(None)
|
|
145
|
+
prompts: Optional[List[PromptData]] = Field(None)
|
|
126
146
|
test_passed: Optional[int] = Field(None, alias="testPassed")
|
|
127
147
|
test_failed: Optional[int] = Field(None, alias="testFailed")
|
|
128
148
|
run_duration: float = Field(0.0, alias="runDuration")
|
|
@@ -191,65 +211,91 @@ class TestRun(BaseModel):
|
|
|
191
211
|
valid_scores = 0
|
|
192
212
|
|
|
193
213
|
def process_metric_data(metric_data: MetricData):
|
|
214
|
+
"""
|
|
215
|
+
Process and aggregate metric data for overall test metrics.
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
metric_data: The metric data to process
|
|
219
|
+
"""
|
|
194
220
|
nonlocal valid_scores
|
|
195
|
-
|
|
221
|
+
metric_name = metric_data.name
|
|
196
222
|
score = metric_data.score
|
|
197
223
|
success = metric_data.success
|
|
198
|
-
|
|
199
|
-
if
|
|
200
|
-
metrics_dict[
|
|
224
|
+
|
|
225
|
+
if metric_name not in metrics_dict:
|
|
226
|
+
metrics_dict[metric_name] = {
|
|
201
227
|
"scores": [],
|
|
202
228
|
"passes": 0,
|
|
203
229
|
"fails": 0,
|
|
204
230
|
"errors": 0,
|
|
205
231
|
}
|
|
206
232
|
|
|
233
|
+
metric_dict = metrics_dict[metric_name]
|
|
234
|
+
|
|
207
235
|
if score is None or success is None:
|
|
208
|
-
|
|
236
|
+
metric_dict["errors"] += 1
|
|
209
237
|
else:
|
|
210
238
|
valid_scores += 1
|
|
211
|
-
|
|
212
|
-
# Append the score.
|
|
213
|
-
metrics_dict[name]["scores"].append(score)
|
|
214
|
-
|
|
215
|
-
# Increment passes or fails based on the metric_data.success flag.
|
|
239
|
+
metric_dict["scores"].append(score)
|
|
216
240
|
if success:
|
|
217
|
-
|
|
241
|
+
metric_dict["passes"] += 1
|
|
218
242
|
else:
|
|
219
|
-
|
|
243
|
+
metric_dict["fails"] += 1
|
|
220
244
|
|
|
221
245
|
def process_span_metric_data(
|
|
222
|
-
metric_data: MetricData,
|
|
246
|
+
metric_data: MetricData,
|
|
247
|
+
span_type: span_api_type_literals,
|
|
248
|
+
span_name: str,
|
|
223
249
|
):
|
|
250
|
+
"""
|
|
251
|
+
Process and aggregate metric data for a specific span.
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
metric_data: The metric data to process
|
|
255
|
+
span_type: The type of span (agent, tool, retriever, llm, base)
|
|
256
|
+
span_name: The name of the span
|
|
257
|
+
"""
|
|
224
258
|
metric_name = metric_data.name
|
|
225
259
|
score = metric_data.score
|
|
226
260
|
success = metric_data.success
|
|
227
261
|
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
trace_metrics_dict[type][name] = {}
|
|
262
|
+
if span_name not in trace_metrics_dict[span_type]:
|
|
263
|
+
trace_metrics_dict[span_type][span_name] = {}
|
|
231
264
|
|
|
232
|
-
if metric_name not in trace_metrics_dict[
|
|
233
|
-
trace_metrics_dict[
|
|
265
|
+
if metric_name not in trace_metrics_dict[span_type][span_name]:
|
|
266
|
+
trace_metrics_dict[span_type][span_name][metric_name] = {
|
|
234
267
|
"scores": [],
|
|
235
268
|
"passes": 0,
|
|
236
269
|
"fails": 0,
|
|
237
270
|
"errors": 0,
|
|
238
271
|
}
|
|
239
272
|
|
|
273
|
+
metric_dict = trace_metrics_dict[span_type][span_name][metric_name]
|
|
274
|
+
|
|
240
275
|
if score is None or success is None:
|
|
241
|
-
|
|
276
|
+
metric_dict["errors"] += 1
|
|
242
277
|
else:
|
|
243
|
-
|
|
244
|
-
trace_metrics_dict[type][name][metric_name]["scores"].append(
|
|
245
|
-
score
|
|
246
|
-
)
|
|
247
|
-
|
|
248
|
-
# Increment passes or fails
|
|
278
|
+
metric_dict["scores"].append(score)
|
|
249
279
|
if success:
|
|
250
|
-
|
|
280
|
+
metric_dict["passes"] += 1
|
|
251
281
|
else:
|
|
252
|
-
|
|
282
|
+
metric_dict["fails"] += 1
|
|
283
|
+
|
|
284
|
+
def process_spans(spans, span_type: span_api_type_literals):
|
|
285
|
+
"""
|
|
286
|
+
Process all metrics for a list of spans of a specific type.
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
spans: List of spans to process
|
|
290
|
+
span_type: The type of spans being processed
|
|
291
|
+
"""
|
|
292
|
+
for span in spans:
|
|
293
|
+
if span.metrics_data is not None:
|
|
294
|
+
for metric_data in span.metrics_data:
|
|
295
|
+
process_metric_data(metric_data)
|
|
296
|
+
process_span_metric_data(
|
|
297
|
+
metric_data, span_type, span.name
|
|
298
|
+
)
|
|
253
299
|
|
|
254
300
|
# Process non-conversational test cases.
|
|
255
301
|
for test_case in self.test_cases:
|
|
@@ -261,45 +307,14 @@ class TestRun(BaseModel):
|
|
|
261
307
|
if test_case.trace is None:
|
|
262
308
|
continue
|
|
263
309
|
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
for span in test_case.trace.tool_spans:
|
|
273
|
-
if span.metrics_data is not None:
|
|
274
|
-
for metric_data in span.metrics_data:
|
|
275
|
-
process_metric_data(metric_data)
|
|
276
|
-
process_span_metric_data(
|
|
277
|
-
metric_data, SpanApiType.TOOL.value, span.name
|
|
278
|
-
)
|
|
279
|
-
|
|
280
|
-
for span in test_case.trace.retriever_spans:
|
|
281
|
-
if span.metrics_data is not None:
|
|
282
|
-
for metric_data in span.metrics_data:
|
|
283
|
-
process_metric_data(metric_data)
|
|
284
|
-
process_span_metric_data(
|
|
285
|
-
metric_data, SpanApiType.RETRIEVER.value, span.name
|
|
286
|
-
)
|
|
287
|
-
|
|
288
|
-
for span in test_case.trace.llm_spans:
|
|
289
|
-
if span.metrics_data is not None:
|
|
290
|
-
for metric_data in span.metrics_data:
|
|
291
|
-
process_metric_data(metric_data)
|
|
292
|
-
process_span_metric_data(
|
|
293
|
-
metric_data, SpanApiType.LLM.value, span.name
|
|
294
|
-
)
|
|
295
|
-
|
|
296
|
-
for span in test_case.trace.base_spans:
|
|
297
|
-
if span.metrics_data is not None:
|
|
298
|
-
for metric_data in span.metrics_data:
|
|
299
|
-
process_metric_data(metric_data)
|
|
300
|
-
process_span_metric_data(
|
|
301
|
-
metric_data, SpanApiType.BASE.value, span.name
|
|
302
|
-
)
|
|
310
|
+
# Process all span types using the helper function
|
|
311
|
+
process_spans(test_case.trace.agent_spans, SpanApiType.AGENT.value)
|
|
312
|
+
process_spans(test_case.trace.tool_spans, SpanApiType.TOOL.value)
|
|
313
|
+
process_spans(
|
|
314
|
+
test_case.trace.retriever_spans, SpanApiType.RETRIEVER.value
|
|
315
|
+
)
|
|
316
|
+
process_spans(test_case.trace.llm_spans, SpanApiType.LLM.value)
|
|
317
|
+
process_spans(test_case.trace.base_spans, SpanApiType.BASE.value)
|
|
303
318
|
|
|
304
319
|
# Process conversational test cases.
|
|
305
320
|
for convo_test_case in self.conversational_test_cases:
|
|
@@ -532,105 +547,141 @@ class TestRunManager:
|
|
|
532
547
|
def clear_test_run(self):
|
|
533
548
|
self.test_run = None
|
|
534
549
|
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
550
|
+
@staticmethod
|
|
551
|
+
def _calculate_success_rate(pass_count: int, fail_count: int) -> str:
|
|
552
|
+
"""Calculate success rate percentage or return error message."""
|
|
553
|
+
total = pass_count + fail_count
|
|
554
|
+
if total > 0:
|
|
555
|
+
return str(round((100 * pass_count) / total, 2))
|
|
556
|
+
return "Cannot display metrics for component-level evals, please run 'deepeval view' to see results on Confident AI."
|
|
557
|
+
|
|
558
|
+
@staticmethod
|
|
559
|
+
def _get_metric_status(metric_data: MetricData) -> str:
|
|
560
|
+
"""Get formatted status string for a metric."""
|
|
561
|
+
if metric_data.error:
|
|
562
|
+
return "[red]ERRORED[/red]"
|
|
563
|
+
elif metric_data.success:
|
|
564
|
+
return "[green]PASSED[/green]"
|
|
565
|
+
return "[red]FAILED[/red]"
|
|
566
|
+
|
|
567
|
+
@staticmethod
|
|
568
|
+
def _format_metric_score(metric_data: MetricData) -> str:
|
|
569
|
+
"""Format metric score with evaluation details."""
|
|
570
|
+
evaluation_model = metric_data.evaluation_model or "n/a"
|
|
571
|
+
metric_score = (
|
|
572
|
+
round(metric_data.score, 2)
|
|
573
|
+
if metric_data.score is not None
|
|
574
|
+
else None
|
|
575
|
+
)
|
|
548
576
|
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
577
|
+
return (
|
|
578
|
+
f"{metric_score} "
|
|
579
|
+
f"(threshold={metric_data.threshold}, "
|
|
580
|
+
f"evaluation model={evaluation_model}, "
|
|
581
|
+
f"reason={metric_data.reason}, "
|
|
582
|
+
f"error={metric_data.error})"
|
|
583
|
+
)
|
|
556
584
|
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
585
|
+
@staticmethod
|
|
586
|
+
def _should_skip_test_case(
|
|
587
|
+
test_case, display: TestRunResultDisplay
|
|
588
|
+
) -> bool:
|
|
589
|
+
"""Determine if test case should be skipped based on display filter."""
|
|
590
|
+
if display == TestRunResultDisplay.PASSING and not test_case.success:
|
|
591
|
+
return True
|
|
592
|
+
elif display == TestRunResultDisplay.FAILING and test_case.success:
|
|
593
|
+
return True
|
|
594
|
+
return False
|
|
595
|
+
|
|
596
|
+
@staticmethod
|
|
597
|
+
def _count_metric_results(
|
|
598
|
+
metrics_data: List[MetricData],
|
|
599
|
+
) -> tuple[int, int]:
|
|
600
|
+
"""Count passing and failing metrics."""
|
|
601
|
+
pass_count = 0
|
|
602
|
+
fail_count = 0
|
|
603
|
+
for metric_data in metrics_data:
|
|
604
|
+
if metric_data.success:
|
|
605
|
+
pass_count += 1
|
|
606
|
+
else:
|
|
607
|
+
fail_count += 1
|
|
608
|
+
return pass_count, fail_count
|
|
560
609
|
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
610
|
+
def _add_test_case_header_row(
|
|
611
|
+
self,
|
|
612
|
+
table: Table,
|
|
613
|
+
test_case_name: str,
|
|
614
|
+
pass_count: int,
|
|
615
|
+
fail_count: int,
|
|
616
|
+
):
|
|
617
|
+
"""Add test case header row with name and success rate."""
|
|
618
|
+
success_rate = self._calculate_success_rate(pass_count, fail_count)
|
|
619
|
+
table.add_row(
|
|
620
|
+
test_case_name,
|
|
621
|
+
*[""] * 3,
|
|
622
|
+
f"{success_rate}%",
|
|
623
|
+
)
|
|
564
624
|
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
625
|
+
def _add_metric_rows(self, table: Table, metrics_data: List[MetricData]):
|
|
626
|
+
"""Add metric detail rows to the table."""
|
|
627
|
+
for metric_data in metrics_data:
|
|
628
|
+
status = self._get_metric_status(metric_data)
|
|
629
|
+
formatted_score = self._format_metric_score(metric_data)
|
|
570
630
|
|
|
571
|
-
success_rate = (
|
|
572
|
-
round((100 * pass_count) / (pass_count + fail_count), 2)
|
|
573
|
-
if pass_count + fail_count > 0
|
|
574
|
-
else "Cannot display metrics for component-level evals, please run 'deepeval view' to see results on Confident AI."
|
|
575
|
-
)
|
|
576
631
|
table.add_row(
|
|
577
|
-
test_case_name,
|
|
578
632
|
"",
|
|
633
|
+
str(metric_data.name),
|
|
634
|
+
formatted_score,
|
|
635
|
+
status,
|
|
579
636
|
"",
|
|
580
|
-
"",
|
|
581
|
-
f"{success_rate}%",
|
|
582
637
|
)
|
|
583
638
|
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
elif metric_data.success:
|
|
588
|
-
status = "[green]PASSED[/green]"
|
|
589
|
-
else:
|
|
590
|
-
status = "[red]FAILED[/red]"
|
|
639
|
+
def _add_separator_row(self, table: Table):
|
|
640
|
+
"""Add empty separator row between test cases."""
|
|
641
|
+
table.add_row(*[""] * len(table.columns))
|
|
591
642
|
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
643
|
+
def display_results_table(
|
|
644
|
+
self, test_run: TestRun, display: TestRunResultDisplay
|
|
645
|
+
):
|
|
646
|
+
"""Display test results in a formatted table."""
|
|
595
647
|
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
status,
|
|
606
|
-
"",
|
|
607
|
-
)
|
|
648
|
+
table = Table(title="Test Results")
|
|
649
|
+
column_config = dict(justify="left")
|
|
650
|
+
column_names = [
|
|
651
|
+
"Test case",
|
|
652
|
+
"Metric",
|
|
653
|
+
"Score",
|
|
654
|
+
"Status",
|
|
655
|
+
"Overall Success Rate",
|
|
656
|
+
]
|
|
608
657
|
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
"",
|
|
612
|
-
"",
|
|
613
|
-
"",
|
|
614
|
-
"",
|
|
615
|
-
"",
|
|
616
|
-
)
|
|
658
|
+
for name in column_names:
|
|
659
|
+
table.add_column(name, **column_config)
|
|
617
660
|
|
|
661
|
+
# Process regular test cases
|
|
662
|
+
for index, test_case in enumerate(test_run.test_cases):
|
|
663
|
+
if test_case.metrics_data is None or self._should_skip_test_case(
|
|
664
|
+
test_case, display
|
|
665
|
+
):
|
|
666
|
+
continue
|
|
667
|
+
pass_count, fail_count = self._count_metric_results(
|
|
668
|
+
test_case.metrics_data
|
|
669
|
+
)
|
|
670
|
+
self._add_test_case_header_row(
|
|
671
|
+
table, test_case.name, pass_count, fail_count
|
|
672
|
+
)
|
|
673
|
+
self._add_metric_rows(table, test_case.metrics_data)
|
|
674
|
+
|
|
675
|
+
if index < len(test_run.test_cases) - 1:
|
|
676
|
+
self._add_separator_row(table)
|
|
677
|
+
|
|
678
|
+
# Process conversational test cases
|
|
618
679
|
for index, conversational_test_case in enumerate(
|
|
619
680
|
test_run.conversational_test_cases
|
|
620
681
|
):
|
|
621
|
-
if (
|
|
622
|
-
display == TestRunResultDisplay.PASSING
|
|
623
|
-
and conversational_test_case.success is False
|
|
624
|
-
):
|
|
625
|
-
continue
|
|
626
|
-
elif (
|
|
627
|
-
display == TestRunResultDisplay.FAILING
|
|
628
|
-
and conversational_test_case.success
|
|
629
|
-
):
|
|
682
|
+
if self._should_skip_test_case(conversational_test_case, display):
|
|
630
683
|
continue
|
|
631
684
|
|
|
632
|
-
pass_count = 0
|
|
633
|
-
fail_count = 0
|
|
634
685
|
conversational_test_case_name = conversational_test_case.name
|
|
635
686
|
|
|
636
687
|
if conversational_test_case.turns:
|
|
@@ -691,71 +742,26 @@ class TestRunManager:
|
|
|
691
742
|
console.print(
|
|
692
743
|
f"[dim]No turns recorded for {conversational_test_case_name}.[/dim]"
|
|
693
744
|
)
|
|
694
|
-
|
|
695
745
|
if conversational_test_case.metrics_data is not None:
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
pass_count += 1
|
|
699
|
-
else:
|
|
700
|
-
fail_count += 1
|
|
701
|
-
table.add_row(
|
|
702
|
-
conversational_test_case_name,
|
|
703
|
-
"",
|
|
704
|
-
"",
|
|
705
|
-
"",
|
|
706
|
-
f"{round((100*pass_count)/(pass_count+fail_count),2)}%",
|
|
746
|
+
pass_count, fail_count = self._count_metric_results(
|
|
747
|
+
conversational_test_case.metrics_data
|
|
707
748
|
)
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
for metric_data in conversational_test_case.metrics_data:
|
|
711
|
-
if metric_data.error:
|
|
712
|
-
status = "[red]ERRORED[/red]"
|
|
713
|
-
elif metric_data.success:
|
|
714
|
-
status = "[green]PASSED[/green]"
|
|
715
|
-
else:
|
|
716
|
-
status = "[red]FAILED[/red]"
|
|
717
|
-
|
|
718
|
-
evaluation_model = metric_data.evaluation_model
|
|
719
|
-
if evaluation_model is None:
|
|
720
|
-
evaluation_model = "n/a"
|
|
721
|
-
|
|
722
|
-
if metric_data.score is not None:
|
|
723
|
-
metric_score = round(metric_data.score, 2)
|
|
724
|
-
else:
|
|
725
|
-
metric_score = None
|
|
726
|
-
|
|
727
|
-
table.add_row(
|
|
728
|
-
"",
|
|
729
|
-
str(metric_data.name),
|
|
730
|
-
f"{metric_score} (threshold={metric_data.threshold}, evaluation model={evaluation_model}, reason={metric_data.reason}, error={metric_data.error})",
|
|
731
|
-
status,
|
|
732
|
-
"",
|
|
733
|
-
)
|
|
734
|
-
|
|
735
|
-
if index is not len(self.test_run.conversational_test_cases) - 1:
|
|
736
|
-
table.add_row(
|
|
737
|
-
"",
|
|
738
|
-
"",
|
|
739
|
-
"",
|
|
740
|
-
"",
|
|
741
|
-
"",
|
|
749
|
+
self._add_test_case_header_row(
|
|
750
|
+
table, conversational_test_case.name, pass_count, fail_count
|
|
742
751
|
)
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
table.add_row(
|
|
746
|
-
"",
|
|
747
|
-
"",
|
|
748
|
-
"",
|
|
749
|
-
"",
|
|
750
|
-
"",
|
|
752
|
+
self._add_metric_rows(
|
|
753
|
+
table, conversational_test_case.metrics_data
|
|
751
754
|
)
|
|
752
755
|
|
|
756
|
+
if index < len(test_run.conversational_test_cases) - 1:
|
|
757
|
+
self._add_separator_row(table)
|
|
758
|
+
|
|
759
|
+
if index < len(test_run.test_cases) - 1:
|
|
760
|
+
self._add_separator_row(table)
|
|
761
|
+
|
|
753
762
|
table.add_row(
|
|
754
763
|
"[bold red]Note: Use Confident AI with DeepEval to analyze failed test cases for more details[/bold red]",
|
|
755
|
-
"",
|
|
756
|
-
"",
|
|
757
|
-
"",
|
|
758
|
-
"",
|
|
764
|
+
*[""] * (len(table.columns) - 1),
|
|
759
765
|
)
|
|
760
766
|
print(table)
|
|
761
767
|
|
|
@@ -799,6 +805,7 @@ class TestRunManager:
|
|
|
799
805
|
test_run.test_cases = initial_batch
|
|
800
806
|
|
|
801
807
|
try:
|
|
808
|
+
test_run.prompts = None
|
|
802
809
|
body = test_run.model_dump(by_alias=True, exclude_none=True)
|
|
803
810
|
except AttributeError:
|
|
804
811
|
# Pydantic version below 2.0
|
|
@@ -947,12 +954,28 @@ class TestRunManager:
|
|
|
947
954
|
global_test_run_cache_manager.disable_write_cache = not bool(
|
|
948
955
|
get_is_running_deepeval()
|
|
949
956
|
)
|
|
950
|
-
|
|
951
957
|
global_test_run_cache_manager.wrap_up_cached_test_run()
|
|
952
958
|
|
|
953
959
|
if display_table:
|
|
954
960
|
self.display_results_table(test_run, display)
|
|
955
961
|
|
|
962
|
+
if test_run.hyperparameters is None:
|
|
963
|
+
console.print(
|
|
964
|
+
"\n[bold yellow]⚠ WARNING:[/bold yellow] No hyperparameters logged.\n"
|
|
965
|
+
"» [bold blue][link=https://deepeval.com/docs/evaluation-prompts]Log hyperparameters[/link][/bold blue] to attribute prompts and models to your test runs.\n\n"
|
|
966
|
+
+ "=" * 80
|
|
967
|
+
)
|
|
968
|
+
else:
|
|
969
|
+
if not test_run.prompts:
|
|
970
|
+
console.print(
|
|
971
|
+
"\n[bold yellow]⚠ WARNING:[/bold yellow] No prompts logged.\n"
|
|
972
|
+
"» [bold blue][link=https://deepeval.com/docs/evaluation-prompts]Log prompts[/link][/bold blue] to evaluate and optimize your prompt templates and models.\n\n"
|
|
973
|
+
+ "=" * 80
|
|
974
|
+
)
|
|
975
|
+
else:
|
|
976
|
+
console.print("\n[bold green]✓ Prompts Logged[/bold green]\n")
|
|
977
|
+
self._render_prompts_panels(prompts=test_run.prompts)
|
|
978
|
+
|
|
956
979
|
self.save_test_run_locally()
|
|
957
980
|
delete_file_if_exists(self.temp_file_path)
|
|
958
981
|
if is_confident() and self.disable_request is False:
|
|
@@ -967,7 +990,7 @@ class TestRunManager:
|
|
|
967
990
|
f"» Test Results ({test_run.test_passed + test_run.test_failed} total tests):\n",
|
|
968
991
|
f" » Pass Rate: {round((test_run.test_passed / (test_run.test_passed + test_run.test_failed)) * 100, 2)}% | Passed: [bold green]{test_run.test_passed}[/bold green] | Failed: [bold red]{test_run.test_failed}[/bold red]\n\n",
|
|
969
992
|
"=" * 80,
|
|
970
|
-
"\n\n»
|
|
993
|
+
"\n\n» Want to share evals with your team, or a place for your test cases to live? ❤️ 🏡\n"
|
|
971
994
|
" » Run [bold]'deepeval view'[/bold] to analyze and save testing results on [rgb(106,0,255)]Confident AI[/rgb(106,0,255)].\n\n",
|
|
972
995
|
)
|
|
973
996
|
|
|
@@ -993,5 +1016,68 @@ class TestRunManager:
|
|
|
993
1016
|
pass
|
|
994
1017
|
return None
|
|
995
1018
|
|
|
1019
|
+
def _render_prompts_panels(self, prompts: List[PromptData]) -> None:
|
|
1020
|
+
|
|
1021
|
+
def format_string(
|
|
1022
|
+
v, default="[dim]None[/dim]", color: Optional[str] = None
|
|
1023
|
+
):
|
|
1024
|
+
formatted_string = str(v) if v not in (None, "", []) else default
|
|
1025
|
+
return (
|
|
1026
|
+
f"{formatted_string}"
|
|
1027
|
+
if color is None or v in (None, "", [])
|
|
1028
|
+
else f"[{color}]{formatted_string}[/]"
|
|
1029
|
+
)
|
|
1030
|
+
|
|
1031
|
+
panels = []
|
|
1032
|
+
for prompt in prompts:
|
|
1033
|
+
lines = []
|
|
1034
|
+
p_type = (
|
|
1035
|
+
"messages"
|
|
1036
|
+
if prompt.messages_template
|
|
1037
|
+
else ("text" if prompt.text_template else "—")
|
|
1038
|
+
)
|
|
1039
|
+
if p_type:
|
|
1040
|
+
lines.append(f"type: {format_string(p_type, color='blue')}")
|
|
1041
|
+
if prompt.output_type:
|
|
1042
|
+
lines.append(
|
|
1043
|
+
f"output_type: {format_string(prompt.output_type, color='blue')}"
|
|
1044
|
+
)
|
|
1045
|
+
if prompt.interpolation_type:
|
|
1046
|
+
lines.append(
|
|
1047
|
+
f"interpolation_type: {format_string(prompt.interpolation_type, color='blue')}"
|
|
1048
|
+
)
|
|
1049
|
+
if prompt.model_settings:
|
|
1050
|
+
ms = prompt.model_settings
|
|
1051
|
+
settings_lines = [
|
|
1052
|
+
"Model Settings:",
|
|
1053
|
+
f" – provider: {format_string(ms.provider, color='green')}",
|
|
1054
|
+
f" – name: {format_string(ms.name, color='green')}",
|
|
1055
|
+
f" – temperature: {format_string(ms.temperature, color='green')}",
|
|
1056
|
+
f" – max_tokens: {format_string(ms.max_tokens, color='green')}",
|
|
1057
|
+
f" – top_p: {format_string(ms.top_p, color='green')}",
|
|
1058
|
+
f" – frequency_penalty: {format_string(ms.frequency_penalty, color='green')}",
|
|
1059
|
+
f" – presence_penalty: {format_string(ms.presence_penalty, color='green')}",
|
|
1060
|
+
f" – stop_sequence: {format_string(ms.stop_sequence, color='green')}",
|
|
1061
|
+
f" – reasoning_effort: {format_string(ms.reasoning_effort, color='green')}",
|
|
1062
|
+
f" – verbosity: {format_string(ms.verbosity, color='green')}",
|
|
1063
|
+
]
|
|
1064
|
+
lines.append("")
|
|
1065
|
+
lines.extend(settings_lines)
|
|
1066
|
+
title = f"{format_string(prompt.alias)}"
|
|
1067
|
+
if prompt.version:
|
|
1068
|
+
title += f" (v{prompt.version})"
|
|
1069
|
+
body = "\n".join(lines)
|
|
1070
|
+
panel = Panel(
|
|
1071
|
+
body,
|
|
1072
|
+
title=title,
|
|
1073
|
+
title_align="left",
|
|
1074
|
+
expand=False,
|
|
1075
|
+
padding=(1, 6, 1, 2),
|
|
1076
|
+
)
|
|
1077
|
+
panels.append(panel)
|
|
1078
|
+
|
|
1079
|
+
if panels:
|
|
1080
|
+
console.print(Columns(panels, equal=False, expand=False))
|
|
1081
|
+
|
|
996
1082
|
|
|
997
1083
|
global_test_run_manager = TestRunManager()
|
deepeval/tracing/__init__.py
CHANGED
|
@@ -4,7 +4,7 @@ from .context import (
|
|
|
4
4
|
update_retriever_span,
|
|
5
5
|
update_llm_span,
|
|
6
6
|
)
|
|
7
|
-
from .trace_context import trace
|
|
7
|
+
from .trace_context import trace, LlmSpanContext
|
|
8
8
|
from .types import BaseSpan, Trace
|
|
9
9
|
from .tracing import observe, trace_manager
|
|
10
10
|
from .offline_evals import evaluate_thread, evaluate_trace, evaluate_span
|
|
@@ -14,6 +14,7 @@ __all__ = [
|
|
|
14
14
|
"update_current_trace",
|
|
15
15
|
"update_retriever_span",
|
|
16
16
|
"update_llm_span",
|
|
17
|
+
"LlmSpanContext",
|
|
17
18
|
"BaseSpan",
|
|
18
19
|
"Trace",
|
|
19
20
|
"observe",
|
deepeval/tracing/api.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
2
|
from typing import Dict, List, Optional, Union, Literal, Any
|
|
3
|
-
from pydantic import BaseModel, Field
|
|
3
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
4
4
|
|
|
5
5
|
from deepeval.test_case import ToolCall
|
|
6
6
|
|
|
@@ -27,6 +27,8 @@ class PromptApi(BaseModel):
|
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
class MetricData(BaseModel):
|
|
30
|
+
model_config = ConfigDict(extra="ignore")
|
|
31
|
+
|
|
30
32
|
name: str
|
|
31
33
|
threshold: float
|
|
32
34
|
success: bool
|