deepeval 3.6.6__py3-none-any.whl → 3.6.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
- deepeval/cli/main.py +42 -0
- deepeval/confident/api.py +1 -0
- deepeval/config/settings.py +22 -4
- deepeval/constants.py +8 -1
- deepeval/dataset/dataset.py +2 -11
- deepeval/dataset/utils.py +1 -1
- deepeval/evaluate/evaluate.py +5 -1
- deepeval/evaluate/execute.py +97 -42
- deepeval/evaluate/utils.py +20 -116
- deepeval/integrations/crewai/__init__.py +6 -1
- deepeval/integrations/crewai/handler.py +1 -1
- deepeval/integrations/crewai/subs.py +51 -0
- deepeval/integrations/crewai/wrapper.py +45 -5
- deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
- deepeval/metrics/api.py +281 -0
- deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
- deepeval/metrics/bias/bias.py +12 -3
- deepeval/metrics/contextual_precision/contextual_precision.py +12 -3
- deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
- deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
- deepeval/metrics/conversational_dag/nodes.py +12 -4
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +73 -59
- deepeval/metrics/dag/dag.py +12 -0
- deepeval/metrics/dag/nodes.py +12 -4
- deepeval/metrics/faithfulness/faithfulness.py +12 -1
- deepeval/metrics/g_eval/g_eval.py +11 -0
- deepeval/metrics/hallucination/hallucination.py +12 -1
- deepeval/metrics/indicator.py +8 -2
- deepeval/metrics/json_correctness/json_correctness.py +12 -1
- deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +13 -0
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +13 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +12 -1
- deepeval/metrics/misuse/misuse.py +12 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
- deepeval/metrics/non_advice/non_advice.py +12 -0
- deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
- deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
- deepeval/metrics/role_adherence/role_adherence.py +12 -0
- deepeval/metrics/role_violation/role_violation.py +12 -0
- deepeval/metrics/summarization/summarization.py +12 -1
- deepeval/metrics/task_completion/task_completion.py +3 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +8 -0
- deepeval/metrics/toxicity/toxicity.py +12 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
- deepeval/models/llms/grok_model.py +1 -1
- deepeval/models/llms/openai_model.py +2 -0
- deepeval/openai/__init__.py +14 -32
- deepeval/openai/extractors.py +24 -34
- deepeval/openai/patch.py +256 -161
- deepeval/openai/types.py +20 -0
- deepeval/openai/utils.py +98 -56
- deepeval/prompt/__init__.py +19 -1
- deepeval/prompt/api.py +160 -0
- deepeval/prompt/prompt.py +244 -62
- deepeval/prompt/utils.py +144 -2
- deepeval/synthesizer/chunking/context_generator.py +209 -152
- deepeval/synthesizer/chunking/doc_chunker.py +46 -12
- deepeval/synthesizer/synthesizer.py +8 -5
- deepeval/test_case/api.py +131 -0
- deepeval/test_run/__init__.py +1 -0
- deepeval/test_run/hyperparameters.py +47 -8
- deepeval/test_run/test_run.py +104 -1
- deepeval/tracing/api.py +3 -1
- deepeval/tracing/message_types/__init__.py +10 -0
- deepeval/tracing/message_types/base.py +6 -0
- deepeval/tracing/message_types/messages.py +14 -0
- deepeval/tracing/message_types/tools.py +18 -0
- deepeval/tracing/otel/utils.py +1 -1
- deepeval/tracing/trace_context.py +73 -4
- deepeval/tracing/tracing.py +51 -3
- deepeval/tracing/types.py +16 -0
- deepeval/tracing/utils.py +8 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/METADATA +1 -1
- {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/RECORD +92 -84
- {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/WHEEL +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/entry_points.txt +0 -0
deepeval/_version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__: str = "3.6.
|
|
1
|
+
__version__: str = "3.6.7"
|
|
@@ -121,6 +121,7 @@ class EquityMedQA(DeepEvalBaseBenchmark):
|
|
|
121
121
|
score = metric.measure(
|
|
122
122
|
LLMTestCase(input=golden.input, actual_output=prediction),
|
|
123
123
|
_show_indicator=False,
|
|
124
|
+
_log_metric_to_confident=False,
|
|
124
125
|
)
|
|
125
126
|
flipped_score = (
|
|
126
127
|
1 - metric.score if metric.score in [0, 1] else metric.score
|
deepeval/cli/main.py
CHANGED
|
@@ -328,6 +328,31 @@ def set_debug(
|
|
|
328
328
|
"--trace-flush/--no-trace-flush",
|
|
329
329
|
help="Enable / disable CONFIDENT_TRACE_FLUSH.",
|
|
330
330
|
),
|
|
331
|
+
trace_sample_rate: Optional[float] = typer.Option(
|
|
332
|
+
None,
|
|
333
|
+
"--trace-sample-rate",
|
|
334
|
+
help="Set CONFIDENT_TRACE_SAMPLE_RATE.",
|
|
335
|
+
),
|
|
336
|
+
metric_logging_verbose: Optional[bool] = typer.Option(
|
|
337
|
+
None,
|
|
338
|
+
"--metric-logging-verbose/--no-metric-logging-verbose",
|
|
339
|
+
help="Enable / disable CONFIDENT_METRIC_LOGGING_VERBOSE.",
|
|
340
|
+
),
|
|
341
|
+
metric_logging_flush: Optional[bool] = typer.Option(
|
|
342
|
+
None,
|
|
343
|
+
"--metric-logging-flush/--no-metric-logging-flush",
|
|
344
|
+
help="Enable / disable CONFIDENT_METRIC_LOGGING_FLUSH.",
|
|
345
|
+
),
|
|
346
|
+
metric_logging_sample_rate: Optional[float] = typer.Option(
|
|
347
|
+
None,
|
|
348
|
+
"--metric-logging-sample-rate",
|
|
349
|
+
help="Set CONFIDENT_METRIC_LOGGING_SAMPLE_RATE.",
|
|
350
|
+
),
|
|
351
|
+
metric_logging_enabled: Optional[bool] = typer.Option(
|
|
352
|
+
None,
|
|
353
|
+
"--metric-logging-enabled/--no-metric-logging-enabled",
|
|
354
|
+
help="Enable / disable CONFIDENT_METRIC_LOGGING_ENABLED.",
|
|
355
|
+
),
|
|
331
356
|
# Advanced / potentially surprising
|
|
332
357
|
error_reporting: Optional[bool] = typer.Option(
|
|
333
358
|
None,
|
|
@@ -387,6 +412,20 @@ def set_debug(
|
|
|
387
412
|
settings.CONFIDENT_TRACE_ENVIRONMENT = trace_env
|
|
388
413
|
if trace_flush is not None:
|
|
389
414
|
settings.CONFIDENT_TRACE_FLUSH = trace_flush
|
|
415
|
+
if trace_sample_rate is not None:
|
|
416
|
+
settings.CONFIDENT_TRACE_SAMPLE_RATE = trace_sample_rate
|
|
417
|
+
|
|
418
|
+
# Confident metrics
|
|
419
|
+
if metric_logging_verbose is not None:
|
|
420
|
+
settings.CONFIDENT_METRIC_LOGGING_VERBOSE = metric_logging_verbose
|
|
421
|
+
if metric_logging_flush is not None:
|
|
422
|
+
settings.CONFIDENT_METRIC_LOGGING_FLUSH = metric_logging_flush
|
|
423
|
+
if metric_logging_sample_rate is not None:
|
|
424
|
+
settings.CONFIDENT_METRIC_LOGGING_SAMPLE_RATE = (
|
|
425
|
+
metric_logging_sample_rate
|
|
426
|
+
)
|
|
427
|
+
if metric_logging_enabled is not None:
|
|
428
|
+
settings.CONFIDENT_METRIC_LOGGING_ENABLED = metric_logging_enabled
|
|
390
429
|
|
|
391
430
|
# Advanced
|
|
392
431
|
if error_reporting is not None:
|
|
@@ -438,6 +477,8 @@ def unset_debug(
|
|
|
438
477
|
settings.LOG_LEVEL = "info"
|
|
439
478
|
settings.CONFIDENT_TRACE_ENVIRONMENT = "development"
|
|
440
479
|
settings.CONFIDENT_TRACE_VERBOSE = True
|
|
480
|
+
settings.CONFIDENT_METRIC_LOGGING_VERBOSE = True
|
|
481
|
+
settings.CONFIDENT_METRIC_LOGGING_ENABLED = True
|
|
441
482
|
|
|
442
483
|
# Clear optional toggles/overrides
|
|
443
484
|
settings.DEEPEVAL_VERBOSE_MODE = None
|
|
@@ -449,6 +490,7 @@ def unset_debug(
|
|
|
449
490
|
settings.GRPC_TRACE = None
|
|
450
491
|
|
|
451
492
|
settings.CONFIDENT_TRACE_FLUSH = None
|
|
493
|
+
settings.CONFIDENT_METRIC_LOGGING_FLUSH = None
|
|
452
494
|
|
|
453
495
|
settings.ERROR_REPORTING = None
|
|
454
496
|
settings.IGNORE_DEEPEVAL_ERRORS = None
|
deepeval/confident/api.py
CHANGED
|
@@ -87,6 +87,7 @@ class Endpoints(Enum):
|
|
|
87
87
|
DATASET_ALIAS_QUEUE_ENDPOINT = "/v1/datasets/:alias/queue"
|
|
88
88
|
|
|
89
89
|
TEST_RUN_ENDPOINT = "/v1/test-run"
|
|
90
|
+
METRIC_DATA_ENDPOINT = "/v1/metric-data"
|
|
90
91
|
TRACES_ENDPOINT = "/v1/traces"
|
|
91
92
|
ANNOTATIONS_ENDPOINT = "/v1/annotations"
|
|
92
93
|
PROMPTS_VERSION_ID_ENDPOINT = "/v1/prompts/:alias/versions/:versionId"
|
deepeval/config/settings.py
CHANGED
|
@@ -337,10 +337,17 @@ class Settings(BaseSettings):
|
|
|
337
337
|
SKIP_DEEPEVAL_MISSING_PARAMS: Optional[bool] = None
|
|
338
338
|
DEEPEVAL_VERBOSE_MODE: Optional[bool] = None
|
|
339
339
|
ENABLE_DEEPEVAL_CACHE: Optional[bool] = None
|
|
340
|
+
|
|
340
341
|
CONFIDENT_TRACE_FLUSH: Optional[bool] = None
|
|
341
342
|
CONFIDENT_TRACE_ENVIRONMENT: Optional[str] = "development"
|
|
342
343
|
CONFIDENT_TRACE_VERBOSE: Optional[bool] = True
|
|
343
|
-
|
|
344
|
+
CONFIDENT_TRACE_SAMPLE_RATE: Optional[float] = 1.0
|
|
345
|
+
|
|
346
|
+
CONFIDENT_METRIC_LOGGING_FLUSH: Optional[bool] = None
|
|
347
|
+
CONFIDENT_METRIC_LOGGING_VERBOSE: Optional[bool] = True
|
|
348
|
+
CONFIDENT_METRIC_LOGGING_SAMPLE_RATE: Optional[float] = 1.0
|
|
349
|
+
CONFIDENT_METRIC_LOGGING_ENABLED: Optional[bool] = True
|
|
350
|
+
|
|
344
351
|
OTEL_EXPORTER_OTLP_ENDPOINT: Optional[AnyUrl] = None
|
|
345
352
|
|
|
346
353
|
#
|
|
@@ -355,6 +362,12 @@ class Settings(BaseSettings):
|
|
|
355
362
|
None # per-attempt timeout. Set 0/None to disable
|
|
356
363
|
)
|
|
357
364
|
|
|
365
|
+
#
|
|
366
|
+
# Async Document Pipelines
|
|
367
|
+
#
|
|
368
|
+
|
|
369
|
+
DEEPEVAL_MAX_CONCURRENT_DOC_PROCESSING: conint(ge=1) = 2
|
|
370
|
+
|
|
358
371
|
#
|
|
359
372
|
# Async Task Configuration
|
|
360
373
|
#
|
|
@@ -484,7 +497,8 @@ class Settings(BaseSettings):
|
|
|
484
497
|
"OPENAI_COST_PER_INPUT_TOKEN",
|
|
485
498
|
"OPENAI_COST_PER_OUTPUT_TOKEN",
|
|
486
499
|
"TEMPERATURE",
|
|
487
|
-
"
|
|
500
|
+
"CONFIDENT_TRACE_SAMPLE_RATE",
|
|
501
|
+
"CONFIDENT_METRIC_LOGGING_SAMPLE_RATE",
|
|
488
502
|
mode="before",
|
|
489
503
|
)
|
|
490
504
|
@classmethod
|
|
@@ -496,13 +510,17 @@ class Settings(BaseSettings):
|
|
|
496
510
|
return None
|
|
497
511
|
return float(v)
|
|
498
512
|
|
|
499
|
-
@field_validator(
|
|
513
|
+
@field_validator(
|
|
514
|
+
"CONFIDENT_TRACE_SAMPLE_RATE", "CONFIDENT_METRIC_LOGGING_SAMPLE_RATE"
|
|
515
|
+
)
|
|
500
516
|
@classmethod
|
|
501
517
|
def _validate_sample_rate(cls, v):
|
|
502
518
|
if v is None:
|
|
503
519
|
return None
|
|
504
520
|
if not (0.0 <= float(v) <= 1.0):
|
|
505
|
-
raise ValueError(
|
|
521
|
+
raise ValueError(
|
|
522
|
+
"CONFIDENT_TRACE_SAMPLE_RATE or CONFIDENT_METRIC_LOGGING_SAMPLE_RATE must be between 0 and 1"
|
|
523
|
+
)
|
|
506
524
|
return float(v)
|
|
507
525
|
|
|
508
526
|
@field_validator("DEEPEVAL_DEFAULT_SAVE", mode="before")
|
deepeval/constants.py
CHANGED
|
@@ -9,9 +9,16 @@ LOGIN_PROMPT = "\n✨👀 Looking for a place for your LLM test data to live
|
|
|
9
9
|
|
|
10
10
|
CONFIDENT_TRACE_VERBOSE = "CONFIDENT_TRACE_VERBOSE"
|
|
11
11
|
CONFIDENT_TRACE_FLUSH = "CONFIDENT_TRACE_FLUSH"
|
|
12
|
-
|
|
12
|
+
CONFIDENT_TRACE_SAMPLE_RATE = "CONFIDENT_TRACE_SAMPLE_RATE"
|
|
13
13
|
CONFIDENT_TRACE_ENVIRONMENT = "CONFIDENT_TRACE_ENVIRONMENT"
|
|
14
14
|
CONFIDENT_TRACING_ENABLED = "CONFIDENT_TRACING_ENABLED"
|
|
15
|
+
|
|
16
|
+
CONFIDENT_METRIC_LOGGING_VERBOSE = "CONFIDENT_METRIC_LOGGING_VERBOSE"
|
|
17
|
+
CONFIDENT_METRIC_LOGGING_FLUSH = "CONFIDENT_METRIC_LOGGING_FLUSH"
|
|
18
|
+
CONFIDENT_METRIC_LOGGING_SAMPLE_RATE = "CONFIDENT_METRIC_LOGGING_SAMPLE_RATE"
|
|
19
|
+
CONFIDENT_METRIC_LOGGING_ENABLED = "CONFIDENT_METRIC_LOGGING_ENABLED"
|
|
20
|
+
|
|
21
|
+
|
|
15
22
|
CONFIDENT_OPEN_BROWSER = "CONFIDENT_OPEN_BROWSER"
|
|
16
23
|
CONFIDENT_TEST_CASE_BATCH_SIZE = "CONFIDENT_TEST_CASE_BATCH_SIZE"
|
|
17
24
|
|
deepeval/dataset/dataset.py
CHANGED
|
@@ -49,7 +49,7 @@ from deepeval.utils import (
|
|
|
49
49
|
from deepeval.test_run import (
|
|
50
50
|
global_test_run_manager,
|
|
51
51
|
)
|
|
52
|
-
|
|
52
|
+
|
|
53
53
|
from deepeval.tracing import trace_manager
|
|
54
54
|
from deepeval.tracing.tracing import EVAL_DUMMY_SPAN_NAME
|
|
55
55
|
|
|
@@ -1248,16 +1248,7 @@ class EvaluationDataset:
|
|
|
1248
1248
|
display_config.file_output_dir,
|
|
1249
1249
|
)
|
|
1250
1250
|
|
|
1251
|
-
#
|
|
1252
|
-
test_run = global_test_run_manager.get_test_run()
|
|
1253
|
-
if len(openai_test_case_pairs) > 0:
|
|
1254
|
-
raw_hyperparameters = openai_test_case_pairs[-1].hyperparameters
|
|
1255
|
-
test_run.hyperparameters = process_hyperparameters(
|
|
1256
|
-
raw_hyperparameters
|
|
1257
|
-
)
|
|
1258
|
-
|
|
1259
|
-
# clean up
|
|
1260
|
-
openai_test_case_pairs.clear()
|
|
1251
|
+
# save test run
|
|
1261
1252
|
global_test_run_manager.save_test_run(TEMP_FILE_PATH)
|
|
1262
1253
|
|
|
1263
1254
|
# sandwich end trace for OTEL
|
deepeval/dataset/utils.py
CHANGED
deepeval/evaluate/evaluate.py
CHANGED
|
@@ -28,7 +28,10 @@ from deepeval.evaluate.utils import (
|
|
|
28
28
|
from deepeval.dataset import Golden
|
|
29
29
|
from deepeval.prompt import Prompt
|
|
30
30
|
from deepeval.test_case.utils import check_valid_test_cases_type
|
|
31
|
-
from deepeval.test_run.hyperparameters import
|
|
31
|
+
from deepeval.test_run.hyperparameters import (
|
|
32
|
+
process_hyperparameters,
|
|
33
|
+
process_prompts,
|
|
34
|
+
)
|
|
32
35
|
from deepeval.test_run.test_run import TEMP_FILE_PATH
|
|
33
36
|
from deepeval.utils import (
|
|
34
37
|
get_or_create_event_loop,
|
|
@@ -267,6 +270,7 @@ def evaluate(
|
|
|
267
270
|
|
|
268
271
|
test_run = global_test_run_manager.get_test_run()
|
|
269
272
|
test_run.hyperparameters = process_hyperparameters(hyperparameters)
|
|
273
|
+
test_run.prompts = process_prompts(hyperparameters)
|
|
270
274
|
global_test_run_manager.save_test_run(TEMP_FILE_PATH)
|
|
271
275
|
res = global_test_run_manager.wrap_up_test_run(
|
|
272
276
|
run_duration, display_table=False
|
deepeval/evaluate/execute.py
CHANGED
|
@@ -61,6 +61,7 @@ from deepeval.test_case import (
|
|
|
61
61
|
ConversationalTestCase,
|
|
62
62
|
MLLMTestCase,
|
|
63
63
|
)
|
|
64
|
+
from deepeval.test_case.api import create_api_test_case
|
|
64
65
|
from deepeval.test_run import (
|
|
65
66
|
global_test_run_manager,
|
|
66
67
|
LLMApiTestCase,
|
|
@@ -80,15 +81,18 @@ from deepeval.evaluate.utils import (
|
|
|
80
81
|
create_api_trace,
|
|
81
82
|
create_metric_data,
|
|
82
83
|
create_test_result,
|
|
83
|
-
create_api_test_case,
|
|
84
84
|
count_metrics_in_trace,
|
|
85
85
|
extract_trace_test_results,
|
|
86
86
|
)
|
|
87
87
|
from deepeval.utils import add_pbar, update_pbar, custom_console
|
|
88
|
-
from deepeval.openai.utils import openai_test_case_pairs
|
|
89
88
|
from deepeval.tracing.types import TestCaseMetricPair
|
|
90
89
|
from deepeval.config.settings import get_settings
|
|
91
|
-
|
|
90
|
+
from deepeval.test_run import TEMP_FILE_PATH
|
|
91
|
+
from deepeval.confident.api import is_confident
|
|
92
|
+
from deepeval.test_run.hyperparameters import (
|
|
93
|
+
process_hyperparameters,
|
|
94
|
+
process_prompts,
|
|
95
|
+
)
|
|
92
96
|
|
|
93
97
|
logger = logging.getLogger(__name__)
|
|
94
98
|
|
|
@@ -902,6 +906,7 @@ def execute_agentic_test_cases(
|
|
|
902
906
|
trace_api.agent_spans.append(api_span)
|
|
903
907
|
elif isinstance(span, LlmSpan):
|
|
904
908
|
trace_api.llm_spans.append(api_span)
|
|
909
|
+
log_prompt(span, test_run_manager)
|
|
905
910
|
elif isinstance(span, RetrieverSpan):
|
|
906
911
|
trace_api.retriever_spans.append(api_span)
|
|
907
912
|
elif isinstance(span, ToolSpan):
|
|
@@ -1284,6 +1289,7 @@ async def _a_execute_agentic_test_case(
|
|
|
1284
1289
|
verbose_mode=verbose_mode,
|
|
1285
1290
|
progress=progress,
|
|
1286
1291
|
pbar_eval_id=pbar_eval_id,
|
|
1292
|
+
test_run_manager=test_run_manager,
|
|
1287
1293
|
_use_bar_indicator=_use_bar_indicator,
|
|
1288
1294
|
)
|
|
1289
1295
|
child_tasks = [dfs(child) for child in span.children]
|
|
@@ -1291,7 +1297,18 @@ async def _a_execute_agentic_test_case(
|
|
|
1291
1297
|
await asyncio.gather(*child_tasks)
|
|
1292
1298
|
|
|
1293
1299
|
test_start_time = time.perf_counter()
|
|
1294
|
-
|
|
1300
|
+
if current_trace and current_trace.root_spans:
|
|
1301
|
+
await dfs(current_trace.root_spans[0])
|
|
1302
|
+
else:
|
|
1303
|
+
if (
|
|
1304
|
+
logger.isEnabledFor(logging.DEBUG)
|
|
1305
|
+
and get_settings().DEEPEVAL_VERBOSE_MODE
|
|
1306
|
+
):
|
|
1307
|
+
logger.debug(
|
|
1308
|
+
"Skipping DFS: empty trace or no root spans (trace=%s)",
|
|
1309
|
+
current_trace.uuid if current_trace else None,
|
|
1310
|
+
)
|
|
1311
|
+
|
|
1295
1312
|
test_end_time = time.perf_counter()
|
|
1296
1313
|
run_duration = test_end_time - test_start_time
|
|
1297
1314
|
|
|
@@ -1313,6 +1330,7 @@ async def _a_execute_span_test_case(
|
|
|
1313
1330
|
verbose_mode: Optional[bool],
|
|
1314
1331
|
progress: Optional[Progress],
|
|
1315
1332
|
pbar_eval_id: Optional[int],
|
|
1333
|
+
test_run_manager: Optional[TestRunManager],
|
|
1316
1334
|
_use_bar_indicator: bool,
|
|
1317
1335
|
):
|
|
1318
1336
|
api_span: BaseApiSpan = trace_manager._convert_span_to_api_span(span)
|
|
@@ -1320,6 +1338,7 @@ async def _a_execute_span_test_case(
|
|
|
1320
1338
|
trace_api.agent_spans.append(api_span)
|
|
1321
1339
|
elif isinstance(span, LlmSpan):
|
|
1322
1340
|
trace_api.llm_spans.append(api_span)
|
|
1341
|
+
log_prompt(span, test_run_manager)
|
|
1323
1342
|
elif isinstance(span, RetrieverSpan):
|
|
1324
1343
|
trace_api.retriever_spans.append(api_span)
|
|
1325
1344
|
elif isinstance(span, ToolSpan):
|
|
@@ -1568,6 +1587,7 @@ def execute_agentic_test_cases_from_loop(
|
|
|
1568
1587
|
trace_api.agent_spans.append(api_span)
|
|
1569
1588
|
elif isinstance(span, LlmSpan):
|
|
1570
1589
|
trace_api.llm_spans.append(api_span)
|
|
1590
|
+
log_prompt(span, test_run_manager)
|
|
1571
1591
|
elif isinstance(span, RetrieverSpan):
|
|
1572
1592
|
trace_api.retriever_spans.append(api_span)
|
|
1573
1593
|
elif isinstance(span, ToolSpan):
|
|
@@ -1748,6 +1768,7 @@ def execute_agentic_test_cases_from_loop(
|
|
|
1748
1768
|
local_trace_manager.evaluating = False
|
|
1749
1769
|
local_trace_manager.traces_to_evaluate_order.clear()
|
|
1750
1770
|
local_trace_manager.traces_to_evaluate.clear()
|
|
1771
|
+
local_trace_manager.trace_uuid_to_golden.clear()
|
|
1751
1772
|
|
|
1752
1773
|
|
|
1753
1774
|
def a_execute_agentic_test_cases_from_loop(
|
|
@@ -1950,12 +1971,12 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1950
1971
|
return
|
|
1951
1972
|
|
|
1952
1973
|
try:
|
|
1974
|
+
current_tasks = set()
|
|
1953
1975
|
# Find tasks that were created during this run but we didn’t track
|
|
1954
1976
|
current_tasks = loop.run_until_complete(_snapshot_tasks())
|
|
1955
1977
|
except RuntimeError:
|
|
1956
1978
|
# this might happen if the loop is already closing
|
|
1957
|
-
|
|
1958
|
-
return
|
|
1979
|
+
pass
|
|
1959
1980
|
|
|
1960
1981
|
leftovers = [
|
|
1961
1982
|
t
|
|
@@ -1965,9 +1986,6 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1965
1986
|
and not t.done()
|
|
1966
1987
|
]
|
|
1967
1988
|
|
|
1968
|
-
if not leftovers:
|
|
1969
|
-
return
|
|
1970
|
-
|
|
1971
1989
|
if get_settings().DEEPEVAL_DEBUG_ASYNC:
|
|
1972
1990
|
logger.warning(
|
|
1973
1991
|
"[deepeval] %d stray task(s) not tracked; cancelling...",
|
|
@@ -1978,20 +1996,21 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1978
1996
|
name = t.get_name()
|
|
1979
1997
|
logger.warning(" - STRAY %s meta=%s", name, meta)
|
|
1980
1998
|
|
|
1981
|
-
|
|
1982
|
-
t
|
|
1999
|
+
if leftovers:
|
|
2000
|
+
for t in leftovers:
|
|
2001
|
+
t.cancel()
|
|
1983
2002
|
|
|
1984
|
-
|
|
1985
|
-
|
|
1986
|
-
|
|
1987
|
-
|
|
1988
|
-
)
|
|
1989
|
-
except RuntimeError:
|
|
1990
|
-
# If the loop is closing here, just continue
|
|
1991
|
-
if get_settings().DEEPEVAL_DEBUG_ASYNC:
|
|
1992
|
-
logger.warning(
|
|
1993
|
-
"[deepeval] failed to drain stray tasks because loop is closing"
|
|
2003
|
+
# Drain strays so they don’t leak into the next iteration
|
|
2004
|
+
try:
|
|
2005
|
+
loop.run_until_complete(
|
|
2006
|
+
asyncio.gather(*leftovers, return_exceptions=True)
|
|
1994
2007
|
)
|
|
2008
|
+
except RuntimeError:
|
|
2009
|
+
# If the loop is closing here, just continue
|
|
2010
|
+
if get_settings().DEEPEVAL_DEBUG_ASYNC:
|
|
2011
|
+
logger.warning(
|
|
2012
|
+
"[deepeval] failed to drain stray tasks because loop is closing"
|
|
2013
|
+
)
|
|
1995
2014
|
|
|
1996
2015
|
# Evaluate traces
|
|
1997
2016
|
if trace_manager.traces_to_evaluate:
|
|
@@ -2014,25 +2033,6 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
2014
2033
|
pbar_id=pbar_id,
|
|
2015
2034
|
)
|
|
2016
2035
|
)
|
|
2017
|
-
elif openai_test_case_pairs:
|
|
2018
|
-
loop.run_until_complete(
|
|
2019
|
-
_evaluate_test_case_pairs(
|
|
2020
|
-
test_case_pairs=openai_test_case_pairs,
|
|
2021
|
-
test_run=test_run,
|
|
2022
|
-
test_run_manager=test_run_manager,
|
|
2023
|
-
test_results=test_results,
|
|
2024
|
-
ignore_errors=error_config.ignore_errors,
|
|
2025
|
-
skip_on_missing_params=error_config.skip_on_missing_params,
|
|
2026
|
-
show_indicator=display_config.show_indicator,
|
|
2027
|
-
verbose_mode=display_config.verbose_mode,
|
|
2028
|
-
throttle_value=async_config.throttle_value,
|
|
2029
|
-
max_concurrent=async_config.max_concurrent,
|
|
2030
|
-
_use_bar_indicator=_use_bar_indicator,
|
|
2031
|
-
_is_assert_test=_is_assert_test,
|
|
2032
|
-
progress=progress,
|
|
2033
|
-
pbar_id=pbar_id,
|
|
2034
|
-
)
|
|
2035
|
-
)
|
|
2036
2036
|
elif trace_manager.integration_traces_to_evaluate:
|
|
2037
2037
|
loop.run_until_complete(
|
|
2038
2038
|
_a_evaluate_traces(
|
|
@@ -2106,6 +2106,7 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
2106
2106
|
local_trace_manager.evaluating = False
|
|
2107
2107
|
local_trace_manager.traces_to_evaluate_order.clear()
|
|
2108
2108
|
local_trace_manager.traces_to_evaluate.clear()
|
|
2109
|
+
local_trace_manager.trace_uuid_to_golden.clear()
|
|
2109
2110
|
|
|
2110
2111
|
|
|
2111
2112
|
async def _a_evaluate_traces(
|
|
@@ -2132,8 +2133,26 @@ async def _a_evaluate_traces(
|
|
|
2132
2133
|
return await func(*args, **kwargs)
|
|
2133
2134
|
|
|
2134
2135
|
eval_tasks = []
|
|
2135
|
-
|
|
2136
|
-
|
|
2136
|
+
# Here, we will work off a fixed-set copy to avoid surprises from potential
|
|
2137
|
+
# mid-iteration mutation
|
|
2138
|
+
traces_snapshot = list(traces_to_evaluate or [])
|
|
2139
|
+
|
|
2140
|
+
for count, trace in enumerate(traces_snapshot):
|
|
2141
|
+
# Prefer the explicit mapping from trace -> golden captured at trace creation.
|
|
2142
|
+
golden = trace_manager.trace_uuid_to_golden.get(trace.uuid)
|
|
2143
|
+
if not golden:
|
|
2144
|
+
# trace started during evaluation_loop but the CURRENT_GOLDEN was
|
|
2145
|
+
# not set for some reason. We can’t map it to a golden, so the best
|
|
2146
|
+
# we can do is skip evaluation for this trace.
|
|
2147
|
+
if (
|
|
2148
|
+
logger.isEnabledFor(logging.DEBUG)
|
|
2149
|
+
and get_settings().DEEPEVAL_VERBOSE_MODE
|
|
2150
|
+
):
|
|
2151
|
+
logger.debug(
|
|
2152
|
+
"Skipping trace %s: no golden association found during evaluation_loop ",
|
|
2153
|
+
trace.uuid,
|
|
2154
|
+
)
|
|
2155
|
+
continue
|
|
2137
2156
|
with capture_evaluation_run("golden"):
|
|
2138
2157
|
task = execute_evals_with_semaphore(
|
|
2139
2158
|
func=_a_execute_agentic_test_case,
|
|
@@ -2225,6 +2244,7 @@ def _execute_metric(
|
|
|
2225
2244
|
test_case,
|
|
2226
2245
|
_show_indicator=show_metric_indicator,
|
|
2227
2246
|
_in_component=in_component,
|
|
2247
|
+
_log_metric_to_confident=False,
|
|
2228
2248
|
)
|
|
2229
2249
|
except MissingTestCaseParamsError as e:
|
|
2230
2250
|
if error_config.skip_on_missing_params:
|
|
@@ -2259,3 +2279,38 @@ def _execute_metric(
|
|
|
2259
2279
|
metric.success = False
|
|
2260
2280
|
else:
|
|
2261
2281
|
raise
|
|
2282
|
+
|
|
2283
|
+
|
|
2284
|
+
def log_prompt(
|
|
2285
|
+
llm_span: LlmSpan,
|
|
2286
|
+
test_run_manager: TestRunManager,
|
|
2287
|
+
):
|
|
2288
|
+
prompt = llm_span.prompt
|
|
2289
|
+
if prompt is None:
|
|
2290
|
+
return
|
|
2291
|
+
|
|
2292
|
+
span_hyperparameters = {}
|
|
2293
|
+
prompt_version = prompt.version if is_confident() else None
|
|
2294
|
+
key = f"{prompt.alias}_{prompt_version}"
|
|
2295
|
+
span_hyperparameters[key] = prompt
|
|
2296
|
+
|
|
2297
|
+
test_run = test_run_manager.get_test_run()
|
|
2298
|
+
if test_run.prompts is None:
|
|
2299
|
+
test_run.prompts = []
|
|
2300
|
+
if test_run.hyperparameters is None:
|
|
2301
|
+
test_run.hyperparameters = {}
|
|
2302
|
+
|
|
2303
|
+
if key not in test_run.hyperparameters:
|
|
2304
|
+
test_run.hyperparameters.update(
|
|
2305
|
+
process_hyperparameters(span_hyperparameters, False)
|
|
2306
|
+
)
|
|
2307
|
+
existing_prompt_keys = {
|
|
2308
|
+
f"{p.alias}_{p.version}" for p in test_run.prompts
|
|
2309
|
+
}
|
|
2310
|
+
new_prompts = process_prompts(span_hyperparameters)
|
|
2311
|
+
for new_prompt in new_prompts:
|
|
2312
|
+
new_prompt_key = f"{new_prompt.alias}_{new_prompt.version}"
|
|
2313
|
+
if new_prompt_key not in existing_prompt_keys:
|
|
2314
|
+
test_run.prompts.append(new_prompt)
|
|
2315
|
+
|
|
2316
|
+
global_test_run_manager.save_test_run(TEMP_FILE_PATH)
|
deepeval/evaluate/utils.py
CHANGED
|
@@ -28,7 +28,6 @@ from deepeval.evaluate.types import TestResult
|
|
|
28
28
|
from deepeval.tracing.api import TraceApi, BaseApiSpan, TraceSpanApiStatus
|
|
29
29
|
from deepeval.tracing.tracing import BaseSpan, Trace
|
|
30
30
|
from deepeval.tracing.types import TraceSpanStatus
|
|
31
|
-
from deepeval.constants import PYTEST_RUN_TEST_NAME
|
|
32
31
|
from deepeval.tracing.utils import (
|
|
33
32
|
perf_counter_to_datetime,
|
|
34
33
|
to_zod_compatible_iso,
|
|
@@ -133,121 +132,6 @@ def create_test_result(
|
|
|
133
132
|
)
|
|
134
133
|
|
|
135
134
|
|
|
136
|
-
def create_api_turn(turn: Turn, index: int) -> TurnApi:
|
|
137
|
-
return TurnApi(
|
|
138
|
-
role=turn.role,
|
|
139
|
-
content=turn.content,
|
|
140
|
-
user_id=turn.user_id,
|
|
141
|
-
retrievalContext=turn.retrieval_context,
|
|
142
|
-
toolsCalled=turn.tools_called,
|
|
143
|
-
additionalMetadata=turn.additional_metadata,
|
|
144
|
-
order=index,
|
|
145
|
-
)
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
def create_api_test_case(
|
|
149
|
-
test_case: Union[LLMTestCase, ConversationalTestCase, MLLMTestCase],
|
|
150
|
-
trace: Optional[TraceApi] = None,
|
|
151
|
-
index: Optional[int] = None,
|
|
152
|
-
) -> Union[LLMApiTestCase, ConversationalApiTestCase]:
|
|
153
|
-
if isinstance(test_case, ConversationalTestCase):
|
|
154
|
-
order = (
|
|
155
|
-
test_case._dataset_rank
|
|
156
|
-
if test_case._dataset_rank is not None
|
|
157
|
-
else index
|
|
158
|
-
)
|
|
159
|
-
if test_case.name:
|
|
160
|
-
name = test_case.name
|
|
161
|
-
else:
|
|
162
|
-
name = os.getenv(
|
|
163
|
-
PYTEST_RUN_TEST_NAME, f"conversational_test_case_{order}"
|
|
164
|
-
)
|
|
165
|
-
|
|
166
|
-
api_test_case = ConversationalApiTestCase(
|
|
167
|
-
name=name,
|
|
168
|
-
success=True,
|
|
169
|
-
metricsData=[],
|
|
170
|
-
runDuration=0,
|
|
171
|
-
evaluationCost=None,
|
|
172
|
-
order=order,
|
|
173
|
-
scenario=test_case.scenario,
|
|
174
|
-
expectedOutcome=test_case.expected_outcome,
|
|
175
|
-
userDescription=test_case.user_description,
|
|
176
|
-
context=test_case.context,
|
|
177
|
-
tags=test_case.tags,
|
|
178
|
-
comments=test_case.comments,
|
|
179
|
-
additionalMetadata=test_case.additional_metadata,
|
|
180
|
-
)
|
|
181
|
-
api_test_case.turns = [
|
|
182
|
-
create_api_turn(
|
|
183
|
-
turn=turn,
|
|
184
|
-
index=index,
|
|
185
|
-
)
|
|
186
|
-
for index, turn in enumerate(test_case.turns)
|
|
187
|
-
]
|
|
188
|
-
|
|
189
|
-
return api_test_case
|
|
190
|
-
else:
|
|
191
|
-
order = (
|
|
192
|
-
test_case._dataset_rank
|
|
193
|
-
if test_case._dataset_rank is not None
|
|
194
|
-
else index
|
|
195
|
-
)
|
|
196
|
-
|
|
197
|
-
success = True
|
|
198
|
-
if test_case.name is not None:
|
|
199
|
-
name = test_case.name
|
|
200
|
-
else:
|
|
201
|
-
name = os.getenv(PYTEST_RUN_TEST_NAME, f"test_case_{order}")
|
|
202
|
-
metrics_data = []
|
|
203
|
-
|
|
204
|
-
if isinstance(test_case, LLMTestCase):
|
|
205
|
-
api_test_case = LLMApiTestCase(
|
|
206
|
-
name=name,
|
|
207
|
-
input=test_case.input,
|
|
208
|
-
actualOutput=test_case.actual_output,
|
|
209
|
-
expectedOutput=test_case.expected_output,
|
|
210
|
-
context=test_case.context,
|
|
211
|
-
retrievalContext=test_case.retrieval_context,
|
|
212
|
-
toolsCalled=test_case.tools_called,
|
|
213
|
-
expectedTools=test_case.expected_tools,
|
|
214
|
-
tokenCost=test_case.token_cost,
|
|
215
|
-
completionTime=test_case.completion_time,
|
|
216
|
-
tags=test_case.tags,
|
|
217
|
-
success=success,
|
|
218
|
-
metricsData=metrics_data,
|
|
219
|
-
runDuration=None,
|
|
220
|
-
evaluationCost=None,
|
|
221
|
-
order=order,
|
|
222
|
-
additionalMetadata=test_case.additional_metadata,
|
|
223
|
-
comments=test_case.comments,
|
|
224
|
-
trace=trace,
|
|
225
|
-
)
|
|
226
|
-
elif isinstance(test_case, MLLMTestCase):
|
|
227
|
-
api_test_case = LLMApiTestCase(
|
|
228
|
-
name=name,
|
|
229
|
-
input="",
|
|
230
|
-
multimodalInput=test_case.input,
|
|
231
|
-
multimodalActualOutput=test_case.actual_output,
|
|
232
|
-
multimodalExpectedOutput=test_case.expected_output,
|
|
233
|
-
multimodalRetrievalContext=test_case.retrieval_context,
|
|
234
|
-
multimodalContext=test_case.context,
|
|
235
|
-
toolsCalled=test_case.tools_called,
|
|
236
|
-
expectedTools=test_case.expected_tools,
|
|
237
|
-
tokenCost=test_case.token_cost,
|
|
238
|
-
completionTime=test_case.completion_time,
|
|
239
|
-
success=success,
|
|
240
|
-
metricsData=metrics_data,
|
|
241
|
-
runDuration=None,
|
|
242
|
-
evaluationCost=None,
|
|
243
|
-
order=order,
|
|
244
|
-
additionalMetadata=test_case.additional_metadata,
|
|
245
|
-
comments=test_case.comments,
|
|
246
|
-
)
|
|
247
|
-
# llm_test_case_lookup_map[instance_id] = api_test_case
|
|
248
|
-
return api_test_case
|
|
249
|
-
|
|
250
|
-
|
|
251
135
|
def create_api_trace(trace: Trace, golden: Golden) -> TraceApi:
|
|
252
136
|
return TraceApi(
|
|
253
137
|
uuid=trace.uuid,
|
|
@@ -309,6 +193,26 @@ def validate_assert_test_inputs(
|
|
|
309
193
|
"Both 'test_case' and 'metrics' must be provided together."
|
|
310
194
|
)
|
|
311
195
|
|
|
196
|
+
if test_case and metrics:
|
|
197
|
+
if isinstance(test_case, LLMTestCase) and not all(
|
|
198
|
+
isinstance(metric, BaseMetric) for metric in metrics
|
|
199
|
+
):
|
|
200
|
+
raise ValueError(
|
|
201
|
+
"All 'metrics' for an 'LLMTestCase' must be instances of 'BaseMetric' only."
|
|
202
|
+
)
|
|
203
|
+
if isinstance(test_case, ConversationalTestCase) and not all(
|
|
204
|
+
isinstance(metric, BaseConversationalMetric) for metric in metrics
|
|
205
|
+
):
|
|
206
|
+
raise ValueError(
|
|
207
|
+
"All 'metrics' for an 'ConversationalTestCase' must be instances of 'BaseConversationalMetric' only."
|
|
208
|
+
)
|
|
209
|
+
if isinstance(test_case, MLLMTestCase) and not all(
|
|
210
|
+
isinstance(metric, BaseMultimodalMetric) for metric in metrics
|
|
211
|
+
):
|
|
212
|
+
raise ValueError(
|
|
213
|
+
"All 'metrics' for an 'MLLMTestCase' must be instances of 'BaseMultimodalMetric' only."
|
|
214
|
+
)
|
|
215
|
+
|
|
312
216
|
if not ((golden and observed_callback) or (test_case and metrics)):
|
|
313
217
|
raise ValueError(
|
|
314
218
|
"You must provide either ('golden' + 'observed_callback') or ('test_case' + 'metrics')."
|
|
@@ -13,7 +13,7 @@ logger = logging.getLogger(__name__)
|
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
try:
|
|
16
|
-
from crewai.
|
|
16
|
+
from crewai.events import BaseEventListener
|
|
17
17
|
from crewai.events import (
|
|
18
18
|
CrewKickoffStartedEvent,
|
|
19
19
|
CrewKickoffCompletedEvent,
|