deepeval 3.6.7__py3-none-any.whl → 3.6.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +104 -36
- deepeval/config/utils.py +5 -0
- deepeval/dataset/dataset.py +162 -30
- deepeval/dataset/utils.py +41 -13
- deepeval/errors.py +20 -2
- deepeval/evaluate/execute.py +1662 -688
- deepeval/evaluate/types.py +1 -0
- deepeval/evaluate/utils.py +13 -3
- deepeval/integrations/crewai/__init__.py +2 -1
- deepeval/integrations/crewai/tool.py +71 -0
- deepeval/integrations/llama_index/__init__.py +0 -4
- deepeval/integrations/llama_index/handler.py +20 -21
- deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
- deepeval/metrics/__init__.py +13 -0
- deepeval/metrics/base_metric.py +1 -0
- deepeval/metrics/contextual_precision/contextual_precision.py +27 -21
- deepeval/metrics/conversational_g_eval/__init__.py +3 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +11 -7
- deepeval/metrics/dag/schema.py +1 -1
- deepeval/metrics/dag/templates.py +2 -2
- deepeval/metrics/goal_accuracy/__init__.py +1 -0
- deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
- deepeval/metrics/goal_accuracy/schema.py +17 -0
- deepeval/metrics/goal_accuracy/template.py +235 -0
- deepeval/metrics/hallucination/hallucination.py +8 -8
- deepeval/metrics/indicator.py +21 -1
- deepeval/metrics/mcp/mcp_task_completion.py +7 -2
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +16 -6
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +2 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +32 -24
- deepeval/metrics/plan_adherence/__init__.py +1 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
- deepeval/metrics/plan_adherence/schema.py +11 -0
- deepeval/metrics/plan_adherence/template.py +170 -0
- deepeval/metrics/plan_quality/__init__.py +1 -0
- deepeval/metrics/plan_quality/plan_quality.py +292 -0
- deepeval/metrics/plan_quality/schema.py +11 -0
- deepeval/metrics/plan_quality/template.py +101 -0
- deepeval/metrics/step_efficiency/__init__.py +1 -0
- deepeval/metrics/step_efficiency/schema.py +11 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
- deepeval/metrics/step_efficiency/template.py +256 -0
- deepeval/metrics/task_completion/task_completion.py +1 -0
- deepeval/metrics/tool_correctness/schema.py +6 -0
- deepeval/metrics/tool_correctness/template.py +88 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +226 -22
- deepeval/metrics/tool_use/__init__.py +1 -0
- deepeval/metrics/tool_use/schema.py +19 -0
- deepeval/metrics/tool_use/template.py +220 -0
- deepeval/metrics/tool_use/tool_use.py +458 -0
- deepeval/metrics/topic_adherence/__init__.py +1 -0
- deepeval/metrics/topic_adherence/schema.py +16 -0
- deepeval/metrics/topic_adherence/template.py +162 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
- deepeval/models/embedding_models/azure_embedding_model.py +37 -36
- deepeval/models/embedding_models/local_embedding_model.py +30 -32
- deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
- deepeval/models/embedding_models/openai_embedding_model.py +22 -31
- deepeval/models/llms/amazon_bedrock_model.py +20 -17
- deepeval/models/llms/openai_model.py +10 -1
- deepeval/models/retry_policy.py +103 -20
- deepeval/openai/extractors.py +61 -16
- deepeval/openai/patch.py +8 -12
- deepeval/openai/types.py +1 -1
- deepeval/openai/utils.py +108 -1
- deepeval/prompt/prompt.py +1 -0
- deepeval/prompt/utils.py +43 -14
- deepeval/simulator/conversation_simulator.py +25 -18
- deepeval/synthesizer/chunking/context_generator.py +9 -1
- deepeval/synthesizer/synthesizer.py +11 -10
- deepeval/test_case/llm_test_case.py +6 -2
- deepeval/test_run/test_run.py +190 -207
- deepeval/tracing/__init__.py +2 -1
- deepeval/tracing/otel/exporter.py +3 -4
- deepeval/tracing/otel/utils.py +23 -4
- deepeval/tracing/trace_context.py +53 -38
- deepeval/tracing/tracing.py +23 -0
- deepeval/tracing/types.py +16 -14
- deepeval/utils.py +21 -0
- {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/METADATA +1 -1
- {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/RECORD +85 -63
- deepeval/integrations/llama_index/agent/patched.py +0 -68
- deepeval/tracing/message_types/__init__.py +0 -10
- deepeval/tracing/message_types/base.py +0 -6
- deepeval/tracing/message_types/messages.py +0 -14
- deepeval/tracing/message_types/tools.py +0 -18
- {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/WHEEL +0 -0
- {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/entry_points.txt +0 -0
deepeval/evaluate/types.py
CHANGED
deepeval/evaluate/utils.py
CHANGED
|
@@ -5,8 +5,6 @@ import os
|
|
|
5
5
|
import time
|
|
6
6
|
|
|
7
7
|
from deepeval.utils import format_turn
|
|
8
|
-
from deepeval.test_case.conversational_test_case import Turn
|
|
9
|
-
from deepeval.test_run.api import TurnApi
|
|
10
8
|
from deepeval.test_run.test_run import TestRunResultDisplay
|
|
11
9
|
from deepeval.dataset import Golden
|
|
12
10
|
from deepeval.metrics import (
|
|
@@ -481,6 +479,18 @@ def count_metrics_in_trace(trace: Trace) -> int:
|
|
|
481
479
|
return sum(count_metrics_recursive(span) for span in trace.root_spans)
|
|
482
480
|
|
|
483
481
|
|
|
482
|
+
def count_total_metrics_for_trace(trace: Trace) -> int:
|
|
483
|
+
"""Span subtree metrics + trace-level metrics."""
|
|
484
|
+
return count_metrics_in_trace(trace=trace) + len(trace.metrics or [])
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
def count_metrics_in_span_subtree(span: BaseSpan) -> int:
|
|
488
|
+
total = len(span.metrics or [])
|
|
489
|
+
for c in span.children or []:
|
|
490
|
+
total += count_metrics_in_span_subtree(c)
|
|
491
|
+
return total
|
|
492
|
+
|
|
493
|
+
|
|
484
494
|
def extract_trace_test_results(trace_api: TraceApi) -> List[TestResult]:
|
|
485
495
|
test_results: List[TestResult] = []
|
|
486
496
|
# extract trace result
|
|
@@ -523,7 +533,7 @@ def extract_span_test_results(span_api: BaseApiSpan) -> List[TestResult]:
|
|
|
523
533
|
test_results.append(
|
|
524
534
|
TestResult(
|
|
525
535
|
name=span_api.name,
|
|
526
|
-
success=span_api.status ==
|
|
536
|
+
success=span_api.status == TraceSpanApiStatus.SUCCESS,
|
|
527
537
|
metrics_data=span_api.metrics_data,
|
|
528
538
|
input=span_api.input,
|
|
529
539
|
actual_output=span_api.output,
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
from typing import Callable
|
|
3
|
+
from crewai.tools import tool as crewai_tool
|
|
4
|
+
|
|
5
|
+
from deepeval.tracing.context import current_span_context
|
|
6
|
+
from deepeval.tracing.types import ToolSpan
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def tool(*args, metric=None, metric_collection=None, **kwargs) -> Callable:
|
|
10
|
+
"""
|
|
11
|
+
Simple wrapper around crewai.tools.tool that:
|
|
12
|
+
- prints the original function's input and output
|
|
13
|
+
- accepts additional parameters: metric and metric_collection (unused, for compatibility)
|
|
14
|
+
- remains backward compatible with CrewAI's decorator usage patterns
|
|
15
|
+
"""
|
|
16
|
+
crewai_kwargs = kwargs
|
|
17
|
+
|
|
18
|
+
# Case 1: @tool (function passed directly)
|
|
19
|
+
if len(args) == 1 and callable(args[0]):
|
|
20
|
+
f = args[0]
|
|
21
|
+
tool_name = f.__name__
|
|
22
|
+
|
|
23
|
+
@functools.wraps(f)
|
|
24
|
+
def wrapped(*f_args, **f_kwargs):
|
|
25
|
+
current_span = current_span_context.get()
|
|
26
|
+
if current_span and isinstance(current_span, ToolSpan):
|
|
27
|
+
current_span.metric_collection = metric_collection
|
|
28
|
+
current_span.metrics = metric
|
|
29
|
+
result = f(*f_args, **f_kwargs)
|
|
30
|
+
return result
|
|
31
|
+
|
|
32
|
+
return crewai_tool(tool_name, **crewai_kwargs)(wrapped)
|
|
33
|
+
|
|
34
|
+
# Case 2: @tool("name")
|
|
35
|
+
if len(args) == 1 and isinstance(args[0], str):
|
|
36
|
+
tool_name = args[0]
|
|
37
|
+
|
|
38
|
+
def _decorator(f: Callable) -> Callable:
|
|
39
|
+
@functools.wraps(f)
|
|
40
|
+
def wrapped(*f_args, **f_kwargs):
|
|
41
|
+
current_span = current_span_context.get()
|
|
42
|
+
if current_span and isinstance(current_span, ToolSpan):
|
|
43
|
+
current_span.metric_collection = metric_collection
|
|
44
|
+
current_span.metrics = metric
|
|
45
|
+
result = f(*f_args, **f_kwargs)
|
|
46
|
+
return result
|
|
47
|
+
|
|
48
|
+
return crewai_tool(tool_name, **crewai_kwargs)(wrapped)
|
|
49
|
+
|
|
50
|
+
return _decorator
|
|
51
|
+
|
|
52
|
+
# Case 3: @tool(result_as_answer=True, ...) — kwargs only
|
|
53
|
+
if len(args) == 0:
|
|
54
|
+
|
|
55
|
+
def _decorator(f: Callable) -> Callable:
|
|
56
|
+
tool_name = f.__name__
|
|
57
|
+
|
|
58
|
+
@functools.wraps(f)
|
|
59
|
+
def wrapped(*f_args, **f_kwargs):
|
|
60
|
+
current_span = current_span_context.get()
|
|
61
|
+
if current_span and isinstance(current_span, ToolSpan):
|
|
62
|
+
current_span.metric_collection = metric_collection
|
|
63
|
+
current_span.metrics = metric
|
|
64
|
+
result = f(*f_args, **f_kwargs)
|
|
65
|
+
return result
|
|
66
|
+
|
|
67
|
+
return crewai_tool(tool_name, **crewai_kwargs)(wrapped)
|
|
68
|
+
|
|
69
|
+
return _decorator
|
|
70
|
+
|
|
71
|
+
raise ValueError("Invalid arguments")
|
|
@@ -5,6 +5,10 @@ import uuid
|
|
|
5
5
|
from deepeval.telemetry import capture_tracing_integration
|
|
6
6
|
from deepeval.tracing import trace_manager
|
|
7
7
|
from deepeval.tracing.types import AgentSpan, BaseSpan, LlmSpan, TraceSpanStatus
|
|
8
|
+
from deepeval.tracing.trace_context import (
|
|
9
|
+
current_llm_context,
|
|
10
|
+
current_agent_context,
|
|
11
|
+
)
|
|
8
12
|
|
|
9
13
|
try:
|
|
10
14
|
from llama_index.core.instrumentation.events.base import BaseEvent
|
|
@@ -22,11 +26,6 @@ try:
|
|
|
22
26
|
LLMChatEndEvent,
|
|
23
27
|
)
|
|
24
28
|
from llama_index_instrumentation.dispatcher import Dispatcher
|
|
25
|
-
from deepeval.integrations.llama_index.agent.patched import (
|
|
26
|
-
FunctionAgent as PatchedFunctionAgent,
|
|
27
|
-
ReActAgent as PatchedReActAgent,
|
|
28
|
-
CodeActAgent as PatchedCodeActAgent,
|
|
29
|
-
)
|
|
30
29
|
from deepeval.integrations.llama_index.utils import (
|
|
31
30
|
parse_id,
|
|
32
31
|
prepare_input_llm_test_case_params,
|
|
@@ -67,6 +66,7 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
|
|
|
67
66
|
).strip()
|
|
68
67
|
input_messages.append({"role": role, "content": content})
|
|
69
68
|
|
|
69
|
+
llm_span_context = current_llm_context.get()
|
|
70
70
|
# create the span
|
|
71
71
|
llm_span = LlmSpan(
|
|
72
72
|
name="ConfidentLLMSpan",
|
|
@@ -83,6 +83,12 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
|
|
|
83
83
|
), # check the model name not coming in this option
|
|
84
84
|
input=input_messages,
|
|
85
85
|
output="",
|
|
86
|
+
metrics=llm_span_context.metrics if llm_span_context else None,
|
|
87
|
+
metric_collection=(
|
|
88
|
+
llm_span_context.metric_collection
|
|
89
|
+
if llm_span_context
|
|
90
|
+
else None
|
|
91
|
+
),
|
|
86
92
|
)
|
|
87
93
|
trace_manager.add_span(llm_span)
|
|
88
94
|
trace_manager.add_span_to_trace(llm_span)
|
|
@@ -144,6 +150,7 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
|
|
|
144
150
|
|
|
145
151
|
# conditions to qualify as agent start run span
|
|
146
152
|
if method_name == "run":
|
|
153
|
+
agent_span_context = current_agent_context.get()
|
|
147
154
|
span = AgentSpan(
|
|
148
155
|
uuid=id_,
|
|
149
156
|
status=TraceSpanStatus.IN_PROGRESS,
|
|
@@ -153,24 +160,16 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
|
|
|
153
160
|
start_time=perf_counter(),
|
|
154
161
|
name="Agent", # TODO: decide the name of the span
|
|
155
162
|
input=bound_args.arguments,
|
|
163
|
+
metrics=(
|
|
164
|
+
agent_span_context.metrics if agent_span_context else None
|
|
165
|
+
),
|
|
166
|
+
metric_collection=(
|
|
167
|
+
agent_span_context.metric_collection
|
|
168
|
+
if agent_span_context
|
|
169
|
+
else None
|
|
170
|
+
),
|
|
156
171
|
)
|
|
157
172
|
|
|
158
|
-
# check if the instance is a PatchedFunctionAgent
|
|
159
|
-
if isinstance(instance, PatchedFunctionAgent):
|
|
160
|
-
span.name = "FunctionAgent"
|
|
161
|
-
span.metric_collection = instance.metric_collection
|
|
162
|
-
span.metrics = instance.metrics
|
|
163
|
-
|
|
164
|
-
if isinstance(instance, PatchedReActAgent):
|
|
165
|
-
span.name = "ReActAgent"
|
|
166
|
-
span.metric_collection = instance.metric_collection
|
|
167
|
-
span.metrics = instance.metrics
|
|
168
|
-
|
|
169
|
-
if isinstance(instance, PatchedCodeActAgent):
|
|
170
|
-
span.name = "CodeActAgent"
|
|
171
|
-
span.metric_collection = instance.metric_collection
|
|
172
|
-
span.metrics = instance.metrics
|
|
173
|
-
|
|
174
173
|
# prepare input test case params for the span
|
|
175
174
|
prepare_input_llm_test_case_params(
|
|
176
175
|
class_name, method_name, span, bound_args.arguments
|
|
@@ -1,14 +1,19 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
3
|
import os
|
|
4
|
+
from time import perf_counter
|
|
4
5
|
from typing import Literal, Optional, List
|
|
5
6
|
|
|
6
7
|
from deepeval.config.settings import get_settings
|
|
7
8
|
from deepeval.confident.api import get_confident_api_key
|
|
9
|
+
from deepeval.metrics.base_metric import BaseMetric
|
|
8
10
|
from deepeval.prompt import Prompt
|
|
9
11
|
from deepeval.tracing.context import current_trace_context
|
|
10
12
|
from deepeval.tracing.types import Trace
|
|
11
13
|
from deepeval.tracing.otel.utils import to_hex_string
|
|
14
|
+
from deepeval.tracing.tracing import trace_manager
|
|
15
|
+
from deepeval.tracing.otel.utils import normalize_pydantic_ai_messages
|
|
16
|
+
from deepeval.tracing.otel.exporter import ConfidentSpanExporter
|
|
12
17
|
|
|
13
18
|
|
|
14
19
|
logger = logging.getLogger(__name__)
|
|
@@ -21,6 +26,7 @@ try:
|
|
|
21
26
|
from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
|
|
22
27
|
OTLPSpanExporter,
|
|
23
28
|
)
|
|
29
|
+
from opentelemetry.sdk.trace import ReadableSpan
|
|
24
30
|
|
|
25
31
|
dependency_installed = True
|
|
26
32
|
except ImportError as e:
|
|
@@ -48,24 +54,96 @@ def is_dependency_installed():
|
|
|
48
54
|
return True
|
|
49
55
|
|
|
50
56
|
|
|
57
|
+
from deepeval.tracing.types import AgentSpan
|
|
51
58
|
from deepeval.confident.api import get_confident_api_key
|
|
52
59
|
from deepeval.prompt import Prompt
|
|
53
60
|
from deepeval.tracing.otel.test_exporter import test_exporter
|
|
54
61
|
from deepeval.tracing.context import current_trace_context
|
|
55
62
|
from deepeval.tracing.types import Trace
|
|
56
63
|
from deepeval.tracing.otel.utils import to_hex_string
|
|
64
|
+
from deepeval.tracing.types import TraceSpanStatus, ToolCall
|
|
65
|
+
from deepeval.tracing.perf_epoch_bridge import init_clock_bridge
|
|
57
66
|
|
|
58
67
|
# OTLP_ENDPOINT = "http://127.0.0.1:4318/v1/traces"
|
|
59
68
|
OTLP_ENDPOINT = "https://otel.confident-ai.com/v1/traces"
|
|
69
|
+
init_clock_bridge() # initialize clock bridge for perf_counter() to epoch_nanos conversion
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class ConfidentInstrumentationSettings(InstrumentationSettings):
|
|
73
|
+
|
|
74
|
+
def __init__(
|
|
75
|
+
self,
|
|
76
|
+
api_key: Optional[str] = None,
|
|
77
|
+
name: Optional[str] = None,
|
|
78
|
+
thread_id: Optional[str] = None,
|
|
79
|
+
user_id: Optional[str] = None,
|
|
80
|
+
metadata: Optional[dict] = None,
|
|
81
|
+
tags: Optional[List[str]] = None,
|
|
82
|
+
metric_collection: Optional[str] = None,
|
|
83
|
+
confident_prompt: Optional[Prompt] = None,
|
|
84
|
+
llm_metric_collection: Optional[str] = None,
|
|
85
|
+
agent_metric_collection: Optional[str] = None,
|
|
86
|
+
tool_metric_collection_map: Optional[dict] = None,
|
|
87
|
+
trace_metric_collection: Optional[str] = None,
|
|
88
|
+
is_test_mode: Optional[bool] = False,
|
|
89
|
+
agent_metrics: Optional[List[BaseMetric]] = None,
|
|
90
|
+
):
|
|
91
|
+
is_dependency_installed()
|
|
92
|
+
|
|
93
|
+
_environment = os.getenv("CONFIDENT_TRACE_ENVIRONMENT", "development")
|
|
94
|
+
if _environment and _environment in [
|
|
95
|
+
"production",
|
|
96
|
+
"staging",
|
|
97
|
+
"development",
|
|
98
|
+
"testing",
|
|
99
|
+
]:
|
|
100
|
+
self.environment = _environment
|
|
101
|
+
|
|
102
|
+
self.tool_metric_collection_map = tool_metric_collection_map or {}
|
|
103
|
+
self.name = name
|
|
104
|
+
self.thread_id = thread_id
|
|
105
|
+
self.user_id = user_id
|
|
106
|
+
self.metadata = metadata
|
|
107
|
+
self.tags = tags
|
|
108
|
+
self.metric_collection = metric_collection
|
|
109
|
+
self.confident_prompt = confident_prompt
|
|
110
|
+
self.llm_metric_collection = llm_metric_collection
|
|
111
|
+
self.agent_metric_collection = agent_metric_collection
|
|
112
|
+
self.trace_metric_collection = trace_metric_collection
|
|
113
|
+
self.is_test_mode = is_test_mode
|
|
114
|
+
self.agent_metrics = agent_metrics
|
|
115
|
+
|
|
116
|
+
if not api_key:
|
|
117
|
+
api_key = get_confident_api_key()
|
|
118
|
+
if not api_key:
|
|
119
|
+
raise ValueError("CONFIDENT_API_KEY is not set")
|
|
120
|
+
|
|
121
|
+
trace_provider = TracerProvider()
|
|
122
|
+
|
|
123
|
+
# Pass the entire settings instance instead of individual values
|
|
124
|
+
span_interceptor = SpanInterceptor(self)
|
|
125
|
+
trace_provider.add_span_processor(span_interceptor)
|
|
126
|
+
|
|
127
|
+
if is_test_mode:
|
|
128
|
+
trace_provider.add_span_processor(BatchSpanProcessor(test_exporter))
|
|
129
|
+
else:
|
|
130
|
+
trace_provider.add_span_processor(
|
|
131
|
+
BatchSpanProcessor(
|
|
132
|
+
OTLPSpanExporter(
|
|
133
|
+
endpoint=OTLP_ENDPOINT,
|
|
134
|
+
headers={"x-confident-api-key": api_key},
|
|
135
|
+
)
|
|
136
|
+
)
|
|
137
|
+
)
|
|
138
|
+
super().__init__(tracer_provider=trace_provider)
|
|
60
139
|
|
|
61
140
|
|
|
62
141
|
class SpanInterceptor(SpanProcessor):
|
|
63
|
-
def __init__(self, settings_instance):
|
|
142
|
+
def __init__(self, settings_instance: ConfidentInstrumentationSettings):
|
|
64
143
|
# Keep a reference to the settings instance instead of copying values
|
|
65
|
-
self.settings
|
|
144
|
+
self.settings = settings_instance
|
|
66
145
|
|
|
67
146
|
def on_start(self, span, parent_context):
|
|
68
|
-
|
|
69
147
|
# set trace uuid
|
|
70
148
|
_current_trace_context = current_trace_context.get()
|
|
71
149
|
if _current_trace_context and isinstance(_current_trace_context, Trace):
|
|
@@ -151,85 +229,56 @@ class SpanInterceptor(SpanProcessor):
|
|
|
151
229
|
)
|
|
152
230
|
|
|
153
231
|
def on_end(self, span):
|
|
154
|
-
|
|
155
|
-
|
|
232
|
+
if self.settings.is_test_mode:
|
|
233
|
+
if span.attributes.get("confident.span.type") == "agent":
|
|
156
234
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
thread_id: Optional[str] = None
|
|
161
|
-
user_id: Optional[str] = None
|
|
162
|
-
metadata: Optional[dict] = None
|
|
163
|
-
tags: Optional[List[str]] = None
|
|
164
|
-
environment: Literal["production", "staging", "development", "testing"] = (
|
|
165
|
-
None
|
|
166
|
-
)
|
|
167
|
-
metric_collection: Optional[str] = None
|
|
168
|
-
confident_prompt: Optional[Prompt] = None
|
|
169
|
-
llm_metric_collection: Optional[str] = None
|
|
170
|
-
agent_metric_collection: Optional[str] = None
|
|
171
|
-
tool_metric_collection_map: dict = {}
|
|
172
|
-
trace_metric_collection: Optional[str] = None
|
|
235
|
+
def create_agent_span_for_evaluation(
|
|
236
|
+
span: ReadableSpan,
|
|
237
|
+
) -> AgentSpan:
|
|
173
238
|
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
user_id: Optional[str] = None,
|
|
180
|
-
metadata: Optional[dict] = None,
|
|
181
|
-
tags: Optional[List[str]] = None,
|
|
182
|
-
metric_collection: Optional[str] = None,
|
|
183
|
-
confident_prompt: Optional[Prompt] = None,
|
|
184
|
-
llm_metric_collection: Optional[str] = None,
|
|
185
|
-
agent_metric_collection: Optional[str] = None,
|
|
186
|
-
tool_metric_collection_map: Optional[dict] = None,
|
|
187
|
-
trace_metric_collection: Optional[str] = None,
|
|
188
|
-
is_test_mode: Optional[bool] = False,
|
|
189
|
-
):
|
|
190
|
-
is_dependency_installed()
|
|
239
|
+
agent_span = (
|
|
240
|
+
ConfidentSpanExporter.prepare_boilerplate_base_span(
|
|
241
|
+
span
|
|
242
|
+
)
|
|
243
|
+
)
|
|
191
244
|
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
"staging",
|
|
196
|
-
"development",
|
|
197
|
-
"testing",
|
|
198
|
-
]:
|
|
199
|
-
self.environment = _environment
|
|
245
|
+
# tools called
|
|
246
|
+
normalized_messages = normalize_pydantic_ai_messages(span)
|
|
247
|
+
tools_called = []
|
|
200
248
|
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
self.trace_metric_collection = trace_metric_collection
|
|
249
|
+
for message in normalized_messages:
|
|
250
|
+
for part in message.get("parts", []):
|
|
251
|
+
if part.get("type") == "tool_call":
|
|
252
|
+
name = part.get("name")
|
|
253
|
+
try:
|
|
254
|
+
input_parameters = json.loads(
|
|
255
|
+
part.get("arguments")
|
|
256
|
+
)
|
|
257
|
+
except Exception:
|
|
258
|
+
input_parameters = {}
|
|
212
259
|
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
260
|
+
tools_called.append(
|
|
261
|
+
ToolCall(
|
|
262
|
+
name=name,
|
|
263
|
+
input_parameters=input_parameters,
|
|
264
|
+
)
|
|
265
|
+
)
|
|
217
266
|
|
|
218
|
-
|
|
267
|
+
# agent_span.tools_called = tools_called
|
|
268
|
+
return agent_span
|
|
219
269
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
trace_provider.add_span_processor(span_interceptor)
|
|
270
|
+
agent_span = create_agent_span_for_evaluation(span)
|
|
271
|
+
agent_span.metrics = self.settings.agent_metrics
|
|
223
272
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
OTLPSpanExporter(
|
|
230
|
-
endpoint=OTLP_ENDPOINT,
|
|
231
|
-
headers={"x-confident-api-key": api_key},
|
|
273
|
+
# create a trace for evaluation
|
|
274
|
+
trace = trace_manager.get_trace_by_uuid(agent_span.trace_uuid)
|
|
275
|
+
if not trace:
|
|
276
|
+
trace = trace_manager.start_new_trace(
|
|
277
|
+
trace_uuid=agent_span.trace_uuid
|
|
232
278
|
)
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
279
|
+
|
|
280
|
+
trace.root_spans.append(agent_span)
|
|
281
|
+
trace.status = TraceSpanStatus.SUCCESS
|
|
282
|
+
trace.end_time = perf_counter()
|
|
283
|
+
trace_manager.traces_to_evaluate.append(trace)
|
|
284
|
+
test_exporter.clear_span_json_list()
|
deepeval/metrics/__init__.py
CHANGED
|
@@ -27,6 +27,12 @@ from .tool_correctness.tool_correctness import ToolCorrectnessMetric
|
|
|
27
27
|
from .json_correctness.json_correctness import JsonCorrectnessMetric
|
|
28
28
|
from .prompt_alignment.prompt_alignment import PromptAlignmentMetric
|
|
29
29
|
from .task_completion.task_completion import TaskCompletionMetric
|
|
30
|
+
from .topic_adherence.topic_adherence import TopicAdherenceMetric
|
|
31
|
+
from .step_efficiency.step_efficiency import StepEfficiencyMetric
|
|
32
|
+
from .plan_adherence.plan_adherence import PlanAdherenceMetric
|
|
33
|
+
from .plan_quality.plan_quality import PlanQualityMetric
|
|
34
|
+
from .tool_use.tool_use import ToolUseMetric
|
|
35
|
+
from .goal_accuracy.goal_accuracy import GoalAccuracyMetric
|
|
30
36
|
from .argument_correctness.argument_correctness import ArgumentCorrectnessMetric
|
|
31
37
|
from .mcp.mcp_task_completion import MCPTaskCompletionMetric
|
|
32
38
|
from .mcp.multi_turn_mcp_use_metric import MultiTurnMCPUseMetric
|
|
@@ -98,6 +104,13 @@ __all__ = [
|
|
|
98
104
|
"TaskCompletionMetric",
|
|
99
105
|
"ArgumentCorrectnessMetric",
|
|
100
106
|
"KnowledgeRetentionMetric",
|
|
107
|
+
# Agentic metrics
|
|
108
|
+
"TopicAdherenceMetric",
|
|
109
|
+
"StepEfficiencyMetric",
|
|
110
|
+
"PlanAdherenceMetric",
|
|
111
|
+
"PlanQualityMetric",
|
|
112
|
+
"ToolUseMetric",
|
|
113
|
+
"GoalAccuracyMetric",
|
|
101
114
|
# Conversational metrics
|
|
102
115
|
"TurnRelevancyMetric",
|
|
103
116
|
"ConversationCompletenessMetric",
|
deepeval/metrics/base_metric.py
CHANGED
|
@@ -17,7 +17,7 @@ from deepeval.metrics.contextual_precision.template import (
|
|
|
17
17
|
ContextualPrecisionTemplate,
|
|
18
18
|
)
|
|
19
19
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
20
|
-
|
|
20
|
+
import deepeval.metrics.contextual_precision.schema as cpschema
|
|
21
21
|
from deepeval.metrics.api import metric_data_manager
|
|
22
22
|
|
|
23
23
|
|
|
@@ -73,7 +73,7 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
73
73
|
)
|
|
74
74
|
)
|
|
75
75
|
else:
|
|
76
|
-
self.verdicts: List[ContextualPrecisionVerdict] = (
|
|
76
|
+
self.verdicts: List[cpschema.ContextualPrecisionVerdict] = (
|
|
77
77
|
self._generate_verdicts(
|
|
78
78
|
test_case.input,
|
|
79
79
|
test_case.expected_output,
|
|
@@ -113,7 +113,7 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
113
113
|
_show_indicator=_show_indicator,
|
|
114
114
|
_in_component=_in_component,
|
|
115
115
|
):
|
|
116
|
-
self.verdicts: List[ContextualPrecisionVerdict] = (
|
|
116
|
+
self.verdicts: List[cpschema.ContextualPrecisionVerdict] = (
|
|
117
117
|
await self._a_generate_verdicts(
|
|
118
118
|
test_case.input,
|
|
119
119
|
test_case.expected_output,
|
|
@@ -141,7 +141,7 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
141
141
|
return None
|
|
142
142
|
|
|
143
143
|
retrieval_contexts_verdicts = [
|
|
144
|
-
{"verdict": verdict.verdict, "
|
|
144
|
+
{"verdict": verdict.verdict, "reason": verdict.reason}
|
|
145
145
|
for verdict in self.verdicts
|
|
146
146
|
]
|
|
147
147
|
prompt = self.evaluation_template.generate_reason(
|
|
@@ -152,15 +152,15 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
152
152
|
|
|
153
153
|
if self.using_native_model:
|
|
154
154
|
res, cost = await self.model.a_generate(
|
|
155
|
-
prompt, schema=ContextualPrecisionScoreReason
|
|
155
|
+
prompt, schema=cpschema.ContextualPrecisionScoreReason
|
|
156
156
|
)
|
|
157
157
|
self.evaluation_cost += cost
|
|
158
158
|
return res.reason
|
|
159
159
|
else:
|
|
160
160
|
try:
|
|
161
|
-
res: ContextualPrecisionScoreReason = (
|
|
161
|
+
res: cpschema.ContextualPrecisionScoreReason = (
|
|
162
162
|
await self.model.a_generate(
|
|
163
|
-
prompt, schema=ContextualPrecisionScoreReason
|
|
163
|
+
prompt, schema=cpschema.ContextualPrecisionScoreReason
|
|
164
164
|
)
|
|
165
165
|
)
|
|
166
166
|
return res.reason
|
|
@@ -174,7 +174,7 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
174
174
|
return None
|
|
175
175
|
|
|
176
176
|
retrieval_contexts_verdicts = [
|
|
177
|
-
{"verdict": verdict.verdict, "
|
|
177
|
+
{"verdict": verdict.verdict, "reason": verdict.reason}
|
|
178
178
|
for verdict in self.verdicts
|
|
179
179
|
]
|
|
180
180
|
prompt = self.evaluation_template.generate_reason(
|
|
@@ -185,14 +185,16 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
185
185
|
|
|
186
186
|
if self.using_native_model:
|
|
187
187
|
res, cost = self.model.generate(
|
|
188
|
-
prompt, schema=ContextualPrecisionScoreReason
|
|
188
|
+
prompt, schema=cpschema.ContextualPrecisionScoreReason
|
|
189
189
|
)
|
|
190
190
|
self.evaluation_cost += cost
|
|
191
191
|
return res.reason
|
|
192
192
|
else:
|
|
193
193
|
try:
|
|
194
|
-
res: ContextualPrecisionScoreReason =
|
|
195
|
-
|
|
194
|
+
res: cpschema.ContextualPrecisionScoreReason = (
|
|
195
|
+
self.model.generate(
|
|
196
|
+
prompt, schema=cpschema.ContextualPrecisionScoreReason
|
|
197
|
+
)
|
|
196
198
|
)
|
|
197
199
|
return res.reason
|
|
198
200
|
except TypeError:
|
|
@@ -202,21 +204,23 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
202
204
|
|
|
203
205
|
async def _a_generate_verdicts(
|
|
204
206
|
self, input: str, expected_output: str, retrieval_context: List[str]
|
|
205
|
-
) -> List[ContextualPrecisionVerdict]:
|
|
207
|
+
) -> List[cpschema.ContextualPrecisionVerdict]:
|
|
206
208
|
prompt = self.evaluation_template.generate_verdicts(
|
|
207
209
|
input=input,
|
|
208
210
|
expected_output=expected_output,
|
|
209
211
|
retrieval_context=retrieval_context,
|
|
210
212
|
)
|
|
211
213
|
if self.using_native_model:
|
|
212
|
-
res, cost = await self.model.a_generate(
|
|
214
|
+
res, cost = await self.model.a_generate(
|
|
215
|
+
prompt, schema=cpschema.Verdicts
|
|
216
|
+
)
|
|
213
217
|
self.evaluation_cost += cost
|
|
214
218
|
verdicts = [item for item in res.verdicts]
|
|
215
219
|
return verdicts
|
|
216
220
|
else:
|
|
217
221
|
try:
|
|
218
|
-
res: Verdicts = await self.model.a_generate(
|
|
219
|
-
prompt, schema=Verdicts
|
|
222
|
+
res: cpschema.Verdicts = await self.model.a_generate(
|
|
223
|
+
prompt, schema=cpschema.Verdicts
|
|
220
224
|
)
|
|
221
225
|
verdicts = [item for item in res.verdicts]
|
|
222
226
|
return verdicts
|
|
@@ -224,34 +228,36 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
224
228
|
res = await self.model.a_generate(prompt)
|
|
225
229
|
data = trimAndLoadJson(res, self)
|
|
226
230
|
verdicts = [
|
|
227
|
-
ContextualPrecisionVerdict(**item)
|
|
231
|
+
cpschema.ContextualPrecisionVerdict(**item)
|
|
228
232
|
for item in data["verdicts"]
|
|
229
233
|
]
|
|
230
234
|
return verdicts
|
|
231
235
|
|
|
232
236
|
def _generate_verdicts(
|
|
233
237
|
self, input: str, expected_output: str, retrieval_context: List[str]
|
|
234
|
-
) -> List[ContextualPrecisionVerdict]:
|
|
238
|
+
) -> List[cpschema.ContextualPrecisionVerdict]:
|
|
235
239
|
prompt = self.evaluation_template.generate_verdicts(
|
|
236
240
|
input=input,
|
|
237
241
|
expected_output=expected_output,
|
|
238
242
|
retrieval_context=retrieval_context,
|
|
239
243
|
)
|
|
240
244
|
if self.using_native_model:
|
|
241
|
-
res, cost = self.model.generate(prompt, schema=Verdicts)
|
|
245
|
+
res, cost = self.model.generate(prompt, schema=cpschema.Verdicts)
|
|
242
246
|
self.evaluation_cost += cost
|
|
243
247
|
verdicts = [item for item in res.verdicts]
|
|
244
248
|
return verdicts
|
|
245
249
|
else:
|
|
246
250
|
try:
|
|
247
|
-
res: Verdicts = self.model.generate(
|
|
251
|
+
res: cpschema.Verdicts = self.model.generate(
|
|
252
|
+
prompt, schema=cpschema.Verdicts
|
|
253
|
+
)
|
|
248
254
|
verdicts = [item for item in res.verdicts]
|
|
249
255
|
return verdicts
|
|
250
256
|
except TypeError:
|
|
251
257
|
res = self.model.generate(prompt)
|
|
252
258
|
data = trimAndLoadJson(res, self)
|
|
253
259
|
verdicts = [
|
|
254
|
-
ContextualPrecisionVerdict(**item)
|
|
260
|
+
cpschema.ContextualPrecisionVerdict(**item)
|
|
255
261
|
for item in data["verdicts"]
|
|
256
262
|
]
|
|
257
263
|
return verdicts
|
|
@@ -288,7 +294,7 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
288
294
|
else:
|
|
289
295
|
try:
|
|
290
296
|
self.success = self.score >= self.threshold
|
|
291
|
-
except:
|
|
297
|
+
except TypeError:
|
|
292
298
|
self.success = False
|
|
293
299
|
return self.success
|
|
294
300
|
|