deepeval 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
- deepeval/cli/main.py +42 -0
- deepeval/confident/api.py +1 -0
- deepeval/config/settings.py +22 -4
- deepeval/constants.py +8 -1
- deepeval/dataset/dataset.py +2 -11
- deepeval/dataset/utils.py +1 -1
- deepeval/errors.py +20 -2
- deepeval/evaluate/evaluate.py +5 -1
- deepeval/evaluate/execute.py +811 -248
- deepeval/evaluate/types.py +1 -0
- deepeval/evaluate/utils.py +33 -119
- deepeval/integrations/crewai/__init__.py +7 -1
- deepeval/integrations/crewai/handler.py +1 -1
- deepeval/integrations/crewai/subs.py +51 -0
- deepeval/integrations/crewai/tool.py +71 -0
- deepeval/integrations/crewai/wrapper.py +45 -5
- deepeval/integrations/llama_index/__init__.py +0 -4
- deepeval/integrations/llama_index/handler.py +20 -21
- deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
- deepeval/metrics/__init__.py +13 -0
- deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
- deepeval/metrics/api.py +281 -0
- deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
- deepeval/metrics/base_metric.py +1 -0
- deepeval/metrics/bias/bias.py +12 -3
- deepeval/metrics/contextual_precision/contextual_precision.py +39 -24
- deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
- deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
- deepeval/metrics/conversational_dag/nodes.py +12 -4
- deepeval/metrics/conversational_g_eval/__init__.py +3 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +84 -66
- deepeval/metrics/dag/dag.py +12 -0
- deepeval/metrics/dag/nodes.py +12 -4
- deepeval/metrics/dag/schema.py +1 -1
- deepeval/metrics/dag/templates.py +2 -2
- deepeval/metrics/faithfulness/faithfulness.py +12 -1
- deepeval/metrics/g_eval/g_eval.py +11 -0
- deepeval/metrics/goal_accuracy/__init__.py +1 -0
- deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
- deepeval/metrics/goal_accuracy/schema.py +17 -0
- deepeval/metrics/goal_accuracy/template.py +235 -0
- deepeval/metrics/hallucination/hallucination.py +20 -9
- deepeval/metrics/indicator.py +8 -2
- deepeval/metrics/json_correctness/json_correctness.py +12 -1
- deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +20 -2
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +29 -6
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +14 -2
- deepeval/metrics/misuse/misuse.py +12 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +38 -25
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
- deepeval/metrics/non_advice/non_advice.py +12 -0
- deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
- deepeval/metrics/plan_adherence/__init__.py +1 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
- deepeval/metrics/plan_adherence/schema.py +11 -0
- deepeval/metrics/plan_adherence/template.py +170 -0
- deepeval/metrics/plan_quality/__init__.py +1 -0
- deepeval/metrics/plan_quality/plan_quality.py +292 -0
- deepeval/metrics/plan_quality/schema.py +11 -0
- deepeval/metrics/plan_quality/template.py +101 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
- deepeval/metrics/role_adherence/role_adherence.py +12 -0
- deepeval/metrics/role_violation/role_violation.py +12 -0
- deepeval/metrics/step_efficiency/__init__.py +1 -0
- deepeval/metrics/step_efficiency/schema.py +11 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
- deepeval/metrics/step_efficiency/template.py +256 -0
- deepeval/metrics/summarization/summarization.py +12 -1
- deepeval/metrics/task_completion/task_completion.py +4 -0
- deepeval/metrics/tool_correctness/schema.py +6 -0
- deepeval/metrics/tool_correctness/template.py +88 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +233 -21
- deepeval/metrics/tool_use/__init__.py +1 -0
- deepeval/metrics/tool_use/schema.py +19 -0
- deepeval/metrics/tool_use/template.py +220 -0
- deepeval/metrics/tool_use/tool_use.py +458 -0
- deepeval/metrics/topic_adherence/__init__.py +1 -0
- deepeval/metrics/topic_adherence/schema.py +16 -0
- deepeval/metrics/topic_adherence/template.py +162 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
- deepeval/metrics/toxicity/toxicity.py +12 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
- deepeval/models/embedding_models/azure_embedding_model.py +37 -36
- deepeval/models/embedding_models/local_embedding_model.py +30 -32
- deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
- deepeval/models/embedding_models/openai_embedding_model.py +22 -31
- deepeval/models/llms/grok_model.py +1 -1
- deepeval/models/llms/openai_model.py +2 -0
- deepeval/openai/__init__.py +14 -32
- deepeval/openai/extractors.py +85 -50
- deepeval/openai/patch.py +258 -167
- deepeval/openai/types.py +20 -0
- deepeval/openai/utils.py +205 -56
- deepeval/prompt/__init__.py +19 -1
- deepeval/prompt/api.py +160 -0
- deepeval/prompt/prompt.py +245 -62
- deepeval/prompt/utils.py +186 -15
- deepeval/synthesizer/chunking/context_generator.py +209 -152
- deepeval/synthesizer/chunking/doc_chunker.py +46 -12
- deepeval/synthesizer/synthesizer.py +19 -15
- deepeval/test_case/api.py +131 -0
- deepeval/test_case/llm_test_case.py +6 -2
- deepeval/test_run/__init__.py +1 -0
- deepeval/test_run/hyperparameters.py +47 -8
- deepeval/test_run/test_run.py +292 -206
- deepeval/tracing/__init__.py +2 -1
- deepeval/tracing/api.py +3 -1
- deepeval/tracing/otel/exporter.py +3 -4
- deepeval/tracing/otel/utils.py +24 -5
- deepeval/tracing/trace_context.py +89 -5
- deepeval/tracing/tracing.py +74 -3
- deepeval/tracing/types.py +20 -2
- deepeval/tracing/utils.py +8 -0
- deepeval/utils.py +21 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/RECORD +133 -103
- deepeval/integrations/llama_index/agent/patched.py +0 -68
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
|
@@ -1,14 +1,19 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
3
|
import os
|
|
4
|
+
from time import perf_counter
|
|
4
5
|
from typing import Literal, Optional, List
|
|
5
6
|
|
|
6
7
|
from deepeval.config.settings import get_settings
|
|
7
8
|
from deepeval.confident.api import get_confident_api_key
|
|
9
|
+
from deepeval.metrics.base_metric import BaseMetric
|
|
8
10
|
from deepeval.prompt import Prompt
|
|
9
11
|
from deepeval.tracing.context import current_trace_context
|
|
10
12
|
from deepeval.tracing.types import Trace
|
|
11
13
|
from deepeval.tracing.otel.utils import to_hex_string
|
|
14
|
+
from deepeval.tracing.tracing import trace_manager
|
|
15
|
+
from deepeval.tracing.otel.utils import normalize_pydantic_ai_messages
|
|
16
|
+
from deepeval.tracing.otel.exporter import ConfidentSpanExporter
|
|
12
17
|
|
|
13
18
|
|
|
14
19
|
logger = logging.getLogger(__name__)
|
|
@@ -21,6 +26,7 @@ try:
|
|
|
21
26
|
from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
|
|
22
27
|
OTLPSpanExporter,
|
|
23
28
|
)
|
|
29
|
+
from opentelemetry.sdk.trace import ReadableSpan
|
|
24
30
|
|
|
25
31
|
dependency_installed = True
|
|
26
32
|
except ImportError as e:
|
|
@@ -48,24 +54,96 @@ def is_dependency_installed():
|
|
|
48
54
|
return True
|
|
49
55
|
|
|
50
56
|
|
|
57
|
+
from deepeval.tracing.types import AgentSpan
|
|
51
58
|
from deepeval.confident.api import get_confident_api_key
|
|
52
59
|
from deepeval.prompt import Prompt
|
|
53
60
|
from deepeval.tracing.otel.test_exporter import test_exporter
|
|
54
61
|
from deepeval.tracing.context import current_trace_context
|
|
55
62
|
from deepeval.tracing.types import Trace
|
|
56
63
|
from deepeval.tracing.otel.utils import to_hex_string
|
|
64
|
+
from deepeval.tracing.types import TraceSpanStatus, ToolCall
|
|
65
|
+
from deepeval.tracing.perf_epoch_bridge import init_clock_bridge
|
|
57
66
|
|
|
58
67
|
# OTLP_ENDPOINT = "http://127.0.0.1:4318/v1/traces"
|
|
59
68
|
OTLP_ENDPOINT = "https://otel.confident-ai.com/v1/traces"
|
|
69
|
+
init_clock_bridge() # initialize clock bridge for perf_counter() to epoch_nanos conversion
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class ConfidentInstrumentationSettings(InstrumentationSettings):
|
|
73
|
+
|
|
74
|
+
def __init__(
|
|
75
|
+
self,
|
|
76
|
+
api_key: Optional[str] = None,
|
|
77
|
+
name: Optional[str] = None,
|
|
78
|
+
thread_id: Optional[str] = None,
|
|
79
|
+
user_id: Optional[str] = None,
|
|
80
|
+
metadata: Optional[dict] = None,
|
|
81
|
+
tags: Optional[List[str]] = None,
|
|
82
|
+
metric_collection: Optional[str] = None,
|
|
83
|
+
confident_prompt: Optional[Prompt] = None,
|
|
84
|
+
llm_metric_collection: Optional[str] = None,
|
|
85
|
+
agent_metric_collection: Optional[str] = None,
|
|
86
|
+
tool_metric_collection_map: Optional[dict] = None,
|
|
87
|
+
trace_metric_collection: Optional[str] = None,
|
|
88
|
+
is_test_mode: Optional[bool] = False,
|
|
89
|
+
agent_metrics: Optional[List[BaseMetric]] = None,
|
|
90
|
+
):
|
|
91
|
+
is_dependency_installed()
|
|
92
|
+
|
|
93
|
+
_environment = os.getenv("CONFIDENT_TRACE_ENVIRONMENT", "development")
|
|
94
|
+
if _environment and _environment in [
|
|
95
|
+
"production",
|
|
96
|
+
"staging",
|
|
97
|
+
"development",
|
|
98
|
+
"testing",
|
|
99
|
+
]:
|
|
100
|
+
self.environment = _environment
|
|
101
|
+
|
|
102
|
+
self.tool_metric_collection_map = tool_metric_collection_map or {}
|
|
103
|
+
self.name = name
|
|
104
|
+
self.thread_id = thread_id
|
|
105
|
+
self.user_id = user_id
|
|
106
|
+
self.metadata = metadata
|
|
107
|
+
self.tags = tags
|
|
108
|
+
self.metric_collection = metric_collection
|
|
109
|
+
self.confident_prompt = confident_prompt
|
|
110
|
+
self.llm_metric_collection = llm_metric_collection
|
|
111
|
+
self.agent_metric_collection = agent_metric_collection
|
|
112
|
+
self.trace_metric_collection = trace_metric_collection
|
|
113
|
+
self.is_test_mode = is_test_mode
|
|
114
|
+
self.agent_metrics = agent_metrics
|
|
115
|
+
|
|
116
|
+
if not api_key:
|
|
117
|
+
api_key = get_confident_api_key()
|
|
118
|
+
if not api_key:
|
|
119
|
+
raise ValueError("CONFIDENT_API_KEY is not set")
|
|
120
|
+
|
|
121
|
+
trace_provider = TracerProvider()
|
|
122
|
+
|
|
123
|
+
# Pass the entire settings instance instead of individual values
|
|
124
|
+
span_interceptor = SpanInterceptor(self)
|
|
125
|
+
trace_provider.add_span_processor(span_interceptor)
|
|
126
|
+
|
|
127
|
+
if is_test_mode:
|
|
128
|
+
trace_provider.add_span_processor(BatchSpanProcessor(test_exporter))
|
|
129
|
+
else:
|
|
130
|
+
trace_provider.add_span_processor(
|
|
131
|
+
BatchSpanProcessor(
|
|
132
|
+
OTLPSpanExporter(
|
|
133
|
+
endpoint=OTLP_ENDPOINT,
|
|
134
|
+
headers={"x-confident-api-key": api_key},
|
|
135
|
+
)
|
|
136
|
+
)
|
|
137
|
+
)
|
|
138
|
+
super().__init__(tracer_provider=trace_provider)
|
|
60
139
|
|
|
61
140
|
|
|
62
141
|
class SpanInterceptor(SpanProcessor):
|
|
63
|
-
def __init__(self, settings_instance):
|
|
142
|
+
def __init__(self, settings_instance: ConfidentInstrumentationSettings):
|
|
64
143
|
# Keep a reference to the settings instance instead of copying values
|
|
65
|
-
self.settings
|
|
144
|
+
self.settings = settings_instance
|
|
66
145
|
|
|
67
146
|
def on_start(self, span, parent_context):
|
|
68
|
-
|
|
69
147
|
# set trace uuid
|
|
70
148
|
_current_trace_context = current_trace_context.get()
|
|
71
149
|
if _current_trace_context and isinstance(_current_trace_context, Trace):
|
|
@@ -151,85 +229,56 @@ class SpanInterceptor(SpanProcessor):
|
|
|
151
229
|
)
|
|
152
230
|
|
|
153
231
|
def on_end(self, span):
|
|
154
|
-
|
|
155
|
-
|
|
232
|
+
if self.settings.is_test_mode:
|
|
233
|
+
if span.attributes.get("confident.span.type") == "agent":
|
|
156
234
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
thread_id: Optional[str] = None
|
|
161
|
-
user_id: Optional[str] = None
|
|
162
|
-
metadata: Optional[dict] = None
|
|
163
|
-
tags: Optional[List[str]] = None
|
|
164
|
-
environment: Literal["production", "staging", "development", "testing"] = (
|
|
165
|
-
None
|
|
166
|
-
)
|
|
167
|
-
metric_collection: Optional[str] = None
|
|
168
|
-
confident_prompt: Optional[Prompt] = None
|
|
169
|
-
llm_metric_collection: Optional[str] = None
|
|
170
|
-
agent_metric_collection: Optional[str] = None
|
|
171
|
-
tool_metric_collection_map: dict = {}
|
|
172
|
-
trace_metric_collection: Optional[str] = None
|
|
235
|
+
def create_agent_span_for_evaluation(
|
|
236
|
+
span: ReadableSpan,
|
|
237
|
+
) -> AgentSpan:
|
|
173
238
|
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
user_id: Optional[str] = None,
|
|
180
|
-
metadata: Optional[dict] = None,
|
|
181
|
-
tags: Optional[List[str]] = None,
|
|
182
|
-
metric_collection: Optional[str] = None,
|
|
183
|
-
confident_prompt: Optional[Prompt] = None,
|
|
184
|
-
llm_metric_collection: Optional[str] = None,
|
|
185
|
-
agent_metric_collection: Optional[str] = None,
|
|
186
|
-
tool_metric_collection_map: Optional[dict] = None,
|
|
187
|
-
trace_metric_collection: Optional[str] = None,
|
|
188
|
-
is_test_mode: Optional[bool] = False,
|
|
189
|
-
):
|
|
190
|
-
is_dependency_installed()
|
|
239
|
+
agent_span = (
|
|
240
|
+
ConfidentSpanExporter.prepare_boilerplate_base_span(
|
|
241
|
+
span
|
|
242
|
+
)
|
|
243
|
+
)
|
|
191
244
|
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
"staging",
|
|
196
|
-
"development",
|
|
197
|
-
"testing",
|
|
198
|
-
]:
|
|
199
|
-
self.environment = _environment
|
|
245
|
+
# tools called
|
|
246
|
+
normalized_messages = normalize_pydantic_ai_messages(span)
|
|
247
|
+
tools_called = []
|
|
200
248
|
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
self.trace_metric_collection = trace_metric_collection
|
|
249
|
+
for message in normalized_messages:
|
|
250
|
+
for part in message.get("parts", []):
|
|
251
|
+
if part.get("type") == "tool_call":
|
|
252
|
+
name = part.get("name")
|
|
253
|
+
try:
|
|
254
|
+
input_parameters = json.loads(
|
|
255
|
+
part.get("arguments")
|
|
256
|
+
)
|
|
257
|
+
except Exception:
|
|
258
|
+
input_parameters = {}
|
|
212
259
|
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
260
|
+
tools_called.append(
|
|
261
|
+
ToolCall(
|
|
262
|
+
name=name,
|
|
263
|
+
input_parameters=input_parameters,
|
|
264
|
+
)
|
|
265
|
+
)
|
|
217
266
|
|
|
218
|
-
|
|
267
|
+
# agent_span.tools_called = tools_called
|
|
268
|
+
return agent_span
|
|
219
269
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
trace_provider.add_span_processor(span_interceptor)
|
|
270
|
+
agent_span = create_agent_span_for_evaluation(span)
|
|
271
|
+
agent_span.metrics = self.settings.agent_metrics
|
|
223
272
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
OTLPSpanExporter(
|
|
230
|
-
endpoint=OTLP_ENDPOINT,
|
|
231
|
-
headers={"x-confident-api-key": api_key},
|
|
273
|
+
# create a trace for evaluation
|
|
274
|
+
trace = trace_manager.get_trace_by_uuid(agent_span.trace_uuid)
|
|
275
|
+
if not trace:
|
|
276
|
+
trace = trace_manager.start_new_trace(
|
|
277
|
+
trace_uuid=agent_span.trace_uuid
|
|
232
278
|
)
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
279
|
+
|
|
280
|
+
trace.root_spans.append(agent_span)
|
|
281
|
+
trace.status = TraceSpanStatus.SUCCESS
|
|
282
|
+
trace.end_time = perf_counter()
|
|
283
|
+
trace_manager.traces_to_evaluate.append(trace)
|
|
284
|
+
test_exporter.clear_span_json_list()
|
deepeval/metrics/__init__.py
CHANGED
|
@@ -27,6 +27,12 @@ from .tool_correctness.tool_correctness import ToolCorrectnessMetric
|
|
|
27
27
|
from .json_correctness.json_correctness import JsonCorrectnessMetric
|
|
28
28
|
from .prompt_alignment.prompt_alignment import PromptAlignmentMetric
|
|
29
29
|
from .task_completion.task_completion import TaskCompletionMetric
|
|
30
|
+
from .topic_adherence.topic_adherence import TopicAdherenceMetric
|
|
31
|
+
from .step_efficiency.step_efficiency import StepEfficiencyMetric
|
|
32
|
+
from .plan_adherence.plan_adherence import PlanAdherenceMetric
|
|
33
|
+
from .plan_quality.plan_quality import PlanQualityMetric
|
|
34
|
+
from .tool_use.tool_use import ToolUseMetric
|
|
35
|
+
from .goal_accuracy.goal_accuracy import GoalAccuracyMetric
|
|
30
36
|
from .argument_correctness.argument_correctness import ArgumentCorrectnessMetric
|
|
31
37
|
from .mcp.mcp_task_completion import MCPTaskCompletionMetric
|
|
32
38
|
from .mcp.multi_turn_mcp_use_metric import MultiTurnMCPUseMetric
|
|
@@ -98,6 +104,13 @@ __all__ = [
|
|
|
98
104
|
"TaskCompletionMetric",
|
|
99
105
|
"ArgumentCorrectnessMetric",
|
|
100
106
|
"KnowledgeRetentionMetric",
|
|
107
|
+
# Agentic metrics
|
|
108
|
+
"TopicAdherenceMetric",
|
|
109
|
+
"StepEfficiencyMetric",
|
|
110
|
+
"PlanAdherenceMetric",
|
|
111
|
+
"PlanQualityMetric",
|
|
112
|
+
"ToolUseMetric",
|
|
113
|
+
"GoalAccuracyMetric",
|
|
101
114
|
# Conversational metrics
|
|
102
115
|
"TurnRelevancyMetric",
|
|
103
116
|
"ConversationCompletenessMetric",
|
|
@@ -16,6 +16,7 @@ from deepeval.models import DeepEvalBaseLLM
|
|
|
16
16
|
from deepeval.metrics.answer_relevancy.template import AnswerRelevancyTemplate
|
|
17
17
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
18
18
|
from deepeval.metrics.answer_relevancy.schema import *
|
|
19
|
+
from deepeval.metrics.api import metric_data_manager
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class AnswerRelevancyMetric(BaseMetric):
|
|
@@ -50,8 +51,8 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
50
51
|
test_case: LLMTestCase,
|
|
51
52
|
_show_indicator: bool = True,
|
|
52
53
|
_in_component: bool = False,
|
|
54
|
+
_log_metric_to_confident: bool = True,
|
|
53
55
|
) -> float:
|
|
54
|
-
|
|
55
56
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
56
57
|
|
|
57
58
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -65,6 +66,7 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
65
66
|
test_case,
|
|
66
67
|
_show_indicator=False,
|
|
67
68
|
_in_component=_in_component,
|
|
69
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
68
70
|
)
|
|
69
71
|
)
|
|
70
72
|
else:
|
|
@@ -85,6 +87,10 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
85
87
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
86
88
|
],
|
|
87
89
|
)
|
|
90
|
+
if _log_metric_to_confident:
|
|
91
|
+
metric_data_manager.post_metric_if_enabled(
|
|
92
|
+
self, test_case=test_case
|
|
93
|
+
)
|
|
88
94
|
|
|
89
95
|
return self.score
|
|
90
96
|
|
|
@@ -93,8 +99,8 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
93
99
|
test_case: LLMTestCase,
|
|
94
100
|
_show_indicator: bool = True,
|
|
95
101
|
_in_component: bool = False,
|
|
102
|
+
_log_metric_to_confident: bool = True,
|
|
96
103
|
) -> float:
|
|
97
|
-
|
|
98
104
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
99
105
|
|
|
100
106
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -121,7 +127,10 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
121
127
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
122
128
|
],
|
|
123
129
|
)
|
|
124
|
-
|
|
130
|
+
if _log_metric_to_confident:
|
|
131
|
+
metric_data_manager.post_metric_if_enabled(
|
|
132
|
+
self, test_case=test_case
|
|
133
|
+
)
|
|
125
134
|
return self.score
|
|
126
135
|
|
|
127
136
|
async def _a_generate_reason(self, input: str) -> str:
|
deepeval/metrics/api.py
ADDED
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
from typing import Optional, Set, Any, Dict, List, Union
|
|
2
|
+
import threading
|
|
3
|
+
import asyncio
|
|
4
|
+
import queue
|
|
5
|
+
import atexit
|
|
6
|
+
from time import perf_counter
|
|
7
|
+
from enum import Enum
|
|
8
|
+
from pydantic import Field
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
|
|
11
|
+
from deepeval.confident.api import Api, HttpMethods, Endpoints, is_confident
|
|
12
|
+
from deepeval.constants import (
|
|
13
|
+
CONFIDENT_METRIC_LOGGING_FLUSH,
|
|
14
|
+
CONFIDENT_METRIC_LOGGING_VERBOSE,
|
|
15
|
+
)
|
|
16
|
+
from deepeval.metrics.base_metric import BaseConversationalMetric, BaseMetric
|
|
17
|
+
from deepeval.test_case.conversational_test_case import ConversationalTestCase
|
|
18
|
+
from deepeval.test_case.llm_test_case import LLMTestCase
|
|
19
|
+
from deepeval.test_case.api import create_api_test_case
|
|
20
|
+
from deepeval.test_run.api import LLMApiTestCase, ConversationalApiTestCase
|
|
21
|
+
from deepeval.tracing.api import MetricData
|
|
22
|
+
from deepeval.config.settings import get_settings
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class MetricWorkerStatus(Enum):
|
|
26
|
+
SUCCESS = "success"
|
|
27
|
+
FAILURE = "failure"
|
|
28
|
+
WARNING = "warning"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ApiMetricData(MetricData):
|
|
32
|
+
llm_test_case: Optional[LLMApiTestCase] = Field(None, alias="llmTestCase")
|
|
33
|
+
conversational_test_case: Optional[ConversationalApiTestCase] = Field(
|
|
34
|
+
None, alias="conversationalTestCase"
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class MetricDataManager:
|
|
39
|
+
"""Manager for posting metric data asynchronously in background thread."""
|
|
40
|
+
|
|
41
|
+
def __init__(self):
|
|
42
|
+
settings = get_settings()
|
|
43
|
+
# Initialize queue and worker thread for metric posting
|
|
44
|
+
self._metric_queue = queue.Queue()
|
|
45
|
+
self._worker_thread = None
|
|
46
|
+
self._min_interval = 0.2 # Minimum time between API calls (seconds)
|
|
47
|
+
self._last_post_time = 0
|
|
48
|
+
self._in_flight_tasks: Set[asyncio.Task[Any]] = set()
|
|
49
|
+
self._flush_enabled = bool(settings.CONFIDENT_METRIC_LOGGING_FLUSH)
|
|
50
|
+
self._daemon = not self._flush_enabled
|
|
51
|
+
self._thread_lock = threading.Lock()
|
|
52
|
+
self.metric_logging_enabled = bool(
|
|
53
|
+
settings.CONFIDENT_METRIC_LOGGING_ENABLED
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# Register an exit handler to warn about unprocessed metrics
|
|
57
|
+
atexit.register(self._warn_on_exit)
|
|
58
|
+
|
|
59
|
+
def post_metric_if_enabled(
|
|
60
|
+
self,
|
|
61
|
+
metric: Union[BaseMetric, BaseConversationalMetric],
|
|
62
|
+
test_case: Optional[Union[LLMTestCase, ConversationalTestCase]] = None,
|
|
63
|
+
):
|
|
64
|
+
"""Post metric data asynchronously in a background thread."""
|
|
65
|
+
if not self.metric_logging_enabled or not is_confident():
|
|
66
|
+
return
|
|
67
|
+
|
|
68
|
+
from deepeval.evaluate.utils import create_metric_data
|
|
69
|
+
|
|
70
|
+
metric_data = create_metric_data(metric)
|
|
71
|
+
api_metric_data = ApiMetricData(
|
|
72
|
+
**metric_data.model_dump(by_alias=True, exclude_none=True)
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
if isinstance(test_case, LLMTestCase):
|
|
76
|
+
api_metric_data.llm_test_case = create_api_test_case(test_case)
|
|
77
|
+
elif isinstance(test_case, ConversationalTestCase):
|
|
78
|
+
api_metric_data.conversational_test_case = create_api_test_case(
|
|
79
|
+
test_case
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
self._ensure_worker_thread_running()
|
|
83
|
+
self._metric_queue.put(api_metric_data)
|
|
84
|
+
|
|
85
|
+
def _warn_on_exit(self):
|
|
86
|
+
"""Warn if there are unprocessed metrics on exit."""
|
|
87
|
+
queue_size = self._metric_queue.qsize()
|
|
88
|
+
in_flight = len(self._in_flight_tasks)
|
|
89
|
+
remaining_tasks = queue_size + in_flight
|
|
90
|
+
|
|
91
|
+
if not self._flush_enabled and remaining_tasks > 0:
|
|
92
|
+
self._print_metric_data_status(
|
|
93
|
+
metric_worker_status=MetricWorkerStatus.WARNING,
|
|
94
|
+
message=f"Exiting with {queue_size + in_flight} abandoned metric(s).",
|
|
95
|
+
description=f"Set {CONFIDENT_METRIC_LOGGING_FLUSH}=1 as an environment variable to flush remaining metrics to Confident AI.",
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
def _ensure_worker_thread_running(self):
|
|
99
|
+
"""Ensure the background worker thread is running."""
|
|
100
|
+
with self._thread_lock:
|
|
101
|
+
if (
|
|
102
|
+
self._worker_thread is None
|
|
103
|
+
or not self._worker_thread.is_alive()
|
|
104
|
+
):
|
|
105
|
+
self._worker_thread = threading.Thread(
|
|
106
|
+
target=self._process_metric_queue,
|
|
107
|
+
daemon=self._daemon,
|
|
108
|
+
)
|
|
109
|
+
self._worker_thread.start()
|
|
110
|
+
|
|
111
|
+
def _print_metric_data_status(
|
|
112
|
+
self,
|
|
113
|
+
metric_worker_status: MetricWorkerStatus,
|
|
114
|
+
message: str,
|
|
115
|
+
description: Optional[str] = None,
|
|
116
|
+
):
|
|
117
|
+
"""Print metric data worker status messages."""
|
|
118
|
+
if getattr(get_settings(), CONFIDENT_METRIC_LOGGING_VERBOSE, False):
|
|
119
|
+
console = Console()
|
|
120
|
+
message_prefix = "[dim][Confident AI Metric Data Log][/dim]"
|
|
121
|
+
if metric_worker_status == MetricWorkerStatus.SUCCESS:
|
|
122
|
+
message = f"[green]{message}[/green]"
|
|
123
|
+
elif metric_worker_status == MetricWorkerStatus.FAILURE:
|
|
124
|
+
message = f"[red]{message}[/red]"
|
|
125
|
+
elif metric_worker_status == MetricWorkerStatus.WARNING:
|
|
126
|
+
message = f"[yellow]{message}[/yellow]"
|
|
127
|
+
|
|
128
|
+
if bool(CONFIDENT_METRIC_LOGGING_VERBOSE):
|
|
129
|
+
if description:
|
|
130
|
+
message += f": {description}"
|
|
131
|
+
|
|
132
|
+
console.print(
|
|
133
|
+
message_prefix,
|
|
134
|
+
message,
|
|
135
|
+
f"\nTo disable dev logging, set {CONFIDENT_METRIC_LOGGING_VERBOSE}=0 as an environment variable.",
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
def _process_metric_queue(self):
|
|
139
|
+
"""Worker thread function that processes the metric queue."""
|
|
140
|
+
import threading
|
|
141
|
+
|
|
142
|
+
main_thr = threading.main_thread()
|
|
143
|
+
|
|
144
|
+
# Create a new event loop
|
|
145
|
+
loop = asyncio.new_event_loop()
|
|
146
|
+
asyncio.set_event_loop(loop)
|
|
147
|
+
|
|
148
|
+
# Buffer for payloads that need to be sent after main exits
|
|
149
|
+
remaining_metric_request_bodies: List[Dict[str, Any]] = []
|
|
150
|
+
|
|
151
|
+
async def _a_send_metric(metric_data: ApiMetricData):
|
|
152
|
+
nonlocal remaining_metric_request_bodies
|
|
153
|
+
try:
|
|
154
|
+
# Build API object & payload
|
|
155
|
+
try:
|
|
156
|
+
body = metric_data.model_dump(
|
|
157
|
+
by_alias=True,
|
|
158
|
+
exclude_none=True,
|
|
159
|
+
)
|
|
160
|
+
except AttributeError:
|
|
161
|
+
# Pydantic version below 2.0
|
|
162
|
+
body = metric_data.dict(by_alias=True, exclude_none=True)
|
|
163
|
+
|
|
164
|
+
# If the main thread is still alive, send now
|
|
165
|
+
if main_thr.is_alive():
|
|
166
|
+
api = Api()
|
|
167
|
+
_, _ = await api.a_send_request(
|
|
168
|
+
method=HttpMethods.POST,
|
|
169
|
+
endpoint=Endpoints.METRIC_DATA_ENDPOINT,
|
|
170
|
+
body=body,
|
|
171
|
+
)
|
|
172
|
+
queue_size = self._metric_queue.qsize()
|
|
173
|
+
in_flight = len(self._in_flight_tasks)
|
|
174
|
+
status = f"({queue_size} metric{'s' if queue_size!=1 else ''} remaining in queue, {in_flight} in flight)"
|
|
175
|
+
self._print_metric_data_status(
|
|
176
|
+
metric_worker_status=MetricWorkerStatus.SUCCESS,
|
|
177
|
+
message=f"Successfully posted metric data {status}",
|
|
178
|
+
)
|
|
179
|
+
elif self._flush_enabled:
|
|
180
|
+
# Main thread gone → to be flushed
|
|
181
|
+
remaining_metric_request_bodies.append(body)
|
|
182
|
+
|
|
183
|
+
except Exception as e:
|
|
184
|
+
queue_size = self._metric_queue.qsize()
|
|
185
|
+
in_flight = len(self._in_flight_tasks)
|
|
186
|
+
status = f"({queue_size} metric{'s' if queue_size!=1 else ''} remaining in queue, {in_flight} in flight)"
|
|
187
|
+
self._print_metric_data_status(
|
|
188
|
+
metric_worker_status=MetricWorkerStatus.FAILURE,
|
|
189
|
+
message=f"Error posting metric data {status}",
|
|
190
|
+
description=str(e),
|
|
191
|
+
)
|
|
192
|
+
finally:
|
|
193
|
+
task = asyncio.current_task()
|
|
194
|
+
if task:
|
|
195
|
+
self._in_flight_tasks.discard(task)
|
|
196
|
+
|
|
197
|
+
async def async_worker():
|
|
198
|
+
# Continue while user code is running or work remains
|
|
199
|
+
while (
|
|
200
|
+
main_thr.is_alive()
|
|
201
|
+
or not self._metric_queue.empty()
|
|
202
|
+
or self._in_flight_tasks
|
|
203
|
+
):
|
|
204
|
+
try:
|
|
205
|
+
metric_data = self._metric_queue.get(
|
|
206
|
+
block=True, timeout=1.0
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
# Rate-limit
|
|
210
|
+
now = perf_counter()
|
|
211
|
+
elapsed = now - self._last_post_time
|
|
212
|
+
if elapsed < self._min_interval:
|
|
213
|
+
await asyncio.sleep(self._min_interval - elapsed)
|
|
214
|
+
self._last_post_time = perf_counter()
|
|
215
|
+
|
|
216
|
+
# Schedule async send
|
|
217
|
+
task = asyncio.create_task(_a_send_metric(metric_data))
|
|
218
|
+
self._in_flight_tasks.add(task)
|
|
219
|
+
self._metric_queue.task_done()
|
|
220
|
+
|
|
221
|
+
except queue.Empty:
|
|
222
|
+
await asyncio.sleep(0.1)
|
|
223
|
+
continue
|
|
224
|
+
except Exception as e:
|
|
225
|
+
self._print_metric_data_status(
|
|
226
|
+
message="Error in metric worker",
|
|
227
|
+
metric_worker_status=MetricWorkerStatus.FAILURE,
|
|
228
|
+
description=str(e),
|
|
229
|
+
)
|
|
230
|
+
await asyncio.sleep(1.0)
|
|
231
|
+
|
|
232
|
+
try:
|
|
233
|
+
loop.run_until_complete(async_worker())
|
|
234
|
+
finally:
|
|
235
|
+
# Drain any pending tasks
|
|
236
|
+
pending = asyncio.all_tasks(loop=loop)
|
|
237
|
+
if pending:
|
|
238
|
+
loop.run_until_complete(
|
|
239
|
+
asyncio.gather(*pending, return_exceptions=True)
|
|
240
|
+
)
|
|
241
|
+
self._flush_metrics(remaining_metric_request_bodies)
|
|
242
|
+
loop.run_until_complete(loop.shutdown_asyncgens())
|
|
243
|
+
loop.close()
|
|
244
|
+
|
|
245
|
+
def _flush_metrics(
|
|
246
|
+
self, remaining_metric_request_bodies: List[Dict[str, Any]]
|
|
247
|
+
):
|
|
248
|
+
"""Flush remaining metrics synchronously."""
|
|
249
|
+
if not remaining_metric_request_bodies:
|
|
250
|
+
return
|
|
251
|
+
|
|
252
|
+
self._print_metric_data_status(
|
|
253
|
+
MetricWorkerStatus.WARNING,
|
|
254
|
+
message=f"Flushing {len(remaining_metric_request_bodies)} remaining metric(s)",
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
for body in remaining_metric_request_bodies:
|
|
258
|
+
try:
|
|
259
|
+
api = Api()
|
|
260
|
+
_, link = api.send_request(
|
|
261
|
+
method=HttpMethods.POST,
|
|
262
|
+
endpoint=Endpoints.METRIC_DATA_ENDPOINT,
|
|
263
|
+
body=body,
|
|
264
|
+
)
|
|
265
|
+
qs = self._metric_queue.qsize()
|
|
266
|
+
self._print_metric_data_status(
|
|
267
|
+
metric_worker_status=MetricWorkerStatus.SUCCESS,
|
|
268
|
+
message=f"Successfully posted metric data ({qs} metrics remaining in queue, 1 in flight)",
|
|
269
|
+
description=link,
|
|
270
|
+
)
|
|
271
|
+
except Exception as e:
|
|
272
|
+
qs = self._metric_queue.qsize()
|
|
273
|
+
self._print_metric_data_status(
|
|
274
|
+
metric_worker_status=MetricWorkerStatus.FAILURE,
|
|
275
|
+
message="Error flushing remaining metric(s)",
|
|
276
|
+
description=str(e),
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
# Global metric manager instance
|
|
281
|
+
metric_data_manager = MetricDataManager()
|
|
@@ -19,6 +19,7 @@ from deepeval.metrics.argument_correctness.template import (
|
|
|
19
19
|
)
|
|
20
20
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
21
21
|
from deepeval.metrics.argument_correctness.schema import *
|
|
22
|
+
from deepeval.metrics.api import metric_data_manager
|
|
22
23
|
|
|
23
24
|
|
|
24
25
|
class ArgumentCorrectnessMetric(BaseMetric):
|
|
@@ -53,6 +54,7 @@ class ArgumentCorrectnessMetric(BaseMetric):
|
|
|
53
54
|
test_case: LLMTestCase,
|
|
54
55
|
_show_indicator: bool = True,
|
|
55
56
|
_in_component: bool = False,
|
|
57
|
+
_log_metric_to_confident: bool = True,
|
|
56
58
|
) -> float:
|
|
57
59
|
|
|
58
60
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -68,6 +70,7 @@ class ArgumentCorrectnessMetric(BaseMetric):
|
|
|
68
70
|
test_case,
|
|
69
71
|
_show_indicator=False,
|
|
70
72
|
_in_component=_in_component,
|
|
73
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
71
74
|
)
|
|
72
75
|
)
|
|
73
76
|
else:
|
|
@@ -91,7 +94,10 @@ class ArgumentCorrectnessMetric(BaseMetric):
|
|
|
91
94
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
92
95
|
],
|
|
93
96
|
)
|
|
94
|
-
|
|
97
|
+
if _log_metric_to_confident:
|
|
98
|
+
metric_data_manager.post_metric_if_enabled(
|
|
99
|
+
self, test_case=test_case
|
|
100
|
+
)
|
|
95
101
|
return self.score
|
|
96
102
|
|
|
97
103
|
async def a_measure(
|
|
@@ -99,6 +105,7 @@ class ArgumentCorrectnessMetric(BaseMetric):
|
|
|
99
105
|
test_case: LLMTestCase,
|
|
100
106
|
_show_indicator: bool = True,
|
|
101
107
|
_in_component: bool = False,
|
|
108
|
+
_log_metric_to_confident: bool = True,
|
|
102
109
|
) -> float:
|
|
103
110
|
|
|
104
111
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -130,7 +137,10 @@ class ArgumentCorrectnessMetric(BaseMetric):
|
|
|
130
137
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
131
138
|
],
|
|
132
139
|
)
|
|
133
|
-
|
|
140
|
+
if _log_metric_to_confident:
|
|
141
|
+
metric_data_manager.post_metric_if_enabled(
|
|
142
|
+
self, test_case=test_case
|
|
143
|
+
)
|
|
134
144
|
return self.score
|
|
135
145
|
|
|
136
146
|
async def _a_generate_reason(self, input: str) -> str:
|
deepeval/metrics/base_metric.py
CHANGED