deepeval 3.6.8__py3-none-any.whl → 3.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/anthropic/__init__.py +19 -0
- deepeval/anthropic/extractors.py +94 -0
- deepeval/anthropic/patch.py +169 -0
- deepeval/anthropic/utils.py +225 -0
- deepeval/benchmarks/drop/drop.py +40 -14
- deepeval/benchmarks/ifeval/ifeval.py +2 -2
- deepeval/confident/types.py +4 -2
- deepeval/config/settings.py +258 -47
- deepeval/config/settings_manager.py +4 -0
- deepeval/config/utils.py +5 -0
- deepeval/dataset/dataset.py +162 -30
- deepeval/dataset/utils.py +41 -13
- deepeval/evaluate/execute.py +1099 -633
- deepeval/integrations/crewai/handler.py +36 -0
- deepeval/integrations/langchain/callback.py +27 -2
- deepeval/integrations/llama_index/handler.py +58 -4
- deepeval/integrations/llama_index/utils.py +24 -0
- deepeval/metrics/__init__.py +5 -0
- deepeval/metrics/exact_match/__init__.py +0 -0
- deepeval/metrics/exact_match/exact_match.py +94 -0
- deepeval/metrics/indicator.py +21 -1
- deepeval/metrics/pattern_match/__init__.py +0 -0
- deepeval/metrics/pattern_match/pattern_match.py +103 -0
- deepeval/metrics/task_completion/task_completion.py +9 -2
- deepeval/model_integrations/__init__.py +0 -0
- deepeval/model_integrations/utils.py +116 -0
- deepeval/models/base_model.py +3 -1
- deepeval/models/llms/amazon_bedrock_model.py +20 -17
- deepeval/models/llms/openai_model.py +10 -1
- deepeval/models/retry_policy.py +103 -20
- deepeval/openai/__init__.py +3 -1
- deepeval/openai/extractors.py +2 -2
- deepeval/openai/utils.py +7 -31
- deepeval/prompt/api.py +11 -10
- deepeval/prompt/prompt.py +5 -4
- deepeval/simulator/conversation_simulator.py +25 -18
- deepeval/synthesizer/chunking/context_generator.py +9 -1
- deepeval/telemetry.py +3 -3
- deepeval/test_case/llm_test_case.py +3 -2
- deepeval/test_run/api.py +3 -2
- deepeval/test_run/cache.py +4 -3
- deepeval/test_run/test_run.py +24 -5
- deepeval/tracing/api.py +11 -10
- deepeval/tracing/otel/exporter.py +11 -0
- deepeval/tracing/patchers.py +102 -1
- deepeval/tracing/trace_context.py +13 -4
- deepeval/tracing/tracing.py +10 -1
- deepeval/tracing/types.py +8 -8
- deepeval/tracing/utils.py +9 -0
- deepeval/utils.py +44 -2
- {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/METADATA +2 -2
- {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/RECORD +57 -47
- /deepeval/{openai → model_integrations}/types.py +0 -0
- {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/WHEEL +0 -0
- {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/entry_points.txt +0 -0
|
@@ -23,6 +23,8 @@ try:
|
|
|
23
23
|
AgentExecutionCompletedEvent,
|
|
24
24
|
ToolUsageStartedEvent,
|
|
25
25
|
ToolUsageFinishedEvent,
|
|
26
|
+
KnowledgeRetrievalStartedEvent,
|
|
27
|
+
KnowledgeRetrievalCompletedEvent,
|
|
26
28
|
)
|
|
27
29
|
|
|
28
30
|
crewai_installed = True
|
|
@@ -69,6 +71,14 @@ class CrewAIEventsListener(BaseEventListener):
|
|
|
69
71
|
|
|
70
72
|
return execution_id
|
|
71
73
|
|
|
74
|
+
@staticmethod
|
|
75
|
+
def get_knowledge_execution_id(source, event) -> str:
|
|
76
|
+
source_id = id(source)
|
|
77
|
+
agent_id = id(event.agent) if hasattr(event, "agent") else "unknown"
|
|
78
|
+
execution_id = f"_knowledge_{source_id}_{agent_id}"
|
|
79
|
+
|
|
80
|
+
return execution_id
|
|
81
|
+
|
|
72
82
|
def setup_listeners(self, crewai_event_bus):
|
|
73
83
|
@crewai_event_bus.on(CrewKickoffStartedEvent)
|
|
74
84
|
def on_crew_started(source, event: CrewKickoffStartedEvent):
|
|
@@ -161,6 +171,32 @@ class CrewAIEventsListener(BaseEventListener):
|
|
|
161
171
|
current_span.output = event.output
|
|
162
172
|
observer.__exit__(None, None, None)
|
|
163
173
|
|
|
174
|
+
@crewai_event_bus.on(KnowledgeRetrievalStartedEvent)
|
|
175
|
+
def on_knowledge_started(source, event: KnowledgeRetrievalStartedEvent):
|
|
176
|
+
observer = Observer(
|
|
177
|
+
span_type="tool",
|
|
178
|
+
func_name="knowledge_retrieval",
|
|
179
|
+
function_kwargs={},
|
|
180
|
+
)
|
|
181
|
+
self.span_observers[
|
|
182
|
+
self.get_knowledge_execution_id(source, event)
|
|
183
|
+
] = observer
|
|
184
|
+
observer.__enter__()
|
|
185
|
+
|
|
186
|
+
@crewai_event_bus.on(KnowledgeRetrievalCompletedEvent)
|
|
187
|
+
def on_knowledge_completed(
|
|
188
|
+
source, event: KnowledgeRetrievalCompletedEvent
|
|
189
|
+
):
|
|
190
|
+
observer = self.span_observers.pop(
|
|
191
|
+
self.get_knowledge_execution_id(source, event)
|
|
192
|
+
)
|
|
193
|
+
if observer:
|
|
194
|
+
current_span = current_span_context.get()
|
|
195
|
+
if current_span:
|
|
196
|
+
current_span.input = event.query
|
|
197
|
+
current_span.output = event.retrieved_knowledge
|
|
198
|
+
observer.__exit__(None, None, None)
|
|
199
|
+
|
|
164
200
|
|
|
165
201
|
def instrument_crewai(api_key: Optional[str] = None):
|
|
166
202
|
is_crewai_installed()
|
|
@@ -1,12 +1,15 @@
|
|
|
1
1
|
from typing import Any, Optional, List, Dict
|
|
2
2
|
from uuid import UUID
|
|
3
3
|
from time import perf_counter
|
|
4
|
-
|
|
4
|
+
|
|
5
|
+
from deepeval.tracing.context import current_span_context, current_trace_context
|
|
6
|
+
from deepeval.test_case.llm_test_case import ToolCall
|
|
5
7
|
from deepeval.tracing.types import (
|
|
6
8
|
LlmOutput,
|
|
7
9
|
LlmToolCall,
|
|
8
10
|
)
|
|
9
11
|
from deepeval.metrics import BaseMetric
|
|
12
|
+
from deepeval.tracing.utils import prepare_tool_call_input_parameters
|
|
10
13
|
|
|
11
14
|
try:
|
|
12
15
|
from langchain_core.callbacks.base import BaseCallbackHandler
|
|
@@ -266,12 +269,34 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
266
269
|
parent_run_id: Optional[UUID] = None,
|
|
267
270
|
**kwargs: Any, # un-logged kwargs
|
|
268
271
|
) -> Any:
|
|
269
|
-
|
|
270
272
|
uuid_str = str(run_id)
|
|
271
273
|
tool_span: ToolSpan = trace_manager.get_span_by_uuid(uuid_str)
|
|
272
274
|
tool_span.output = output
|
|
273
275
|
exit_current_context(uuid_str=uuid_str)
|
|
274
276
|
|
|
277
|
+
# set the tools called in the parent span as well as on the trace level
|
|
278
|
+
tool_call = ToolCall(
|
|
279
|
+
name=tool_span.name,
|
|
280
|
+
description=tool_span.description,
|
|
281
|
+
output=output,
|
|
282
|
+
input_parameters=prepare_tool_call_input_parameters(
|
|
283
|
+
tool_span.input
|
|
284
|
+
),
|
|
285
|
+
)
|
|
286
|
+
parent_span = current_span_context.get()
|
|
287
|
+
if parent_span:
|
|
288
|
+
if parent_span.tools_called is None:
|
|
289
|
+
parent_span.tools_called = []
|
|
290
|
+
|
|
291
|
+
parent_span.tools_called.append(tool_call)
|
|
292
|
+
|
|
293
|
+
trace = current_trace_context.get()
|
|
294
|
+
if trace:
|
|
295
|
+
if trace.tools_called is None:
|
|
296
|
+
trace.tools_called = []
|
|
297
|
+
|
|
298
|
+
trace.tools_called.append(tool_call)
|
|
299
|
+
|
|
275
300
|
def on_tool_error(
|
|
276
301
|
self,
|
|
277
302
|
error: BaseException,
|
|
@@ -2,13 +2,28 @@ from typing import Any, Dict, Optional
|
|
|
2
2
|
import inspect
|
|
3
3
|
from time import perf_counter
|
|
4
4
|
import uuid
|
|
5
|
+
|
|
6
|
+
from llama_index.core.agent.workflow.workflow_events import (
|
|
7
|
+
AgentWorkflowStartEvent,
|
|
8
|
+
)
|
|
9
|
+
from deepeval.integrations.llama_index.utils import (
|
|
10
|
+
extract_output_from_llm_chat_end_event,
|
|
11
|
+
)
|
|
5
12
|
from deepeval.telemetry import capture_tracing_integration
|
|
6
13
|
from deepeval.tracing import trace_manager
|
|
7
|
-
from deepeval.tracing.types import
|
|
14
|
+
from deepeval.tracing.types import (
|
|
15
|
+
ToolSpan,
|
|
16
|
+
AgentSpan,
|
|
17
|
+
BaseSpan,
|
|
18
|
+
LlmSpan,
|
|
19
|
+
TraceSpanStatus,
|
|
20
|
+
)
|
|
8
21
|
from deepeval.tracing.trace_context import (
|
|
9
22
|
current_llm_context,
|
|
10
23
|
current_agent_context,
|
|
11
24
|
)
|
|
25
|
+
from deepeval.test_case import ToolCall
|
|
26
|
+
from deepeval.tracing.utils import make_json_serializable
|
|
12
27
|
|
|
13
28
|
try:
|
|
14
29
|
from llama_index.core.instrumentation.events.base import BaseEvent
|
|
@@ -89,6 +104,7 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
|
|
|
89
104
|
if llm_span_context
|
|
90
105
|
else None
|
|
91
106
|
),
|
|
107
|
+
prompt=llm_span_context.prompt if llm_span_context else None,
|
|
92
108
|
)
|
|
93
109
|
trace_manager.add_span(llm_span)
|
|
94
110
|
trace_manager.add_span_to_trace(llm_span)
|
|
@@ -106,7 +122,9 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
|
|
|
106
122
|
llm_span.status = TraceSpanStatus.SUCCESS
|
|
107
123
|
llm_span.end_time = perf_counter()
|
|
108
124
|
llm_span.input = llm_span.input
|
|
109
|
-
llm_span.output =
|
|
125
|
+
llm_span.output = extract_output_from_llm_chat_end_event(
|
|
126
|
+
event
|
|
127
|
+
)
|
|
110
128
|
trace_manager.remove_span(llm_span.uuid)
|
|
111
129
|
del self.open_ai_astream_to_llm_span_map[event.span_id]
|
|
112
130
|
|
|
@@ -151,6 +169,14 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
|
|
|
151
169
|
# conditions to qualify as agent start run span
|
|
152
170
|
if method_name == "run":
|
|
153
171
|
agent_span_context = current_agent_context.get()
|
|
172
|
+
start_event = bound_args.arguments.get("start_event")
|
|
173
|
+
|
|
174
|
+
if start_event and isinstance(start_event, AgentWorkflowStartEvent):
|
|
175
|
+
input = start_event.model_dump()
|
|
176
|
+
|
|
177
|
+
else:
|
|
178
|
+
input = bound_args.arguments
|
|
179
|
+
|
|
154
180
|
span = AgentSpan(
|
|
155
181
|
uuid=id_,
|
|
156
182
|
status=TraceSpanStatus.IN_PROGRESS,
|
|
@@ -159,7 +185,7 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
|
|
|
159
185
|
parent_uuid=parent_span_id,
|
|
160
186
|
start_time=perf_counter(),
|
|
161
187
|
name="Agent", # TODO: decide the name of the span
|
|
162
|
-
input=
|
|
188
|
+
input=input,
|
|
163
189
|
metrics=(
|
|
164
190
|
agent_span_context.metrics if agent_span_context else None
|
|
165
191
|
),
|
|
@@ -169,7 +195,17 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
|
|
|
169
195
|
else None
|
|
170
196
|
),
|
|
171
197
|
)
|
|
172
|
-
|
|
198
|
+
elif method_name == "acall":
|
|
199
|
+
span = ToolSpan(
|
|
200
|
+
uuid=id_,
|
|
201
|
+
status=TraceSpanStatus.IN_PROGRESS,
|
|
202
|
+
children=[],
|
|
203
|
+
trace_uuid=trace_uuid,
|
|
204
|
+
parent_uuid=parent_span_id,
|
|
205
|
+
start_time=perf_counter(),
|
|
206
|
+
input=bound_args.arguments,
|
|
207
|
+
name="Tool",
|
|
208
|
+
)
|
|
173
209
|
# prepare input test case params for the span
|
|
174
210
|
prepare_input_llm_test_case_params(
|
|
175
211
|
class_name, method_name, span, bound_args.arguments
|
|
@@ -192,10 +228,28 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
|
|
|
192
228
|
if base_span is None:
|
|
193
229
|
return None
|
|
194
230
|
|
|
231
|
+
class_name, method_name = parse_id(id_)
|
|
232
|
+
if method_name == "call_tool":
|
|
233
|
+
output_json = make_json_serializable(result)
|
|
234
|
+
if output_json and isinstance(output_json, dict):
|
|
235
|
+
if base_span.tools_called is None:
|
|
236
|
+
base_span.tools_called = []
|
|
237
|
+
base_span.tools_called.append(
|
|
238
|
+
ToolCall(
|
|
239
|
+
name=output_json.get("tool_name", "Tool"),
|
|
240
|
+
input_parameters=output_json.get("tool_kwargs", {}),
|
|
241
|
+
output=output_json.get("tool_output", {}),
|
|
242
|
+
)
|
|
243
|
+
)
|
|
195
244
|
base_span.end_time = perf_counter()
|
|
196
245
|
base_span.status = TraceSpanStatus.SUCCESS
|
|
197
246
|
base_span.output = result
|
|
198
247
|
|
|
248
|
+
if isinstance(base_span, ToolSpan):
|
|
249
|
+
result_json = make_json_serializable(result)
|
|
250
|
+
if result_json and isinstance(result_json, dict):
|
|
251
|
+
base_span.name = result_json.get("tool_name", "Tool")
|
|
252
|
+
|
|
199
253
|
if base_span.llm_test_case:
|
|
200
254
|
class_name, method_name = parse_id(id_)
|
|
201
255
|
prepare_output_llm_test_case_params(
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from llama_index.core.instrumentation.events.llm import LLMChatEndEvent
|
|
1
2
|
from deepeval.test_case.llm_test_case import LLMTestCase, ToolCall
|
|
2
3
|
from deepeval.tracing.types import BaseSpan
|
|
3
4
|
from typing import Any
|
|
@@ -81,3 +82,26 @@ def prepare_output_llm_test_case_params(
|
|
|
81
82
|
)
|
|
82
83
|
|
|
83
84
|
span.llm_test_case.tools_called = tool_calls
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def extract_output_from_llm_chat_end_event(event: LLMChatEndEvent) -> list:
|
|
88
|
+
messages = []
|
|
89
|
+
for msg in event.response.message.blocks:
|
|
90
|
+
if msg.block_type == "text":
|
|
91
|
+
messages.append(
|
|
92
|
+
{
|
|
93
|
+
"role": event.response.message.role.value,
|
|
94
|
+
"content": msg.text,
|
|
95
|
+
}
|
|
96
|
+
)
|
|
97
|
+
elif msg.block_type == "tool_call":
|
|
98
|
+
messages.append(
|
|
99
|
+
{
|
|
100
|
+
"name": msg.tool_name,
|
|
101
|
+
"input_parameters": msg.tool_kwargs,
|
|
102
|
+
"id": msg.tool_call_id,
|
|
103
|
+
}
|
|
104
|
+
)
|
|
105
|
+
else:
|
|
106
|
+
messages.append(msg.model_dump())
|
|
107
|
+
return messages
|
deepeval/metrics/__init__.py
CHANGED
|
@@ -8,6 +8,8 @@ from .base_metric import (
|
|
|
8
8
|
from .dag.dag import DAGMetric, DeepAcyclicGraph
|
|
9
9
|
from .conversational_dag.conversational_dag import ConversationalDAGMetric
|
|
10
10
|
from .bias.bias import BiasMetric
|
|
11
|
+
from .exact_match.exact_match import ExactMatchMetric
|
|
12
|
+
from .pattern_match.pattern_match import PatternMatchMetric
|
|
11
13
|
from .toxicity.toxicity import ToxicityMetric
|
|
12
14
|
from .pii_leakage.pii_leakage import PIILeakageMetric
|
|
13
15
|
from .non_advice.non_advice import NonAdviceMetric
|
|
@@ -69,6 +71,9 @@ __all__ = [
|
|
|
69
71
|
"BaseConversationalMetric",
|
|
70
72
|
"BaseMultimodalMetric",
|
|
71
73
|
"BaseArenaMetric",
|
|
74
|
+
# Non-LLM metrics
|
|
75
|
+
"ExactMatchMetric",
|
|
76
|
+
"PatternMatchMetric",
|
|
72
77
|
# Core metrics
|
|
73
78
|
"GEval",
|
|
74
79
|
"ArenaGEval",
|
|
File without changes
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from deepeval.metrics.indicator import metric_progress_indicator
|
|
4
|
+
from deepeval.metrics.utils import (
|
|
5
|
+
check_llm_test_case_params,
|
|
6
|
+
construct_verbose_logs,
|
|
7
|
+
)
|
|
8
|
+
from deepeval.metrics.api import metric_data_manager
|
|
9
|
+
from deepeval.metrics import BaseMetric
|
|
10
|
+
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ExactMatchMetric(BaseMetric):
|
|
14
|
+
_required_params: List[LLMTestCaseParams] = [
|
|
15
|
+
LLMTestCaseParams.INPUT,
|
|
16
|
+
LLMTestCaseParams.ACTUAL_OUTPUT,
|
|
17
|
+
LLMTestCaseParams.EXPECTED_OUTPUT,
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
threshold: float = 1,
|
|
23
|
+
verbose_mode: bool = False,
|
|
24
|
+
):
|
|
25
|
+
self.threshold = threshold
|
|
26
|
+
self.verbose_mode = verbose_mode
|
|
27
|
+
|
|
28
|
+
def measure(
|
|
29
|
+
self,
|
|
30
|
+
test_case: LLMTestCase,
|
|
31
|
+
_show_indicator: bool = True,
|
|
32
|
+
_in_component: bool = False,
|
|
33
|
+
_log_metric_to_confident: bool = True,
|
|
34
|
+
) -> float:
|
|
35
|
+
check_llm_test_case_params(test_case, self._required_params, self)
|
|
36
|
+
|
|
37
|
+
with metric_progress_indicator(
|
|
38
|
+
self, _show_indicator=_show_indicator, _in_component=_in_component
|
|
39
|
+
):
|
|
40
|
+
expected = test_case.expected_output.strip()
|
|
41
|
+
actual = test_case.actual_output.strip()
|
|
42
|
+
|
|
43
|
+
if expected == actual:
|
|
44
|
+
self.score = self.precision = self.recall = self.f1 = 1.0
|
|
45
|
+
self.reason = (
|
|
46
|
+
"The actual and expected outputs are exact matches."
|
|
47
|
+
)
|
|
48
|
+
else:
|
|
49
|
+
self.score = self.precision = self.recall = self.f1 = 0.0
|
|
50
|
+
self.reason = "The actual and expected outputs are different."
|
|
51
|
+
|
|
52
|
+
self.success = self.score >= self.threshold
|
|
53
|
+
|
|
54
|
+
if self.verbose_mode:
|
|
55
|
+
self.verbose_logs = construct_verbose_logs(
|
|
56
|
+
self,
|
|
57
|
+
steps=[
|
|
58
|
+
f"Score: {self.score:.2f}",
|
|
59
|
+
f"Reason: {self.reason}",
|
|
60
|
+
],
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
if _log_metric_to_confident:
|
|
64
|
+
metric_data_manager.post_metric_if_enabled(
|
|
65
|
+
self, test_case=test_case
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
return self.score
|
|
69
|
+
|
|
70
|
+
async def a_measure(
|
|
71
|
+
self,
|
|
72
|
+
test_case: LLMTestCase,
|
|
73
|
+
_show_indicator: bool = True,
|
|
74
|
+
_in_component: bool = False,
|
|
75
|
+
) -> float:
|
|
76
|
+
return self.measure(
|
|
77
|
+
test_case,
|
|
78
|
+
_show_indicator=_show_indicator,
|
|
79
|
+
_in_component=_in_component,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
def is_successful(self) -> bool:
|
|
83
|
+
if self.error is not None:
|
|
84
|
+
self.success = False
|
|
85
|
+
else:
|
|
86
|
+
try:
|
|
87
|
+
self.success = self.score >= self.threshold
|
|
88
|
+
except:
|
|
89
|
+
self.success = False
|
|
90
|
+
return self.success
|
|
91
|
+
|
|
92
|
+
@property
|
|
93
|
+
def __name__(self):
|
|
94
|
+
return "Exact Match"
|
deepeval/metrics/indicator.py
CHANGED
|
@@ -18,6 +18,10 @@ from deepeval.test_run.cache import CachedTestCase, Cache
|
|
|
18
18
|
from deepeval.telemetry import capture_metric_type
|
|
19
19
|
from deepeval.utils import update_pbar
|
|
20
20
|
|
|
21
|
+
import logging
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
21
25
|
|
|
22
26
|
def format_metric_description(
|
|
23
27
|
metric: Union[BaseMetric, BaseConversationalMetric, BaseArenaMetric],
|
|
@@ -43,7 +47,7 @@ def metric_progress_indicator(
|
|
|
43
47
|
_show_indicator: bool = True,
|
|
44
48
|
_in_component: bool = False,
|
|
45
49
|
):
|
|
46
|
-
captured_async_mode = False if async_mode
|
|
50
|
+
captured_async_mode = False if async_mode is None else async_mode
|
|
47
51
|
with capture_metric_type(
|
|
48
52
|
metric.__name__,
|
|
49
53
|
async_mode=captured_async_mode,
|
|
@@ -250,6 +254,21 @@ async def safe_a_measure(
|
|
|
250
254
|
_log_metric_to_confident=False,
|
|
251
255
|
)
|
|
252
256
|
update_pbar(progress, pbar_eval_id)
|
|
257
|
+
|
|
258
|
+
except asyncio.CancelledError:
|
|
259
|
+
logger.info("caught asyncio.CancelledError")
|
|
260
|
+
|
|
261
|
+
# treat cancellation as a timeout so we still emit a MetricData
|
|
262
|
+
metric.error = (
|
|
263
|
+
"Timed out/cancelled while evaluating metric. "
|
|
264
|
+
"Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
|
|
265
|
+
"DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
266
|
+
)
|
|
267
|
+
metric.success = False
|
|
268
|
+
|
|
269
|
+
if not ignore_errors:
|
|
270
|
+
raise
|
|
271
|
+
|
|
253
272
|
except MissingTestCaseParamsError as e:
|
|
254
273
|
if skip_on_missing_params:
|
|
255
274
|
metric.skipped = True
|
|
@@ -277,5 +296,6 @@ async def safe_a_measure(
|
|
|
277
296
|
if ignore_errors:
|
|
278
297
|
metric.error = str(e)
|
|
279
298
|
metric.success = False # Assuming you want to set success to False
|
|
299
|
+
logger.info("a metric was marked as errored")
|
|
280
300
|
else:
|
|
281
301
|
raise
|
|
File without changes
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from deepeval.metrics.indicator import metric_progress_indicator
|
|
5
|
+
from deepeval.metrics.utils import (
|
|
6
|
+
check_llm_test_case_params,
|
|
7
|
+
construct_verbose_logs,
|
|
8
|
+
)
|
|
9
|
+
from deepeval.metrics.api import metric_data_manager
|
|
10
|
+
from deepeval.metrics import BaseMetric
|
|
11
|
+
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class PatternMatchMetric(BaseMetric):
|
|
15
|
+
_required_params: List[LLMTestCaseParams] = [
|
|
16
|
+
LLMTestCaseParams.INPUT,
|
|
17
|
+
LLMTestCaseParams.ACTUAL_OUTPUT,
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
pattern: str,
|
|
23
|
+
ignore_case: bool = False,
|
|
24
|
+
threshold: float = 1.0,
|
|
25
|
+
verbose_mode: bool = False,
|
|
26
|
+
):
|
|
27
|
+
self.pattern = pattern.strip()
|
|
28
|
+
self.ignore_case = ignore_case
|
|
29
|
+
self.verbose_mode = verbose_mode
|
|
30
|
+
self.threshold = threshold
|
|
31
|
+
|
|
32
|
+
flags = re.IGNORECASE if ignore_case else 0
|
|
33
|
+
try:
|
|
34
|
+
self._compiled_pattern = re.compile(self.pattern, flags)
|
|
35
|
+
except re.error as e:
|
|
36
|
+
raise ValueError(f"Invalid regex pattern: {pattern} — {e}")
|
|
37
|
+
|
|
38
|
+
def measure(
|
|
39
|
+
self,
|
|
40
|
+
test_case: LLMTestCase,
|
|
41
|
+
_show_indicator: bool = True,
|
|
42
|
+
_in_component: bool = False,
|
|
43
|
+
_log_metric_to_confident: bool = True,
|
|
44
|
+
) -> float:
|
|
45
|
+
check_llm_test_case_params(test_case, self._required_params, self)
|
|
46
|
+
|
|
47
|
+
with metric_progress_indicator(
|
|
48
|
+
self, _show_indicator=_show_indicator, _in_component=_in_component
|
|
49
|
+
):
|
|
50
|
+
actual = test_case.actual_output.strip()
|
|
51
|
+
full_match = self._compiled_pattern.fullmatch(actual)
|
|
52
|
+
|
|
53
|
+
self.score = 1.0 if full_match else 0.0
|
|
54
|
+
self.reason = (
|
|
55
|
+
f"The actual output fully matches the pattern."
|
|
56
|
+
if full_match
|
|
57
|
+
else f"The actual output does not match the pattern."
|
|
58
|
+
)
|
|
59
|
+
self.success = self.score >= self.threshold
|
|
60
|
+
|
|
61
|
+
if self.verbose_mode:
|
|
62
|
+
self.verbose_logs = construct_verbose_logs(
|
|
63
|
+
self,
|
|
64
|
+
steps=[
|
|
65
|
+
f"Pattern: {self.pattern}",
|
|
66
|
+
f"Actual: {actual}",
|
|
67
|
+
f"Score: {self.score:.2f}",
|
|
68
|
+
f"Reason: {self.reason}",
|
|
69
|
+
],
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
if _log_metric_to_confident:
|
|
73
|
+
metric_data_manager.post_metric_if_enabled(
|
|
74
|
+
self, test_case=test_case
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
return self.score
|
|
78
|
+
|
|
79
|
+
async def a_measure(
|
|
80
|
+
self,
|
|
81
|
+
test_case: LLMTestCase,
|
|
82
|
+
_show_indicator: bool = True,
|
|
83
|
+
_in_component: bool = False,
|
|
84
|
+
) -> float:
|
|
85
|
+
return self.measure(
|
|
86
|
+
test_case,
|
|
87
|
+
_show_indicator=_show_indicator,
|
|
88
|
+
_in_component=_in_component,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
def is_successful(self) -> bool:
|
|
92
|
+
if self.error is not None:
|
|
93
|
+
self.success = False
|
|
94
|
+
else:
|
|
95
|
+
try:
|
|
96
|
+
self.success = self.score >= self.threshold
|
|
97
|
+
except:
|
|
98
|
+
self.success = False
|
|
99
|
+
return self.success
|
|
100
|
+
|
|
101
|
+
@property
|
|
102
|
+
def __name__(self):
|
|
103
|
+
return "Pattern Match"
|
|
@@ -36,6 +36,11 @@ class TaskCompletionMetric(BaseMetric):
|
|
|
36
36
|
strict_mode: bool = False,
|
|
37
37
|
verbose_mode: bool = False,
|
|
38
38
|
):
|
|
39
|
+
if task is None:
|
|
40
|
+
self._is_task_provided = False
|
|
41
|
+
else:
|
|
42
|
+
self._is_task_provided = True
|
|
43
|
+
|
|
39
44
|
self.task = task
|
|
40
45
|
self.threshold = 1 if strict_mode else threshold
|
|
41
46
|
self.model, self.using_native_model = initialize_model(model)
|
|
@@ -73,7 +78,8 @@ class TaskCompletionMetric(BaseMetric):
|
|
|
73
78
|
)
|
|
74
79
|
else:
|
|
75
80
|
task, self.outcome = self._extract_task_and_outcome(test_case)
|
|
76
|
-
|
|
81
|
+
if self.task is None or not self._is_task_provided:
|
|
82
|
+
self.task = task
|
|
77
83
|
self.verdict, self.reason = self._generate_verdicts()
|
|
78
84
|
self.score = self._calculate_score()
|
|
79
85
|
self.success = self.score >= self.threshold
|
|
@@ -108,7 +114,8 @@ class TaskCompletionMetric(BaseMetric):
|
|
|
108
114
|
task, self.outcome = await self._a_extract_task_and_outcome(
|
|
109
115
|
test_case
|
|
110
116
|
)
|
|
111
|
-
|
|
117
|
+
if self.task is None or not self._is_task_provided:
|
|
118
|
+
self.task = task
|
|
112
119
|
self.verdict, self.reason = await self._a_generate_verdicts()
|
|
113
120
|
self.score = self._calculate_score()
|
|
114
121
|
self.success = self.score >= self.threshold
|
|
File without changes
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import uuid
|
|
3
|
+
from typing import Any, List, Optional
|
|
4
|
+
|
|
5
|
+
from deepeval.model_integrations.types import InputParameters, OutputParameters
|
|
6
|
+
from deepeval.test_case.llm_test_case import ToolCall
|
|
7
|
+
from deepeval.tracing.context import (
|
|
8
|
+
current_span_context,
|
|
9
|
+
current_trace_context,
|
|
10
|
+
update_current_span,
|
|
11
|
+
update_llm_span,
|
|
12
|
+
)
|
|
13
|
+
from deepeval.tracing.trace_context import current_llm_context
|
|
14
|
+
from deepeval.tracing.types import ToolSpan, TraceSpanStatus
|
|
15
|
+
from deepeval.utils import shorten, len_long
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _update_all_attributes(
|
|
19
|
+
input_parameters: InputParameters,
|
|
20
|
+
output_parameters: OutputParameters,
|
|
21
|
+
expected_tools: List[ToolCall],
|
|
22
|
+
expected_output: str,
|
|
23
|
+
context: List[str],
|
|
24
|
+
retrieval_context: List[str],
|
|
25
|
+
):
|
|
26
|
+
"""Update span and trace attributes with input/output parameters."""
|
|
27
|
+
update_current_span(
|
|
28
|
+
input=input_parameters.input or input_parameters.messages or "NA",
|
|
29
|
+
output=output_parameters.output or "NA",
|
|
30
|
+
tools_called=output_parameters.tools_called,
|
|
31
|
+
# attributes to be added
|
|
32
|
+
expected_output=expected_output,
|
|
33
|
+
expected_tools=expected_tools,
|
|
34
|
+
context=context,
|
|
35
|
+
retrieval_context=retrieval_context,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
llm_context = current_llm_context.get()
|
|
39
|
+
|
|
40
|
+
update_llm_span(
|
|
41
|
+
input_token_count=output_parameters.prompt_tokens,
|
|
42
|
+
output_token_count=output_parameters.completion_tokens,
|
|
43
|
+
prompt=llm_context.prompt,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
if output_parameters.tools_called:
|
|
47
|
+
create_child_tool_spans(output_parameters)
|
|
48
|
+
|
|
49
|
+
__update_input_and_output_of_current_trace(
|
|
50
|
+
input_parameters, output_parameters
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def __update_input_and_output_of_current_trace(
|
|
55
|
+
input_parameters: InputParameters, output_parameters: OutputParameters
|
|
56
|
+
):
|
|
57
|
+
|
|
58
|
+
current_trace = current_trace_context.get()
|
|
59
|
+
if current_trace:
|
|
60
|
+
if current_trace.input is None:
|
|
61
|
+
current_trace.input = (
|
|
62
|
+
input_parameters.input or input_parameters.messages
|
|
63
|
+
)
|
|
64
|
+
if current_trace.output is None:
|
|
65
|
+
current_trace.output = output_parameters.output
|
|
66
|
+
|
|
67
|
+
return
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def create_child_tool_spans(output_parameters: OutputParameters):
|
|
71
|
+
if output_parameters.tools_called is None:
|
|
72
|
+
return
|
|
73
|
+
|
|
74
|
+
current_span = current_span_context.get()
|
|
75
|
+
for tool_called in output_parameters.tools_called:
|
|
76
|
+
tool_span = ToolSpan(
|
|
77
|
+
**{
|
|
78
|
+
"uuid": str(uuid.uuid4()),
|
|
79
|
+
"trace_uuid": current_span.trace_uuid,
|
|
80
|
+
"parent_uuid": current_span.uuid,
|
|
81
|
+
"start_time": current_span.start_time,
|
|
82
|
+
"end_time": current_span.start_time,
|
|
83
|
+
"status": TraceSpanStatus.SUCCESS,
|
|
84
|
+
"children": [],
|
|
85
|
+
"name": tool_called.name,
|
|
86
|
+
"input": tool_called.input_parameters,
|
|
87
|
+
"output": None,
|
|
88
|
+
"metrics": None,
|
|
89
|
+
"description": tool_called.description,
|
|
90
|
+
}
|
|
91
|
+
)
|
|
92
|
+
current_span.children.append(tool_span)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
_URL_MAX = 200
|
|
96
|
+
_JSON_MAX = max(
|
|
97
|
+
len_long(), 400
|
|
98
|
+
) # <- make this bigger by increasing DEEPEVAL_MAXLEN_LONG above 400
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def compact_dump(value: Any) -> str:
|
|
102
|
+
try:
|
|
103
|
+
dumped = json.dumps(
|
|
104
|
+
value, ensure_ascii=False, default=str, separators=(",", ":")
|
|
105
|
+
)
|
|
106
|
+
except Exception:
|
|
107
|
+
dumped = repr(value)
|
|
108
|
+
return shorten(dumped, max_len=_JSON_MAX)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def fmt_url(url: Optional[str]) -> str:
|
|
112
|
+
if not url:
|
|
113
|
+
return ""
|
|
114
|
+
if url.startswith("data:"):
|
|
115
|
+
return "[data-uri]"
|
|
116
|
+
return shorten(url, max_len=_URL_MAX)
|
deepeval/models/base_model.py
CHANGED
|
@@ -68,7 +68,9 @@ class DeepEvalBaseLLM(ABC):
|
|
|
68
68
|
Returns:
|
|
69
69
|
A list of strings.
|
|
70
70
|
"""
|
|
71
|
-
raise
|
|
71
|
+
raise NotImplementedError(
|
|
72
|
+
"batch_generate is not implemented for this model"
|
|
73
|
+
)
|
|
72
74
|
|
|
73
75
|
@abstractmethod
|
|
74
76
|
def get_model_name(self, *args, **kwargs) -> str:
|