deepeval 3.6.9__py3-none-any.whl → 3.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/__init__.py +0 -4
- deepeval/_version.py +1 -1
- deepeval/anthropic/__init__.py +19 -0
- deepeval/anthropic/extractors.py +94 -0
- deepeval/anthropic/patch.py +169 -0
- deepeval/anthropic/utils.py +225 -0
- deepeval/benchmarks/drop/drop.py +40 -14
- deepeval/benchmarks/ifeval/ifeval.py +2 -2
- deepeval/cli/main.py +7 -0
- deepeval/confident/api.py +6 -1
- deepeval/confident/types.py +4 -2
- deepeval/config/settings.py +159 -11
- deepeval/config/settings_manager.py +4 -0
- deepeval/evaluate/compare.py +215 -4
- deepeval/evaluate/types.py +6 -0
- deepeval/evaluate/utils.py +30 -0
- deepeval/integrations/crewai/handler.py +36 -0
- deepeval/integrations/langchain/callback.py +27 -2
- deepeval/integrations/llama_index/handler.py +58 -4
- deepeval/integrations/llama_index/utils.py +24 -0
- deepeval/key_handler.py +1 -0
- deepeval/metrics/__init__.py +5 -0
- deepeval/metrics/arena_g_eval/arena_g_eval.py +5 -1
- deepeval/metrics/arena_g_eval/utils.py +5 -5
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +9 -18
- deepeval/metrics/exact_match/__init__.py +0 -0
- deepeval/metrics/exact_match/exact_match.py +94 -0
- deepeval/metrics/g_eval/g_eval.py +5 -1
- deepeval/metrics/g_eval/utils.py +1 -1
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +5 -1
- deepeval/metrics/pattern_match/__init__.py +0 -0
- deepeval/metrics/pattern_match/pattern_match.py +103 -0
- deepeval/metrics/task_completion/task_completion.py +9 -2
- deepeval/metrics/utils.py +1 -1
- deepeval/model_integrations/__init__.py +0 -0
- deepeval/model_integrations/utils.py +116 -0
- deepeval/models/base_model.py +3 -1
- deepeval/models/llms/gemini_model.py +27 -5
- deepeval/openai/__init__.py +3 -1
- deepeval/openai/extractors.py +2 -2
- deepeval/openai/utils.py +7 -31
- deepeval/openai_agents/callback_handler.py +12 -3
- deepeval/prompt/api.py +11 -10
- deepeval/prompt/prompt.py +27 -15
- deepeval/simulator/template.py +1 -1
- deepeval/telemetry.py +3 -3
- deepeval/test_case/__init__.py +2 -1
- deepeval/test_case/arena_test_case.py +15 -4
- deepeval/test_case/llm_test_case.py +3 -2
- deepeval/test_case/mllm_test_case.py +45 -22
- deepeval/test_run/api.py +3 -2
- deepeval/test_run/cache.py +35 -13
- deepeval/test_run/hyperparameters.py +5 -1
- deepeval/test_run/test_run.py +52 -14
- deepeval/tracing/api.py +11 -10
- deepeval/tracing/otel/exporter.py +11 -0
- deepeval/tracing/patchers.py +102 -1
- deepeval/tracing/trace_context.py +13 -4
- deepeval/tracing/tracing.py +11 -2
- deepeval/tracing/types.py +8 -8
- deepeval/tracing/utils.py +9 -0
- deepeval/utils.py +48 -2
- {deepeval-3.6.9.dist-info → deepeval-3.7.1.dist-info}/METADATA +3 -3
- {deepeval-3.6.9.dist-info → deepeval-3.7.1.dist-info}/RECORD +68 -58
- /deepeval/{openai → model_integrations}/types.py +0 -0
- {deepeval-3.6.9.dist-info → deepeval-3.7.1.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.9.dist-info → deepeval-3.7.1.dist-info}/WHEEL +0 -0
- {deepeval-3.6.9.dist-info → deepeval-3.7.1.dist-info}/entry_points.txt +0 -0
|
@@ -1,12 +1,15 @@
|
|
|
1
1
|
from typing import Any, Optional, List, Dict
|
|
2
2
|
from uuid import UUID
|
|
3
3
|
from time import perf_counter
|
|
4
|
-
|
|
4
|
+
|
|
5
|
+
from deepeval.tracing.context import current_span_context, current_trace_context
|
|
6
|
+
from deepeval.test_case.llm_test_case import ToolCall
|
|
5
7
|
from deepeval.tracing.types import (
|
|
6
8
|
LlmOutput,
|
|
7
9
|
LlmToolCall,
|
|
8
10
|
)
|
|
9
11
|
from deepeval.metrics import BaseMetric
|
|
12
|
+
from deepeval.tracing.utils import prepare_tool_call_input_parameters
|
|
10
13
|
|
|
11
14
|
try:
|
|
12
15
|
from langchain_core.callbacks.base import BaseCallbackHandler
|
|
@@ -266,12 +269,34 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
266
269
|
parent_run_id: Optional[UUID] = None,
|
|
267
270
|
**kwargs: Any, # un-logged kwargs
|
|
268
271
|
) -> Any:
|
|
269
|
-
|
|
270
272
|
uuid_str = str(run_id)
|
|
271
273
|
tool_span: ToolSpan = trace_manager.get_span_by_uuid(uuid_str)
|
|
272
274
|
tool_span.output = output
|
|
273
275
|
exit_current_context(uuid_str=uuid_str)
|
|
274
276
|
|
|
277
|
+
# set the tools called in the parent span as well as on the trace level
|
|
278
|
+
tool_call = ToolCall(
|
|
279
|
+
name=tool_span.name,
|
|
280
|
+
description=tool_span.description,
|
|
281
|
+
output=output,
|
|
282
|
+
input_parameters=prepare_tool_call_input_parameters(
|
|
283
|
+
tool_span.input
|
|
284
|
+
),
|
|
285
|
+
)
|
|
286
|
+
parent_span = current_span_context.get()
|
|
287
|
+
if parent_span:
|
|
288
|
+
if parent_span.tools_called is None:
|
|
289
|
+
parent_span.tools_called = []
|
|
290
|
+
|
|
291
|
+
parent_span.tools_called.append(tool_call)
|
|
292
|
+
|
|
293
|
+
trace = current_trace_context.get()
|
|
294
|
+
if trace:
|
|
295
|
+
if trace.tools_called is None:
|
|
296
|
+
trace.tools_called = []
|
|
297
|
+
|
|
298
|
+
trace.tools_called.append(tool_call)
|
|
299
|
+
|
|
275
300
|
def on_tool_error(
|
|
276
301
|
self,
|
|
277
302
|
error: BaseException,
|
|
@@ -2,13 +2,28 @@ from typing import Any, Dict, Optional
|
|
|
2
2
|
import inspect
|
|
3
3
|
from time import perf_counter
|
|
4
4
|
import uuid
|
|
5
|
+
|
|
6
|
+
from llama_index.core.agent.workflow.workflow_events import (
|
|
7
|
+
AgentWorkflowStartEvent,
|
|
8
|
+
)
|
|
9
|
+
from deepeval.integrations.llama_index.utils import (
|
|
10
|
+
extract_output_from_llm_chat_end_event,
|
|
11
|
+
)
|
|
5
12
|
from deepeval.telemetry import capture_tracing_integration
|
|
6
13
|
from deepeval.tracing import trace_manager
|
|
7
|
-
from deepeval.tracing.types import
|
|
14
|
+
from deepeval.tracing.types import (
|
|
15
|
+
ToolSpan,
|
|
16
|
+
AgentSpan,
|
|
17
|
+
BaseSpan,
|
|
18
|
+
LlmSpan,
|
|
19
|
+
TraceSpanStatus,
|
|
20
|
+
)
|
|
8
21
|
from deepeval.tracing.trace_context import (
|
|
9
22
|
current_llm_context,
|
|
10
23
|
current_agent_context,
|
|
11
24
|
)
|
|
25
|
+
from deepeval.test_case import ToolCall
|
|
26
|
+
from deepeval.tracing.utils import make_json_serializable
|
|
12
27
|
|
|
13
28
|
try:
|
|
14
29
|
from llama_index.core.instrumentation.events.base import BaseEvent
|
|
@@ -89,6 +104,7 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
|
|
|
89
104
|
if llm_span_context
|
|
90
105
|
else None
|
|
91
106
|
),
|
|
107
|
+
prompt=llm_span_context.prompt if llm_span_context else None,
|
|
92
108
|
)
|
|
93
109
|
trace_manager.add_span(llm_span)
|
|
94
110
|
trace_manager.add_span_to_trace(llm_span)
|
|
@@ -106,7 +122,9 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
|
|
|
106
122
|
llm_span.status = TraceSpanStatus.SUCCESS
|
|
107
123
|
llm_span.end_time = perf_counter()
|
|
108
124
|
llm_span.input = llm_span.input
|
|
109
|
-
llm_span.output =
|
|
125
|
+
llm_span.output = extract_output_from_llm_chat_end_event(
|
|
126
|
+
event
|
|
127
|
+
)
|
|
110
128
|
trace_manager.remove_span(llm_span.uuid)
|
|
111
129
|
del self.open_ai_astream_to_llm_span_map[event.span_id]
|
|
112
130
|
|
|
@@ -151,6 +169,14 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
|
|
|
151
169
|
# conditions to qualify as agent start run span
|
|
152
170
|
if method_name == "run":
|
|
153
171
|
agent_span_context = current_agent_context.get()
|
|
172
|
+
start_event = bound_args.arguments.get("start_event")
|
|
173
|
+
|
|
174
|
+
if start_event and isinstance(start_event, AgentWorkflowStartEvent):
|
|
175
|
+
input = start_event.model_dump()
|
|
176
|
+
|
|
177
|
+
else:
|
|
178
|
+
input = bound_args.arguments
|
|
179
|
+
|
|
154
180
|
span = AgentSpan(
|
|
155
181
|
uuid=id_,
|
|
156
182
|
status=TraceSpanStatus.IN_PROGRESS,
|
|
@@ -159,7 +185,7 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
|
|
|
159
185
|
parent_uuid=parent_span_id,
|
|
160
186
|
start_time=perf_counter(),
|
|
161
187
|
name="Agent", # TODO: decide the name of the span
|
|
162
|
-
input=
|
|
188
|
+
input=input,
|
|
163
189
|
metrics=(
|
|
164
190
|
agent_span_context.metrics if agent_span_context else None
|
|
165
191
|
),
|
|
@@ -169,7 +195,17 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
|
|
|
169
195
|
else None
|
|
170
196
|
),
|
|
171
197
|
)
|
|
172
|
-
|
|
198
|
+
elif method_name == "acall":
|
|
199
|
+
span = ToolSpan(
|
|
200
|
+
uuid=id_,
|
|
201
|
+
status=TraceSpanStatus.IN_PROGRESS,
|
|
202
|
+
children=[],
|
|
203
|
+
trace_uuid=trace_uuid,
|
|
204
|
+
parent_uuid=parent_span_id,
|
|
205
|
+
start_time=perf_counter(),
|
|
206
|
+
input=bound_args.arguments,
|
|
207
|
+
name="Tool",
|
|
208
|
+
)
|
|
173
209
|
# prepare input test case params for the span
|
|
174
210
|
prepare_input_llm_test_case_params(
|
|
175
211
|
class_name, method_name, span, bound_args.arguments
|
|
@@ -192,10 +228,28 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
|
|
|
192
228
|
if base_span is None:
|
|
193
229
|
return None
|
|
194
230
|
|
|
231
|
+
class_name, method_name = parse_id(id_)
|
|
232
|
+
if method_name == "call_tool":
|
|
233
|
+
output_json = make_json_serializable(result)
|
|
234
|
+
if output_json and isinstance(output_json, dict):
|
|
235
|
+
if base_span.tools_called is None:
|
|
236
|
+
base_span.tools_called = []
|
|
237
|
+
base_span.tools_called.append(
|
|
238
|
+
ToolCall(
|
|
239
|
+
name=output_json.get("tool_name", "Tool"),
|
|
240
|
+
input_parameters=output_json.get("tool_kwargs", {}),
|
|
241
|
+
output=output_json.get("tool_output", {}),
|
|
242
|
+
)
|
|
243
|
+
)
|
|
195
244
|
base_span.end_time = perf_counter()
|
|
196
245
|
base_span.status = TraceSpanStatus.SUCCESS
|
|
197
246
|
base_span.output = result
|
|
198
247
|
|
|
248
|
+
if isinstance(base_span, ToolSpan):
|
|
249
|
+
result_json = make_json_serializable(result)
|
|
250
|
+
if result_json and isinstance(result_json, dict):
|
|
251
|
+
base_span.name = result_json.get("tool_name", "Tool")
|
|
252
|
+
|
|
199
253
|
if base_span.llm_test_case:
|
|
200
254
|
class_name, method_name = parse_id(id_)
|
|
201
255
|
prepare_output_llm_test_case_params(
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from llama_index.core.instrumentation.events.llm import LLMChatEndEvent
|
|
1
2
|
from deepeval.test_case.llm_test_case import LLMTestCase, ToolCall
|
|
2
3
|
from deepeval.tracing.types import BaseSpan
|
|
3
4
|
from typing import Any
|
|
@@ -81,3 +82,26 @@ def prepare_output_llm_test_case_params(
|
|
|
81
82
|
)
|
|
82
83
|
|
|
83
84
|
span.llm_test_case.tools_called = tool_calls
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def extract_output_from_llm_chat_end_event(event: LLMChatEndEvent) -> list:
|
|
88
|
+
messages = []
|
|
89
|
+
for msg in event.response.message.blocks:
|
|
90
|
+
if msg.block_type == "text":
|
|
91
|
+
messages.append(
|
|
92
|
+
{
|
|
93
|
+
"role": event.response.message.role.value,
|
|
94
|
+
"content": msg.text,
|
|
95
|
+
}
|
|
96
|
+
)
|
|
97
|
+
elif msg.block_type == "tool_call":
|
|
98
|
+
messages.append(
|
|
99
|
+
{
|
|
100
|
+
"name": msg.tool_name,
|
|
101
|
+
"input_parameters": msg.tool_kwargs,
|
|
102
|
+
"id": msg.tool_call_id,
|
|
103
|
+
}
|
|
104
|
+
)
|
|
105
|
+
else:
|
|
106
|
+
messages.append(msg.model_dump())
|
|
107
|
+
return messages
|
deepeval/key_handler.py
CHANGED
|
@@ -70,6 +70,7 @@ class ModelKeyValues(Enum):
|
|
|
70
70
|
GOOGLE_GENAI_USE_VERTEXAI = "GOOGLE_GENAI_USE_VERTEXAI"
|
|
71
71
|
GOOGLE_CLOUD_PROJECT = "GOOGLE_CLOUD_PROJECT"
|
|
72
72
|
GOOGLE_CLOUD_LOCATION = "GOOGLE_CLOUD_LOCATION"
|
|
73
|
+
GOOGLE_SERVICE_ACCOUNT_KEY = "GOOGLE_SERVICE_ACCOUNT_KEY"
|
|
73
74
|
# LiteLLM
|
|
74
75
|
USE_LITELLM = "USE_LITELLM"
|
|
75
76
|
LITELLM_MODEL_NAME = "LITELLM_MODEL_NAME"
|
deepeval/metrics/__init__.py
CHANGED
|
@@ -8,6 +8,8 @@ from .base_metric import (
|
|
|
8
8
|
from .dag.dag import DAGMetric, DeepAcyclicGraph
|
|
9
9
|
from .conversational_dag.conversational_dag import ConversationalDAGMetric
|
|
10
10
|
from .bias.bias import BiasMetric
|
|
11
|
+
from .exact_match.exact_match import ExactMatchMetric
|
|
12
|
+
from .pattern_match.pattern_match import PatternMatchMetric
|
|
11
13
|
from .toxicity.toxicity import ToxicityMetric
|
|
12
14
|
from .pii_leakage.pii_leakage import PIILeakageMetric
|
|
13
15
|
from .non_advice.non_advice import NonAdviceMetric
|
|
@@ -69,6 +71,9 @@ __all__ = [
|
|
|
69
71
|
"BaseConversationalMetric",
|
|
70
72
|
"BaseMultimodalMetric",
|
|
71
73
|
"BaseArenaMetric",
|
|
74
|
+
# Non-LLM metrics
|
|
75
|
+
"ExactMatchMetric",
|
|
76
|
+
"PatternMatchMetric",
|
|
72
77
|
# Core metrics
|
|
73
78
|
"GEval",
|
|
74
79
|
"ArenaGEval",
|
|
@@ -46,7 +46,11 @@ class ArenaGEval(BaseArenaMetric):
|
|
|
46
46
|
self.criteria = criteria
|
|
47
47
|
self.model, self.using_native_model = initialize_model(model)
|
|
48
48
|
self.evaluation_model = self.model.get_model_name()
|
|
49
|
-
self.evaluation_steps =
|
|
49
|
+
self.evaluation_steps = (
|
|
50
|
+
evaluation_steps
|
|
51
|
+
if evaluation_steps and len(evaluation_steps) > 0
|
|
52
|
+
else None
|
|
53
|
+
)
|
|
50
54
|
self.async_mode = async_mode
|
|
51
55
|
self.verbose_mode = verbose_mode
|
|
52
56
|
self._include_g_eval_suffix = _include_g_eval_suffix
|
|
@@ -89,10 +89,10 @@ class FormattedArenaTestCase:
|
|
|
89
89
|
def format_arena_test_case(
|
|
90
90
|
evaluation_params: List[LLMTestCaseParams], test_case: ArenaTestCase
|
|
91
91
|
) -> Tuple[FormattedArenaTestCase, Dict[str, str]]:
|
|
92
|
-
case = next(iter(test_case.contestants
|
|
92
|
+
case = next(iter([case.test_case for case in test_case.contestants]))
|
|
93
93
|
|
|
94
94
|
# Create dummy name mapping
|
|
95
|
-
real_names = list(test_case.contestants
|
|
95
|
+
real_names = list([case.name for case in test_case.contestants])
|
|
96
96
|
available_fake_names = FAKE_NAMES.copy()
|
|
97
97
|
random.shuffle(available_fake_names)
|
|
98
98
|
|
|
@@ -119,10 +119,10 @@ def format_arena_test_case(
|
|
|
119
119
|
else None
|
|
120
120
|
),
|
|
121
121
|
contestants={
|
|
122
|
-
contestant: construct_formatted_llm_test_case(
|
|
123
|
-
evaluation_params, test_case
|
|
122
|
+
contestant.name: construct_formatted_llm_test_case(
|
|
123
|
+
evaluation_params, contestant.test_case
|
|
124
124
|
)
|
|
125
|
-
for contestant
|
|
125
|
+
for contestant in test_case.contestants
|
|
126
126
|
},
|
|
127
127
|
dummy_to_real_names=dummy_to_real_names,
|
|
128
128
|
)
|
|
@@ -9,6 +9,8 @@ from deepeval.metrics.g_eval.utils import (
|
|
|
9
9
|
construct_conversational_g_eval_turn_params_string,
|
|
10
10
|
construct_non_turns_test_case_string,
|
|
11
11
|
format_rubrics,
|
|
12
|
+
validate_and_sort_rubrics,
|
|
13
|
+
validate_criteria_and_evaluation_steps,
|
|
12
14
|
)
|
|
13
15
|
from deepeval.test_case import (
|
|
14
16
|
TurnParams,
|
|
@@ -63,27 +65,16 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
63
65
|
|
|
64
66
|
self.evaluation_params = evaluation_params
|
|
65
67
|
|
|
66
|
-
|
|
67
|
-
if criteria is None and evaluation_steps is None:
|
|
68
|
-
raise ValueError(
|
|
69
|
-
"Either 'criteria' or 'evaluation_steps' must be provided."
|
|
70
|
-
)
|
|
71
|
-
|
|
72
|
-
# Check if criteria is provided, it cannot be an empty string
|
|
73
|
-
if criteria is not None and not criteria.strip():
|
|
74
|
-
raise ValueError("Criteria provided cannot be an empty string.")
|
|
75
|
-
|
|
76
|
-
# Check if evaluation_steps is provided, it cannot be an empty list
|
|
77
|
-
if evaluation_steps is not None and len(evaluation_steps) == 0:
|
|
78
|
-
raise ValueError(
|
|
79
|
-
"'evaluation_steps' must not be an empty list. Either omit evaluation steps or include a non-empty list of steps."
|
|
80
|
-
)
|
|
81
|
-
|
|
68
|
+
validate_criteria_and_evaluation_steps(criteria, evaluation_steps)
|
|
82
69
|
self.criteria = criteria
|
|
83
|
-
self.rubric = rubric
|
|
70
|
+
self.rubric = validate_and_sort_rubrics(rubric)
|
|
84
71
|
self.model, self.using_native_model = initialize_model(model)
|
|
85
72
|
self.evaluation_model = self.model.get_model_name()
|
|
86
|
-
self.evaluation_steps =
|
|
73
|
+
self.evaluation_steps = (
|
|
74
|
+
evaluation_steps
|
|
75
|
+
if evaluation_steps and len(evaluation_steps) > 0
|
|
76
|
+
else None
|
|
77
|
+
)
|
|
87
78
|
self.threshold = 1 if strict_mode else threshold
|
|
88
79
|
self.strict_mode = strict_mode
|
|
89
80
|
self.async_mode = async_mode
|
|
File without changes
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from deepeval.metrics.indicator import metric_progress_indicator
|
|
4
|
+
from deepeval.metrics.utils import (
|
|
5
|
+
check_llm_test_case_params,
|
|
6
|
+
construct_verbose_logs,
|
|
7
|
+
)
|
|
8
|
+
from deepeval.metrics.api import metric_data_manager
|
|
9
|
+
from deepeval.metrics import BaseMetric
|
|
10
|
+
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ExactMatchMetric(BaseMetric):
|
|
14
|
+
_required_params: List[LLMTestCaseParams] = [
|
|
15
|
+
LLMTestCaseParams.INPUT,
|
|
16
|
+
LLMTestCaseParams.ACTUAL_OUTPUT,
|
|
17
|
+
LLMTestCaseParams.EXPECTED_OUTPUT,
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
threshold: float = 1,
|
|
23
|
+
verbose_mode: bool = False,
|
|
24
|
+
):
|
|
25
|
+
self.threshold = threshold
|
|
26
|
+
self.verbose_mode = verbose_mode
|
|
27
|
+
|
|
28
|
+
def measure(
|
|
29
|
+
self,
|
|
30
|
+
test_case: LLMTestCase,
|
|
31
|
+
_show_indicator: bool = True,
|
|
32
|
+
_in_component: bool = False,
|
|
33
|
+
_log_metric_to_confident: bool = True,
|
|
34
|
+
) -> float:
|
|
35
|
+
check_llm_test_case_params(test_case, self._required_params, self)
|
|
36
|
+
|
|
37
|
+
with metric_progress_indicator(
|
|
38
|
+
self, _show_indicator=_show_indicator, _in_component=_in_component
|
|
39
|
+
):
|
|
40
|
+
expected = test_case.expected_output.strip()
|
|
41
|
+
actual = test_case.actual_output.strip()
|
|
42
|
+
|
|
43
|
+
if expected == actual:
|
|
44
|
+
self.score = self.precision = self.recall = self.f1 = 1.0
|
|
45
|
+
self.reason = (
|
|
46
|
+
"The actual and expected outputs are exact matches."
|
|
47
|
+
)
|
|
48
|
+
else:
|
|
49
|
+
self.score = self.precision = self.recall = self.f1 = 0.0
|
|
50
|
+
self.reason = "The actual and expected outputs are different."
|
|
51
|
+
|
|
52
|
+
self.success = self.score >= self.threshold
|
|
53
|
+
|
|
54
|
+
if self.verbose_mode:
|
|
55
|
+
self.verbose_logs = construct_verbose_logs(
|
|
56
|
+
self,
|
|
57
|
+
steps=[
|
|
58
|
+
f"Score: {self.score:.2f}",
|
|
59
|
+
f"Reason: {self.reason}",
|
|
60
|
+
],
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
if _log_metric_to_confident:
|
|
64
|
+
metric_data_manager.post_metric_if_enabled(
|
|
65
|
+
self, test_case=test_case
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
return self.score
|
|
69
|
+
|
|
70
|
+
async def a_measure(
|
|
71
|
+
self,
|
|
72
|
+
test_case: LLMTestCase,
|
|
73
|
+
_show_indicator: bool = True,
|
|
74
|
+
_in_component: bool = False,
|
|
75
|
+
) -> float:
|
|
76
|
+
return self.measure(
|
|
77
|
+
test_case,
|
|
78
|
+
_show_indicator=_show_indicator,
|
|
79
|
+
_in_component=_in_component,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
def is_successful(self) -> bool:
|
|
83
|
+
if self.error is not None:
|
|
84
|
+
self.success = False
|
|
85
|
+
else:
|
|
86
|
+
try:
|
|
87
|
+
self.success = self.score >= self.threshold
|
|
88
|
+
except:
|
|
89
|
+
self.success = False
|
|
90
|
+
return self.success
|
|
91
|
+
|
|
92
|
+
@property
|
|
93
|
+
def __name__(self):
|
|
94
|
+
return "Exact Match"
|
|
@@ -61,7 +61,11 @@ class GEval(BaseMetric):
|
|
|
61
61
|
self.score_range_span = self.score_range[1] - self.score_range[0]
|
|
62
62
|
self.model, self.using_native_model = initialize_model(model)
|
|
63
63
|
self.evaluation_model = self.model.get_model_name()
|
|
64
|
-
self.evaluation_steps =
|
|
64
|
+
self.evaluation_steps = (
|
|
65
|
+
evaluation_steps
|
|
66
|
+
if evaluation_steps and len(evaluation_steps) > 0
|
|
67
|
+
else None
|
|
68
|
+
)
|
|
65
69
|
self.threshold = 1 if strict_mode else threshold
|
|
66
70
|
self.top_logprobs = top_logprobs
|
|
67
71
|
self.strict_mode = strict_mode
|
deepeval/metrics/g_eval/utils.py
CHANGED
|
@@ -77,7 +77,7 @@ def validate_criteria_and_evaluation_steps(
|
|
|
77
77
|
def validate_and_sort_rubrics(
|
|
78
78
|
rubrics: Optional[List[Rubric]] = None,
|
|
79
79
|
) -> Optional[List[Rubric]]:
|
|
80
|
-
if rubrics is None:
|
|
80
|
+
if rubrics is None or len(rubrics) == 0:
|
|
81
81
|
return None
|
|
82
82
|
|
|
83
83
|
# Sort rubrics by start of range
|
|
@@ -64,7 +64,11 @@ class MultimodalGEval(BaseMultimodalMetric):
|
|
|
64
64
|
self.rubric = validate_and_sort_rubrics(rubric)
|
|
65
65
|
self.model, self.using_native_model = initialize_multimodal_model(model)
|
|
66
66
|
self.evaluation_model = self.model.get_model_name()
|
|
67
|
-
self.evaluation_steps =
|
|
67
|
+
self.evaluation_steps = (
|
|
68
|
+
evaluation_steps
|
|
69
|
+
if evaluation_steps and len(evaluation_steps) > 0
|
|
70
|
+
else None
|
|
71
|
+
)
|
|
68
72
|
self.threshold = 1 if strict_mode else threshold
|
|
69
73
|
self.top_logprobs = top_logprobs
|
|
70
74
|
self.strict_mode = strict_mode
|
|
File without changes
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from deepeval.metrics.indicator import metric_progress_indicator
|
|
5
|
+
from deepeval.metrics.utils import (
|
|
6
|
+
check_llm_test_case_params,
|
|
7
|
+
construct_verbose_logs,
|
|
8
|
+
)
|
|
9
|
+
from deepeval.metrics.api import metric_data_manager
|
|
10
|
+
from deepeval.metrics import BaseMetric
|
|
11
|
+
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class PatternMatchMetric(BaseMetric):
|
|
15
|
+
_required_params: List[LLMTestCaseParams] = [
|
|
16
|
+
LLMTestCaseParams.INPUT,
|
|
17
|
+
LLMTestCaseParams.ACTUAL_OUTPUT,
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
pattern: str,
|
|
23
|
+
ignore_case: bool = False,
|
|
24
|
+
threshold: float = 1.0,
|
|
25
|
+
verbose_mode: bool = False,
|
|
26
|
+
):
|
|
27
|
+
self.pattern = pattern.strip()
|
|
28
|
+
self.ignore_case = ignore_case
|
|
29
|
+
self.verbose_mode = verbose_mode
|
|
30
|
+
self.threshold = threshold
|
|
31
|
+
|
|
32
|
+
flags = re.IGNORECASE if ignore_case else 0
|
|
33
|
+
try:
|
|
34
|
+
self._compiled_pattern = re.compile(self.pattern, flags)
|
|
35
|
+
except re.error as e:
|
|
36
|
+
raise ValueError(f"Invalid regex pattern: {pattern} — {e}")
|
|
37
|
+
|
|
38
|
+
def measure(
|
|
39
|
+
self,
|
|
40
|
+
test_case: LLMTestCase,
|
|
41
|
+
_show_indicator: bool = True,
|
|
42
|
+
_in_component: bool = False,
|
|
43
|
+
_log_metric_to_confident: bool = True,
|
|
44
|
+
) -> float:
|
|
45
|
+
check_llm_test_case_params(test_case, self._required_params, self)
|
|
46
|
+
|
|
47
|
+
with metric_progress_indicator(
|
|
48
|
+
self, _show_indicator=_show_indicator, _in_component=_in_component
|
|
49
|
+
):
|
|
50
|
+
actual = test_case.actual_output.strip()
|
|
51
|
+
full_match = self._compiled_pattern.fullmatch(actual)
|
|
52
|
+
|
|
53
|
+
self.score = 1.0 if full_match else 0.0
|
|
54
|
+
self.reason = (
|
|
55
|
+
f"The actual output fully matches the pattern."
|
|
56
|
+
if full_match
|
|
57
|
+
else f"The actual output does not match the pattern."
|
|
58
|
+
)
|
|
59
|
+
self.success = self.score >= self.threshold
|
|
60
|
+
|
|
61
|
+
if self.verbose_mode:
|
|
62
|
+
self.verbose_logs = construct_verbose_logs(
|
|
63
|
+
self,
|
|
64
|
+
steps=[
|
|
65
|
+
f"Pattern: {self.pattern}",
|
|
66
|
+
f"Actual: {actual}",
|
|
67
|
+
f"Score: {self.score:.2f}",
|
|
68
|
+
f"Reason: {self.reason}",
|
|
69
|
+
],
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
if _log_metric_to_confident:
|
|
73
|
+
metric_data_manager.post_metric_if_enabled(
|
|
74
|
+
self, test_case=test_case
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
return self.score
|
|
78
|
+
|
|
79
|
+
async def a_measure(
|
|
80
|
+
self,
|
|
81
|
+
test_case: LLMTestCase,
|
|
82
|
+
_show_indicator: bool = True,
|
|
83
|
+
_in_component: bool = False,
|
|
84
|
+
) -> float:
|
|
85
|
+
return self.measure(
|
|
86
|
+
test_case,
|
|
87
|
+
_show_indicator=_show_indicator,
|
|
88
|
+
_in_component=_in_component,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
def is_successful(self) -> bool:
|
|
92
|
+
if self.error is not None:
|
|
93
|
+
self.success = False
|
|
94
|
+
else:
|
|
95
|
+
try:
|
|
96
|
+
self.success = self.score >= self.threshold
|
|
97
|
+
except:
|
|
98
|
+
self.success = False
|
|
99
|
+
return self.success
|
|
100
|
+
|
|
101
|
+
@property
|
|
102
|
+
def __name__(self):
|
|
103
|
+
return "Pattern Match"
|
|
@@ -36,6 +36,11 @@ class TaskCompletionMetric(BaseMetric):
|
|
|
36
36
|
strict_mode: bool = False,
|
|
37
37
|
verbose_mode: bool = False,
|
|
38
38
|
):
|
|
39
|
+
if task is None:
|
|
40
|
+
self._is_task_provided = False
|
|
41
|
+
else:
|
|
42
|
+
self._is_task_provided = True
|
|
43
|
+
|
|
39
44
|
self.task = task
|
|
40
45
|
self.threshold = 1 if strict_mode else threshold
|
|
41
46
|
self.model, self.using_native_model = initialize_model(model)
|
|
@@ -73,7 +78,8 @@ class TaskCompletionMetric(BaseMetric):
|
|
|
73
78
|
)
|
|
74
79
|
else:
|
|
75
80
|
task, self.outcome = self._extract_task_and_outcome(test_case)
|
|
76
|
-
|
|
81
|
+
if self.task is None or not self._is_task_provided:
|
|
82
|
+
self.task = task
|
|
77
83
|
self.verdict, self.reason = self._generate_verdicts()
|
|
78
84
|
self.score = self._calculate_score()
|
|
79
85
|
self.success = self.score >= self.threshold
|
|
@@ -108,7 +114,8 @@ class TaskCompletionMetric(BaseMetric):
|
|
|
108
114
|
task, self.outcome = await self._a_extract_task_and_outcome(
|
|
109
115
|
test_case
|
|
110
116
|
)
|
|
111
|
-
|
|
117
|
+
if self.task is None or not self._is_task_provided:
|
|
118
|
+
self.task = task
|
|
112
119
|
self.verdict, self.reason = await self._a_generate_verdicts()
|
|
113
120
|
self.score = self._calculate_score()
|
|
114
121
|
self.success = self.score >= self.threshold
|
deepeval/metrics/utils.py
CHANGED
|
@@ -270,7 +270,7 @@ def check_arena_test_case_params(
|
|
|
270
270
|
f"Expected ArenaTestCase, got {type(arena_test_case).__name__}"
|
|
271
271
|
)
|
|
272
272
|
|
|
273
|
-
cases =
|
|
273
|
+
cases = [contestant.test_case for contestant in arena_test_case.contestants]
|
|
274
274
|
ref_input = cases[0].input
|
|
275
275
|
for case in cases[1:]:
|
|
276
276
|
if case.input != ref_input:
|
|
File without changes
|