deepeval 3.6.9__py3-none-any.whl → 3.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. deepeval/__init__.py +0 -4
  2. deepeval/_version.py +1 -1
  3. deepeval/anthropic/__init__.py +19 -0
  4. deepeval/anthropic/extractors.py +94 -0
  5. deepeval/anthropic/patch.py +169 -0
  6. deepeval/anthropic/utils.py +225 -0
  7. deepeval/benchmarks/drop/drop.py +40 -14
  8. deepeval/benchmarks/ifeval/ifeval.py +2 -2
  9. deepeval/cli/main.py +7 -0
  10. deepeval/confident/api.py +6 -1
  11. deepeval/confident/types.py +4 -2
  12. deepeval/config/settings.py +159 -11
  13. deepeval/config/settings_manager.py +4 -0
  14. deepeval/evaluate/compare.py +215 -4
  15. deepeval/evaluate/types.py +6 -0
  16. deepeval/evaluate/utils.py +30 -0
  17. deepeval/integrations/crewai/handler.py +36 -0
  18. deepeval/integrations/langchain/callback.py +27 -2
  19. deepeval/integrations/llama_index/handler.py +58 -4
  20. deepeval/integrations/llama_index/utils.py +24 -0
  21. deepeval/key_handler.py +1 -0
  22. deepeval/metrics/__init__.py +5 -0
  23. deepeval/metrics/arena_g_eval/arena_g_eval.py +5 -1
  24. deepeval/metrics/arena_g_eval/utils.py +5 -5
  25. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +9 -18
  26. deepeval/metrics/exact_match/__init__.py +0 -0
  27. deepeval/metrics/exact_match/exact_match.py +94 -0
  28. deepeval/metrics/g_eval/g_eval.py +5 -1
  29. deepeval/metrics/g_eval/utils.py +1 -1
  30. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +5 -1
  31. deepeval/metrics/pattern_match/__init__.py +0 -0
  32. deepeval/metrics/pattern_match/pattern_match.py +103 -0
  33. deepeval/metrics/task_completion/task_completion.py +9 -2
  34. deepeval/metrics/utils.py +1 -1
  35. deepeval/model_integrations/__init__.py +0 -0
  36. deepeval/model_integrations/utils.py +116 -0
  37. deepeval/models/base_model.py +3 -1
  38. deepeval/models/llms/gemini_model.py +27 -5
  39. deepeval/openai/__init__.py +3 -1
  40. deepeval/openai/extractors.py +2 -2
  41. deepeval/openai/utils.py +7 -31
  42. deepeval/openai_agents/callback_handler.py +12 -3
  43. deepeval/prompt/api.py +11 -10
  44. deepeval/prompt/prompt.py +27 -15
  45. deepeval/simulator/template.py +1 -1
  46. deepeval/telemetry.py +3 -3
  47. deepeval/test_case/__init__.py +2 -1
  48. deepeval/test_case/arena_test_case.py +15 -4
  49. deepeval/test_case/llm_test_case.py +3 -2
  50. deepeval/test_case/mllm_test_case.py +45 -22
  51. deepeval/test_run/api.py +3 -2
  52. deepeval/test_run/cache.py +35 -13
  53. deepeval/test_run/hyperparameters.py +5 -1
  54. deepeval/test_run/test_run.py +52 -14
  55. deepeval/tracing/api.py +11 -10
  56. deepeval/tracing/otel/exporter.py +11 -0
  57. deepeval/tracing/patchers.py +102 -1
  58. deepeval/tracing/trace_context.py +13 -4
  59. deepeval/tracing/tracing.py +11 -2
  60. deepeval/tracing/types.py +8 -8
  61. deepeval/tracing/utils.py +9 -0
  62. deepeval/utils.py +48 -2
  63. {deepeval-3.6.9.dist-info → deepeval-3.7.1.dist-info}/METADATA +3 -3
  64. {deepeval-3.6.9.dist-info → deepeval-3.7.1.dist-info}/RECORD +68 -58
  65. /deepeval/{openai → model_integrations}/types.py +0 -0
  66. {deepeval-3.6.9.dist-info → deepeval-3.7.1.dist-info}/LICENSE.md +0 -0
  67. {deepeval-3.6.9.dist-info → deepeval-3.7.1.dist-info}/WHEEL +0 -0
  68. {deepeval-3.6.9.dist-info → deepeval-3.7.1.dist-info}/entry_points.txt +0 -0
@@ -1,12 +1,15 @@
1
1
  from typing import Any, Optional, List, Dict
2
2
  from uuid import UUID
3
3
  from time import perf_counter
4
- from deepeval.tracing.context import current_trace_context
4
+
5
+ from deepeval.tracing.context import current_span_context, current_trace_context
6
+ from deepeval.test_case.llm_test_case import ToolCall
5
7
  from deepeval.tracing.types import (
6
8
  LlmOutput,
7
9
  LlmToolCall,
8
10
  )
9
11
  from deepeval.metrics import BaseMetric
12
+ from deepeval.tracing.utils import prepare_tool_call_input_parameters
10
13
 
11
14
  try:
12
15
  from langchain_core.callbacks.base import BaseCallbackHandler
@@ -266,12 +269,34 @@ class CallbackHandler(BaseCallbackHandler):
266
269
  parent_run_id: Optional[UUID] = None,
267
270
  **kwargs: Any, # un-logged kwargs
268
271
  ) -> Any:
269
-
270
272
  uuid_str = str(run_id)
271
273
  tool_span: ToolSpan = trace_manager.get_span_by_uuid(uuid_str)
272
274
  tool_span.output = output
273
275
  exit_current_context(uuid_str=uuid_str)
274
276
 
277
+ # set the tools called in the parent span as well as on the trace level
278
+ tool_call = ToolCall(
279
+ name=tool_span.name,
280
+ description=tool_span.description,
281
+ output=output,
282
+ input_parameters=prepare_tool_call_input_parameters(
283
+ tool_span.input
284
+ ),
285
+ )
286
+ parent_span = current_span_context.get()
287
+ if parent_span:
288
+ if parent_span.tools_called is None:
289
+ parent_span.tools_called = []
290
+
291
+ parent_span.tools_called.append(tool_call)
292
+
293
+ trace = current_trace_context.get()
294
+ if trace:
295
+ if trace.tools_called is None:
296
+ trace.tools_called = []
297
+
298
+ trace.tools_called.append(tool_call)
299
+
275
300
  def on_tool_error(
276
301
  self,
277
302
  error: BaseException,
@@ -2,13 +2,28 @@ from typing import Any, Dict, Optional
2
2
  import inspect
3
3
  from time import perf_counter
4
4
  import uuid
5
+
6
+ from llama_index.core.agent.workflow.workflow_events import (
7
+ AgentWorkflowStartEvent,
8
+ )
9
+ from deepeval.integrations.llama_index.utils import (
10
+ extract_output_from_llm_chat_end_event,
11
+ )
5
12
  from deepeval.telemetry import capture_tracing_integration
6
13
  from deepeval.tracing import trace_manager
7
- from deepeval.tracing.types import AgentSpan, BaseSpan, LlmSpan, TraceSpanStatus
14
+ from deepeval.tracing.types import (
15
+ ToolSpan,
16
+ AgentSpan,
17
+ BaseSpan,
18
+ LlmSpan,
19
+ TraceSpanStatus,
20
+ )
8
21
  from deepeval.tracing.trace_context import (
9
22
  current_llm_context,
10
23
  current_agent_context,
11
24
  )
25
+ from deepeval.test_case import ToolCall
26
+ from deepeval.tracing.utils import make_json_serializable
12
27
 
13
28
  try:
14
29
  from llama_index.core.instrumentation.events.base import BaseEvent
@@ -89,6 +104,7 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
89
104
  if llm_span_context
90
105
  else None
91
106
  ),
107
+ prompt=llm_span_context.prompt if llm_span_context else None,
92
108
  )
93
109
  trace_manager.add_span(llm_span)
94
110
  trace_manager.add_span_to_trace(llm_span)
@@ -106,7 +122,9 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
106
122
  llm_span.status = TraceSpanStatus.SUCCESS
107
123
  llm_span.end_time = perf_counter()
108
124
  llm_span.input = llm_span.input
109
- llm_span.output = event.response.message.blocks[0].text
125
+ llm_span.output = extract_output_from_llm_chat_end_event(
126
+ event
127
+ )
110
128
  trace_manager.remove_span(llm_span.uuid)
111
129
  del self.open_ai_astream_to_llm_span_map[event.span_id]
112
130
 
@@ -151,6 +169,14 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
151
169
  # conditions to qualify as agent start run span
152
170
  if method_name == "run":
153
171
  agent_span_context = current_agent_context.get()
172
+ start_event = bound_args.arguments.get("start_event")
173
+
174
+ if start_event and isinstance(start_event, AgentWorkflowStartEvent):
175
+ input = start_event.model_dump()
176
+
177
+ else:
178
+ input = bound_args.arguments
179
+
154
180
  span = AgentSpan(
155
181
  uuid=id_,
156
182
  status=TraceSpanStatus.IN_PROGRESS,
@@ -159,7 +185,7 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
159
185
  parent_uuid=parent_span_id,
160
186
  start_time=perf_counter(),
161
187
  name="Agent", # TODO: decide the name of the span
162
- input=bound_args.arguments,
188
+ input=input,
163
189
  metrics=(
164
190
  agent_span_context.metrics if agent_span_context else None
165
191
  ),
@@ -169,7 +195,17 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
169
195
  else None
170
196
  ),
171
197
  )
172
-
198
+ elif method_name == "acall":
199
+ span = ToolSpan(
200
+ uuid=id_,
201
+ status=TraceSpanStatus.IN_PROGRESS,
202
+ children=[],
203
+ trace_uuid=trace_uuid,
204
+ parent_uuid=parent_span_id,
205
+ start_time=perf_counter(),
206
+ input=bound_args.arguments,
207
+ name="Tool",
208
+ )
173
209
  # prepare input test case params for the span
174
210
  prepare_input_llm_test_case_params(
175
211
  class_name, method_name, span, bound_args.arguments
@@ -192,10 +228,28 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
192
228
  if base_span is None:
193
229
  return None
194
230
 
231
+ class_name, method_name = parse_id(id_)
232
+ if method_name == "call_tool":
233
+ output_json = make_json_serializable(result)
234
+ if output_json and isinstance(output_json, dict):
235
+ if base_span.tools_called is None:
236
+ base_span.tools_called = []
237
+ base_span.tools_called.append(
238
+ ToolCall(
239
+ name=output_json.get("tool_name", "Tool"),
240
+ input_parameters=output_json.get("tool_kwargs", {}),
241
+ output=output_json.get("tool_output", {}),
242
+ )
243
+ )
195
244
  base_span.end_time = perf_counter()
196
245
  base_span.status = TraceSpanStatus.SUCCESS
197
246
  base_span.output = result
198
247
 
248
+ if isinstance(base_span, ToolSpan):
249
+ result_json = make_json_serializable(result)
250
+ if result_json and isinstance(result_json, dict):
251
+ base_span.name = result_json.get("tool_name", "Tool")
252
+
199
253
  if base_span.llm_test_case:
200
254
  class_name, method_name = parse_id(id_)
201
255
  prepare_output_llm_test_case_params(
@@ -1,3 +1,4 @@
1
+ from llama_index.core.instrumentation.events.llm import LLMChatEndEvent
1
2
  from deepeval.test_case.llm_test_case import LLMTestCase, ToolCall
2
3
  from deepeval.tracing.types import BaseSpan
3
4
  from typing import Any
@@ -81,3 +82,26 @@ def prepare_output_llm_test_case_params(
81
82
  )
82
83
 
83
84
  span.llm_test_case.tools_called = tool_calls
85
+
86
+
87
+ def extract_output_from_llm_chat_end_event(event: LLMChatEndEvent) -> list:
88
+ messages = []
89
+ for msg in event.response.message.blocks:
90
+ if msg.block_type == "text":
91
+ messages.append(
92
+ {
93
+ "role": event.response.message.role.value,
94
+ "content": msg.text,
95
+ }
96
+ )
97
+ elif msg.block_type == "tool_call":
98
+ messages.append(
99
+ {
100
+ "name": msg.tool_name,
101
+ "input_parameters": msg.tool_kwargs,
102
+ "id": msg.tool_call_id,
103
+ }
104
+ )
105
+ else:
106
+ messages.append(msg.model_dump())
107
+ return messages
deepeval/key_handler.py CHANGED
@@ -70,6 +70,7 @@ class ModelKeyValues(Enum):
70
70
  GOOGLE_GENAI_USE_VERTEXAI = "GOOGLE_GENAI_USE_VERTEXAI"
71
71
  GOOGLE_CLOUD_PROJECT = "GOOGLE_CLOUD_PROJECT"
72
72
  GOOGLE_CLOUD_LOCATION = "GOOGLE_CLOUD_LOCATION"
73
+ GOOGLE_SERVICE_ACCOUNT_KEY = "GOOGLE_SERVICE_ACCOUNT_KEY"
73
74
  # LiteLLM
74
75
  USE_LITELLM = "USE_LITELLM"
75
76
  LITELLM_MODEL_NAME = "LITELLM_MODEL_NAME"
@@ -8,6 +8,8 @@ from .base_metric import (
8
8
  from .dag.dag import DAGMetric, DeepAcyclicGraph
9
9
  from .conversational_dag.conversational_dag import ConversationalDAGMetric
10
10
  from .bias.bias import BiasMetric
11
+ from .exact_match.exact_match import ExactMatchMetric
12
+ from .pattern_match.pattern_match import PatternMatchMetric
11
13
  from .toxicity.toxicity import ToxicityMetric
12
14
  from .pii_leakage.pii_leakage import PIILeakageMetric
13
15
  from .non_advice.non_advice import NonAdviceMetric
@@ -69,6 +71,9 @@ __all__ = [
69
71
  "BaseConversationalMetric",
70
72
  "BaseMultimodalMetric",
71
73
  "BaseArenaMetric",
74
+ # Non-LLM metrics
75
+ "ExactMatchMetric",
76
+ "PatternMatchMetric",
72
77
  # Core metrics
73
78
  "GEval",
74
79
  "ArenaGEval",
@@ -46,7 +46,11 @@ class ArenaGEval(BaseArenaMetric):
46
46
  self.criteria = criteria
47
47
  self.model, self.using_native_model = initialize_model(model)
48
48
  self.evaluation_model = self.model.get_model_name()
49
- self.evaluation_steps = evaluation_steps
49
+ self.evaluation_steps = (
50
+ evaluation_steps
51
+ if evaluation_steps and len(evaluation_steps) > 0
52
+ else None
53
+ )
50
54
  self.async_mode = async_mode
51
55
  self.verbose_mode = verbose_mode
52
56
  self._include_g_eval_suffix = _include_g_eval_suffix
@@ -89,10 +89,10 @@ class FormattedArenaTestCase:
89
89
  def format_arena_test_case(
90
90
  evaluation_params: List[LLMTestCaseParams], test_case: ArenaTestCase
91
91
  ) -> Tuple[FormattedArenaTestCase, Dict[str, str]]:
92
- case = next(iter(test_case.contestants.values()))
92
+ case = next(iter([case.test_case for case in test_case.contestants]))
93
93
 
94
94
  # Create dummy name mapping
95
- real_names = list(test_case.contestants.keys())
95
+ real_names = list([case.name for case in test_case.contestants])
96
96
  available_fake_names = FAKE_NAMES.copy()
97
97
  random.shuffle(available_fake_names)
98
98
 
@@ -119,10 +119,10 @@ def format_arena_test_case(
119
119
  else None
120
120
  ),
121
121
  contestants={
122
- contestant: construct_formatted_llm_test_case(
123
- evaluation_params, test_case
122
+ contestant.name: construct_formatted_llm_test_case(
123
+ evaluation_params, contestant.test_case
124
124
  )
125
- for contestant, test_case in test_case.contestants.items()
125
+ for contestant in test_case.contestants
126
126
  },
127
127
  dummy_to_real_names=dummy_to_real_names,
128
128
  )
@@ -9,6 +9,8 @@ from deepeval.metrics.g_eval.utils import (
9
9
  construct_conversational_g_eval_turn_params_string,
10
10
  construct_non_turns_test_case_string,
11
11
  format_rubrics,
12
+ validate_and_sort_rubrics,
13
+ validate_criteria_and_evaluation_steps,
12
14
  )
13
15
  from deepeval.test_case import (
14
16
  TurnParams,
@@ -63,27 +65,16 @@ class ConversationalGEval(BaseConversationalMetric):
63
65
 
64
66
  self.evaluation_params = evaluation_params
65
67
 
66
- # Check if both criteria and evaluation_steps are not None at the same time
67
- if criteria is None and evaluation_steps is None:
68
- raise ValueError(
69
- "Either 'criteria' or 'evaluation_steps' must be provided."
70
- )
71
-
72
- # Check if criteria is provided, it cannot be an empty string
73
- if criteria is not None and not criteria.strip():
74
- raise ValueError("Criteria provided cannot be an empty string.")
75
-
76
- # Check if evaluation_steps is provided, it cannot be an empty list
77
- if evaluation_steps is not None and len(evaluation_steps) == 0:
78
- raise ValueError(
79
- "'evaluation_steps' must not be an empty list. Either omit evaluation steps or include a non-empty list of steps."
80
- )
81
-
68
+ validate_criteria_and_evaluation_steps(criteria, evaluation_steps)
82
69
  self.criteria = criteria
83
- self.rubric = rubric
70
+ self.rubric = validate_and_sort_rubrics(rubric)
84
71
  self.model, self.using_native_model = initialize_model(model)
85
72
  self.evaluation_model = self.model.get_model_name()
86
- self.evaluation_steps = evaluation_steps
73
+ self.evaluation_steps = (
74
+ evaluation_steps
75
+ if evaluation_steps and len(evaluation_steps) > 0
76
+ else None
77
+ )
87
78
  self.threshold = 1 if strict_mode else threshold
88
79
  self.strict_mode = strict_mode
89
80
  self.async_mode = async_mode
File without changes
@@ -0,0 +1,94 @@
1
+ from typing import List
2
+
3
+ from deepeval.metrics.indicator import metric_progress_indicator
4
+ from deepeval.metrics.utils import (
5
+ check_llm_test_case_params,
6
+ construct_verbose_logs,
7
+ )
8
+ from deepeval.metrics.api import metric_data_manager
9
+ from deepeval.metrics import BaseMetric
10
+ from deepeval.test_case import LLMTestCase, LLMTestCaseParams
11
+
12
+
13
+ class ExactMatchMetric(BaseMetric):
14
+ _required_params: List[LLMTestCaseParams] = [
15
+ LLMTestCaseParams.INPUT,
16
+ LLMTestCaseParams.ACTUAL_OUTPUT,
17
+ LLMTestCaseParams.EXPECTED_OUTPUT,
18
+ ]
19
+
20
+ def __init__(
21
+ self,
22
+ threshold: float = 1,
23
+ verbose_mode: bool = False,
24
+ ):
25
+ self.threshold = threshold
26
+ self.verbose_mode = verbose_mode
27
+
28
+ def measure(
29
+ self,
30
+ test_case: LLMTestCase,
31
+ _show_indicator: bool = True,
32
+ _in_component: bool = False,
33
+ _log_metric_to_confident: bool = True,
34
+ ) -> float:
35
+ check_llm_test_case_params(test_case, self._required_params, self)
36
+
37
+ with metric_progress_indicator(
38
+ self, _show_indicator=_show_indicator, _in_component=_in_component
39
+ ):
40
+ expected = test_case.expected_output.strip()
41
+ actual = test_case.actual_output.strip()
42
+
43
+ if expected == actual:
44
+ self.score = self.precision = self.recall = self.f1 = 1.0
45
+ self.reason = (
46
+ "The actual and expected outputs are exact matches."
47
+ )
48
+ else:
49
+ self.score = self.precision = self.recall = self.f1 = 0.0
50
+ self.reason = "The actual and expected outputs are different."
51
+
52
+ self.success = self.score >= self.threshold
53
+
54
+ if self.verbose_mode:
55
+ self.verbose_logs = construct_verbose_logs(
56
+ self,
57
+ steps=[
58
+ f"Score: {self.score:.2f}",
59
+ f"Reason: {self.reason}",
60
+ ],
61
+ )
62
+
63
+ if _log_metric_to_confident:
64
+ metric_data_manager.post_metric_if_enabled(
65
+ self, test_case=test_case
66
+ )
67
+
68
+ return self.score
69
+
70
+ async def a_measure(
71
+ self,
72
+ test_case: LLMTestCase,
73
+ _show_indicator: bool = True,
74
+ _in_component: bool = False,
75
+ ) -> float:
76
+ return self.measure(
77
+ test_case,
78
+ _show_indicator=_show_indicator,
79
+ _in_component=_in_component,
80
+ )
81
+
82
+ def is_successful(self) -> bool:
83
+ if self.error is not None:
84
+ self.success = False
85
+ else:
86
+ try:
87
+ self.success = self.score >= self.threshold
88
+ except:
89
+ self.success = False
90
+ return self.success
91
+
92
+ @property
93
+ def __name__(self):
94
+ return "Exact Match"
@@ -61,7 +61,11 @@ class GEval(BaseMetric):
61
61
  self.score_range_span = self.score_range[1] - self.score_range[0]
62
62
  self.model, self.using_native_model = initialize_model(model)
63
63
  self.evaluation_model = self.model.get_model_name()
64
- self.evaluation_steps = evaluation_steps
64
+ self.evaluation_steps = (
65
+ evaluation_steps
66
+ if evaluation_steps and len(evaluation_steps) > 0
67
+ else None
68
+ )
65
69
  self.threshold = 1 if strict_mode else threshold
66
70
  self.top_logprobs = top_logprobs
67
71
  self.strict_mode = strict_mode
@@ -77,7 +77,7 @@ def validate_criteria_and_evaluation_steps(
77
77
  def validate_and_sort_rubrics(
78
78
  rubrics: Optional[List[Rubric]] = None,
79
79
  ) -> Optional[List[Rubric]]:
80
- if rubrics is None:
80
+ if rubrics is None or len(rubrics) == 0:
81
81
  return None
82
82
 
83
83
  # Sort rubrics by start of range
@@ -64,7 +64,11 @@ class MultimodalGEval(BaseMultimodalMetric):
64
64
  self.rubric = validate_and_sort_rubrics(rubric)
65
65
  self.model, self.using_native_model = initialize_multimodal_model(model)
66
66
  self.evaluation_model = self.model.get_model_name()
67
- self.evaluation_steps = evaluation_steps
67
+ self.evaluation_steps = (
68
+ evaluation_steps
69
+ if evaluation_steps and len(evaluation_steps) > 0
70
+ else None
71
+ )
68
72
  self.threshold = 1 if strict_mode else threshold
69
73
  self.top_logprobs = top_logprobs
70
74
  self.strict_mode = strict_mode
File without changes
@@ -0,0 +1,103 @@
1
+ import re
2
+ from typing import List
3
+
4
+ from deepeval.metrics.indicator import metric_progress_indicator
5
+ from deepeval.metrics.utils import (
6
+ check_llm_test_case_params,
7
+ construct_verbose_logs,
8
+ )
9
+ from deepeval.metrics.api import metric_data_manager
10
+ from deepeval.metrics import BaseMetric
11
+ from deepeval.test_case import LLMTestCase, LLMTestCaseParams
12
+
13
+
14
+ class PatternMatchMetric(BaseMetric):
15
+ _required_params: List[LLMTestCaseParams] = [
16
+ LLMTestCaseParams.INPUT,
17
+ LLMTestCaseParams.ACTUAL_OUTPUT,
18
+ ]
19
+
20
+ def __init__(
21
+ self,
22
+ pattern: str,
23
+ ignore_case: bool = False,
24
+ threshold: float = 1.0,
25
+ verbose_mode: bool = False,
26
+ ):
27
+ self.pattern = pattern.strip()
28
+ self.ignore_case = ignore_case
29
+ self.verbose_mode = verbose_mode
30
+ self.threshold = threshold
31
+
32
+ flags = re.IGNORECASE if ignore_case else 0
33
+ try:
34
+ self._compiled_pattern = re.compile(self.pattern, flags)
35
+ except re.error as e:
36
+ raise ValueError(f"Invalid regex pattern: {pattern} — {e}")
37
+
38
+ def measure(
39
+ self,
40
+ test_case: LLMTestCase,
41
+ _show_indicator: bool = True,
42
+ _in_component: bool = False,
43
+ _log_metric_to_confident: bool = True,
44
+ ) -> float:
45
+ check_llm_test_case_params(test_case, self._required_params, self)
46
+
47
+ with metric_progress_indicator(
48
+ self, _show_indicator=_show_indicator, _in_component=_in_component
49
+ ):
50
+ actual = test_case.actual_output.strip()
51
+ full_match = self._compiled_pattern.fullmatch(actual)
52
+
53
+ self.score = 1.0 if full_match else 0.0
54
+ self.reason = (
55
+ f"The actual output fully matches the pattern."
56
+ if full_match
57
+ else f"The actual output does not match the pattern."
58
+ )
59
+ self.success = self.score >= self.threshold
60
+
61
+ if self.verbose_mode:
62
+ self.verbose_logs = construct_verbose_logs(
63
+ self,
64
+ steps=[
65
+ f"Pattern: {self.pattern}",
66
+ f"Actual: {actual}",
67
+ f"Score: {self.score:.2f}",
68
+ f"Reason: {self.reason}",
69
+ ],
70
+ )
71
+
72
+ if _log_metric_to_confident:
73
+ metric_data_manager.post_metric_if_enabled(
74
+ self, test_case=test_case
75
+ )
76
+
77
+ return self.score
78
+
79
+ async def a_measure(
80
+ self,
81
+ test_case: LLMTestCase,
82
+ _show_indicator: bool = True,
83
+ _in_component: bool = False,
84
+ ) -> float:
85
+ return self.measure(
86
+ test_case,
87
+ _show_indicator=_show_indicator,
88
+ _in_component=_in_component,
89
+ )
90
+
91
+ def is_successful(self) -> bool:
92
+ if self.error is not None:
93
+ self.success = False
94
+ else:
95
+ try:
96
+ self.success = self.score >= self.threshold
97
+ except:
98
+ self.success = False
99
+ return self.success
100
+
101
+ @property
102
+ def __name__(self):
103
+ return "Pattern Match"
@@ -36,6 +36,11 @@ class TaskCompletionMetric(BaseMetric):
36
36
  strict_mode: bool = False,
37
37
  verbose_mode: bool = False,
38
38
  ):
39
+ if task is None:
40
+ self._is_task_provided = False
41
+ else:
42
+ self._is_task_provided = True
43
+
39
44
  self.task = task
40
45
  self.threshold = 1 if strict_mode else threshold
41
46
  self.model, self.using_native_model = initialize_model(model)
@@ -73,7 +78,8 @@ class TaskCompletionMetric(BaseMetric):
73
78
  )
74
79
  else:
75
80
  task, self.outcome = self._extract_task_and_outcome(test_case)
76
- self.task = task if self.task is None else self.task
81
+ if self.task is None or not self._is_task_provided:
82
+ self.task = task
77
83
  self.verdict, self.reason = self._generate_verdicts()
78
84
  self.score = self._calculate_score()
79
85
  self.success = self.score >= self.threshold
@@ -108,7 +114,8 @@ class TaskCompletionMetric(BaseMetric):
108
114
  task, self.outcome = await self._a_extract_task_and_outcome(
109
115
  test_case
110
116
  )
111
- self.task = task if self.task is None else self.task
117
+ if self.task is None or not self._is_task_provided:
118
+ self.task = task
112
119
  self.verdict, self.reason = await self._a_generate_verdicts()
113
120
  self.score = self._calculate_score()
114
121
  self.success = self.score >= self.threshold
deepeval/metrics/utils.py CHANGED
@@ -270,7 +270,7 @@ def check_arena_test_case_params(
270
270
  f"Expected ArenaTestCase, got {type(arena_test_case).__name__}"
271
271
  )
272
272
 
273
- cases = list(arena_test_case.contestants.values())
273
+ cases = [contestant.test_case for contestant in arena_test_case.contestants]
274
274
  ref_input = cases[0].input
275
275
  for case in cases[1:]:
276
276
  if case.input != ref_input:
File without changes