deepeval 3.6.9__py3-none-any.whl → 3.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/anthropic/__init__.py +19 -0
  3. deepeval/anthropic/extractors.py +94 -0
  4. deepeval/anthropic/patch.py +169 -0
  5. deepeval/anthropic/utils.py +225 -0
  6. deepeval/benchmarks/drop/drop.py +40 -14
  7. deepeval/benchmarks/ifeval/ifeval.py +2 -2
  8. deepeval/confident/types.py +4 -2
  9. deepeval/config/settings.py +154 -11
  10. deepeval/config/settings_manager.py +4 -0
  11. deepeval/integrations/crewai/handler.py +36 -0
  12. deepeval/integrations/langchain/callback.py +27 -2
  13. deepeval/integrations/llama_index/handler.py +58 -4
  14. deepeval/integrations/llama_index/utils.py +24 -0
  15. deepeval/metrics/__init__.py +5 -0
  16. deepeval/metrics/exact_match/__init__.py +0 -0
  17. deepeval/metrics/exact_match/exact_match.py +94 -0
  18. deepeval/metrics/pattern_match/__init__.py +0 -0
  19. deepeval/metrics/pattern_match/pattern_match.py +103 -0
  20. deepeval/metrics/task_completion/task_completion.py +9 -2
  21. deepeval/model_integrations/__init__.py +0 -0
  22. deepeval/model_integrations/utils.py +116 -0
  23. deepeval/models/base_model.py +3 -1
  24. deepeval/openai/__init__.py +3 -1
  25. deepeval/openai/extractors.py +2 -2
  26. deepeval/openai/utils.py +7 -31
  27. deepeval/prompt/api.py +11 -10
  28. deepeval/prompt/prompt.py +5 -4
  29. deepeval/telemetry.py +3 -3
  30. deepeval/test_case/llm_test_case.py +3 -2
  31. deepeval/test_run/api.py +3 -2
  32. deepeval/test_run/cache.py +4 -3
  33. deepeval/test_run/test_run.py +24 -5
  34. deepeval/tracing/api.py +11 -10
  35. deepeval/tracing/otel/exporter.py +11 -0
  36. deepeval/tracing/patchers.py +102 -1
  37. deepeval/tracing/trace_context.py +13 -4
  38. deepeval/tracing/tracing.py +10 -1
  39. deepeval/tracing/types.py +8 -8
  40. deepeval/tracing/utils.py +9 -0
  41. deepeval/utils.py +44 -2
  42. {deepeval-3.6.9.dist-info → deepeval-3.7.0.dist-info}/METADATA +2 -2
  43. {deepeval-3.6.9.dist-info → deepeval-3.7.0.dist-info}/RECORD +47 -37
  44. /deepeval/{openai → model_integrations}/types.py +0 -0
  45. {deepeval-3.6.9.dist-info → deepeval-3.7.0.dist-info}/LICENSE.md +0 -0
  46. {deepeval-3.6.9.dist-info → deepeval-3.7.0.dist-info}/WHEEL +0 -0
  47. {deepeval-3.6.9.dist-info → deepeval-3.7.0.dist-info}/entry_points.txt +0 -0
@@ -9,10 +9,13 @@ Central config for DeepEval.
9
9
  type coercion.
10
10
  """
11
11
 
12
+ import hashlib
13
+ import json
12
14
  import logging
13
15
  import math
14
16
  import os
15
17
  import re
18
+ import threading
16
19
 
17
20
  from dotenv import dotenv_values
18
21
  from pathlib import Path
@@ -22,6 +25,7 @@ from pydantic import (
22
25
  confloat,
23
26
  conint,
24
27
  field_validator,
28
+ model_validator,
25
29
  SecretStr,
26
30
  )
27
31
  from pydantic_settings import BaseSettings, SettingsConfigDict
@@ -39,6 +43,13 @@ from deepeval.constants import SUPPORTED_PROVIDER_SLUGS, slugify
39
43
  logger = logging.getLogger(__name__)
40
44
  _SAVE_RE = re.compile(r"^(?P<scheme>dotenv)(?::(?P<path>.+))?$")
41
45
 
46
+ # settings that were converted to computed fields with override counterparts
47
+ _DEPRECATED_TO_OVERRIDE = {
48
+ "DEEPEVAL_PER_TASK_TIMEOUT_SECONDS": "DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE",
49
+ "DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS": "DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS_OVERRIDE",
50
+ "DEEPEVAL_TASK_GATHER_BUFFER_SECONDS": "DEEPEVAL_TASK_GATHER_BUFFER_SECONDS_OVERRIDE",
51
+ }
52
+
42
53
 
43
54
  def _find_legacy_enum(env_key: str):
44
55
  from deepeval.key_handler import (
@@ -690,12 +701,119 @@ class Settings(BaseSettings):
690
701
  "CRITICAL, NOTSET, or a numeric logging level."
691
702
  )
692
703
 
704
+ @field_validator("DEEPEVAL_TELEMETRY_OPT_OUT", mode="before")
705
+ @classmethod
706
+ def _apply_telemetry_enabled_alias(cls, v):
707
+ """
708
+ Precedence (most secure):
709
+ - Any OFF signal wins if both are set:
710
+ - DEEPEVAL_TELEMETRY_OPT_OUT = truthy -> OFF
711
+ - DEEPEVAL_TELEMETRY_ENABLED = falsy -> OFF
712
+ - Else, ON signal:
713
+ - DEEPEVAL_TELEMETRY_OPT_OUT = falsy -> ON
714
+ - DEEPEVAL_TELEMETRY_ENABLED = truthy -> ON
715
+ - Else None (unset) -> ON
716
+ """
717
+
718
+ def normalize(x):
719
+ if x is None:
720
+ return None
721
+ s = str(x).strip()
722
+ return None if s == "" else parse_bool(s, default=False)
723
+
724
+ new_opt_out = normalize(v) # True means OFF, False means ON
725
+ legacy_enabled = normalize(
726
+ os.getenv("DEEPEVAL_TELEMETRY_ENABLED")
727
+ ) # True means ON, False means OFF
728
+
729
+ off_signal = (new_opt_out is True) or (legacy_enabled is False)
730
+ on_signal = (new_opt_out is False) or (legacy_enabled is True)
731
+
732
+ # Conflict: simultaneous OFF and ON signals
733
+ if off_signal and on_signal:
734
+ # Only warn if verbose or debug
735
+ if parse_bool(
736
+ os.getenv("DEEPEVAL_VERBOSE_MODE"), default=False
737
+ ) or logger.isEnabledFor(logging.DEBUG):
738
+ logger.warning(
739
+ "Conflicting telemetry flags detected: DEEPEVAL_TELEMETRY_OPT_OUT=%r, "
740
+ "DEEPEVAL_TELEMETRY_ENABLED=%r. Defaulting to OFF.",
741
+ new_opt_out,
742
+ legacy_enabled,
743
+ )
744
+ return True # OFF wins
745
+
746
+ # Clear winner
747
+ if off_signal:
748
+ return True # OFF
749
+ if on_signal:
750
+ return False # ON
751
+
752
+ # Unset means ON
753
+ return False
754
+
755
+ @model_validator(mode="after")
756
+ def _apply_deprecated_computed_env_aliases(self):
757
+ """
758
+ Backwards compatibility courtesy:
759
+ - If users still set a deprecated computed field in the environment,
760
+ emit a deprecation warning and mirror its value into the matching
761
+ *_OVERRIDE field (unless the override is already set).
762
+ - Override always wins if both are present.
763
+ """
764
+ for old_key, override_key in _DEPRECATED_TO_OVERRIDE.items():
765
+ raw = os.getenv(old_key)
766
+ if raw is None or str(raw).strip() == "":
767
+ continue
768
+
769
+ # if override already set, ignore the deprecated one but log a warning
770
+ if getattr(self, override_key) is not None:
771
+ logger.warning(
772
+ "Config deprecation: %s is deprecated and was ignored because %s "
773
+ "is already set. Please remove %s and use %s going forward.",
774
+ old_key,
775
+ override_key,
776
+ old_key,
777
+ override_key,
778
+ )
779
+ continue
780
+
781
+ # apply the deprecated value into the override field.
782
+ try:
783
+ # let pydantic coerce the string to the target type on assignment
784
+ setattr(self, override_key, raw)
785
+ logger.warning(
786
+ "Config deprecation: %s is deprecated. Its value (%r) was applied to %s. "
787
+ "Please migrate to %s and remove %s from your environment.",
788
+ old_key,
789
+ raw,
790
+ override_key,
791
+ override_key,
792
+ old_key,
793
+ )
794
+ except Exception as e:
795
+ # do not let exception bubble up, just warn
796
+ logger.warning(
797
+ "Config deprecation: %s is deprecated and could not be applied to %s "
798
+ "(value=%r): %s",
799
+ old_key,
800
+ override_key,
801
+ raw,
802
+ e,
803
+ )
804
+ return self
805
+
693
806
  #######################
694
807
  # Persistence support #
695
808
  #######################
696
809
  class _SettingsEditCtx:
810
+ # TODO: will generate this list in future PR
697
811
  COMPUTED_FIELDS: frozenset[str] = frozenset(
698
- {"DEEPEVAL_PER_TASK_TIMEOUT_SECONDS"}
812
+ {
813
+ "DEEPEVAL_PER_TASK_TIMEOUT_SECONDS",
814
+ "DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS",
815
+ "DEEPEVAL_TASK_GATHER_BUFFER_SECONDS",
816
+ }
699
817
  )
700
818
 
701
819
  def __init__(
@@ -880,16 +998,39 @@ class Settings(BaseSettings):
880
998
 
881
999
 
882
1000
  _settings_singleton: Optional[Settings] = None
1001
+ _settings_env_fingerprint: "str | None" = None
1002
+ _settings_lock = threading.RLock()
1003
+
1004
+
1005
+ def _calc_env_fingerprint() -> str:
1006
+ env = os.environ.copy()
1007
+ # must hash in a stable order.
1008
+ keys = sorted(
1009
+ key
1010
+ for key in Settings.model_fields.keys()
1011
+ if key != "_DEPRECATED_TELEMETRY_ENABLED" # exclude deprecated
1012
+ )
1013
+ # encode as triples: (key, present?, value)
1014
+ items = [(k, k in env, env.get(k)) for k in keys]
1015
+ payload = json.dumps(items, ensure_ascii=False, separators=(",", ":"))
1016
+ return hashlib.sha256(payload.encode("utf-8")).hexdigest()
883
1017
 
884
1018
 
885
1019
  def get_settings() -> Settings:
886
- global _settings_singleton
887
- if _settings_singleton is None:
888
- _settings_singleton = Settings()
889
- from deepeval.config.logging import apply_deepeval_log_level
1020
+ global _settings_singleton, _settings_env_fingerprint
1021
+ fingerprint = _calc_env_fingerprint()
1022
+
1023
+ with _settings_lock:
1024
+ if (
1025
+ _settings_singleton is None
1026
+ or _settings_env_fingerprint != fingerprint
1027
+ ):
1028
+ _settings_singleton = Settings()
1029
+ _settings_env_fingerprint = fingerprint
1030
+ from deepeval.config.logging import apply_deepeval_log_level
890
1031
 
891
- apply_deepeval_log_level()
892
- return _settings_singleton
1032
+ apply_deepeval_log_level()
1033
+ return _settings_singleton
893
1034
 
894
1035
 
895
1036
  def reset_settings(*, reload_dotenv: bool = False) -> Settings:
@@ -905,8 +1046,10 @@ def reset_settings(*, reload_dotenv: bool = False) -> Settings:
905
1046
  Returns:
906
1047
  The fresh Settings instance.
907
1048
  """
908
- global _settings_singleton
909
- if reload_dotenv:
910
- autoload_dotenv()
911
- _settings_singleton = None
1049
+ global _settings_singleton, _settings_env_fingerprint
1050
+ with _settings_lock:
1051
+ if reload_dotenv:
1052
+ autoload_dotenv()
1053
+ _settings_singleton = None
1054
+ _settings_env_fingerprint = None
912
1055
  return get_settings()
@@ -4,6 +4,7 @@ dotenv file. Also syncs os.environ, handles unsets, and warns on unknown fields.
4
4
  Primary entrypoint: update_settings_and_persist.
5
5
  """
6
6
 
7
+ import json
7
8
  import logging
8
9
  import os
9
10
 
@@ -33,6 +34,9 @@ def _normalize_for_env(val: Any) -> Optional[str]:
33
34
  return val.get_secret_value()
34
35
  if isinstance(val, bool):
35
36
  return bool_to_env_str(val)
37
+ # encode sequences as JSON so Settings can parse them back reliably.
38
+ if isinstance(val, (list, tuple, set)):
39
+ return json.dumps(list(val))
36
40
  return str(val)
37
41
 
38
42
 
@@ -23,6 +23,8 @@ try:
23
23
  AgentExecutionCompletedEvent,
24
24
  ToolUsageStartedEvent,
25
25
  ToolUsageFinishedEvent,
26
+ KnowledgeRetrievalStartedEvent,
27
+ KnowledgeRetrievalCompletedEvent,
26
28
  )
27
29
 
28
30
  crewai_installed = True
@@ -69,6 +71,14 @@ class CrewAIEventsListener(BaseEventListener):
69
71
 
70
72
  return execution_id
71
73
 
74
+ @staticmethod
75
+ def get_knowledge_execution_id(source, event) -> str:
76
+ source_id = id(source)
77
+ agent_id = id(event.agent) if hasattr(event, "agent") else "unknown"
78
+ execution_id = f"_knowledge_{source_id}_{agent_id}"
79
+
80
+ return execution_id
81
+
72
82
  def setup_listeners(self, crewai_event_bus):
73
83
  @crewai_event_bus.on(CrewKickoffStartedEvent)
74
84
  def on_crew_started(source, event: CrewKickoffStartedEvent):
@@ -161,6 +171,32 @@ class CrewAIEventsListener(BaseEventListener):
161
171
  current_span.output = event.output
162
172
  observer.__exit__(None, None, None)
163
173
 
174
+ @crewai_event_bus.on(KnowledgeRetrievalStartedEvent)
175
+ def on_knowledge_started(source, event: KnowledgeRetrievalStartedEvent):
176
+ observer = Observer(
177
+ span_type="tool",
178
+ func_name="knowledge_retrieval",
179
+ function_kwargs={},
180
+ )
181
+ self.span_observers[
182
+ self.get_knowledge_execution_id(source, event)
183
+ ] = observer
184
+ observer.__enter__()
185
+
186
+ @crewai_event_bus.on(KnowledgeRetrievalCompletedEvent)
187
+ def on_knowledge_completed(
188
+ source, event: KnowledgeRetrievalCompletedEvent
189
+ ):
190
+ observer = self.span_observers.pop(
191
+ self.get_knowledge_execution_id(source, event)
192
+ )
193
+ if observer:
194
+ current_span = current_span_context.get()
195
+ if current_span:
196
+ current_span.input = event.query
197
+ current_span.output = event.retrieved_knowledge
198
+ observer.__exit__(None, None, None)
199
+
164
200
 
165
201
  def instrument_crewai(api_key: Optional[str] = None):
166
202
  is_crewai_installed()
@@ -1,12 +1,15 @@
1
1
  from typing import Any, Optional, List, Dict
2
2
  from uuid import UUID
3
3
  from time import perf_counter
4
- from deepeval.tracing.context import current_trace_context
4
+
5
+ from deepeval.tracing.context import current_span_context, current_trace_context
6
+ from deepeval.test_case.llm_test_case import ToolCall
5
7
  from deepeval.tracing.types import (
6
8
  LlmOutput,
7
9
  LlmToolCall,
8
10
  )
9
11
  from deepeval.metrics import BaseMetric
12
+ from deepeval.tracing.utils import prepare_tool_call_input_parameters
10
13
 
11
14
  try:
12
15
  from langchain_core.callbacks.base import BaseCallbackHandler
@@ -266,12 +269,34 @@ class CallbackHandler(BaseCallbackHandler):
266
269
  parent_run_id: Optional[UUID] = None,
267
270
  **kwargs: Any, # un-logged kwargs
268
271
  ) -> Any:
269
-
270
272
  uuid_str = str(run_id)
271
273
  tool_span: ToolSpan = trace_manager.get_span_by_uuid(uuid_str)
272
274
  tool_span.output = output
273
275
  exit_current_context(uuid_str=uuid_str)
274
276
 
277
+ # set the tools called in the parent span as well as on the trace level
278
+ tool_call = ToolCall(
279
+ name=tool_span.name,
280
+ description=tool_span.description,
281
+ output=output,
282
+ input_parameters=prepare_tool_call_input_parameters(
283
+ tool_span.input
284
+ ),
285
+ )
286
+ parent_span = current_span_context.get()
287
+ if parent_span:
288
+ if parent_span.tools_called is None:
289
+ parent_span.tools_called = []
290
+
291
+ parent_span.tools_called.append(tool_call)
292
+
293
+ trace = current_trace_context.get()
294
+ if trace:
295
+ if trace.tools_called is None:
296
+ trace.tools_called = []
297
+
298
+ trace.tools_called.append(tool_call)
299
+
275
300
  def on_tool_error(
276
301
  self,
277
302
  error: BaseException,
@@ -2,13 +2,28 @@ from typing import Any, Dict, Optional
2
2
  import inspect
3
3
  from time import perf_counter
4
4
  import uuid
5
+
6
+ from llama_index.core.agent.workflow.workflow_events import (
7
+ AgentWorkflowStartEvent,
8
+ )
9
+ from deepeval.integrations.llama_index.utils import (
10
+ extract_output_from_llm_chat_end_event,
11
+ )
5
12
  from deepeval.telemetry import capture_tracing_integration
6
13
  from deepeval.tracing import trace_manager
7
- from deepeval.tracing.types import AgentSpan, BaseSpan, LlmSpan, TraceSpanStatus
14
+ from deepeval.tracing.types import (
15
+ ToolSpan,
16
+ AgentSpan,
17
+ BaseSpan,
18
+ LlmSpan,
19
+ TraceSpanStatus,
20
+ )
8
21
  from deepeval.tracing.trace_context import (
9
22
  current_llm_context,
10
23
  current_agent_context,
11
24
  )
25
+ from deepeval.test_case import ToolCall
26
+ from deepeval.tracing.utils import make_json_serializable
12
27
 
13
28
  try:
14
29
  from llama_index.core.instrumentation.events.base import BaseEvent
@@ -89,6 +104,7 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
89
104
  if llm_span_context
90
105
  else None
91
106
  ),
107
+ prompt=llm_span_context.prompt if llm_span_context else None,
92
108
  )
93
109
  trace_manager.add_span(llm_span)
94
110
  trace_manager.add_span_to_trace(llm_span)
@@ -106,7 +122,9 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
106
122
  llm_span.status = TraceSpanStatus.SUCCESS
107
123
  llm_span.end_time = perf_counter()
108
124
  llm_span.input = llm_span.input
109
- llm_span.output = event.response.message.blocks[0].text
125
+ llm_span.output = extract_output_from_llm_chat_end_event(
126
+ event
127
+ )
110
128
  trace_manager.remove_span(llm_span.uuid)
111
129
  del self.open_ai_astream_to_llm_span_map[event.span_id]
112
130
 
@@ -151,6 +169,14 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
151
169
  # conditions to qualify as agent start run span
152
170
  if method_name == "run":
153
171
  agent_span_context = current_agent_context.get()
172
+ start_event = bound_args.arguments.get("start_event")
173
+
174
+ if start_event and isinstance(start_event, AgentWorkflowStartEvent):
175
+ input = start_event.model_dump()
176
+
177
+ else:
178
+ input = bound_args.arguments
179
+
154
180
  span = AgentSpan(
155
181
  uuid=id_,
156
182
  status=TraceSpanStatus.IN_PROGRESS,
@@ -159,7 +185,7 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
159
185
  parent_uuid=parent_span_id,
160
186
  start_time=perf_counter(),
161
187
  name="Agent", # TODO: decide the name of the span
162
- input=bound_args.arguments,
188
+ input=input,
163
189
  metrics=(
164
190
  agent_span_context.metrics if agent_span_context else None
165
191
  ),
@@ -169,7 +195,17 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
169
195
  else None
170
196
  ),
171
197
  )
172
-
198
+ elif method_name == "acall":
199
+ span = ToolSpan(
200
+ uuid=id_,
201
+ status=TraceSpanStatus.IN_PROGRESS,
202
+ children=[],
203
+ trace_uuid=trace_uuid,
204
+ parent_uuid=parent_span_id,
205
+ start_time=perf_counter(),
206
+ input=bound_args.arguments,
207
+ name="Tool",
208
+ )
173
209
  # prepare input test case params for the span
174
210
  prepare_input_llm_test_case_params(
175
211
  class_name, method_name, span, bound_args.arguments
@@ -192,10 +228,28 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
192
228
  if base_span is None:
193
229
  return None
194
230
 
231
+ class_name, method_name = parse_id(id_)
232
+ if method_name == "call_tool":
233
+ output_json = make_json_serializable(result)
234
+ if output_json and isinstance(output_json, dict):
235
+ if base_span.tools_called is None:
236
+ base_span.tools_called = []
237
+ base_span.tools_called.append(
238
+ ToolCall(
239
+ name=output_json.get("tool_name", "Tool"),
240
+ input_parameters=output_json.get("tool_kwargs", {}),
241
+ output=output_json.get("tool_output", {}),
242
+ )
243
+ )
195
244
  base_span.end_time = perf_counter()
196
245
  base_span.status = TraceSpanStatus.SUCCESS
197
246
  base_span.output = result
198
247
 
248
+ if isinstance(base_span, ToolSpan):
249
+ result_json = make_json_serializable(result)
250
+ if result_json and isinstance(result_json, dict):
251
+ base_span.name = result_json.get("tool_name", "Tool")
252
+
199
253
  if base_span.llm_test_case:
200
254
  class_name, method_name = parse_id(id_)
201
255
  prepare_output_llm_test_case_params(
@@ -1,3 +1,4 @@
1
+ from llama_index.core.instrumentation.events.llm import LLMChatEndEvent
1
2
  from deepeval.test_case.llm_test_case import LLMTestCase, ToolCall
2
3
  from deepeval.tracing.types import BaseSpan
3
4
  from typing import Any
@@ -81,3 +82,26 @@ def prepare_output_llm_test_case_params(
81
82
  )
82
83
 
83
84
  span.llm_test_case.tools_called = tool_calls
85
+
86
+
87
+ def extract_output_from_llm_chat_end_event(event: LLMChatEndEvent) -> list:
88
+ messages = []
89
+ for msg in event.response.message.blocks:
90
+ if msg.block_type == "text":
91
+ messages.append(
92
+ {
93
+ "role": event.response.message.role.value,
94
+ "content": msg.text,
95
+ }
96
+ )
97
+ elif msg.block_type == "tool_call":
98
+ messages.append(
99
+ {
100
+ "name": msg.tool_name,
101
+ "input_parameters": msg.tool_kwargs,
102
+ "id": msg.tool_call_id,
103
+ }
104
+ )
105
+ else:
106
+ messages.append(msg.model_dump())
107
+ return messages
@@ -8,6 +8,8 @@ from .base_metric import (
8
8
  from .dag.dag import DAGMetric, DeepAcyclicGraph
9
9
  from .conversational_dag.conversational_dag import ConversationalDAGMetric
10
10
  from .bias.bias import BiasMetric
11
+ from .exact_match.exact_match import ExactMatchMetric
12
+ from .pattern_match.pattern_match import PatternMatchMetric
11
13
  from .toxicity.toxicity import ToxicityMetric
12
14
  from .pii_leakage.pii_leakage import PIILeakageMetric
13
15
  from .non_advice.non_advice import NonAdviceMetric
@@ -69,6 +71,9 @@ __all__ = [
69
71
  "BaseConversationalMetric",
70
72
  "BaseMultimodalMetric",
71
73
  "BaseArenaMetric",
74
+ # Non-LLM metrics
75
+ "ExactMatchMetric",
76
+ "PatternMatchMetric",
72
77
  # Core metrics
73
78
  "GEval",
74
79
  "ArenaGEval",
File without changes
@@ -0,0 +1,94 @@
1
+ from typing import List
2
+
3
+ from deepeval.metrics.indicator import metric_progress_indicator
4
+ from deepeval.metrics.utils import (
5
+ check_llm_test_case_params,
6
+ construct_verbose_logs,
7
+ )
8
+ from deepeval.metrics.api import metric_data_manager
9
+ from deepeval.metrics import BaseMetric
10
+ from deepeval.test_case import LLMTestCase, LLMTestCaseParams
11
+
12
+
13
+ class ExactMatchMetric(BaseMetric):
14
+ _required_params: List[LLMTestCaseParams] = [
15
+ LLMTestCaseParams.INPUT,
16
+ LLMTestCaseParams.ACTUAL_OUTPUT,
17
+ LLMTestCaseParams.EXPECTED_OUTPUT,
18
+ ]
19
+
20
+ def __init__(
21
+ self,
22
+ threshold: float = 1,
23
+ verbose_mode: bool = False,
24
+ ):
25
+ self.threshold = threshold
26
+ self.verbose_mode = verbose_mode
27
+
28
+ def measure(
29
+ self,
30
+ test_case: LLMTestCase,
31
+ _show_indicator: bool = True,
32
+ _in_component: bool = False,
33
+ _log_metric_to_confident: bool = True,
34
+ ) -> float:
35
+ check_llm_test_case_params(test_case, self._required_params, self)
36
+
37
+ with metric_progress_indicator(
38
+ self, _show_indicator=_show_indicator, _in_component=_in_component
39
+ ):
40
+ expected = test_case.expected_output.strip()
41
+ actual = test_case.actual_output.strip()
42
+
43
+ if expected == actual:
44
+ self.score = self.precision = self.recall = self.f1 = 1.0
45
+ self.reason = (
46
+ "The actual and expected outputs are exact matches."
47
+ )
48
+ else:
49
+ self.score = self.precision = self.recall = self.f1 = 0.0
50
+ self.reason = "The actual and expected outputs are different."
51
+
52
+ self.success = self.score >= self.threshold
53
+
54
+ if self.verbose_mode:
55
+ self.verbose_logs = construct_verbose_logs(
56
+ self,
57
+ steps=[
58
+ f"Score: {self.score:.2f}",
59
+ f"Reason: {self.reason}",
60
+ ],
61
+ )
62
+
63
+ if _log_metric_to_confident:
64
+ metric_data_manager.post_metric_if_enabled(
65
+ self, test_case=test_case
66
+ )
67
+
68
+ return self.score
69
+
70
+ async def a_measure(
71
+ self,
72
+ test_case: LLMTestCase,
73
+ _show_indicator: bool = True,
74
+ _in_component: bool = False,
75
+ ) -> float:
76
+ return self.measure(
77
+ test_case,
78
+ _show_indicator=_show_indicator,
79
+ _in_component=_in_component,
80
+ )
81
+
82
+ def is_successful(self) -> bool:
83
+ if self.error is not None:
84
+ self.success = False
85
+ else:
86
+ try:
87
+ self.success = self.score >= self.threshold
88
+ except:
89
+ self.success = False
90
+ return self.success
91
+
92
+ @property
93
+ def __name__(self):
94
+ return "Exact Match"
File without changes