deepeval 3.6.9__py3-none-any.whl → 3.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/anthropic/__init__.py +19 -0
- deepeval/anthropic/extractors.py +94 -0
- deepeval/anthropic/patch.py +169 -0
- deepeval/anthropic/utils.py +225 -0
- deepeval/benchmarks/drop/drop.py +40 -14
- deepeval/benchmarks/ifeval/ifeval.py +2 -2
- deepeval/confident/types.py +4 -2
- deepeval/config/settings.py +154 -11
- deepeval/config/settings_manager.py +4 -0
- deepeval/integrations/crewai/handler.py +36 -0
- deepeval/integrations/langchain/callback.py +27 -2
- deepeval/integrations/llama_index/handler.py +58 -4
- deepeval/integrations/llama_index/utils.py +24 -0
- deepeval/metrics/__init__.py +5 -0
- deepeval/metrics/exact_match/__init__.py +0 -0
- deepeval/metrics/exact_match/exact_match.py +94 -0
- deepeval/metrics/pattern_match/__init__.py +0 -0
- deepeval/metrics/pattern_match/pattern_match.py +103 -0
- deepeval/metrics/task_completion/task_completion.py +9 -2
- deepeval/model_integrations/__init__.py +0 -0
- deepeval/model_integrations/utils.py +116 -0
- deepeval/models/base_model.py +3 -1
- deepeval/openai/__init__.py +3 -1
- deepeval/openai/extractors.py +2 -2
- deepeval/openai/utils.py +7 -31
- deepeval/prompt/api.py +11 -10
- deepeval/prompt/prompt.py +5 -4
- deepeval/telemetry.py +3 -3
- deepeval/test_case/llm_test_case.py +3 -2
- deepeval/test_run/api.py +3 -2
- deepeval/test_run/cache.py +4 -3
- deepeval/test_run/test_run.py +24 -5
- deepeval/tracing/api.py +11 -10
- deepeval/tracing/otel/exporter.py +11 -0
- deepeval/tracing/patchers.py +102 -1
- deepeval/tracing/trace_context.py +13 -4
- deepeval/tracing/tracing.py +10 -1
- deepeval/tracing/types.py +8 -8
- deepeval/tracing/utils.py +9 -0
- deepeval/utils.py +44 -2
- {deepeval-3.6.9.dist-info → deepeval-3.7.0.dist-info}/METADATA +2 -2
- {deepeval-3.6.9.dist-info → deepeval-3.7.0.dist-info}/RECORD +47 -37
- /deepeval/{openai → model_integrations}/types.py +0 -0
- {deepeval-3.6.9.dist-info → deepeval-3.7.0.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.9.dist-info → deepeval-3.7.0.dist-info}/WHEEL +0 -0
- {deepeval-3.6.9.dist-info → deepeval-3.7.0.dist-info}/entry_points.txt +0 -0
deepeval/config/settings.py
CHANGED
|
@@ -9,10 +9,13 @@ Central config for DeepEval.
|
|
|
9
9
|
type coercion.
|
|
10
10
|
"""
|
|
11
11
|
|
|
12
|
+
import hashlib
|
|
13
|
+
import json
|
|
12
14
|
import logging
|
|
13
15
|
import math
|
|
14
16
|
import os
|
|
15
17
|
import re
|
|
18
|
+
import threading
|
|
16
19
|
|
|
17
20
|
from dotenv import dotenv_values
|
|
18
21
|
from pathlib import Path
|
|
@@ -22,6 +25,7 @@ from pydantic import (
|
|
|
22
25
|
confloat,
|
|
23
26
|
conint,
|
|
24
27
|
field_validator,
|
|
28
|
+
model_validator,
|
|
25
29
|
SecretStr,
|
|
26
30
|
)
|
|
27
31
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
@@ -39,6 +43,13 @@ from deepeval.constants import SUPPORTED_PROVIDER_SLUGS, slugify
|
|
|
39
43
|
logger = logging.getLogger(__name__)
|
|
40
44
|
_SAVE_RE = re.compile(r"^(?P<scheme>dotenv)(?::(?P<path>.+))?$")
|
|
41
45
|
|
|
46
|
+
# settings that were converted to computed fields with override counterparts
|
|
47
|
+
_DEPRECATED_TO_OVERRIDE = {
|
|
48
|
+
"DEEPEVAL_PER_TASK_TIMEOUT_SECONDS": "DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE",
|
|
49
|
+
"DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS": "DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS_OVERRIDE",
|
|
50
|
+
"DEEPEVAL_TASK_GATHER_BUFFER_SECONDS": "DEEPEVAL_TASK_GATHER_BUFFER_SECONDS_OVERRIDE",
|
|
51
|
+
}
|
|
52
|
+
|
|
42
53
|
|
|
43
54
|
def _find_legacy_enum(env_key: str):
|
|
44
55
|
from deepeval.key_handler import (
|
|
@@ -690,12 +701,119 @@ class Settings(BaseSettings):
|
|
|
690
701
|
"CRITICAL, NOTSET, or a numeric logging level."
|
|
691
702
|
)
|
|
692
703
|
|
|
704
|
+
@field_validator("DEEPEVAL_TELEMETRY_OPT_OUT", mode="before")
|
|
705
|
+
@classmethod
|
|
706
|
+
def _apply_telemetry_enabled_alias(cls, v):
|
|
707
|
+
"""
|
|
708
|
+
Precedence (most secure):
|
|
709
|
+
- Any OFF signal wins if both are set:
|
|
710
|
+
- DEEPEVAL_TELEMETRY_OPT_OUT = truthy -> OFF
|
|
711
|
+
- DEEPEVAL_TELEMETRY_ENABLED = falsy -> OFF
|
|
712
|
+
- Else, ON signal:
|
|
713
|
+
- DEEPEVAL_TELEMETRY_OPT_OUT = falsy -> ON
|
|
714
|
+
- DEEPEVAL_TELEMETRY_ENABLED = truthy -> ON
|
|
715
|
+
- Else None (unset) -> ON
|
|
716
|
+
"""
|
|
717
|
+
|
|
718
|
+
def normalize(x):
|
|
719
|
+
if x is None:
|
|
720
|
+
return None
|
|
721
|
+
s = str(x).strip()
|
|
722
|
+
return None if s == "" else parse_bool(s, default=False)
|
|
723
|
+
|
|
724
|
+
new_opt_out = normalize(v) # True means OFF, False means ON
|
|
725
|
+
legacy_enabled = normalize(
|
|
726
|
+
os.getenv("DEEPEVAL_TELEMETRY_ENABLED")
|
|
727
|
+
) # True means ON, False means OFF
|
|
728
|
+
|
|
729
|
+
off_signal = (new_opt_out is True) or (legacy_enabled is False)
|
|
730
|
+
on_signal = (new_opt_out is False) or (legacy_enabled is True)
|
|
731
|
+
|
|
732
|
+
# Conflict: simultaneous OFF and ON signals
|
|
733
|
+
if off_signal and on_signal:
|
|
734
|
+
# Only warn if verbose or debug
|
|
735
|
+
if parse_bool(
|
|
736
|
+
os.getenv("DEEPEVAL_VERBOSE_MODE"), default=False
|
|
737
|
+
) or logger.isEnabledFor(logging.DEBUG):
|
|
738
|
+
logger.warning(
|
|
739
|
+
"Conflicting telemetry flags detected: DEEPEVAL_TELEMETRY_OPT_OUT=%r, "
|
|
740
|
+
"DEEPEVAL_TELEMETRY_ENABLED=%r. Defaulting to OFF.",
|
|
741
|
+
new_opt_out,
|
|
742
|
+
legacy_enabled,
|
|
743
|
+
)
|
|
744
|
+
return True # OFF wins
|
|
745
|
+
|
|
746
|
+
# Clear winner
|
|
747
|
+
if off_signal:
|
|
748
|
+
return True # OFF
|
|
749
|
+
if on_signal:
|
|
750
|
+
return False # ON
|
|
751
|
+
|
|
752
|
+
# Unset means ON
|
|
753
|
+
return False
|
|
754
|
+
|
|
755
|
+
@model_validator(mode="after")
|
|
756
|
+
def _apply_deprecated_computed_env_aliases(self):
|
|
757
|
+
"""
|
|
758
|
+
Backwards compatibility courtesy:
|
|
759
|
+
- If users still set a deprecated computed field in the environment,
|
|
760
|
+
emit a deprecation warning and mirror its value into the matching
|
|
761
|
+
*_OVERRIDE field (unless the override is already set).
|
|
762
|
+
- Override always wins if both are present.
|
|
763
|
+
"""
|
|
764
|
+
for old_key, override_key in _DEPRECATED_TO_OVERRIDE.items():
|
|
765
|
+
raw = os.getenv(old_key)
|
|
766
|
+
if raw is None or str(raw).strip() == "":
|
|
767
|
+
continue
|
|
768
|
+
|
|
769
|
+
# if override already set, ignore the deprecated one but log a warning
|
|
770
|
+
if getattr(self, override_key) is not None:
|
|
771
|
+
logger.warning(
|
|
772
|
+
"Config deprecation: %s is deprecated and was ignored because %s "
|
|
773
|
+
"is already set. Please remove %s and use %s going forward.",
|
|
774
|
+
old_key,
|
|
775
|
+
override_key,
|
|
776
|
+
old_key,
|
|
777
|
+
override_key,
|
|
778
|
+
)
|
|
779
|
+
continue
|
|
780
|
+
|
|
781
|
+
# apply the deprecated value into the override field.
|
|
782
|
+
try:
|
|
783
|
+
# let pydantic coerce the string to the target type on assignment
|
|
784
|
+
setattr(self, override_key, raw)
|
|
785
|
+
logger.warning(
|
|
786
|
+
"Config deprecation: %s is deprecated. Its value (%r) was applied to %s. "
|
|
787
|
+
"Please migrate to %s and remove %s from your environment.",
|
|
788
|
+
old_key,
|
|
789
|
+
raw,
|
|
790
|
+
override_key,
|
|
791
|
+
override_key,
|
|
792
|
+
old_key,
|
|
793
|
+
)
|
|
794
|
+
except Exception as e:
|
|
795
|
+
# do not let exception bubble up, just warn
|
|
796
|
+
logger.warning(
|
|
797
|
+
"Config deprecation: %s is deprecated and could not be applied to %s "
|
|
798
|
+
"(value=%r): %s",
|
|
799
|
+
old_key,
|
|
800
|
+
override_key,
|
|
801
|
+
raw,
|
|
802
|
+
e,
|
|
803
|
+
)
|
|
804
|
+
return self
|
|
805
|
+
|
|
693
806
|
#######################
|
|
694
807
|
# Persistence support #
|
|
695
808
|
#######################
|
|
696
809
|
class _SettingsEditCtx:
|
|
810
|
+
# TODO: will generate this list in future PR
|
|
697
811
|
COMPUTED_FIELDS: frozenset[str] = frozenset(
|
|
698
|
-
{
|
|
812
|
+
{
|
|
813
|
+
"DEEPEVAL_PER_TASK_TIMEOUT_SECONDS",
|
|
814
|
+
"DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS",
|
|
815
|
+
"DEEPEVAL_TASK_GATHER_BUFFER_SECONDS",
|
|
816
|
+
}
|
|
699
817
|
)
|
|
700
818
|
|
|
701
819
|
def __init__(
|
|
@@ -880,16 +998,39 @@ class Settings(BaseSettings):
|
|
|
880
998
|
|
|
881
999
|
|
|
882
1000
|
_settings_singleton: Optional[Settings] = None
|
|
1001
|
+
_settings_env_fingerprint: "str | None" = None
|
|
1002
|
+
_settings_lock = threading.RLock()
|
|
1003
|
+
|
|
1004
|
+
|
|
1005
|
+
def _calc_env_fingerprint() -> str:
|
|
1006
|
+
env = os.environ.copy()
|
|
1007
|
+
# must hash in a stable order.
|
|
1008
|
+
keys = sorted(
|
|
1009
|
+
key
|
|
1010
|
+
for key in Settings.model_fields.keys()
|
|
1011
|
+
if key != "_DEPRECATED_TELEMETRY_ENABLED" # exclude deprecated
|
|
1012
|
+
)
|
|
1013
|
+
# encode as triples: (key, present?, value)
|
|
1014
|
+
items = [(k, k in env, env.get(k)) for k in keys]
|
|
1015
|
+
payload = json.dumps(items, ensure_ascii=False, separators=(",", ":"))
|
|
1016
|
+
return hashlib.sha256(payload.encode("utf-8")).hexdigest()
|
|
883
1017
|
|
|
884
1018
|
|
|
885
1019
|
def get_settings() -> Settings:
|
|
886
|
-
global _settings_singleton
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
1020
|
+
global _settings_singleton, _settings_env_fingerprint
|
|
1021
|
+
fingerprint = _calc_env_fingerprint()
|
|
1022
|
+
|
|
1023
|
+
with _settings_lock:
|
|
1024
|
+
if (
|
|
1025
|
+
_settings_singleton is None
|
|
1026
|
+
or _settings_env_fingerprint != fingerprint
|
|
1027
|
+
):
|
|
1028
|
+
_settings_singleton = Settings()
|
|
1029
|
+
_settings_env_fingerprint = fingerprint
|
|
1030
|
+
from deepeval.config.logging import apply_deepeval_log_level
|
|
890
1031
|
|
|
891
|
-
|
|
892
|
-
|
|
1032
|
+
apply_deepeval_log_level()
|
|
1033
|
+
return _settings_singleton
|
|
893
1034
|
|
|
894
1035
|
|
|
895
1036
|
def reset_settings(*, reload_dotenv: bool = False) -> Settings:
|
|
@@ -905,8 +1046,10 @@ def reset_settings(*, reload_dotenv: bool = False) -> Settings:
|
|
|
905
1046
|
Returns:
|
|
906
1047
|
The fresh Settings instance.
|
|
907
1048
|
"""
|
|
908
|
-
global _settings_singleton
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
1049
|
+
global _settings_singleton, _settings_env_fingerprint
|
|
1050
|
+
with _settings_lock:
|
|
1051
|
+
if reload_dotenv:
|
|
1052
|
+
autoload_dotenv()
|
|
1053
|
+
_settings_singleton = None
|
|
1054
|
+
_settings_env_fingerprint = None
|
|
912
1055
|
return get_settings()
|
|
@@ -4,6 +4,7 @@ dotenv file. Also syncs os.environ, handles unsets, and warns on unknown fields.
|
|
|
4
4
|
Primary entrypoint: update_settings_and_persist.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
+
import json
|
|
7
8
|
import logging
|
|
8
9
|
import os
|
|
9
10
|
|
|
@@ -33,6 +34,9 @@ def _normalize_for_env(val: Any) -> Optional[str]:
|
|
|
33
34
|
return val.get_secret_value()
|
|
34
35
|
if isinstance(val, bool):
|
|
35
36
|
return bool_to_env_str(val)
|
|
37
|
+
# encode sequences as JSON so Settings can parse them back reliably.
|
|
38
|
+
if isinstance(val, (list, tuple, set)):
|
|
39
|
+
return json.dumps(list(val))
|
|
36
40
|
return str(val)
|
|
37
41
|
|
|
38
42
|
|
|
@@ -23,6 +23,8 @@ try:
|
|
|
23
23
|
AgentExecutionCompletedEvent,
|
|
24
24
|
ToolUsageStartedEvent,
|
|
25
25
|
ToolUsageFinishedEvent,
|
|
26
|
+
KnowledgeRetrievalStartedEvent,
|
|
27
|
+
KnowledgeRetrievalCompletedEvent,
|
|
26
28
|
)
|
|
27
29
|
|
|
28
30
|
crewai_installed = True
|
|
@@ -69,6 +71,14 @@ class CrewAIEventsListener(BaseEventListener):
|
|
|
69
71
|
|
|
70
72
|
return execution_id
|
|
71
73
|
|
|
74
|
+
@staticmethod
|
|
75
|
+
def get_knowledge_execution_id(source, event) -> str:
|
|
76
|
+
source_id = id(source)
|
|
77
|
+
agent_id = id(event.agent) if hasattr(event, "agent") else "unknown"
|
|
78
|
+
execution_id = f"_knowledge_{source_id}_{agent_id}"
|
|
79
|
+
|
|
80
|
+
return execution_id
|
|
81
|
+
|
|
72
82
|
def setup_listeners(self, crewai_event_bus):
|
|
73
83
|
@crewai_event_bus.on(CrewKickoffStartedEvent)
|
|
74
84
|
def on_crew_started(source, event: CrewKickoffStartedEvent):
|
|
@@ -161,6 +171,32 @@ class CrewAIEventsListener(BaseEventListener):
|
|
|
161
171
|
current_span.output = event.output
|
|
162
172
|
observer.__exit__(None, None, None)
|
|
163
173
|
|
|
174
|
+
@crewai_event_bus.on(KnowledgeRetrievalStartedEvent)
|
|
175
|
+
def on_knowledge_started(source, event: KnowledgeRetrievalStartedEvent):
|
|
176
|
+
observer = Observer(
|
|
177
|
+
span_type="tool",
|
|
178
|
+
func_name="knowledge_retrieval",
|
|
179
|
+
function_kwargs={},
|
|
180
|
+
)
|
|
181
|
+
self.span_observers[
|
|
182
|
+
self.get_knowledge_execution_id(source, event)
|
|
183
|
+
] = observer
|
|
184
|
+
observer.__enter__()
|
|
185
|
+
|
|
186
|
+
@crewai_event_bus.on(KnowledgeRetrievalCompletedEvent)
|
|
187
|
+
def on_knowledge_completed(
|
|
188
|
+
source, event: KnowledgeRetrievalCompletedEvent
|
|
189
|
+
):
|
|
190
|
+
observer = self.span_observers.pop(
|
|
191
|
+
self.get_knowledge_execution_id(source, event)
|
|
192
|
+
)
|
|
193
|
+
if observer:
|
|
194
|
+
current_span = current_span_context.get()
|
|
195
|
+
if current_span:
|
|
196
|
+
current_span.input = event.query
|
|
197
|
+
current_span.output = event.retrieved_knowledge
|
|
198
|
+
observer.__exit__(None, None, None)
|
|
199
|
+
|
|
164
200
|
|
|
165
201
|
def instrument_crewai(api_key: Optional[str] = None):
|
|
166
202
|
is_crewai_installed()
|
|
@@ -1,12 +1,15 @@
|
|
|
1
1
|
from typing import Any, Optional, List, Dict
|
|
2
2
|
from uuid import UUID
|
|
3
3
|
from time import perf_counter
|
|
4
|
-
|
|
4
|
+
|
|
5
|
+
from deepeval.tracing.context import current_span_context, current_trace_context
|
|
6
|
+
from deepeval.test_case.llm_test_case import ToolCall
|
|
5
7
|
from deepeval.tracing.types import (
|
|
6
8
|
LlmOutput,
|
|
7
9
|
LlmToolCall,
|
|
8
10
|
)
|
|
9
11
|
from deepeval.metrics import BaseMetric
|
|
12
|
+
from deepeval.tracing.utils import prepare_tool_call_input_parameters
|
|
10
13
|
|
|
11
14
|
try:
|
|
12
15
|
from langchain_core.callbacks.base import BaseCallbackHandler
|
|
@@ -266,12 +269,34 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
266
269
|
parent_run_id: Optional[UUID] = None,
|
|
267
270
|
**kwargs: Any, # un-logged kwargs
|
|
268
271
|
) -> Any:
|
|
269
|
-
|
|
270
272
|
uuid_str = str(run_id)
|
|
271
273
|
tool_span: ToolSpan = trace_manager.get_span_by_uuid(uuid_str)
|
|
272
274
|
tool_span.output = output
|
|
273
275
|
exit_current_context(uuid_str=uuid_str)
|
|
274
276
|
|
|
277
|
+
# set the tools called in the parent span as well as on the trace level
|
|
278
|
+
tool_call = ToolCall(
|
|
279
|
+
name=tool_span.name,
|
|
280
|
+
description=tool_span.description,
|
|
281
|
+
output=output,
|
|
282
|
+
input_parameters=prepare_tool_call_input_parameters(
|
|
283
|
+
tool_span.input
|
|
284
|
+
),
|
|
285
|
+
)
|
|
286
|
+
parent_span = current_span_context.get()
|
|
287
|
+
if parent_span:
|
|
288
|
+
if parent_span.tools_called is None:
|
|
289
|
+
parent_span.tools_called = []
|
|
290
|
+
|
|
291
|
+
parent_span.tools_called.append(tool_call)
|
|
292
|
+
|
|
293
|
+
trace = current_trace_context.get()
|
|
294
|
+
if trace:
|
|
295
|
+
if trace.tools_called is None:
|
|
296
|
+
trace.tools_called = []
|
|
297
|
+
|
|
298
|
+
trace.tools_called.append(tool_call)
|
|
299
|
+
|
|
275
300
|
def on_tool_error(
|
|
276
301
|
self,
|
|
277
302
|
error: BaseException,
|
|
@@ -2,13 +2,28 @@ from typing import Any, Dict, Optional
|
|
|
2
2
|
import inspect
|
|
3
3
|
from time import perf_counter
|
|
4
4
|
import uuid
|
|
5
|
+
|
|
6
|
+
from llama_index.core.agent.workflow.workflow_events import (
|
|
7
|
+
AgentWorkflowStartEvent,
|
|
8
|
+
)
|
|
9
|
+
from deepeval.integrations.llama_index.utils import (
|
|
10
|
+
extract_output_from_llm_chat_end_event,
|
|
11
|
+
)
|
|
5
12
|
from deepeval.telemetry import capture_tracing_integration
|
|
6
13
|
from deepeval.tracing import trace_manager
|
|
7
|
-
from deepeval.tracing.types import
|
|
14
|
+
from deepeval.tracing.types import (
|
|
15
|
+
ToolSpan,
|
|
16
|
+
AgentSpan,
|
|
17
|
+
BaseSpan,
|
|
18
|
+
LlmSpan,
|
|
19
|
+
TraceSpanStatus,
|
|
20
|
+
)
|
|
8
21
|
from deepeval.tracing.trace_context import (
|
|
9
22
|
current_llm_context,
|
|
10
23
|
current_agent_context,
|
|
11
24
|
)
|
|
25
|
+
from deepeval.test_case import ToolCall
|
|
26
|
+
from deepeval.tracing.utils import make_json_serializable
|
|
12
27
|
|
|
13
28
|
try:
|
|
14
29
|
from llama_index.core.instrumentation.events.base import BaseEvent
|
|
@@ -89,6 +104,7 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
|
|
|
89
104
|
if llm_span_context
|
|
90
105
|
else None
|
|
91
106
|
),
|
|
107
|
+
prompt=llm_span_context.prompt if llm_span_context else None,
|
|
92
108
|
)
|
|
93
109
|
trace_manager.add_span(llm_span)
|
|
94
110
|
trace_manager.add_span_to_trace(llm_span)
|
|
@@ -106,7 +122,9 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
|
|
|
106
122
|
llm_span.status = TraceSpanStatus.SUCCESS
|
|
107
123
|
llm_span.end_time = perf_counter()
|
|
108
124
|
llm_span.input = llm_span.input
|
|
109
|
-
llm_span.output =
|
|
125
|
+
llm_span.output = extract_output_from_llm_chat_end_event(
|
|
126
|
+
event
|
|
127
|
+
)
|
|
110
128
|
trace_manager.remove_span(llm_span.uuid)
|
|
111
129
|
del self.open_ai_astream_to_llm_span_map[event.span_id]
|
|
112
130
|
|
|
@@ -151,6 +169,14 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
|
|
|
151
169
|
# conditions to qualify as agent start run span
|
|
152
170
|
if method_name == "run":
|
|
153
171
|
agent_span_context = current_agent_context.get()
|
|
172
|
+
start_event = bound_args.arguments.get("start_event")
|
|
173
|
+
|
|
174
|
+
if start_event and isinstance(start_event, AgentWorkflowStartEvent):
|
|
175
|
+
input = start_event.model_dump()
|
|
176
|
+
|
|
177
|
+
else:
|
|
178
|
+
input = bound_args.arguments
|
|
179
|
+
|
|
154
180
|
span = AgentSpan(
|
|
155
181
|
uuid=id_,
|
|
156
182
|
status=TraceSpanStatus.IN_PROGRESS,
|
|
@@ -159,7 +185,7 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
|
|
|
159
185
|
parent_uuid=parent_span_id,
|
|
160
186
|
start_time=perf_counter(),
|
|
161
187
|
name="Agent", # TODO: decide the name of the span
|
|
162
|
-
input=
|
|
188
|
+
input=input,
|
|
163
189
|
metrics=(
|
|
164
190
|
agent_span_context.metrics if agent_span_context else None
|
|
165
191
|
),
|
|
@@ -169,7 +195,17 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
|
|
|
169
195
|
else None
|
|
170
196
|
),
|
|
171
197
|
)
|
|
172
|
-
|
|
198
|
+
elif method_name == "acall":
|
|
199
|
+
span = ToolSpan(
|
|
200
|
+
uuid=id_,
|
|
201
|
+
status=TraceSpanStatus.IN_PROGRESS,
|
|
202
|
+
children=[],
|
|
203
|
+
trace_uuid=trace_uuid,
|
|
204
|
+
parent_uuid=parent_span_id,
|
|
205
|
+
start_time=perf_counter(),
|
|
206
|
+
input=bound_args.arguments,
|
|
207
|
+
name="Tool",
|
|
208
|
+
)
|
|
173
209
|
# prepare input test case params for the span
|
|
174
210
|
prepare_input_llm_test_case_params(
|
|
175
211
|
class_name, method_name, span, bound_args.arguments
|
|
@@ -192,10 +228,28 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
|
|
|
192
228
|
if base_span is None:
|
|
193
229
|
return None
|
|
194
230
|
|
|
231
|
+
class_name, method_name = parse_id(id_)
|
|
232
|
+
if method_name == "call_tool":
|
|
233
|
+
output_json = make_json_serializable(result)
|
|
234
|
+
if output_json and isinstance(output_json, dict):
|
|
235
|
+
if base_span.tools_called is None:
|
|
236
|
+
base_span.tools_called = []
|
|
237
|
+
base_span.tools_called.append(
|
|
238
|
+
ToolCall(
|
|
239
|
+
name=output_json.get("tool_name", "Tool"),
|
|
240
|
+
input_parameters=output_json.get("tool_kwargs", {}),
|
|
241
|
+
output=output_json.get("tool_output", {}),
|
|
242
|
+
)
|
|
243
|
+
)
|
|
195
244
|
base_span.end_time = perf_counter()
|
|
196
245
|
base_span.status = TraceSpanStatus.SUCCESS
|
|
197
246
|
base_span.output = result
|
|
198
247
|
|
|
248
|
+
if isinstance(base_span, ToolSpan):
|
|
249
|
+
result_json = make_json_serializable(result)
|
|
250
|
+
if result_json and isinstance(result_json, dict):
|
|
251
|
+
base_span.name = result_json.get("tool_name", "Tool")
|
|
252
|
+
|
|
199
253
|
if base_span.llm_test_case:
|
|
200
254
|
class_name, method_name = parse_id(id_)
|
|
201
255
|
prepare_output_llm_test_case_params(
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from llama_index.core.instrumentation.events.llm import LLMChatEndEvent
|
|
1
2
|
from deepeval.test_case.llm_test_case import LLMTestCase, ToolCall
|
|
2
3
|
from deepeval.tracing.types import BaseSpan
|
|
3
4
|
from typing import Any
|
|
@@ -81,3 +82,26 @@ def prepare_output_llm_test_case_params(
|
|
|
81
82
|
)
|
|
82
83
|
|
|
83
84
|
span.llm_test_case.tools_called = tool_calls
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def extract_output_from_llm_chat_end_event(event: LLMChatEndEvent) -> list:
|
|
88
|
+
messages = []
|
|
89
|
+
for msg in event.response.message.blocks:
|
|
90
|
+
if msg.block_type == "text":
|
|
91
|
+
messages.append(
|
|
92
|
+
{
|
|
93
|
+
"role": event.response.message.role.value,
|
|
94
|
+
"content": msg.text,
|
|
95
|
+
}
|
|
96
|
+
)
|
|
97
|
+
elif msg.block_type == "tool_call":
|
|
98
|
+
messages.append(
|
|
99
|
+
{
|
|
100
|
+
"name": msg.tool_name,
|
|
101
|
+
"input_parameters": msg.tool_kwargs,
|
|
102
|
+
"id": msg.tool_call_id,
|
|
103
|
+
}
|
|
104
|
+
)
|
|
105
|
+
else:
|
|
106
|
+
messages.append(msg.model_dump())
|
|
107
|
+
return messages
|
deepeval/metrics/__init__.py
CHANGED
|
@@ -8,6 +8,8 @@ from .base_metric import (
|
|
|
8
8
|
from .dag.dag import DAGMetric, DeepAcyclicGraph
|
|
9
9
|
from .conversational_dag.conversational_dag import ConversationalDAGMetric
|
|
10
10
|
from .bias.bias import BiasMetric
|
|
11
|
+
from .exact_match.exact_match import ExactMatchMetric
|
|
12
|
+
from .pattern_match.pattern_match import PatternMatchMetric
|
|
11
13
|
from .toxicity.toxicity import ToxicityMetric
|
|
12
14
|
from .pii_leakage.pii_leakage import PIILeakageMetric
|
|
13
15
|
from .non_advice.non_advice import NonAdviceMetric
|
|
@@ -69,6 +71,9 @@ __all__ = [
|
|
|
69
71
|
"BaseConversationalMetric",
|
|
70
72
|
"BaseMultimodalMetric",
|
|
71
73
|
"BaseArenaMetric",
|
|
74
|
+
# Non-LLM metrics
|
|
75
|
+
"ExactMatchMetric",
|
|
76
|
+
"PatternMatchMetric",
|
|
72
77
|
# Core metrics
|
|
73
78
|
"GEval",
|
|
74
79
|
"ArenaGEval",
|
|
File without changes
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from deepeval.metrics.indicator import metric_progress_indicator
|
|
4
|
+
from deepeval.metrics.utils import (
|
|
5
|
+
check_llm_test_case_params,
|
|
6
|
+
construct_verbose_logs,
|
|
7
|
+
)
|
|
8
|
+
from deepeval.metrics.api import metric_data_manager
|
|
9
|
+
from deepeval.metrics import BaseMetric
|
|
10
|
+
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ExactMatchMetric(BaseMetric):
|
|
14
|
+
_required_params: List[LLMTestCaseParams] = [
|
|
15
|
+
LLMTestCaseParams.INPUT,
|
|
16
|
+
LLMTestCaseParams.ACTUAL_OUTPUT,
|
|
17
|
+
LLMTestCaseParams.EXPECTED_OUTPUT,
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
threshold: float = 1,
|
|
23
|
+
verbose_mode: bool = False,
|
|
24
|
+
):
|
|
25
|
+
self.threshold = threshold
|
|
26
|
+
self.verbose_mode = verbose_mode
|
|
27
|
+
|
|
28
|
+
def measure(
|
|
29
|
+
self,
|
|
30
|
+
test_case: LLMTestCase,
|
|
31
|
+
_show_indicator: bool = True,
|
|
32
|
+
_in_component: bool = False,
|
|
33
|
+
_log_metric_to_confident: bool = True,
|
|
34
|
+
) -> float:
|
|
35
|
+
check_llm_test_case_params(test_case, self._required_params, self)
|
|
36
|
+
|
|
37
|
+
with metric_progress_indicator(
|
|
38
|
+
self, _show_indicator=_show_indicator, _in_component=_in_component
|
|
39
|
+
):
|
|
40
|
+
expected = test_case.expected_output.strip()
|
|
41
|
+
actual = test_case.actual_output.strip()
|
|
42
|
+
|
|
43
|
+
if expected == actual:
|
|
44
|
+
self.score = self.precision = self.recall = self.f1 = 1.0
|
|
45
|
+
self.reason = (
|
|
46
|
+
"The actual and expected outputs are exact matches."
|
|
47
|
+
)
|
|
48
|
+
else:
|
|
49
|
+
self.score = self.precision = self.recall = self.f1 = 0.0
|
|
50
|
+
self.reason = "The actual and expected outputs are different."
|
|
51
|
+
|
|
52
|
+
self.success = self.score >= self.threshold
|
|
53
|
+
|
|
54
|
+
if self.verbose_mode:
|
|
55
|
+
self.verbose_logs = construct_verbose_logs(
|
|
56
|
+
self,
|
|
57
|
+
steps=[
|
|
58
|
+
f"Score: {self.score:.2f}",
|
|
59
|
+
f"Reason: {self.reason}",
|
|
60
|
+
],
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
if _log_metric_to_confident:
|
|
64
|
+
metric_data_manager.post_metric_if_enabled(
|
|
65
|
+
self, test_case=test_case
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
return self.score
|
|
69
|
+
|
|
70
|
+
async def a_measure(
|
|
71
|
+
self,
|
|
72
|
+
test_case: LLMTestCase,
|
|
73
|
+
_show_indicator: bool = True,
|
|
74
|
+
_in_component: bool = False,
|
|
75
|
+
) -> float:
|
|
76
|
+
return self.measure(
|
|
77
|
+
test_case,
|
|
78
|
+
_show_indicator=_show_indicator,
|
|
79
|
+
_in_component=_in_component,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
def is_successful(self) -> bool:
|
|
83
|
+
if self.error is not None:
|
|
84
|
+
self.success = False
|
|
85
|
+
else:
|
|
86
|
+
try:
|
|
87
|
+
self.success = self.score >= self.threshold
|
|
88
|
+
except:
|
|
89
|
+
self.success = False
|
|
90
|
+
return self.success
|
|
91
|
+
|
|
92
|
+
@property
|
|
93
|
+
def __name__(self):
|
|
94
|
+
return "Exact Match"
|
|
File without changes
|