deepeval 3.6.9__py3-none-any.whl → 3.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/__init__.py +0 -4
- deepeval/_version.py +1 -1
- deepeval/anthropic/__init__.py +19 -0
- deepeval/anthropic/extractors.py +94 -0
- deepeval/anthropic/patch.py +169 -0
- deepeval/anthropic/utils.py +225 -0
- deepeval/benchmarks/drop/drop.py +40 -14
- deepeval/benchmarks/ifeval/ifeval.py +2 -2
- deepeval/cli/main.py +7 -0
- deepeval/confident/api.py +6 -1
- deepeval/confident/types.py +4 -2
- deepeval/config/settings.py +159 -11
- deepeval/config/settings_manager.py +4 -0
- deepeval/evaluate/compare.py +215 -4
- deepeval/evaluate/types.py +6 -0
- deepeval/evaluate/utils.py +30 -0
- deepeval/integrations/crewai/handler.py +36 -0
- deepeval/integrations/langchain/callback.py +27 -2
- deepeval/integrations/llama_index/handler.py +58 -4
- deepeval/integrations/llama_index/utils.py +24 -0
- deepeval/key_handler.py +1 -0
- deepeval/metrics/__init__.py +5 -0
- deepeval/metrics/arena_g_eval/arena_g_eval.py +5 -1
- deepeval/metrics/arena_g_eval/utils.py +5 -5
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +9 -18
- deepeval/metrics/exact_match/__init__.py +0 -0
- deepeval/metrics/exact_match/exact_match.py +94 -0
- deepeval/metrics/g_eval/g_eval.py +5 -1
- deepeval/metrics/g_eval/utils.py +1 -1
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +5 -1
- deepeval/metrics/pattern_match/__init__.py +0 -0
- deepeval/metrics/pattern_match/pattern_match.py +103 -0
- deepeval/metrics/task_completion/task_completion.py +9 -2
- deepeval/metrics/utils.py +1 -1
- deepeval/model_integrations/__init__.py +0 -0
- deepeval/model_integrations/utils.py +116 -0
- deepeval/models/base_model.py +3 -1
- deepeval/models/llms/gemini_model.py +27 -5
- deepeval/openai/__init__.py +3 -1
- deepeval/openai/extractors.py +2 -2
- deepeval/openai/utils.py +7 -31
- deepeval/openai_agents/callback_handler.py +12 -3
- deepeval/prompt/api.py +11 -10
- deepeval/prompt/prompt.py +27 -15
- deepeval/simulator/template.py +1 -1
- deepeval/telemetry.py +3 -3
- deepeval/test_case/__init__.py +2 -1
- deepeval/test_case/arena_test_case.py +15 -4
- deepeval/test_case/llm_test_case.py +3 -2
- deepeval/test_case/mllm_test_case.py +45 -22
- deepeval/test_run/api.py +3 -2
- deepeval/test_run/cache.py +35 -13
- deepeval/test_run/hyperparameters.py +5 -1
- deepeval/test_run/test_run.py +52 -14
- deepeval/tracing/api.py +11 -10
- deepeval/tracing/otel/exporter.py +11 -0
- deepeval/tracing/patchers.py +102 -1
- deepeval/tracing/trace_context.py +13 -4
- deepeval/tracing/tracing.py +11 -2
- deepeval/tracing/types.py +8 -8
- deepeval/tracing/utils.py +9 -0
- deepeval/utils.py +48 -2
- {deepeval-3.6.9.dist-info → deepeval-3.7.1.dist-info}/METADATA +3 -3
- {deepeval-3.6.9.dist-info → deepeval-3.7.1.dist-info}/RECORD +68 -58
- /deepeval/{openai → model_integrations}/types.py +0 -0
- {deepeval-3.6.9.dist-info → deepeval-3.7.1.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.9.dist-info → deepeval-3.7.1.dist-info}/WHEEL +0 -0
- {deepeval-3.6.9.dist-info → deepeval-3.7.1.dist-info}/entry_points.txt +0 -0
deepeval/confident/types.py
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
|
-
from pydantic import BaseModel
|
|
1
|
+
from pydantic import BaseModel
|
|
2
2
|
from typing import Any, Optional
|
|
3
3
|
|
|
4
|
+
from deepeval.utils import make_model_config
|
|
5
|
+
|
|
4
6
|
|
|
5
7
|
class ApiResponse(BaseModel):
|
|
6
|
-
model_config =
|
|
8
|
+
model_config = make_model_config(extra="ignore")
|
|
7
9
|
|
|
8
10
|
success: bool
|
|
9
11
|
data: Optional[Any] = None
|
deepeval/config/settings.py
CHANGED
|
@@ -9,10 +9,13 @@ Central config for DeepEval.
|
|
|
9
9
|
type coercion.
|
|
10
10
|
"""
|
|
11
11
|
|
|
12
|
+
import hashlib
|
|
13
|
+
import json
|
|
12
14
|
import logging
|
|
13
15
|
import math
|
|
14
16
|
import os
|
|
15
17
|
import re
|
|
18
|
+
import threading
|
|
16
19
|
|
|
17
20
|
from dotenv import dotenv_values
|
|
18
21
|
from pathlib import Path
|
|
@@ -22,6 +25,7 @@ from pydantic import (
|
|
|
22
25
|
confloat,
|
|
23
26
|
conint,
|
|
24
27
|
field_validator,
|
|
28
|
+
model_validator,
|
|
25
29
|
SecretStr,
|
|
26
30
|
)
|
|
27
31
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
@@ -39,6 +43,13 @@ from deepeval.constants import SUPPORTED_PROVIDER_SLUGS, slugify
|
|
|
39
43
|
logger = logging.getLogger(__name__)
|
|
40
44
|
_SAVE_RE = re.compile(r"^(?P<scheme>dotenv)(?::(?P<path>.+))?$")
|
|
41
45
|
|
|
46
|
+
# settings that were converted to computed fields with override counterparts
|
|
47
|
+
_DEPRECATED_TO_OVERRIDE = {
|
|
48
|
+
"DEEPEVAL_PER_TASK_TIMEOUT_SECONDS": "DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE",
|
|
49
|
+
"DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS": "DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS_OVERRIDE",
|
|
50
|
+
"DEEPEVAL_TASK_GATHER_BUFFER_SECONDS": "DEEPEVAL_TASK_GATHER_BUFFER_SECONDS_OVERRIDE",
|
|
51
|
+
}
|
|
52
|
+
|
|
42
53
|
|
|
43
54
|
def _find_legacy_enum(env_key: str):
|
|
44
55
|
from deepeval.key_handler import (
|
|
@@ -218,6 +229,11 @@ class Settings(BaseSettings):
|
|
|
218
229
|
API_KEY: Optional[SecretStr] = None
|
|
219
230
|
CONFIDENT_API_KEY: Optional[SecretStr] = None
|
|
220
231
|
|
|
232
|
+
# ======
|
|
233
|
+
# Base URL for Confident AI API server
|
|
234
|
+
# ======
|
|
235
|
+
CONFIDENT_BASE_URL: Optional[str] = None
|
|
236
|
+
|
|
221
237
|
# General
|
|
222
238
|
TEMPERATURE: Optional[confloat(ge=0, le=2)] = None
|
|
223
239
|
|
|
@@ -690,12 +706,119 @@ class Settings(BaseSettings):
|
|
|
690
706
|
"CRITICAL, NOTSET, or a numeric logging level."
|
|
691
707
|
)
|
|
692
708
|
|
|
709
|
+
@field_validator("DEEPEVAL_TELEMETRY_OPT_OUT", mode="before")
|
|
710
|
+
@classmethod
|
|
711
|
+
def _apply_telemetry_enabled_alias(cls, v):
|
|
712
|
+
"""
|
|
713
|
+
Precedence (most secure):
|
|
714
|
+
- Any OFF signal wins if both are set:
|
|
715
|
+
- DEEPEVAL_TELEMETRY_OPT_OUT = truthy -> OFF
|
|
716
|
+
- DEEPEVAL_TELEMETRY_ENABLED = falsy -> OFF
|
|
717
|
+
- Else, ON signal:
|
|
718
|
+
- DEEPEVAL_TELEMETRY_OPT_OUT = falsy -> ON
|
|
719
|
+
- DEEPEVAL_TELEMETRY_ENABLED = truthy -> ON
|
|
720
|
+
- Else None (unset) -> ON
|
|
721
|
+
"""
|
|
722
|
+
|
|
723
|
+
def normalize(x):
|
|
724
|
+
if x is None:
|
|
725
|
+
return None
|
|
726
|
+
s = str(x).strip()
|
|
727
|
+
return None if s == "" else parse_bool(s, default=False)
|
|
728
|
+
|
|
729
|
+
new_opt_out = normalize(v) # True means OFF, False means ON
|
|
730
|
+
legacy_enabled = normalize(
|
|
731
|
+
os.getenv("DEEPEVAL_TELEMETRY_ENABLED")
|
|
732
|
+
) # True means ON, False means OFF
|
|
733
|
+
|
|
734
|
+
off_signal = (new_opt_out is True) or (legacy_enabled is False)
|
|
735
|
+
on_signal = (new_opt_out is False) or (legacy_enabled is True)
|
|
736
|
+
|
|
737
|
+
# Conflict: simultaneous OFF and ON signals
|
|
738
|
+
if off_signal and on_signal:
|
|
739
|
+
# Only warn if verbose or debug
|
|
740
|
+
if parse_bool(
|
|
741
|
+
os.getenv("DEEPEVAL_VERBOSE_MODE"), default=False
|
|
742
|
+
) or logger.isEnabledFor(logging.DEBUG):
|
|
743
|
+
logger.warning(
|
|
744
|
+
"Conflicting telemetry flags detected: DEEPEVAL_TELEMETRY_OPT_OUT=%r, "
|
|
745
|
+
"DEEPEVAL_TELEMETRY_ENABLED=%r. Defaulting to OFF.",
|
|
746
|
+
new_opt_out,
|
|
747
|
+
legacy_enabled,
|
|
748
|
+
)
|
|
749
|
+
return True # OFF wins
|
|
750
|
+
|
|
751
|
+
# Clear winner
|
|
752
|
+
if off_signal:
|
|
753
|
+
return True # OFF
|
|
754
|
+
if on_signal:
|
|
755
|
+
return False # ON
|
|
756
|
+
|
|
757
|
+
# Unset means ON
|
|
758
|
+
return False
|
|
759
|
+
|
|
760
|
+
@model_validator(mode="after")
|
|
761
|
+
def _apply_deprecated_computed_env_aliases(self):
|
|
762
|
+
"""
|
|
763
|
+
Backwards compatibility courtesy:
|
|
764
|
+
- If users still set a deprecated computed field in the environment,
|
|
765
|
+
emit a deprecation warning and mirror its value into the matching
|
|
766
|
+
*_OVERRIDE field (unless the override is already set).
|
|
767
|
+
- Override always wins if both are present.
|
|
768
|
+
"""
|
|
769
|
+
for old_key, override_key in _DEPRECATED_TO_OVERRIDE.items():
|
|
770
|
+
raw = os.getenv(old_key)
|
|
771
|
+
if raw is None or str(raw).strip() == "":
|
|
772
|
+
continue
|
|
773
|
+
|
|
774
|
+
# if override already set, ignore the deprecated one but log a warning
|
|
775
|
+
if getattr(self, override_key) is not None:
|
|
776
|
+
logger.warning(
|
|
777
|
+
"Config deprecation: %s is deprecated and was ignored because %s "
|
|
778
|
+
"is already set. Please remove %s and use %s going forward.",
|
|
779
|
+
old_key,
|
|
780
|
+
override_key,
|
|
781
|
+
old_key,
|
|
782
|
+
override_key,
|
|
783
|
+
)
|
|
784
|
+
continue
|
|
785
|
+
|
|
786
|
+
# apply the deprecated value into the override field.
|
|
787
|
+
try:
|
|
788
|
+
# let pydantic coerce the string to the target type on assignment
|
|
789
|
+
setattr(self, override_key, raw)
|
|
790
|
+
logger.warning(
|
|
791
|
+
"Config deprecation: %s is deprecated. Its value (%r) was applied to %s. "
|
|
792
|
+
"Please migrate to %s and remove %s from your environment.",
|
|
793
|
+
old_key,
|
|
794
|
+
raw,
|
|
795
|
+
override_key,
|
|
796
|
+
override_key,
|
|
797
|
+
old_key,
|
|
798
|
+
)
|
|
799
|
+
except Exception as e:
|
|
800
|
+
# do not let exception bubble up, just warn
|
|
801
|
+
logger.warning(
|
|
802
|
+
"Config deprecation: %s is deprecated and could not be applied to %s "
|
|
803
|
+
"(value=%r): %s",
|
|
804
|
+
old_key,
|
|
805
|
+
override_key,
|
|
806
|
+
raw,
|
|
807
|
+
e,
|
|
808
|
+
)
|
|
809
|
+
return self
|
|
810
|
+
|
|
693
811
|
#######################
|
|
694
812
|
# Persistence support #
|
|
695
813
|
#######################
|
|
696
814
|
class _SettingsEditCtx:
|
|
815
|
+
# TODO: will generate this list in future PR
|
|
697
816
|
COMPUTED_FIELDS: frozenset[str] = frozenset(
|
|
698
|
-
{
|
|
817
|
+
{
|
|
818
|
+
"DEEPEVAL_PER_TASK_TIMEOUT_SECONDS",
|
|
819
|
+
"DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS",
|
|
820
|
+
"DEEPEVAL_TASK_GATHER_BUFFER_SECONDS",
|
|
821
|
+
}
|
|
699
822
|
)
|
|
700
823
|
|
|
701
824
|
def __init__(
|
|
@@ -880,16 +1003,39 @@ class Settings(BaseSettings):
|
|
|
880
1003
|
|
|
881
1004
|
|
|
882
1005
|
_settings_singleton: Optional[Settings] = None
|
|
1006
|
+
_settings_env_fingerprint: "str | None" = None
|
|
1007
|
+
_settings_lock = threading.RLock()
|
|
1008
|
+
|
|
1009
|
+
|
|
1010
|
+
def _calc_env_fingerprint() -> str:
|
|
1011
|
+
env = os.environ.copy()
|
|
1012
|
+
# must hash in a stable order.
|
|
1013
|
+
keys = sorted(
|
|
1014
|
+
key
|
|
1015
|
+
for key in Settings.model_fields.keys()
|
|
1016
|
+
if key != "_DEPRECATED_TELEMETRY_ENABLED" # exclude deprecated
|
|
1017
|
+
)
|
|
1018
|
+
# encode as triples: (key, present?, value)
|
|
1019
|
+
items = [(k, k in env, env.get(k)) for k in keys]
|
|
1020
|
+
payload = json.dumps(items, ensure_ascii=False, separators=(",", ":"))
|
|
1021
|
+
return hashlib.sha256(payload.encode("utf-8")).hexdigest()
|
|
883
1022
|
|
|
884
1023
|
|
|
885
1024
|
def get_settings() -> Settings:
|
|
886
|
-
global _settings_singleton
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
1025
|
+
global _settings_singleton, _settings_env_fingerprint
|
|
1026
|
+
fingerprint = _calc_env_fingerprint()
|
|
1027
|
+
|
|
1028
|
+
with _settings_lock:
|
|
1029
|
+
if (
|
|
1030
|
+
_settings_singleton is None
|
|
1031
|
+
or _settings_env_fingerprint != fingerprint
|
|
1032
|
+
):
|
|
1033
|
+
_settings_singleton = Settings()
|
|
1034
|
+
_settings_env_fingerprint = fingerprint
|
|
1035
|
+
from deepeval.config.logging import apply_deepeval_log_level
|
|
890
1036
|
|
|
891
|
-
|
|
892
|
-
|
|
1037
|
+
apply_deepeval_log_level()
|
|
1038
|
+
return _settings_singleton
|
|
893
1039
|
|
|
894
1040
|
|
|
895
1041
|
def reset_settings(*, reload_dotenv: bool = False) -> Settings:
|
|
@@ -905,8 +1051,10 @@ def reset_settings(*, reload_dotenv: bool = False) -> Settings:
|
|
|
905
1051
|
Returns:
|
|
906
1052
|
The fresh Settings instance.
|
|
907
1053
|
"""
|
|
908
|
-
global _settings_singleton
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
1054
|
+
global _settings_singleton, _settings_env_fingerprint
|
|
1055
|
+
with _settings_lock:
|
|
1056
|
+
if reload_dotenv:
|
|
1057
|
+
autoload_dotenv()
|
|
1058
|
+
_settings_singleton = None
|
|
1059
|
+
_settings_env_fingerprint = None
|
|
912
1060
|
return get_settings()
|
|
@@ -4,6 +4,7 @@ dotenv file. Also syncs os.environ, handles unsets, and warns on unknown fields.
|
|
|
4
4
|
Primary entrypoint: update_settings_and_persist.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
+
import json
|
|
7
8
|
import logging
|
|
8
9
|
import os
|
|
9
10
|
|
|
@@ -33,6 +34,9 @@ def _normalize_for_env(val: Any) -> Optional[str]:
|
|
|
33
34
|
return val.get_secret_value()
|
|
34
35
|
if isinstance(val, bool):
|
|
35
36
|
return bool_to_env_str(val)
|
|
37
|
+
# encode sequences as JSON so Settings can parse them back reliably.
|
|
38
|
+
if isinstance(val, (list, tuple, set)):
|
|
39
|
+
return json.dumps(list(val))
|
|
36
40
|
return str(val)
|
|
37
41
|
|
|
38
42
|
|
deepeval/evaluate/compare.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from typing import Optional, List, Dict, Callable
|
|
2
2
|
import asyncio
|
|
3
|
+
import time
|
|
3
4
|
from rich.progress import (
|
|
4
5
|
Progress,
|
|
5
6
|
TextColumn,
|
|
@@ -8,24 +9,74 @@ from rich.progress import (
|
|
|
8
9
|
TaskProgressColumn,
|
|
9
10
|
)
|
|
10
11
|
from collections import Counter
|
|
12
|
+
import json
|
|
11
13
|
|
|
12
14
|
from deepeval.errors import MissingTestCaseParamsError
|
|
13
15
|
from deepeval.evaluate.configs import AsyncConfig, DisplayConfig, ErrorConfig
|
|
14
|
-
from deepeval.test_case import ArenaTestCase
|
|
16
|
+
from deepeval.test_case import ArenaTestCase, Contestant
|
|
17
|
+
from deepeval.test_case.api import create_api_test_case
|
|
15
18
|
from deepeval.metrics import ArenaGEval
|
|
16
|
-
from deepeval.utils import
|
|
17
|
-
|
|
19
|
+
from deepeval.utils import (
|
|
20
|
+
add_pbar,
|
|
21
|
+
update_pbar,
|
|
22
|
+
custom_console,
|
|
23
|
+
get_or_create_event_loop,
|
|
24
|
+
open_browser,
|
|
25
|
+
)
|
|
26
|
+
from deepeval.test_run.test_run import (
|
|
27
|
+
TestRun,
|
|
28
|
+
MetricData,
|
|
29
|
+
TestRunEncoder,
|
|
30
|
+
MetricScores,
|
|
31
|
+
console,
|
|
32
|
+
)
|
|
33
|
+
from deepeval.test_run.hyperparameters import (
|
|
34
|
+
process_hyperparameters,
|
|
35
|
+
)
|
|
36
|
+
from deepeval.confident.api import Api, Endpoints, HttpMethods, is_confident
|
|
18
37
|
from deepeval.telemetry import capture_evaluation_run
|
|
38
|
+
from deepeval.test_run.api import LLMApiTestCase
|
|
39
|
+
from deepeval.evaluate.utils import create_arena_metric_data
|
|
40
|
+
from deepeval.evaluate.types import PostExperimentRequest
|
|
19
41
|
|
|
20
42
|
|
|
21
43
|
def compare(
|
|
22
44
|
test_cases: List[ArenaTestCase],
|
|
23
45
|
metric: ArenaGEval,
|
|
46
|
+
name: str = "compare()",
|
|
24
47
|
# Configs
|
|
25
48
|
async_config: Optional[AsyncConfig] = AsyncConfig(),
|
|
26
49
|
display_config: Optional[DisplayConfig] = DisplayConfig(),
|
|
27
50
|
error_config: Optional[ErrorConfig] = ErrorConfig(),
|
|
28
51
|
) -> Dict[str, int]:
|
|
52
|
+
|
|
53
|
+
# Prepare test run map
|
|
54
|
+
unique_contestant_names = set(
|
|
55
|
+
[
|
|
56
|
+
contestant.name
|
|
57
|
+
for test_case in test_cases
|
|
58
|
+
for contestant in test_case.contestants
|
|
59
|
+
]
|
|
60
|
+
)
|
|
61
|
+
test_run_map: Dict[str, TestRun] = {}
|
|
62
|
+
for contestant_name in unique_contestant_names:
|
|
63
|
+
test_run = TestRun(
|
|
64
|
+
identifier=contestant_name,
|
|
65
|
+
test_passed=0,
|
|
66
|
+
test_failed=0,
|
|
67
|
+
)
|
|
68
|
+
test_run.metrics_scores = [
|
|
69
|
+
MetricScores(
|
|
70
|
+
metric=metric.name,
|
|
71
|
+
scores=[],
|
|
72
|
+
passes=0,
|
|
73
|
+
fails=0,
|
|
74
|
+
errors=0,
|
|
75
|
+
)
|
|
76
|
+
]
|
|
77
|
+
test_run_map[contestant_name] = test_run
|
|
78
|
+
|
|
79
|
+
start_time = time.time()
|
|
29
80
|
with capture_evaluation_run("compare()"):
|
|
30
81
|
if async_config.run_async:
|
|
31
82
|
loop = get_or_create_event_loop()
|
|
@@ -39,6 +90,7 @@ def compare(
|
|
|
39
90
|
throttle_value=async_config.throttle_value,
|
|
40
91
|
max_concurrent=async_config.max_concurrent,
|
|
41
92
|
skip_on_missing_params=error_config.skip_on_missing_params,
|
|
93
|
+
test_run_map=test_run_map,
|
|
42
94
|
)
|
|
43
95
|
)
|
|
44
96
|
else:
|
|
@@ -49,7 +101,10 @@ def compare(
|
|
|
49
101
|
verbose_mode=display_config.verbose_mode,
|
|
50
102
|
show_indicator=display_config.show_indicator,
|
|
51
103
|
skip_on_missing_params=error_config.skip_on_missing_params,
|
|
104
|
+
test_run_map=test_run_map,
|
|
52
105
|
)
|
|
106
|
+
end_time = time.time()
|
|
107
|
+
run_duration = end_time - start_time
|
|
53
108
|
|
|
54
109
|
# Aggregate winners
|
|
55
110
|
winner_counts = Counter()
|
|
@@ -57,7 +112,13 @@ def compare(
|
|
|
57
112
|
if winner:
|
|
58
113
|
winner_counts[winner] += 1
|
|
59
114
|
|
|
60
|
-
|
|
115
|
+
process_test_runs(test_run_map=test_run_map, test_cases=test_cases)
|
|
116
|
+
wrap_up_experiment(
|
|
117
|
+
name=name,
|
|
118
|
+
test_runs=list(test_run_map.values()),
|
|
119
|
+
winner_counts=winner_counts,
|
|
120
|
+
run_duration=run_duration,
|
|
121
|
+
)
|
|
61
122
|
return dict(winner_counts)
|
|
62
123
|
|
|
63
124
|
|
|
@@ -70,6 +131,7 @@ async def a_execute_arena_test_cases(
|
|
|
70
131
|
throttle_value: int,
|
|
71
132
|
skip_on_missing_params: bool,
|
|
72
133
|
max_concurrent: int,
|
|
134
|
+
test_run_map: Dict[str, TestRun],
|
|
73
135
|
) -> List[str]:
|
|
74
136
|
semaphore = asyncio.Semaphore(max_concurrent)
|
|
75
137
|
|
|
@@ -104,6 +166,8 @@ async def a_execute_arena_test_cases(
|
|
|
104
166
|
else metric.verbose_mode
|
|
105
167
|
),
|
|
106
168
|
)
|
|
169
|
+
|
|
170
|
+
start_time = time.perf_counter()
|
|
107
171
|
winner = await _a_handle_metric_measurement(
|
|
108
172
|
metric=metric_copy,
|
|
109
173
|
test_case=test_case,
|
|
@@ -112,10 +176,21 @@ async def a_execute_arena_test_cases(
|
|
|
112
176
|
_progress=progress,
|
|
113
177
|
_pbar_id=pbar_test_case_id,
|
|
114
178
|
)
|
|
179
|
+
end_time = time.perf_counter()
|
|
180
|
+
run_duration = end_time - start_time
|
|
181
|
+
|
|
115
182
|
if winner:
|
|
116
183
|
winners.append(winner)
|
|
117
184
|
|
|
118
185
|
update_pbar(progress, pbar_id)
|
|
186
|
+
update_test_run_map(
|
|
187
|
+
test_case=test_case,
|
|
188
|
+
index=index,
|
|
189
|
+
test_run_map=test_run_map,
|
|
190
|
+
metric_copy=metric_copy,
|
|
191
|
+
winner=winner,
|
|
192
|
+
run_duration=run_duration,
|
|
193
|
+
)
|
|
119
194
|
|
|
120
195
|
# Create tasks for all test cases
|
|
121
196
|
if show_indicator:
|
|
@@ -156,6 +231,7 @@ def execute_arena_test_cases(
|
|
|
156
231
|
skip_on_missing_params: bool,
|
|
157
232
|
show_indicator: bool,
|
|
158
233
|
verbose_mode: Optional[bool] = None,
|
|
234
|
+
test_run_map: Optional[Dict[str, TestRun]] = None,
|
|
159
235
|
) -> List[str]:
|
|
160
236
|
"""
|
|
161
237
|
Non-async version of comparing arena test cases.
|
|
@@ -183,6 +259,8 @@ def execute_arena_test_cases(
|
|
|
183
259
|
else metric.verbose_mode
|
|
184
260
|
),
|
|
185
261
|
)
|
|
262
|
+
|
|
263
|
+
start_time = time.perf_counter()
|
|
186
264
|
winner = _handle_metric_measurement(
|
|
187
265
|
metric=metric_copy,
|
|
188
266
|
test_case=test_case,
|
|
@@ -191,10 +269,21 @@ def execute_arena_test_cases(
|
|
|
191
269
|
_progress=progress,
|
|
192
270
|
_pbar_id=pbar_test_case_id,
|
|
193
271
|
)
|
|
272
|
+
end_time = time.perf_counter()
|
|
273
|
+
run_duration = end_time - start_time
|
|
274
|
+
|
|
194
275
|
if winner:
|
|
195
276
|
winners.append(winner)
|
|
196
277
|
|
|
197
278
|
update_pbar(progress, pbar_id)
|
|
279
|
+
update_test_run_map(
|
|
280
|
+
test_case=test_case,
|
|
281
|
+
index=i,
|
|
282
|
+
test_run_map=test_run_map,
|
|
283
|
+
metric_copy=metric_copy,
|
|
284
|
+
winner=winner,
|
|
285
|
+
run_duration=run_duration,
|
|
286
|
+
)
|
|
198
287
|
|
|
199
288
|
if show_indicator:
|
|
200
289
|
progress = Progress(
|
|
@@ -313,3 +402,125 @@ async def _a_handle_metric_measurement(
|
|
|
313
402
|
return None
|
|
314
403
|
else:
|
|
315
404
|
raise
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
def update_test_run_map(
|
|
408
|
+
test_case: ArenaTestCase,
|
|
409
|
+
index: int,
|
|
410
|
+
test_run_map: Dict[str, TestRun],
|
|
411
|
+
metric_copy: ArenaGEval,
|
|
412
|
+
winner: str,
|
|
413
|
+
run_duration: float,
|
|
414
|
+
):
|
|
415
|
+
for contestant in test_case.contestants:
|
|
416
|
+
test_run = test_run_map.get(contestant.name)
|
|
417
|
+
|
|
418
|
+
# update test cases in test run
|
|
419
|
+
api_test_case: LLMApiTestCase = create_api_test_case(
|
|
420
|
+
test_case=contestant.test_case, index=index
|
|
421
|
+
)
|
|
422
|
+
metric_data: MetricData = create_arena_metric_data(
|
|
423
|
+
metric_copy, contestant.name
|
|
424
|
+
)
|
|
425
|
+
api_test_case.update_metric_data(metric_data)
|
|
426
|
+
api_test_case.update_run_duration(run_duration)
|
|
427
|
+
test_run.add_test_case(api_test_case)
|
|
428
|
+
|
|
429
|
+
# update other test run attributes
|
|
430
|
+
if test_run.run_duration is None:
|
|
431
|
+
test_run.run_duration = 0.0
|
|
432
|
+
test_run.run_duration += run_duration
|
|
433
|
+
|
|
434
|
+
# Ensure test_passed and test_failed are initialized
|
|
435
|
+
if test_run.test_passed is None:
|
|
436
|
+
test_run.test_passed = 0
|
|
437
|
+
if test_run.test_failed is None:
|
|
438
|
+
test_run.test_failed = 0
|
|
439
|
+
|
|
440
|
+
if winner == contestant:
|
|
441
|
+
test_run.test_passed += 1
|
|
442
|
+
else:
|
|
443
|
+
test_run.test_failed += 1
|
|
444
|
+
|
|
445
|
+
# update metric scores
|
|
446
|
+
test_run.metrics_scores[0].metric = metric_copy.name
|
|
447
|
+
test_run.metrics_scores[0].scores.append(
|
|
448
|
+
1 if winner == contestant else 0
|
|
449
|
+
)
|
|
450
|
+
test_run.metrics_scores[0].passes += 1 if winner == contestant else 0
|
|
451
|
+
test_run.metrics_scores[0].fails += 1 if winner != contestant else 0
|
|
452
|
+
test_run.metrics_scores[0].errors += 0
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
def process_test_runs(
|
|
456
|
+
test_run_map: Dict[str, TestRun],
|
|
457
|
+
test_cases: List[ArenaTestCase],
|
|
458
|
+
):
|
|
459
|
+
hyperparameters_map = {
|
|
460
|
+
contestant_name: {} for contestant_name in test_run_map.keys()
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
for test_case in test_cases:
|
|
464
|
+
for contestant in test_case.contestants:
|
|
465
|
+
if contestant.hyperparameters:
|
|
466
|
+
hyperparameters_map[contestant.name].update(
|
|
467
|
+
contestant.hyperparameters
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
for contestant_name, hyperparameters in hyperparameters_map.items():
|
|
471
|
+
test_run = test_run_map.get(contestant_name)
|
|
472
|
+
test_run.hyperparameters = process_hyperparameters(hyperparameters)
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
def wrap_up_experiment(
|
|
476
|
+
name: str,
|
|
477
|
+
test_runs: List[TestRun],
|
|
478
|
+
winner_counts: Counter,
|
|
479
|
+
run_duration: float,
|
|
480
|
+
):
|
|
481
|
+
winner_breakdown = []
|
|
482
|
+
for contestant, wins in winner_counts.most_common():
|
|
483
|
+
winner_breakdown.append(
|
|
484
|
+
f" » [bold green]{contestant}[/bold green]: {wins} wins"
|
|
485
|
+
)
|
|
486
|
+
winner_text = (
|
|
487
|
+
"\n".join(winner_breakdown) if winner_breakdown else "No winners"
|
|
488
|
+
)
|
|
489
|
+
console.print(
|
|
490
|
+
f"\n🎉 Arena completed! (time taken: {round(run_duration, 2)}s | token cost: {test_runs[0].evaluation_cost if test_runs else 0} USD)\n"
|
|
491
|
+
f"🏆 Results ({sum(winner_counts.values())} total test cases):\n"
|
|
492
|
+
f"{winner_text}\n\n"
|
|
493
|
+
)
|
|
494
|
+
|
|
495
|
+
if not is_confident():
|
|
496
|
+
console.print(
|
|
497
|
+
f"{'=' * 80}\n"
|
|
498
|
+
f"\n» Want to share experiments with your team? ❤️ 🏟️\n"
|
|
499
|
+
f" » Run [bold]'deepeval login'[/bold] to analyze and save arena results on [rgb(106,0,255)]Confident AI[/rgb(106,0,255)].\n\n"
|
|
500
|
+
)
|
|
501
|
+
return
|
|
502
|
+
|
|
503
|
+
try:
|
|
504
|
+
api = Api()
|
|
505
|
+
experiment_request = PostExperimentRequest(testRuns=test_runs, name=name)
|
|
506
|
+
|
|
507
|
+
try:
|
|
508
|
+
body = experiment_request.model_dump(by_alias=True, exclude_none=True)
|
|
509
|
+
except AttributeError:
|
|
510
|
+
body = experiment_request.dict(by_alias=True, exclude_none=True)
|
|
511
|
+
json_str = json.dumps(body, cls=TestRunEncoder)
|
|
512
|
+
body = json.loads(json_str)
|
|
513
|
+
|
|
514
|
+
_, link = api.send_request(
|
|
515
|
+
method=HttpMethods.POST,
|
|
516
|
+
endpoint=Endpoints.EXPERIMENT_ENDPOINT,
|
|
517
|
+
body=body,
|
|
518
|
+
)
|
|
519
|
+
console.print(
|
|
520
|
+
"[rgb(5,245,141)]✓[/rgb(5,245,141)] Done 🎉! View results on "
|
|
521
|
+
f"[link={link}]{link}[/link]"
|
|
522
|
+
)
|
|
523
|
+
open_browser(link)
|
|
524
|
+
|
|
525
|
+
except Exception:
|
|
526
|
+
raise
|
deepeval/evaluate/types.py
CHANGED
|
@@ -4,6 +4,7 @@ from pydantic import BaseModel
|
|
|
4
4
|
|
|
5
5
|
from deepeval.test_run.api import MetricData, TurnApi
|
|
6
6
|
from deepeval.test_case import MLLMImage
|
|
7
|
+
from deepeval.test_run import TestRun
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
@dataclass
|
|
@@ -29,3 +30,8 @@ class EvaluationResult(BaseModel):
|
|
|
29
30
|
test_results: List[TestResult]
|
|
30
31
|
confident_link: Optional[str]
|
|
31
32
|
test_run_id: Optional[str]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class PostExperimentRequest(BaseModel):
|
|
36
|
+
testRuns: List[TestRun]
|
|
37
|
+
name: Optional[str]
|
deepeval/evaluate/utils.py
CHANGED
|
@@ -8,6 +8,7 @@ from deepeval.utils import format_turn
|
|
|
8
8
|
from deepeval.test_run.test_run import TestRunResultDisplay
|
|
9
9
|
from deepeval.dataset import Golden
|
|
10
10
|
from deepeval.metrics import (
|
|
11
|
+
ArenaGEval,
|
|
11
12
|
BaseMetric,
|
|
12
13
|
BaseConversationalMetric,
|
|
13
14
|
BaseMultimodalMetric,
|
|
@@ -84,6 +85,35 @@ def create_metric_data(metric: BaseMetric) -> MetricData:
|
|
|
84
85
|
)
|
|
85
86
|
|
|
86
87
|
|
|
88
|
+
def create_arena_metric_data(metric: ArenaGEval, contestant: str) -> MetricData:
|
|
89
|
+
if metric.error is not None:
|
|
90
|
+
return MetricData(
|
|
91
|
+
name=metric.__name__,
|
|
92
|
+
threshold=1,
|
|
93
|
+
score=None,
|
|
94
|
+
reason=None,
|
|
95
|
+
success=False,
|
|
96
|
+
strictMode=True,
|
|
97
|
+
evaluationModel=metric.evaluation_model,
|
|
98
|
+
error=metric.error,
|
|
99
|
+
evaluationCost=metric.evaluation_cost,
|
|
100
|
+
verboseLogs=metric.verbose_logs,
|
|
101
|
+
)
|
|
102
|
+
else:
|
|
103
|
+
return MetricData(
|
|
104
|
+
name=metric.__name__,
|
|
105
|
+
score=1 if contestant == metric.winner else 0,
|
|
106
|
+
threshold=1,
|
|
107
|
+
reason=metric.reason,
|
|
108
|
+
success=metric.is_successful(),
|
|
109
|
+
strictMode=True,
|
|
110
|
+
evaluationModel=metric.evaluation_model,
|
|
111
|
+
error=None,
|
|
112
|
+
evaluationCost=metric.evaluation_cost,
|
|
113
|
+
verboseLogs=metric.verbose_logs,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
|
|
87
117
|
def create_test_result(
|
|
88
118
|
api_test_case: Union[LLMApiTestCase, ConversationalApiTestCase],
|
|
89
119
|
) -> TestResult:
|
|
@@ -23,6 +23,8 @@ try:
|
|
|
23
23
|
AgentExecutionCompletedEvent,
|
|
24
24
|
ToolUsageStartedEvent,
|
|
25
25
|
ToolUsageFinishedEvent,
|
|
26
|
+
KnowledgeRetrievalStartedEvent,
|
|
27
|
+
KnowledgeRetrievalCompletedEvent,
|
|
26
28
|
)
|
|
27
29
|
|
|
28
30
|
crewai_installed = True
|
|
@@ -69,6 +71,14 @@ class CrewAIEventsListener(BaseEventListener):
|
|
|
69
71
|
|
|
70
72
|
return execution_id
|
|
71
73
|
|
|
74
|
+
@staticmethod
|
|
75
|
+
def get_knowledge_execution_id(source, event) -> str:
|
|
76
|
+
source_id = id(source)
|
|
77
|
+
agent_id = id(event.agent) if hasattr(event, "agent") else "unknown"
|
|
78
|
+
execution_id = f"_knowledge_{source_id}_{agent_id}"
|
|
79
|
+
|
|
80
|
+
return execution_id
|
|
81
|
+
|
|
72
82
|
def setup_listeners(self, crewai_event_bus):
|
|
73
83
|
@crewai_event_bus.on(CrewKickoffStartedEvent)
|
|
74
84
|
def on_crew_started(source, event: CrewKickoffStartedEvent):
|
|
@@ -161,6 +171,32 @@ class CrewAIEventsListener(BaseEventListener):
|
|
|
161
171
|
current_span.output = event.output
|
|
162
172
|
observer.__exit__(None, None, None)
|
|
163
173
|
|
|
174
|
+
@crewai_event_bus.on(KnowledgeRetrievalStartedEvent)
|
|
175
|
+
def on_knowledge_started(source, event: KnowledgeRetrievalStartedEvent):
|
|
176
|
+
observer = Observer(
|
|
177
|
+
span_type="tool",
|
|
178
|
+
func_name="knowledge_retrieval",
|
|
179
|
+
function_kwargs={},
|
|
180
|
+
)
|
|
181
|
+
self.span_observers[
|
|
182
|
+
self.get_knowledge_execution_id(source, event)
|
|
183
|
+
] = observer
|
|
184
|
+
observer.__enter__()
|
|
185
|
+
|
|
186
|
+
@crewai_event_bus.on(KnowledgeRetrievalCompletedEvent)
|
|
187
|
+
def on_knowledge_completed(
|
|
188
|
+
source, event: KnowledgeRetrievalCompletedEvent
|
|
189
|
+
):
|
|
190
|
+
observer = self.span_observers.pop(
|
|
191
|
+
self.get_knowledge_execution_id(source, event)
|
|
192
|
+
)
|
|
193
|
+
if observer:
|
|
194
|
+
current_span = current_span_context.get()
|
|
195
|
+
if current_span:
|
|
196
|
+
current_span.input = event.query
|
|
197
|
+
current_span.output = event.retrieved_knowledge
|
|
198
|
+
observer.__exit__(None, None, None)
|
|
199
|
+
|
|
164
200
|
|
|
165
201
|
def instrument_crewai(api_key: Optional[str] = None):
|
|
166
202
|
is_crewai_installed()
|