deepeval 3.6.9__py3-none-any.whl → 3.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. deepeval/__init__.py +0 -4
  2. deepeval/_version.py +1 -1
  3. deepeval/anthropic/__init__.py +19 -0
  4. deepeval/anthropic/extractors.py +94 -0
  5. deepeval/anthropic/patch.py +169 -0
  6. deepeval/anthropic/utils.py +225 -0
  7. deepeval/benchmarks/drop/drop.py +40 -14
  8. deepeval/benchmarks/ifeval/ifeval.py +2 -2
  9. deepeval/cli/main.py +7 -0
  10. deepeval/confident/api.py +6 -1
  11. deepeval/confident/types.py +4 -2
  12. deepeval/config/settings.py +159 -11
  13. deepeval/config/settings_manager.py +4 -0
  14. deepeval/evaluate/compare.py +215 -4
  15. deepeval/evaluate/types.py +6 -0
  16. deepeval/evaluate/utils.py +30 -0
  17. deepeval/integrations/crewai/handler.py +36 -0
  18. deepeval/integrations/langchain/callback.py +27 -2
  19. deepeval/integrations/llama_index/handler.py +58 -4
  20. deepeval/integrations/llama_index/utils.py +24 -0
  21. deepeval/key_handler.py +1 -0
  22. deepeval/metrics/__init__.py +5 -0
  23. deepeval/metrics/arena_g_eval/arena_g_eval.py +5 -1
  24. deepeval/metrics/arena_g_eval/utils.py +5 -5
  25. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +9 -18
  26. deepeval/metrics/exact_match/__init__.py +0 -0
  27. deepeval/metrics/exact_match/exact_match.py +94 -0
  28. deepeval/metrics/g_eval/g_eval.py +5 -1
  29. deepeval/metrics/g_eval/utils.py +1 -1
  30. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +5 -1
  31. deepeval/metrics/pattern_match/__init__.py +0 -0
  32. deepeval/metrics/pattern_match/pattern_match.py +103 -0
  33. deepeval/metrics/task_completion/task_completion.py +9 -2
  34. deepeval/metrics/utils.py +1 -1
  35. deepeval/model_integrations/__init__.py +0 -0
  36. deepeval/model_integrations/utils.py +116 -0
  37. deepeval/models/base_model.py +3 -1
  38. deepeval/models/llms/gemini_model.py +27 -5
  39. deepeval/openai/__init__.py +3 -1
  40. deepeval/openai/extractors.py +2 -2
  41. deepeval/openai/utils.py +7 -31
  42. deepeval/openai_agents/callback_handler.py +12 -3
  43. deepeval/prompt/api.py +11 -10
  44. deepeval/prompt/prompt.py +27 -15
  45. deepeval/simulator/template.py +1 -1
  46. deepeval/telemetry.py +3 -3
  47. deepeval/test_case/__init__.py +2 -1
  48. deepeval/test_case/arena_test_case.py +15 -4
  49. deepeval/test_case/llm_test_case.py +3 -2
  50. deepeval/test_case/mllm_test_case.py +45 -22
  51. deepeval/test_run/api.py +3 -2
  52. deepeval/test_run/cache.py +35 -13
  53. deepeval/test_run/hyperparameters.py +5 -1
  54. deepeval/test_run/test_run.py +52 -14
  55. deepeval/tracing/api.py +11 -10
  56. deepeval/tracing/otel/exporter.py +11 -0
  57. deepeval/tracing/patchers.py +102 -1
  58. deepeval/tracing/trace_context.py +13 -4
  59. deepeval/tracing/tracing.py +11 -2
  60. deepeval/tracing/types.py +8 -8
  61. deepeval/tracing/utils.py +9 -0
  62. deepeval/utils.py +48 -2
  63. {deepeval-3.6.9.dist-info → deepeval-3.7.1.dist-info}/METADATA +3 -3
  64. {deepeval-3.6.9.dist-info → deepeval-3.7.1.dist-info}/RECORD +68 -58
  65. /deepeval/{openai → model_integrations}/types.py +0 -0
  66. {deepeval-3.6.9.dist-info → deepeval-3.7.1.dist-info}/LICENSE.md +0 -0
  67. {deepeval-3.6.9.dist-info → deepeval-3.7.1.dist-info}/WHEEL +0 -0
  68. {deepeval-3.6.9.dist-info → deepeval-3.7.1.dist-info}/entry_points.txt +0 -0
@@ -1,9 +1,11 @@
1
- from pydantic import BaseModel, ConfigDict
1
+ from pydantic import BaseModel
2
2
  from typing import Any, Optional
3
3
 
4
+ from deepeval.utils import make_model_config
5
+
4
6
 
5
7
  class ApiResponse(BaseModel):
6
- model_config = ConfigDict(extra="ignore")
8
+ model_config = make_model_config(extra="ignore")
7
9
 
8
10
  success: bool
9
11
  data: Optional[Any] = None
@@ -9,10 +9,13 @@ Central config for DeepEval.
9
9
  type coercion.
10
10
  """
11
11
 
12
+ import hashlib
13
+ import json
12
14
  import logging
13
15
  import math
14
16
  import os
15
17
  import re
18
+ import threading
16
19
 
17
20
  from dotenv import dotenv_values
18
21
  from pathlib import Path
@@ -22,6 +25,7 @@ from pydantic import (
22
25
  confloat,
23
26
  conint,
24
27
  field_validator,
28
+ model_validator,
25
29
  SecretStr,
26
30
  )
27
31
  from pydantic_settings import BaseSettings, SettingsConfigDict
@@ -39,6 +43,13 @@ from deepeval.constants import SUPPORTED_PROVIDER_SLUGS, slugify
39
43
  logger = logging.getLogger(__name__)
40
44
  _SAVE_RE = re.compile(r"^(?P<scheme>dotenv)(?::(?P<path>.+))?$")
41
45
 
46
+ # settings that were converted to computed fields with override counterparts
47
+ _DEPRECATED_TO_OVERRIDE = {
48
+ "DEEPEVAL_PER_TASK_TIMEOUT_SECONDS": "DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE",
49
+ "DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS": "DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS_OVERRIDE",
50
+ "DEEPEVAL_TASK_GATHER_BUFFER_SECONDS": "DEEPEVAL_TASK_GATHER_BUFFER_SECONDS_OVERRIDE",
51
+ }
52
+
42
53
 
43
54
  def _find_legacy_enum(env_key: str):
44
55
  from deepeval.key_handler import (
@@ -218,6 +229,11 @@ class Settings(BaseSettings):
218
229
  API_KEY: Optional[SecretStr] = None
219
230
  CONFIDENT_API_KEY: Optional[SecretStr] = None
220
231
 
232
+ # ======
233
+ # Base URL for Confident AI API server
234
+ # ======
235
+ CONFIDENT_BASE_URL: Optional[str] = None
236
+
221
237
  # General
222
238
  TEMPERATURE: Optional[confloat(ge=0, le=2)] = None
223
239
 
@@ -690,12 +706,119 @@ class Settings(BaseSettings):
690
706
  "CRITICAL, NOTSET, or a numeric logging level."
691
707
  )
692
708
 
709
+ @field_validator("DEEPEVAL_TELEMETRY_OPT_OUT", mode="before")
710
+ @classmethod
711
+ def _apply_telemetry_enabled_alias(cls, v):
712
+ """
713
+ Precedence (most secure):
714
+ - Any OFF signal wins if both are set:
715
+ - DEEPEVAL_TELEMETRY_OPT_OUT = truthy -> OFF
716
+ - DEEPEVAL_TELEMETRY_ENABLED = falsy -> OFF
717
+ - Else, ON signal:
718
+ - DEEPEVAL_TELEMETRY_OPT_OUT = falsy -> ON
719
+ - DEEPEVAL_TELEMETRY_ENABLED = truthy -> ON
720
+ - Else None (unset) -> ON
721
+ """
722
+
723
+ def normalize(x):
724
+ if x is None:
725
+ return None
726
+ s = str(x).strip()
727
+ return None if s == "" else parse_bool(s, default=False)
728
+
729
+ new_opt_out = normalize(v) # True means OFF, False means ON
730
+ legacy_enabled = normalize(
731
+ os.getenv("DEEPEVAL_TELEMETRY_ENABLED")
732
+ ) # True means ON, False means OFF
733
+
734
+ off_signal = (new_opt_out is True) or (legacy_enabled is False)
735
+ on_signal = (new_opt_out is False) or (legacy_enabled is True)
736
+
737
+ # Conflict: simultaneous OFF and ON signals
738
+ if off_signal and on_signal:
739
+ # Only warn if verbose or debug
740
+ if parse_bool(
741
+ os.getenv("DEEPEVAL_VERBOSE_MODE"), default=False
742
+ ) or logger.isEnabledFor(logging.DEBUG):
743
+ logger.warning(
744
+ "Conflicting telemetry flags detected: DEEPEVAL_TELEMETRY_OPT_OUT=%r, "
745
+ "DEEPEVAL_TELEMETRY_ENABLED=%r. Defaulting to OFF.",
746
+ new_opt_out,
747
+ legacy_enabled,
748
+ )
749
+ return True # OFF wins
750
+
751
+ # Clear winner
752
+ if off_signal:
753
+ return True # OFF
754
+ if on_signal:
755
+ return False # ON
756
+
757
+ # Unset means ON
758
+ return False
759
+
760
+ @model_validator(mode="after")
761
+ def _apply_deprecated_computed_env_aliases(self):
762
+ """
763
+ Backwards compatibility courtesy:
764
+ - If users still set a deprecated computed field in the environment,
765
+ emit a deprecation warning and mirror its value into the matching
766
+ *_OVERRIDE field (unless the override is already set).
767
+ - Override always wins if both are present.
768
+ """
769
+ for old_key, override_key in _DEPRECATED_TO_OVERRIDE.items():
770
+ raw = os.getenv(old_key)
771
+ if raw is None or str(raw).strip() == "":
772
+ continue
773
+
774
+ # if override already set, ignore the deprecated one but log a warning
775
+ if getattr(self, override_key) is not None:
776
+ logger.warning(
777
+ "Config deprecation: %s is deprecated and was ignored because %s "
778
+ "is already set. Please remove %s and use %s going forward.",
779
+ old_key,
780
+ override_key,
781
+ old_key,
782
+ override_key,
783
+ )
784
+ continue
785
+
786
+ # apply the deprecated value into the override field.
787
+ try:
788
+ # let pydantic coerce the string to the target type on assignment
789
+ setattr(self, override_key, raw)
790
+ logger.warning(
791
+ "Config deprecation: %s is deprecated. Its value (%r) was applied to %s. "
792
+ "Please migrate to %s and remove %s from your environment.",
793
+ old_key,
794
+ raw,
795
+ override_key,
796
+ override_key,
797
+ old_key,
798
+ )
799
+ except Exception as e:
800
+ # do not let exception bubble up, just warn
801
+ logger.warning(
802
+ "Config deprecation: %s is deprecated and could not be applied to %s "
803
+ "(value=%r): %s",
804
+ old_key,
805
+ override_key,
806
+ raw,
807
+ e,
808
+ )
809
+ return self
810
+
693
811
  #######################
694
812
  # Persistence support #
695
813
  #######################
696
814
  class _SettingsEditCtx:
815
+ # TODO: will generate this list in future PR
697
816
  COMPUTED_FIELDS: frozenset[str] = frozenset(
698
- {"DEEPEVAL_PER_TASK_TIMEOUT_SECONDS"}
817
+ {
818
+ "DEEPEVAL_PER_TASK_TIMEOUT_SECONDS",
819
+ "DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS",
820
+ "DEEPEVAL_TASK_GATHER_BUFFER_SECONDS",
821
+ }
699
822
  )
700
823
 
701
824
  def __init__(
@@ -880,16 +1003,39 @@ class Settings(BaseSettings):
880
1003
 
881
1004
 
882
1005
  _settings_singleton: Optional[Settings] = None
1006
+ _settings_env_fingerprint: "str | None" = None
1007
+ _settings_lock = threading.RLock()
1008
+
1009
+
1010
+ def _calc_env_fingerprint() -> str:
1011
+ env = os.environ.copy()
1012
+ # must hash in a stable order.
1013
+ keys = sorted(
1014
+ key
1015
+ for key in Settings.model_fields.keys()
1016
+ if key != "_DEPRECATED_TELEMETRY_ENABLED" # exclude deprecated
1017
+ )
1018
+ # encode as triples: (key, present?, value)
1019
+ items = [(k, k in env, env.get(k)) for k in keys]
1020
+ payload = json.dumps(items, ensure_ascii=False, separators=(",", ":"))
1021
+ return hashlib.sha256(payload.encode("utf-8")).hexdigest()
883
1022
 
884
1023
 
885
1024
  def get_settings() -> Settings:
886
- global _settings_singleton
887
- if _settings_singleton is None:
888
- _settings_singleton = Settings()
889
- from deepeval.config.logging import apply_deepeval_log_level
1025
+ global _settings_singleton, _settings_env_fingerprint
1026
+ fingerprint = _calc_env_fingerprint()
1027
+
1028
+ with _settings_lock:
1029
+ if (
1030
+ _settings_singleton is None
1031
+ or _settings_env_fingerprint != fingerprint
1032
+ ):
1033
+ _settings_singleton = Settings()
1034
+ _settings_env_fingerprint = fingerprint
1035
+ from deepeval.config.logging import apply_deepeval_log_level
890
1036
 
891
- apply_deepeval_log_level()
892
- return _settings_singleton
1037
+ apply_deepeval_log_level()
1038
+ return _settings_singleton
893
1039
 
894
1040
 
895
1041
  def reset_settings(*, reload_dotenv: bool = False) -> Settings:
@@ -905,8 +1051,10 @@ def reset_settings(*, reload_dotenv: bool = False) -> Settings:
905
1051
  Returns:
906
1052
  The fresh Settings instance.
907
1053
  """
908
- global _settings_singleton
909
- if reload_dotenv:
910
- autoload_dotenv()
911
- _settings_singleton = None
1054
+ global _settings_singleton, _settings_env_fingerprint
1055
+ with _settings_lock:
1056
+ if reload_dotenv:
1057
+ autoload_dotenv()
1058
+ _settings_singleton = None
1059
+ _settings_env_fingerprint = None
912
1060
  return get_settings()
@@ -4,6 +4,7 @@ dotenv file. Also syncs os.environ, handles unsets, and warns on unknown fields.
4
4
  Primary entrypoint: update_settings_and_persist.
5
5
  """
6
6
 
7
+ import json
7
8
  import logging
8
9
  import os
9
10
 
@@ -33,6 +34,9 @@ def _normalize_for_env(val: Any) -> Optional[str]:
33
34
  return val.get_secret_value()
34
35
  if isinstance(val, bool):
35
36
  return bool_to_env_str(val)
37
+ # encode sequences as JSON so Settings can parse them back reliably.
38
+ if isinstance(val, (list, tuple, set)):
39
+ return json.dumps(list(val))
36
40
  return str(val)
37
41
 
38
42
 
@@ -1,5 +1,6 @@
1
1
  from typing import Optional, List, Dict, Callable
2
2
  import asyncio
3
+ import time
3
4
  from rich.progress import (
4
5
  Progress,
5
6
  TextColumn,
@@ -8,24 +9,74 @@ from rich.progress import (
8
9
  TaskProgressColumn,
9
10
  )
10
11
  from collections import Counter
12
+ import json
11
13
 
12
14
  from deepeval.errors import MissingTestCaseParamsError
13
15
  from deepeval.evaluate.configs import AsyncConfig, DisplayConfig, ErrorConfig
14
- from deepeval.test_case import ArenaTestCase
16
+ from deepeval.test_case import ArenaTestCase, Contestant
17
+ from deepeval.test_case.api import create_api_test_case
15
18
  from deepeval.metrics import ArenaGEval
16
- from deepeval.utils import add_pbar, update_pbar, custom_console
17
- from deepeval.utils import get_or_create_event_loop
19
+ from deepeval.utils import (
20
+ add_pbar,
21
+ update_pbar,
22
+ custom_console,
23
+ get_or_create_event_loop,
24
+ open_browser,
25
+ )
26
+ from deepeval.test_run.test_run import (
27
+ TestRun,
28
+ MetricData,
29
+ TestRunEncoder,
30
+ MetricScores,
31
+ console,
32
+ )
33
+ from deepeval.test_run.hyperparameters import (
34
+ process_hyperparameters,
35
+ )
36
+ from deepeval.confident.api import Api, Endpoints, HttpMethods, is_confident
18
37
  from deepeval.telemetry import capture_evaluation_run
38
+ from deepeval.test_run.api import LLMApiTestCase
39
+ from deepeval.evaluate.utils import create_arena_metric_data
40
+ from deepeval.evaluate.types import PostExperimentRequest
19
41
 
20
42
 
21
43
  def compare(
22
44
  test_cases: List[ArenaTestCase],
23
45
  metric: ArenaGEval,
46
+ name: str = "compare()",
24
47
  # Configs
25
48
  async_config: Optional[AsyncConfig] = AsyncConfig(),
26
49
  display_config: Optional[DisplayConfig] = DisplayConfig(),
27
50
  error_config: Optional[ErrorConfig] = ErrorConfig(),
28
51
  ) -> Dict[str, int]:
52
+
53
+ # Prepare test run map
54
+ unique_contestant_names = set(
55
+ [
56
+ contestant.name
57
+ for test_case in test_cases
58
+ for contestant in test_case.contestants
59
+ ]
60
+ )
61
+ test_run_map: Dict[str, TestRun] = {}
62
+ for contestant_name in unique_contestant_names:
63
+ test_run = TestRun(
64
+ identifier=contestant_name,
65
+ test_passed=0,
66
+ test_failed=0,
67
+ )
68
+ test_run.metrics_scores = [
69
+ MetricScores(
70
+ metric=metric.name,
71
+ scores=[],
72
+ passes=0,
73
+ fails=0,
74
+ errors=0,
75
+ )
76
+ ]
77
+ test_run_map[contestant_name] = test_run
78
+
79
+ start_time = time.time()
29
80
  with capture_evaluation_run("compare()"):
30
81
  if async_config.run_async:
31
82
  loop = get_or_create_event_loop()
@@ -39,6 +90,7 @@ def compare(
39
90
  throttle_value=async_config.throttle_value,
40
91
  max_concurrent=async_config.max_concurrent,
41
92
  skip_on_missing_params=error_config.skip_on_missing_params,
93
+ test_run_map=test_run_map,
42
94
  )
43
95
  )
44
96
  else:
@@ -49,7 +101,10 @@ def compare(
49
101
  verbose_mode=display_config.verbose_mode,
50
102
  show_indicator=display_config.show_indicator,
51
103
  skip_on_missing_params=error_config.skip_on_missing_params,
104
+ test_run_map=test_run_map,
52
105
  )
106
+ end_time = time.time()
107
+ run_duration = end_time - start_time
53
108
 
54
109
  # Aggregate winners
55
110
  winner_counts = Counter()
@@ -57,7 +112,13 @@ def compare(
57
112
  if winner:
58
113
  winner_counts[winner] += 1
59
114
 
60
- print(winner_counts)
115
+ process_test_runs(test_run_map=test_run_map, test_cases=test_cases)
116
+ wrap_up_experiment(
117
+ name=name,
118
+ test_runs=list(test_run_map.values()),
119
+ winner_counts=winner_counts,
120
+ run_duration=run_duration,
121
+ )
61
122
  return dict(winner_counts)
62
123
 
63
124
 
@@ -70,6 +131,7 @@ async def a_execute_arena_test_cases(
70
131
  throttle_value: int,
71
132
  skip_on_missing_params: bool,
72
133
  max_concurrent: int,
134
+ test_run_map: Dict[str, TestRun],
73
135
  ) -> List[str]:
74
136
  semaphore = asyncio.Semaphore(max_concurrent)
75
137
 
@@ -104,6 +166,8 @@ async def a_execute_arena_test_cases(
104
166
  else metric.verbose_mode
105
167
  ),
106
168
  )
169
+
170
+ start_time = time.perf_counter()
107
171
  winner = await _a_handle_metric_measurement(
108
172
  metric=metric_copy,
109
173
  test_case=test_case,
@@ -112,10 +176,21 @@ async def a_execute_arena_test_cases(
112
176
  _progress=progress,
113
177
  _pbar_id=pbar_test_case_id,
114
178
  )
179
+ end_time = time.perf_counter()
180
+ run_duration = end_time - start_time
181
+
115
182
  if winner:
116
183
  winners.append(winner)
117
184
 
118
185
  update_pbar(progress, pbar_id)
186
+ update_test_run_map(
187
+ test_case=test_case,
188
+ index=index,
189
+ test_run_map=test_run_map,
190
+ metric_copy=metric_copy,
191
+ winner=winner,
192
+ run_duration=run_duration,
193
+ )
119
194
 
120
195
  # Create tasks for all test cases
121
196
  if show_indicator:
@@ -156,6 +231,7 @@ def execute_arena_test_cases(
156
231
  skip_on_missing_params: bool,
157
232
  show_indicator: bool,
158
233
  verbose_mode: Optional[bool] = None,
234
+ test_run_map: Optional[Dict[str, TestRun]] = None,
159
235
  ) -> List[str]:
160
236
  """
161
237
  Non-async version of comparing arena test cases.
@@ -183,6 +259,8 @@ def execute_arena_test_cases(
183
259
  else metric.verbose_mode
184
260
  ),
185
261
  )
262
+
263
+ start_time = time.perf_counter()
186
264
  winner = _handle_metric_measurement(
187
265
  metric=metric_copy,
188
266
  test_case=test_case,
@@ -191,10 +269,21 @@ def execute_arena_test_cases(
191
269
  _progress=progress,
192
270
  _pbar_id=pbar_test_case_id,
193
271
  )
272
+ end_time = time.perf_counter()
273
+ run_duration = end_time - start_time
274
+
194
275
  if winner:
195
276
  winners.append(winner)
196
277
 
197
278
  update_pbar(progress, pbar_id)
279
+ update_test_run_map(
280
+ test_case=test_case,
281
+ index=i,
282
+ test_run_map=test_run_map,
283
+ metric_copy=metric_copy,
284
+ winner=winner,
285
+ run_duration=run_duration,
286
+ )
198
287
 
199
288
  if show_indicator:
200
289
  progress = Progress(
@@ -313,3 +402,125 @@ async def _a_handle_metric_measurement(
313
402
  return None
314
403
  else:
315
404
  raise
405
+
406
+
407
+ def update_test_run_map(
408
+ test_case: ArenaTestCase,
409
+ index: int,
410
+ test_run_map: Dict[str, TestRun],
411
+ metric_copy: ArenaGEval,
412
+ winner: str,
413
+ run_duration: float,
414
+ ):
415
+ for contestant in test_case.contestants:
416
+ test_run = test_run_map.get(contestant.name)
417
+
418
+ # update test cases in test run
419
+ api_test_case: LLMApiTestCase = create_api_test_case(
420
+ test_case=contestant.test_case, index=index
421
+ )
422
+ metric_data: MetricData = create_arena_metric_data(
423
+ metric_copy, contestant.name
424
+ )
425
+ api_test_case.update_metric_data(metric_data)
426
+ api_test_case.update_run_duration(run_duration)
427
+ test_run.add_test_case(api_test_case)
428
+
429
+ # update other test run attributes
430
+ if test_run.run_duration is None:
431
+ test_run.run_duration = 0.0
432
+ test_run.run_duration += run_duration
433
+
434
+ # Ensure test_passed and test_failed are initialized
435
+ if test_run.test_passed is None:
436
+ test_run.test_passed = 0
437
+ if test_run.test_failed is None:
438
+ test_run.test_failed = 0
439
+
440
+ if winner == contestant:
441
+ test_run.test_passed += 1
442
+ else:
443
+ test_run.test_failed += 1
444
+
445
+ # update metric scores
446
+ test_run.metrics_scores[0].metric = metric_copy.name
447
+ test_run.metrics_scores[0].scores.append(
448
+ 1 if winner == contestant else 0
449
+ )
450
+ test_run.metrics_scores[0].passes += 1 if winner == contestant else 0
451
+ test_run.metrics_scores[0].fails += 1 if winner != contestant else 0
452
+ test_run.metrics_scores[0].errors += 0
453
+
454
+
455
+ def process_test_runs(
456
+ test_run_map: Dict[str, TestRun],
457
+ test_cases: List[ArenaTestCase],
458
+ ):
459
+ hyperparameters_map = {
460
+ contestant_name: {} for contestant_name in test_run_map.keys()
461
+ }
462
+
463
+ for test_case in test_cases:
464
+ for contestant in test_case.contestants:
465
+ if contestant.hyperparameters:
466
+ hyperparameters_map[contestant.name].update(
467
+ contestant.hyperparameters
468
+ )
469
+
470
+ for contestant_name, hyperparameters in hyperparameters_map.items():
471
+ test_run = test_run_map.get(contestant_name)
472
+ test_run.hyperparameters = process_hyperparameters(hyperparameters)
473
+
474
+
475
+ def wrap_up_experiment(
476
+ name: str,
477
+ test_runs: List[TestRun],
478
+ winner_counts: Counter,
479
+ run_duration: float,
480
+ ):
481
+ winner_breakdown = []
482
+ for contestant, wins in winner_counts.most_common():
483
+ winner_breakdown.append(
484
+ f" » [bold green]{contestant}[/bold green]: {wins} wins"
485
+ )
486
+ winner_text = (
487
+ "\n".join(winner_breakdown) if winner_breakdown else "No winners"
488
+ )
489
+ console.print(
490
+ f"\n🎉 Arena completed! (time taken: {round(run_duration, 2)}s | token cost: {test_runs[0].evaluation_cost if test_runs else 0} USD)\n"
491
+ f"🏆 Results ({sum(winner_counts.values())} total test cases):\n"
492
+ f"{winner_text}\n\n"
493
+ )
494
+
495
+ if not is_confident():
496
+ console.print(
497
+ f"{'=' * 80}\n"
498
+ f"\n» Want to share experiments with your team? ❤️ 🏟️\n"
499
+ f" » Run [bold]'deepeval login'[/bold] to analyze and save arena results on [rgb(106,0,255)]Confident AI[/rgb(106,0,255)].\n\n"
500
+ )
501
+ return
502
+
503
+ try:
504
+ api = Api()
505
+ experiment_request = PostExperimentRequest(testRuns=test_runs, name=name)
506
+
507
+ try:
508
+ body = experiment_request.model_dump(by_alias=True, exclude_none=True)
509
+ except AttributeError:
510
+ body = experiment_request.dict(by_alias=True, exclude_none=True)
511
+ json_str = json.dumps(body, cls=TestRunEncoder)
512
+ body = json.loads(json_str)
513
+
514
+ _, link = api.send_request(
515
+ method=HttpMethods.POST,
516
+ endpoint=Endpoints.EXPERIMENT_ENDPOINT,
517
+ body=body,
518
+ )
519
+ console.print(
520
+ "[rgb(5,245,141)]✓[/rgb(5,245,141)] Done 🎉! View results on "
521
+ f"[link={link}]{link}[/link]"
522
+ )
523
+ open_browser(link)
524
+
525
+ except Exception:
526
+ raise
@@ -4,6 +4,7 @@ from pydantic import BaseModel
4
4
 
5
5
  from deepeval.test_run.api import MetricData, TurnApi
6
6
  from deepeval.test_case import MLLMImage
7
+ from deepeval.test_run import TestRun
7
8
 
8
9
 
9
10
  @dataclass
@@ -29,3 +30,8 @@ class EvaluationResult(BaseModel):
29
30
  test_results: List[TestResult]
30
31
  confident_link: Optional[str]
31
32
  test_run_id: Optional[str]
33
+
34
+
35
+ class PostExperimentRequest(BaseModel):
36
+ testRuns: List[TestRun]
37
+ name: Optional[str]
@@ -8,6 +8,7 @@ from deepeval.utils import format_turn
8
8
  from deepeval.test_run.test_run import TestRunResultDisplay
9
9
  from deepeval.dataset import Golden
10
10
  from deepeval.metrics import (
11
+ ArenaGEval,
11
12
  BaseMetric,
12
13
  BaseConversationalMetric,
13
14
  BaseMultimodalMetric,
@@ -84,6 +85,35 @@ def create_metric_data(metric: BaseMetric) -> MetricData:
84
85
  )
85
86
 
86
87
 
88
+ def create_arena_metric_data(metric: ArenaGEval, contestant: str) -> MetricData:
89
+ if metric.error is not None:
90
+ return MetricData(
91
+ name=metric.__name__,
92
+ threshold=1,
93
+ score=None,
94
+ reason=None,
95
+ success=False,
96
+ strictMode=True,
97
+ evaluationModel=metric.evaluation_model,
98
+ error=metric.error,
99
+ evaluationCost=metric.evaluation_cost,
100
+ verboseLogs=metric.verbose_logs,
101
+ )
102
+ else:
103
+ return MetricData(
104
+ name=metric.__name__,
105
+ score=1 if contestant == metric.winner else 0,
106
+ threshold=1,
107
+ reason=metric.reason,
108
+ success=metric.is_successful(),
109
+ strictMode=True,
110
+ evaluationModel=metric.evaluation_model,
111
+ error=None,
112
+ evaluationCost=metric.evaluation_cost,
113
+ verboseLogs=metric.verbose_logs,
114
+ )
115
+
116
+
87
117
  def create_test_result(
88
118
  api_test_case: Union[LLMApiTestCase, ConversationalApiTestCase],
89
119
  ) -> TestResult:
@@ -23,6 +23,8 @@ try:
23
23
  AgentExecutionCompletedEvent,
24
24
  ToolUsageStartedEvent,
25
25
  ToolUsageFinishedEvent,
26
+ KnowledgeRetrievalStartedEvent,
27
+ KnowledgeRetrievalCompletedEvent,
26
28
  )
27
29
 
28
30
  crewai_installed = True
@@ -69,6 +71,14 @@ class CrewAIEventsListener(BaseEventListener):
69
71
 
70
72
  return execution_id
71
73
 
74
+ @staticmethod
75
+ def get_knowledge_execution_id(source, event) -> str:
76
+ source_id = id(source)
77
+ agent_id = id(event.agent) if hasattr(event, "agent") else "unknown"
78
+ execution_id = f"_knowledge_{source_id}_{agent_id}"
79
+
80
+ return execution_id
81
+
72
82
  def setup_listeners(self, crewai_event_bus):
73
83
  @crewai_event_bus.on(CrewKickoffStartedEvent)
74
84
  def on_crew_started(source, event: CrewKickoffStartedEvent):
@@ -161,6 +171,32 @@ class CrewAIEventsListener(BaseEventListener):
161
171
  current_span.output = event.output
162
172
  observer.__exit__(None, None, None)
163
173
 
174
+ @crewai_event_bus.on(KnowledgeRetrievalStartedEvent)
175
+ def on_knowledge_started(source, event: KnowledgeRetrievalStartedEvent):
176
+ observer = Observer(
177
+ span_type="tool",
178
+ func_name="knowledge_retrieval",
179
+ function_kwargs={},
180
+ )
181
+ self.span_observers[
182
+ self.get_knowledge_execution_id(source, event)
183
+ ] = observer
184
+ observer.__enter__()
185
+
186
+ @crewai_event_bus.on(KnowledgeRetrievalCompletedEvent)
187
+ def on_knowledge_completed(
188
+ source, event: KnowledgeRetrievalCompletedEvent
189
+ ):
190
+ observer = self.span_observers.pop(
191
+ self.get_knowledge_execution_id(source, event)
192
+ )
193
+ if observer:
194
+ current_span = current_span_context.get()
195
+ if current_span:
196
+ current_span.input = event.query
197
+ current_span.output = event.retrieved_knowledge
198
+ observer.__exit__(None, None, None)
199
+
164
200
 
165
201
  def instrument_crewai(api_key: Optional[str] = None):
166
202
  is_crewai_installed()