deepeval 3.7.0__py3-none-any.whl → 3.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. deepeval/__init__.py +0 -4
  2. deepeval/_version.py +1 -1
  3. deepeval/cli/main.py +7 -0
  4. deepeval/confident/api.py +6 -1
  5. deepeval/config/settings.py +5 -0
  6. deepeval/evaluate/compare.py +219 -4
  7. deepeval/evaluate/types.py +6 -0
  8. deepeval/evaluate/utils.py +30 -0
  9. deepeval/key_handler.py +1 -0
  10. deepeval/metrics/arena_g_eval/arena_g_eval.py +5 -1
  11. deepeval/metrics/arena_g_eval/utils.py +5 -5
  12. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +9 -18
  13. deepeval/metrics/g_eval/g_eval.py +5 -1
  14. deepeval/metrics/g_eval/utils.py +1 -1
  15. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +5 -1
  16. deepeval/metrics/utils.py +1 -1
  17. deepeval/models/llms/gemini_model.py +27 -5
  18. deepeval/openai_agents/callback_handler.py +12 -3
  19. deepeval/prompt/prompt.py +25 -14
  20. deepeval/simulator/template.py +1 -1
  21. deepeval/synthesizer/config.py +9 -0
  22. deepeval/synthesizer/schema.py +23 -0
  23. deepeval/synthesizer/synthesizer.py +1137 -2
  24. deepeval/synthesizer/templates/__init__.py +11 -2
  25. deepeval/synthesizer/templates/template.py +554 -1
  26. deepeval/synthesizer/templates/template_extraction.py +32 -0
  27. deepeval/synthesizer/templates/template_prompt.py +262 -0
  28. deepeval/test_case/__init__.py +2 -1
  29. deepeval/test_case/arena_test_case.py +15 -4
  30. deepeval/test_case/mllm_test_case.py +45 -22
  31. deepeval/test_run/cache.py +31 -10
  32. deepeval/test_run/hyperparameters.py +5 -1
  33. deepeval/test_run/test_run.py +28 -9
  34. deepeval/tracing/tracing.py +1 -1
  35. deepeval/utils.py +4 -0
  36. {deepeval-3.7.0.dist-info → deepeval-3.7.2.dist-info}/METADATA +3 -2
  37. {deepeval-3.7.0.dist-info → deepeval-3.7.2.dist-info}/RECORD +40 -40
  38. {deepeval-3.7.0.dist-info → deepeval-3.7.2.dist-info}/LICENSE.md +0 -0
  39. {deepeval-3.7.0.dist-info → deepeval-3.7.2.dist-info}/WHEEL +0 -0
  40. {deepeval-3.7.0.dist-info → deepeval-3.7.2.dist-info}/entry_points.txt +0 -0
deepeval/__init__.py CHANGED
@@ -102,9 +102,5 @@ def update_warning_opt_in():
102
102
  return os.getenv("DEEPEVAL_UPDATE_WARNING_OPT_IN") == "1"
103
103
 
104
104
 
105
- def is_read_only_env():
106
- return os.getenv("DEEPEVAL_FILE_SYSTEM") == "READ_ONLY"
107
-
108
-
109
105
  if update_warning_opt_in():
110
106
  check_for_update()
deepeval/_version.py CHANGED
@@ -1 +1 @@
1
- __version__: str = "3.7.0"
1
+ __version__: str = "3.7.2"
deepeval/cli/main.py CHANGED
@@ -1484,6 +1484,11 @@ def set_gemini_model_env(
1484
1484
  google_cloud_location: Optional[str] = typer.Option(
1485
1485
  None, "--location", help="Google Cloud location"
1486
1486
  ),
1487
+ google_service_account_key: Optional[str] = typer.Option(
1488
+ None,
1489
+ "--service-account-key",
1490
+ help="Google Service Account Key for Gemini",
1491
+ ),
1487
1492
  save: Optional[str] = typer.Option(
1488
1493
  None,
1489
1494
  "--save",
@@ -1513,6 +1518,8 @@ def set_gemini_model_env(
1513
1518
  settings.GOOGLE_CLOUD_PROJECT = google_cloud_project
1514
1519
  if google_cloud_location:
1515
1520
  settings.GOOGLE_CLOUD_LOCATION = google_cloud_location
1521
+ if google_service_account_key:
1522
+ settings.GOOGLE_SERVICE_ACCOUNT_KEY = google_service_account_key
1516
1523
  if model_name:
1517
1524
  settings.GEMINI_MODEL_NAME = model_name
1518
1525
 
deepeval/confident/api.py CHANGED
@@ -27,6 +27,10 @@ retryable_exceptions = requests.exceptions.SSLError
27
27
 
28
28
 
29
29
  def get_base_api_url():
30
+ s = get_settings()
31
+ if s.CONFIDENT_BASE_URL:
32
+ base_url = s.CONFIDENT_BASE_URL.rstrip("/")
33
+ return base_url
30
34
  region = KEY_FILE_HANDLER.fetch_data(KeyValues.CONFIDENT_REGION)
31
35
  if region == "EU":
32
36
  return API_BASE_URL_EU
@@ -87,6 +91,7 @@ class Endpoints(Enum):
87
91
  DATASET_ALIAS_QUEUE_ENDPOINT = "/v1/datasets/:alias/queue"
88
92
 
89
93
  TEST_RUN_ENDPOINT = "/v1/test-run"
94
+ EXPERIMENT_ENDPOINT = "/v1/experiment"
90
95
  METRIC_DATA_ENDPOINT = "/v1/metric-data"
91
96
  TRACES_ENDPOINT = "/v1/traces"
92
97
  ANNOTATIONS_ENDPOINT = "/v1/annotations"
@@ -115,7 +120,7 @@ class Api:
115
120
  self.api_key = api_key
116
121
  self._headers = {
117
122
  "Content-Type": "application/json",
118
- "CONFIDENT_API_KEY": api_key,
123
+ "CONFIDENT-API-KEY": api_key,
119
124
  "X-DeepEval-Version": deepeval.__version__,
120
125
  }
121
126
  self.base_api_url = get_base_api_url()
@@ -229,6 +229,11 @@ class Settings(BaseSettings):
229
229
  API_KEY: Optional[SecretStr] = None
230
230
  CONFIDENT_API_KEY: Optional[SecretStr] = None
231
231
 
232
+ # ======
233
+ # Base URL for Confident AI API server
234
+ # ======
235
+ CONFIDENT_BASE_URL: Optional[str] = None
236
+
232
237
  # General
233
238
  TEMPERATURE: Optional[confloat(ge=0, le=2)] = None
234
239
 
@@ -1,5 +1,6 @@
1
1
  from typing import Optional, List, Dict, Callable
2
2
  import asyncio
3
+ import time
3
4
  from rich.progress import (
4
5
  Progress,
5
6
  TextColumn,
@@ -8,24 +9,74 @@ from rich.progress import (
8
9
  TaskProgressColumn,
9
10
  )
10
11
  from collections import Counter
12
+ import json
11
13
 
12
14
  from deepeval.errors import MissingTestCaseParamsError
13
15
  from deepeval.evaluate.configs import AsyncConfig, DisplayConfig, ErrorConfig
14
- from deepeval.test_case import ArenaTestCase
16
+ from deepeval.test_case import ArenaTestCase, Contestant
17
+ from deepeval.test_case.api import create_api_test_case
15
18
  from deepeval.metrics import ArenaGEval
16
- from deepeval.utils import add_pbar, update_pbar, custom_console
17
- from deepeval.utils import get_or_create_event_loop
19
+ from deepeval.utils import (
20
+ add_pbar,
21
+ update_pbar,
22
+ custom_console,
23
+ get_or_create_event_loop,
24
+ open_browser,
25
+ )
26
+ from deepeval.test_run.test_run import (
27
+ TestRun,
28
+ MetricData,
29
+ TestRunEncoder,
30
+ MetricScores,
31
+ console,
32
+ )
33
+ from deepeval.test_run.hyperparameters import (
34
+ process_hyperparameters,
35
+ )
36
+ from deepeval.confident.api import Api, Endpoints, HttpMethods, is_confident
18
37
  from deepeval.telemetry import capture_evaluation_run
38
+ from deepeval.test_run.api import LLMApiTestCase
39
+ from deepeval.evaluate.utils import create_arena_metric_data
40
+ from deepeval.evaluate.types import PostExperimentRequest
19
41
 
20
42
 
21
43
  def compare(
22
44
  test_cases: List[ArenaTestCase],
23
45
  metric: ArenaGEval,
46
+ name: str = "compare()",
24
47
  # Configs
25
48
  async_config: Optional[AsyncConfig] = AsyncConfig(),
26
49
  display_config: Optional[DisplayConfig] = DisplayConfig(),
27
50
  error_config: Optional[ErrorConfig] = ErrorConfig(),
28
51
  ) -> Dict[str, int]:
52
+
53
+ # Prepare test run map
54
+ unique_contestant_names = set(
55
+ [
56
+ contestant.name
57
+ for test_case in test_cases
58
+ for contestant in test_case.contestants
59
+ ]
60
+ )
61
+ test_run_map: Dict[str, TestRun] = {}
62
+ for contestant_name in unique_contestant_names:
63
+ test_run = TestRun(
64
+ identifier=contestant_name,
65
+ test_passed=0,
66
+ test_failed=0,
67
+ )
68
+ test_run.metrics_scores = [
69
+ MetricScores(
70
+ metric=metric.name,
71
+ scores=[],
72
+ passes=0,
73
+ fails=0,
74
+ errors=0,
75
+ )
76
+ ]
77
+ test_run_map[contestant_name] = test_run
78
+
79
+ start_time = time.time()
29
80
  with capture_evaluation_run("compare()"):
30
81
  if async_config.run_async:
31
82
  loop = get_or_create_event_loop()
@@ -39,6 +90,7 @@ def compare(
39
90
  throttle_value=async_config.throttle_value,
40
91
  max_concurrent=async_config.max_concurrent,
41
92
  skip_on_missing_params=error_config.skip_on_missing_params,
93
+ test_run_map=test_run_map,
42
94
  )
43
95
  )
44
96
  else:
@@ -49,7 +101,10 @@ def compare(
49
101
  verbose_mode=display_config.verbose_mode,
50
102
  show_indicator=display_config.show_indicator,
51
103
  skip_on_missing_params=error_config.skip_on_missing_params,
104
+ test_run_map=test_run_map,
52
105
  )
106
+ end_time = time.time()
107
+ run_duration = end_time - start_time
53
108
 
54
109
  # Aggregate winners
55
110
  winner_counts = Counter()
@@ -57,7 +112,13 @@ def compare(
57
112
  if winner:
58
113
  winner_counts[winner] += 1
59
114
 
60
- print(winner_counts)
115
+ process_test_runs(test_run_map=test_run_map, test_cases=test_cases)
116
+ wrap_up_experiment(
117
+ name=name,
118
+ test_runs=list(test_run_map.values()),
119
+ winner_counts=winner_counts,
120
+ run_duration=run_duration,
121
+ )
61
122
  return dict(winner_counts)
62
123
 
63
124
 
@@ -70,6 +131,7 @@ async def a_execute_arena_test_cases(
70
131
  throttle_value: int,
71
132
  skip_on_missing_params: bool,
72
133
  max_concurrent: int,
134
+ test_run_map: Dict[str, TestRun],
73
135
  ) -> List[str]:
74
136
  semaphore = asyncio.Semaphore(max_concurrent)
75
137
 
@@ -104,6 +166,8 @@ async def a_execute_arena_test_cases(
104
166
  else metric.verbose_mode
105
167
  ),
106
168
  )
169
+
170
+ start_time = time.perf_counter()
107
171
  winner = await _a_handle_metric_measurement(
108
172
  metric=metric_copy,
109
173
  test_case=test_case,
@@ -112,10 +176,21 @@ async def a_execute_arena_test_cases(
112
176
  _progress=progress,
113
177
  _pbar_id=pbar_test_case_id,
114
178
  )
179
+ end_time = time.perf_counter()
180
+ run_duration = end_time - start_time
181
+
115
182
  if winner:
116
183
  winners.append(winner)
117
184
 
118
185
  update_pbar(progress, pbar_id)
186
+ update_test_run_map(
187
+ test_case=test_case,
188
+ index=index,
189
+ test_run_map=test_run_map,
190
+ metric_copy=metric_copy,
191
+ winner=winner,
192
+ run_duration=run_duration,
193
+ )
119
194
 
120
195
  # Create tasks for all test cases
121
196
  if show_indicator:
@@ -156,6 +231,7 @@ def execute_arena_test_cases(
156
231
  skip_on_missing_params: bool,
157
232
  show_indicator: bool,
158
233
  verbose_mode: Optional[bool] = None,
234
+ test_run_map: Optional[Dict[str, TestRun]] = None,
159
235
  ) -> List[str]:
160
236
  """
161
237
  Non-async version of comparing arena test cases.
@@ -183,6 +259,8 @@ def execute_arena_test_cases(
183
259
  else metric.verbose_mode
184
260
  ),
185
261
  )
262
+
263
+ start_time = time.perf_counter()
186
264
  winner = _handle_metric_measurement(
187
265
  metric=metric_copy,
188
266
  test_case=test_case,
@@ -191,10 +269,21 @@ def execute_arena_test_cases(
191
269
  _progress=progress,
192
270
  _pbar_id=pbar_test_case_id,
193
271
  )
272
+ end_time = time.perf_counter()
273
+ run_duration = end_time - start_time
274
+
194
275
  if winner:
195
276
  winners.append(winner)
196
277
 
197
278
  update_pbar(progress, pbar_id)
279
+ update_test_run_map(
280
+ test_case=test_case,
281
+ index=i,
282
+ test_run_map=test_run_map,
283
+ metric_copy=metric_copy,
284
+ winner=winner,
285
+ run_duration=run_duration,
286
+ )
198
287
 
199
288
  if show_indicator:
200
289
  progress = Progress(
@@ -313,3 +402,129 @@ async def _a_handle_metric_measurement(
313
402
  return None
314
403
  else:
315
404
  raise
405
+
406
+
407
+ def update_test_run_map(
408
+ test_case: ArenaTestCase,
409
+ index: int,
410
+ test_run_map: Dict[str, TestRun],
411
+ metric_copy: ArenaGEval,
412
+ winner: str,
413
+ run_duration: float,
414
+ ):
415
+ for contestant in test_case.contestants:
416
+ test_run = test_run_map.get(contestant.name)
417
+
418
+ # update test cases in test run
419
+ api_test_case: LLMApiTestCase = create_api_test_case(
420
+ test_case=contestant.test_case, index=index
421
+ )
422
+ metric_data: MetricData = create_arena_metric_data(
423
+ metric_copy, contestant.name
424
+ )
425
+ api_test_case.update_metric_data(metric_data)
426
+ api_test_case.update_run_duration(run_duration)
427
+ test_run.add_test_case(api_test_case)
428
+
429
+ # update other test run attributes
430
+ if test_run.run_duration is None:
431
+ test_run.run_duration = 0.0
432
+ test_run.run_duration += run_duration
433
+
434
+ # Ensure test_passed and test_failed are initialized
435
+ if test_run.test_passed is None:
436
+ test_run.test_passed = 0
437
+ if test_run.test_failed is None:
438
+ test_run.test_failed = 0
439
+
440
+ if winner == contestant:
441
+ test_run.test_passed += 1
442
+ else:
443
+ test_run.test_failed += 1
444
+
445
+ # update metric scores
446
+ test_run.metrics_scores[0].metric = metric_copy.name
447
+ test_run.metrics_scores[0].scores.append(
448
+ 1 if winner == contestant else 0
449
+ )
450
+ test_run.metrics_scores[0].passes += 1 if winner == contestant else 0
451
+ test_run.metrics_scores[0].fails += 1 if winner != contestant else 0
452
+ test_run.metrics_scores[0].errors += 0
453
+
454
+
455
+ def process_test_runs(
456
+ test_run_map: Dict[str, TestRun],
457
+ test_cases: List[ArenaTestCase],
458
+ ):
459
+ hyperparameters_map = {
460
+ contestant_name: {} for contestant_name in test_run_map.keys()
461
+ }
462
+
463
+ for test_case in test_cases:
464
+ for contestant in test_case.contestants:
465
+ if contestant.hyperparameters:
466
+ hyperparameters_map[contestant.name].update(
467
+ contestant.hyperparameters
468
+ )
469
+
470
+ for contestant_name, hyperparameters in hyperparameters_map.items():
471
+ test_run = test_run_map.get(contestant_name)
472
+ test_run.hyperparameters = process_hyperparameters(hyperparameters)
473
+
474
+
475
+ def wrap_up_experiment(
476
+ name: str,
477
+ test_runs: List[TestRun],
478
+ winner_counts: Counter,
479
+ run_duration: float,
480
+ ):
481
+ winner_breakdown = []
482
+ for contestant, wins in winner_counts.most_common():
483
+ winner_breakdown.append(
484
+ f" » [bold green]{contestant}[/bold green]: {wins} wins"
485
+ )
486
+ winner_text = (
487
+ "\n".join(winner_breakdown) if winner_breakdown else "No winners"
488
+ )
489
+ console.print(
490
+ f"\n🎉 Arena completed! (time taken: {round(run_duration, 2)}s | token cost: {test_runs[0].evaluation_cost if test_runs else 0} USD)\n"
491
+ f"🏆 Results ({sum(winner_counts.values())} total test cases):\n"
492
+ f"{winner_text}\n\n"
493
+ )
494
+
495
+ if not is_confident():
496
+ console.print(
497
+ f"{'=' * 80}\n"
498
+ f"\n» Want to share experiments with your team? ❤️ 🏟️\n"
499
+ f" » Run [bold]'deepeval login'[/bold] to analyze and save arena results on [rgb(106,0,255)]Confident AI[/rgb(106,0,255)].\n\n"
500
+ )
501
+ return
502
+
503
+ try:
504
+ api = Api()
505
+ experiment_request = PostExperimentRequest(
506
+ testRuns=test_runs, name=name
507
+ )
508
+
509
+ try:
510
+ body = experiment_request.model_dump(
511
+ by_alias=True, exclude_none=True
512
+ )
513
+ except AttributeError:
514
+ body = experiment_request.dict(by_alias=True, exclude_none=True)
515
+ json_str = json.dumps(body, cls=TestRunEncoder)
516
+ body = json.loads(json_str)
517
+
518
+ _, link = api.send_request(
519
+ method=HttpMethods.POST,
520
+ endpoint=Endpoints.EXPERIMENT_ENDPOINT,
521
+ body=body,
522
+ )
523
+ console.print(
524
+ "[rgb(5,245,141)]✓[/rgb(5,245,141)] Done 🎉! View results on "
525
+ f"[link={link}]{link}[/link]"
526
+ )
527
+ open_browser(link)
528
+
529
+ except Exception:
530
+ raise
@@ -4,6 +4,7 @@ from pydantic import BaseModel
4
4
 
5
5
  from deepeval.test_run.api import MetricData, TurnApi
6
6
  from deepeval.test_case import MLLMImage
7
+ from deepeval.test_run import TestRun
7
8
 
8
9
 
9
10
  @dataclass
@@ -29,3 +30,8 @@ class EvaluationResult(BaseModel):
29
30
  test_results: List[TestResult]
30
31
  confident_link: Optional[str]
31
32
  test_run_id: Optional[str]
33
+
34
+
35
+ class PostExperimentRequest(BaseModel):
36
+ testRuns: List[TestRun]
37
+ name: Optional[str]
@@ -8,6 +8,7 @@ from deepeval.utils import format_turn
8
8
  from deepeval.test_run.test_run import TestRunResultDisplay
9
9
  from deepeval.dataset import Golden
10
10
  from deepeval.metrics import (
11
+ ArenaGEval,
11
12
  BaseMetric,
12
13
  BaseConversationalMetric,
13
14
  BaseMultimodalMetric,
@@ -84,6 +85,35 @@ def create_metric_data(metric: BaseMetric) -> MetricData:
84
85
  )
85
86
 
86
87
 
88
+ def create_arena_metric_data(metric: ArenaGEval, contestant: str) -> MetricData:
89
+ if metric.error is not None:
90
+ return MetricData(
91
+ name=metric.__name__,
92
+ threshold=1,
93
+ score=None,
94
+ reason=None,
95
+ success=False,
96
+ strictMode=True,
97
+ evaluationModel=metric.evaluation_model,
98
+ error=metric.error,
99
+ evaluationCost=metric.evaluation_cost,
100
+ verboseLogs=metric.verbose_logs,
101
+ )
102
+ else:
103
+ return MetricData(
104
+ name=metric.__name__,
105
+ score=1 if contestant == metric.winner else 0,
106
+ threshold=1,
107
+ reason=metric.reason,
108
+ success=metric.is_successful(),
109
+ strictMode=True,
110
+ evaluationModel=metric.evaluation_model,
111
+ error=None,
112
+ evaluationCost=metric.evaluation_cost,
113
+ verboseLogs=metric.verbose_logs,
114
+ )
115
+
116
+
87
117
  def create_test_result(
88
118
  api_test_case: Union[LLMApiTestCase, ConversationalApiTestCase],
89
119
  ) -> TestResult:
deepeval/key_handler.py CHANGED
@@ -70,6 +70,7 @@ class ModelKeyValues(Enum):
70
70
  GOOGLE_GENAI_USE_VERTEXAI = "GOOGLE_GENAI_USE_VERTEXAI"
71
71
  GOOGLE_CLOUD_PROJECT = "GOOGLE_CLOUD_PROJECT"
72
72
  GOOGLE_CLOUD_LOCATION = "GOOGLE_CLOUD_LOCATION"
73
+ GOOGLE_SERVICE_ACCOUNT_KEY = "GOOGLE_SERVICE_ACCOUNT_KEY"
73
74
  # LiteLLM
74
75
  USE_LITELLM = "USE_LITELLM"
75
76
  LITELLM_MODEL_NAME = "LITELLM_MODEL_NAME"
@@ -46,7 +46,11 @@ class ArenaGEval(BaseArenaMetric):
46
46
  self.criteria = criteria
47
47
  self.model, self.using_native_model = initialize_model(model)
48
48
  self.evaluation_model = self.model.get_model_name()
49
- self.evaluation_steps = evaluation_steps
49
+ self.evaluation_steps = (
50
+ evaluation_steps
51
+ if evaluation_steps and len(evaluation_steps) > 0
52
+ else None
53
+ )
50
54
  self.async_mode = async_mode
51
55
  self.verbose_mode = verbose_mode
52
56
  self._include_g_eval_suffix = _include_g_eval_suffix
@@ -89,10 +89,10 @@ class FormattedArenaTestCase:
89
89
  def format_arena_test_case(
90
90
  evaluation_params: List[LLMTestCaseParams], test_case: ArenaTestCase
91
91
  ) -> Tuple[FormattedArenaTestCase, Dict[str, str]]:
92
- case = next(iter(test_case.contestants.values()))
92
+ case = next(iter([case.test_case for case in test_case.contestants]))
93
93
 
94
94
  # Create dummy name mapping
95
- real_names = list(test_case.contestants.keys())
95
+ real_names = list([case.name for case in test_case.contestants])
96
96
  available_fake_names = FAKE_NAMES.copy()
97
97
  random.shuffle(available_fake_names)
98
98
 
@@ -119,10 +119,10 @@ def format_arena_test_case(
119
119
  else None
120
120
  ),
121
121
  contestants={
122
- contestant: construct_formatted_llm_test_case(
123
- evaluation_params, test_case
122
+ contestant.name: construct_formatted_llm_test_case(
123
+ evaluation_params, contestant.test_case
124
124
  )
125
- for contestant, test_case in test_case.contestants.items()
125
+ for contestant in test_case.contestants
126
126
  },
127
127
  dummy_to_real_names=dummy_to_real_names,
128
128
  )
@@ -9,6 +9,8 @@ from deepeval.metrics.g_eval.utils import (
9
9
  construct_conversational_g_eval_turn_params_string,
10
10
  construct_non_turns_test_case_string,
11
11
  format_rubrics,
12
+ validate_and_sort_rubrics,
13
+ validate_criteria_and_evaluation_steps,
12
14
  )
13
15
  from deepeval.test_case import (
14
16
  TurnParams,
@@ -63,27 +65,16 @@ class ConversationalGEval(BaseConversationalMetric):
63
65
 
64
66
  self.evaluation_params = evaluation_params
65
67
 
66
- # Check if both criteria and evaluation_steps are not None at the same time
67
- if criteria is None and evaluation_steps is None:
68
- raise ValueError(
69
- "Either 'criteria' or 'evaluation_steps' must be provided."
70
- )
71
-
72
- # Check if criteria is provided, it cannot be an empty string
73
- if criteria is not None and not criteria.strip():
74
- raise ValueError("Criteria provided cannot be an empty string.")
75
-
76
- # Check if evaluation_steps is provided, it cannot be an empty list
77
- if evaluation_steps is not None and len(evaluation_steps) == 0:
78
- raise ValueError(
79
- "'evaluation_steps' must not be an empty list. Either omit evaluation steps or include a non-empty list of steps."
80
- )
81
-
68
+ validate_criteria_and_evaluation_steps(criteria, evaluation_steps)
82
69
  self.criteria = criteria
83
- self.rubric = rubric
70
+ self.rubric = validate_and_sort_rubrics(rubric)
84
71
  self.model, self.using_native_model = initialize_model(model)
85
72
  self.evaluation_model = self.model.get_model_name()
86
- self.evaluation_steps = evaluation_steps
73
+ self.evaluation_steps = (
74
+ evaluation_steps
75
+ if evaluation_steps and len(evaluation_steps) > 0
76
+ else None
77
+ )
87
78
  self.threshold = 1 if strict_mode else threshold
88
79
  self.strict_mode = strict_mode
89
80
  self.async_mode = async_mode
@@ -61,7 +61,11 @@ class GEval(BaseMetric):
61
61
  self.score_range_span = self.score_range[1] - self.score_range[0]
62
62
  self.model, self.using_native_model = initialize_model(model)
63
63
  self.evaluation_model = self.model.get_model_name()
64
- self.evaluation_steps = evaluation_steps
64
+ self.evaluation_steps = (
65
+ evaluation_steps
66
+ if evaluation_steps and len(evaluation_steps) > 0
67
+ else None
68
+ )
65
69
  self.threshold = 1 if strict_mode else threshold
66
70
  self.top_logprobs = top_logprobs
67
71
  self.strict_mode = strict_mode
@@ -77,7 +77,7 @@ def validate_criteria_and_evaluation_steps(
77
77
  def validate_and_sort_rubrics(
78
78
  rubrics: Optional[List[Rubric]] = None,
79
79
  ) -> Optional[List[Rubric]]:
80
- if rubrics is None:
80
+ if rubrics is None or len(rubrics) == 0:
81
81
  return None
82
82
 
83
83
  # Sort rubrics by start of range
@@ -64,7 +64,11 @@ class MultimodalGEval(BaseMultimodalMetric):
64
64
  self.rubric = validate_and_sort_rubrics(rubric)
65
65
  self.model, self.using_native_model = initialize_multimodal_model(model)
66
66
  self.evaluation_model = self.model.get_model_name()
67
- self.evaluation_steps = evaluation_steps
67
+ self.evaluation_steps = (
68
+ evaluation_steps
69
+ if evaluation_steps and len(evaluation_steps) > 0
70
+ else None
71
+ )
68
72
  self.threshold = 1 if strict_mode else threshold
69
73
  self.top_logprobs = top_logprobs
70
74
  self.strict_mode = strict_mode
deepeval/metrics/utils.py CHANGED
@@ -270,7 +270,7 @@ def check_arena_test_case_params(
270
270
  f"Expected ArenaTestCase, got {type(arena_test_case).__name__}"
271
271
  )
272
272
 
273
- cases = list(arena_test_case.contestants.values())
273
+ cases = [contestant.test_case for contestant in arena_test_case.contestants]
274
274
  ref_input = cases[0].input
275
275
  for case in cases[1:]:
276
276
  if case.input != ref_input:
@@ -1,7 +1,6 @@
1
1
  from pydantic import BaseModel
2
- from google.genai import types
2
+ from google.genai import types, Client
3
3
  from typing import Optional, Dict
4
- from google import genai
5
4
 
6
5
  from deepeval.models.retry_policy import (
7
6
  create_retry_decorator,
@@ -9,7 +8,8 @@ from deepeval.models.retry_policy import (
9
8
  from deepeval.key_handler import ModelKeyValues, KEY_FILE_HANDLER
10
9
  from deepeval.models.base_model import DeepEvalBaseLLM
11
10
  from deepeval.constants import ProviderSlug as PS
12
-
11
+ from google.oauth2 import service_account
12
+ import json
13
13
 
14
14
  default_gemini_model = "gemini-1.5-pro"
15
15
 
@@ -52,6 +52,7 @@ class GeminiModel(DeepEvalBaseLLM):
52
52
  api_key: Optional[str] = None,
53
53
  project: Optional[str] = None,
54
54
  location: Optional[str] = None,
55
+ service_account_key: Optional[Dict[str, str]] = None,
55
56
  temperature: float = 0,
56
57
  generation_kwargs: Optional[Dict] = None,
57
58
  **kwargs,
@@ -75,6 +76,17 @@ class GeminiModel(DeepEvalBaseLLM):
75
76
  self.use_vertexai = KEY_FILE_HANDLER.fetch_data(
76
77
  ModelKeyValues.GOOGLE_GENAI_USE_VERTEXAI
77
78
  )
79
+ if service_account_key:
80
+ self.service_account_key = service_account_key
81
+ else:
82
+ service_account_key_data = KEY_FILE_HANDLER.fetch_data(
83
+ ModelKeyValues.GOOGLE_SERVICE_ACCOUNT_KEY
84
+ )
85
+ if service_account_key_data is None:
86
+ self.service_account_key = None
87
+ elif isinstance(service_account_key_data, str):
88
+ self.service_account_key = json.loads(service_account_key_data)
89
+
78
90
  if temperature < 0:
79
91
  raise ValueError("Temperature must be >= 0.")
80
92
  self.temperature = temperature
@@ -117,10 +129,20 @@ class GeminiModel(DeepEvalBaseLLM):
117
129
  )
118
130
 
119
131
  # Create client for Vertex AI
120
- self.client = genai.Client(
132
+ self.client = Client(
121
133
  vertexai=True,
122
134
  project=self.project,
123
135
  location=self.location,
136
+ credentials=(
137
+ service_account.Credentials.from_service_account_info(
138
+ self.service_account_key,
139
+ scopes=[
140
+ "https://www.googleapis.com/auth/cloud-platform"
141
+ ],
142
+ )
143
+ if self.service_account_key
144
+ else None
145
+ ),
124
146
  **self.kwargs,
125
147
  )
126
148
  else:
@@ -130,7 +152,7 @@ class GeminiModel(DeepEvalBaseLLM):
130
152
  "or set it in your DeepEval configuration."
131
153
  )
132
154
  # Create client for Gemini API
133
- self.client = genai.Client(api_key=self.api_key, **self.kwargs)
155
+ self.client = Client(api_key=self.api_key, **self.kwargs)
134
156
 
135
157
  # Configure default model generation settings
136
158
  self.model_safety_settings = [