deepeval 3.7.0__py3-none-any.whl → 3.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/__init__.py +0 -4
- deepeval/_version.py +1 -1
- deepeval/cli/main.py +7 -0
- deepeval/confident/api.py +6 -1
- deepeval/config/settings.py +5 -0
- deepeval/evaluate/compare.py +219 -4
- deepeval/evaluate/types.py +6 -0
- deepeval/evaluate/utils.py +30 -0
- deepeval/key_handler.py +1 -0
- deepeval/metrics/arena_g_eval/arena_g_eval.py +5 -1
- deepeval/metrics/arena_g_eval/utils.py +5 -5
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +9 -18
- deepeval/metrics/g_eval/g_eval.py +5 -1
- deepeval/metrics/g_eval/utils.py +1 -1
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +5 -1
- deepeval/metrics/utils.py +1 -1
- deepeval/models/llms/gemini_model.py +27 -5
- deepeval/openai_agents/callback_handler.py +12 -3
- deepeval/prompt/prompt.py +25 -14
- deepeval/simulator/template.py +1 -1
- deepeval/synthesizer/config.py +9 -0
- deepeval/synthesizer/schema.py +23 -0
- deepeval/synthesizer/synthesizer.py +1137 -2
- deepeval/synthesizer/templates/__init__.py +11 -2
- deepeval/synthesizer/templates/template.py +554 -1
- deepeval/synthesizer/templates/template_extraction.py +32 -0
- deepeval/synthesizer/templates/template_prompt.py +262 -0
- deepeval/test_case/__init__.py +2 -1
- deepeval/test_case/arena_test_case.py +15 -4
- deepeval/test_case/mllm_test_case.py +45 -22
- deepeval/test_run/cache.py +31 -10
- deepeval/test_run/hyperparameters.py +5 -1
- deepeval/test_run/test_run.py +28 -9
- deepeval/tracing/tracing.py +1 -1
- deepeval/utils.py +4 -0
- {deepeval-3.7.0.dist-info → deepeval-3.7.2.dist-info}/METADATA +3 -2
- {deepeval-3.7.0.dist-info → deepeval-3.7.2.dist-info}/RECORD +40 -40
- {deepeval-3.7.0.dist-info → deepeval-3.7.2.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.0.dist-info → deepeval-3.7.2.dist-info}/WHEEL +0 -0
- {deepeval-3.7.0.dist-info → deepeval-3.7.2.dist-info}/entry_points.txt +0 -0
deepeval/__init__.py
CHANGED
deepeval/_version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__: str = "3.7.
|
|
1
|
+
__version__: str = "3.7.2"
|
deepeval/cli/main.py
CHANGED
|
@@ -1484,6 +1484,11 @@ def set_gemini_model_env(
|
|
|
1484
1484
|
google_cloud_location: Optional[str] = typer.Option(
|
|
1485
1485
|
None, "--location", help="Google Cloud location"
|
|
1486
1486
|
),
|
|
1487
|
+
google_service_account_key: Optional[str] = typer.Option(
|
|
1488
|
+
None,
|
|
1489
|
+
"--service-account-key",
|
|
1490
|
+
help="Google Service Account Key for Gemini",
|
|
1491
|
+
),
|
|
1487
1492
|
save: Optional[str] = typer.Option(
|
|
1488
1493
|
None,
|
|
1489
1494
|
"--save",
|
|
@@ -1513,6 +1518,8 @@ def set_gemini_model_env(
|
|
|
1513
1518
|
settings.GOOGLE_CLOUD_PROJECT = google_cloud_project
|
|
1514
1519
|
if google_cloud_location:
|
|
1515
1520
|
settings.GOOGLE_CLOUD_LOCATION = google_cloud_location
|
|
1521
|
+
if google_service_account_key:
|
|
1522
|
+
settings.GOOGLE_SERVICE_ACCOUNT_KEY = google_service_account_key
|
|
1516
1523
|
if model_name:
|
|
1517
1524
|
settings.GEMINI_MODEL_NAME = model_name
|
|
1518
1525
|
|
deepeval/confident/api.py
CHANGED
|
@@ -27,6 +27,10 @@ retryable_exceptions = requests.exceptions.SSLError
|
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
def get_base_api_url():
|
|
30
|
+
s = get_settings()
|
|
31
|
+
if s.CONFIDENT_BASE_URL:
|
|
32
|
+
base_url = s.CONFIDENT_BASE_URL.rstrip("/")
|
|
33
|
+
return base_url
|
|
30
34
|
region = KEY_FILE_HANDLER.fetch_data(KeyValues.CONFIDENT_REGION)
|
|
31
35
|
if region == "EU":
|
|
32
36
|
return API_BASE_URL_EU
|
|
@@ -87,6 +91,7 @@ class Endpoints(Enum):
|
|
|
87
91
|
DATASET_ALIAS_QUEUE_ENDPOINT = "/v1/datasets/:alias/queue"
|
|
88
92
|
|
|
89
93
|
TEST_RUN_ENDPOINT = "/v1/test-run"
|
|
94
|
+
EXPERIMENT_ENDPOINT = "/v1/experiment"
|
|
90
95
|
METRIC_DATA_ENDPOINT = "/v1/metric-data"
|
|
91
96
|
TRACES_ENDPOINT = "/v1/traces"
|
|
92
97
|
ANNOTATIONS_ENDPOINT = "/v1/annotations"
|
|
@@ -115,7 +120,7 @@ class Api:
|
|
|
115
120
|
self.api_key = api_key
|
|
116
121
|
self._headers = {
|
|
117
122
|
"Content-Type": "application/json",
|
|
118
|
-
"
|
|
123
|
+
"CONFIDENT-API-KEY": api_key,
|
|
119
124
|
"X-DeepEval-Version": deepeval.__version__,
|
|
120
125
|
}
|
|
121
126
|
self.base_api_url = get_base_api_url()
|
deepeval/config/settings.py
CHANGED
|
@@ -229,6 +229,11 @@ class Settings(BaseSettings):
|
|
|
229
229
|
API_KEY: Optional[SecretStr] = None
|
|
230
230
|
CONFIDENT_API_KEY: Optional[SecretStr] = None
|
|
231
231
|
|
|
232
|
+
# ======
|
|
233
|
+
# Base URL for Confident AI API server
|
|
234
|
+
# ======
|
|
235
|
+
CONFIDENT_BASE_URL: Optional[str] = None
|
|
236
|
+
|
|
232
237
|
# General
|
|
233
238
|
TEMPERATURE: Optional[confloat(ge=0, le=2)] = None
|
|
234
239
|
|
deepeval/evaluate/compare.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from typing import Optional, List, Dict, Callable
|
|
2
2
|
import asyncio
|
|
3
|
+
import time
|
|
3
4
|
from rich.progress import (
|
|
4
5
|
Progress,
|
|
5
6
|
TextColumn,
|
|
@@ -8,24 +9,74 @@ from rich.progress import (
|
|
|
8
9
|
TaskProgressColumn,
|
|
9
10
|
)
|
|
10
11
|
from collections import Counter
|
|
12
|
+
import json
|
|
11
13
|
|
|
12
14
|
from deepeval.errors import MissingTestCaseParamsError
|
|
13
15
|
from deepeval.evaluate.configs import AsyncConfig, DisplayConfig, ErrorConfig
|
|
14
|
-
from deepeval.test_case import ArenaTestCase
|
|
16
|
+
from deepeval.test_case import ArenaTestCase, Contestant
|
|
17
|
+
from deepeval.test_case.api import create_api_test_case
|
|
15
18
|
from deepeval.metrics import ArenaGEval
|
|
16
|
-
from deepeval.utils import
|
|
17
|
-
|
|
19
|
+
from deepeval.utils import (
|
|
20
|
+
add_pbar,
|
|
21
|
+
update_pbar,
|
|
22
|
+
custom_console,
|
|
23
|
+
get_or_create_event_loop,
|
|
24
|
+
open_browser,
|
|
25
|
+
)
|
|
26
|
+
from deepeval.test_run.test_run import (
|
|
27
|
+
TestRun,
|
|
28
|
+
MetricData,
|
|
29
|
+
TestRunEncoder,
|
|
30
|
+
MetricScores,
|
|
31
|
+
console,
|
|
32
|
+
)
|
|
33
|
+
from deepeval.test_run.hyperparameters import (
|
|
34
|
+
process_hyperparameters,
|
|
35
|
+
)
|
|
36
|
+
from deepeval.confident.api import Api, Endpoints, HttpMethods, is_confident
|
|
18
37
|
from deepeval.telemetry import capture_evaluation_run
|
|
38
|
+
from deepeval.test_run.api import LLMApiTestCase
|
|
39
|
+
from deepeval.evaluate.utils import create_arena_metric_data
|
|
40
|
+
from deepeval.evaluate.types import PostExperimentRequest
|
|
19
41
|
|
|
20
42
|
|
|
21
43
|
def compare(
|
|
22
44
|
test_cases: List[ArenaTestCase],
|
|
23
45
|
metric: ArenaGEval,
|
|
46
|
+
name: str = "compare()",
|
|
24
47
|
# Configs
|
|
25
48
|
async_config: Optional[AsyncConfig] = AsyncConfig(),
|
|
26
49
|
display_config: Optional[DisplayConfig] = DisplayConfig(),
|
|
27
50
|
error_config: Optional[ErrorConfig] = ErrorConfig(),
|
|
28
51
|
) -> Dict[str, int]:
|
|
52
|
+
|
|
53
|
+
# Prepare test run map
|
|
54
|
+
unique_contestant_names = set(
|
|
55
|
+
[
|
|
56
|
+
contestant.name
|
|
57
|
+
for test_case in test_cases
|
|
58
|
+
for contestant in test_case.contestants
|
|
59
|
+
]
|
|
60
|
+
)
|
|
61
|
+
test_run_map: Dict[str, TestRun] = {}
|
|
62
|
+
for contestant_name in unique_contestant_names:
|
|
63
|
+
test_run = TestRun(
|
|
64
|
+
identifier=contestant_name,
|
|
65
|
+
test_passed=0,
|
|
66
|
+
test_failed=0,
|
|
67
|
+
)
|
|
68
|
+
test_run.metrics_scores = [
|
|
69
|
+
MetricScores(
|
|
70
|
+
metric=metric.name,
|
|
71
|
+
scores=[],
|
|
72
|
+
passes=0,
|
|
73
|
+
fails=0,
|
|
74
|
+
errors=0,
|
|
75
|
+
)
|
|
76
|
+
]
|
|
77
|
+
test_run_map[contestant_name] = test_run
|
|
78
|
+
|
|
79
|
+
start_time = time.time()
|
|
29
80
|
with capture_evaluation_run("compare()"):
|
|
30
81
|
if async_config.run_async:
|
|
31
82
|
loop = get_or_create_event_loop()
|
|
@@ -39,6 +90,7 @@ def compare(
|
|
|
39
90
|
throttle_value=async_config.throttle_value,
|
|
40
91
|
max_concurrent=async_config.max_concurrent,
|
|
41
92
|
skip_on_missing_params=error_config.skip_on_missing_params,
|
|
93
|
+
test_run_map=test_run_map,
|
|
42
94
|
)
|
|
43
95
|
)
|
|
44
96
|
else:
|
|
@@ -49,7 +101,10 @@ def compare(
|
|
|
49
101
|
verbose_mode=display_config.verbose_mode,
|
|
50
102
|
show_indicator=display_config.show_indicator,
|
|
51
103
|
skip_on_missing_params=error_config.skip_on_missing_params,
|
|
104
|
+
test_run_map=test_run_map,
|
|
52
105
|
)
|
|
106
|
+
end_time = time.time()
|
|
107
|
+
run_duration = end_time - start_time
|
|
53
108
|
|
|
54
109
|
# Aggregate winners
|
|
55
110
|
winner_counts = Counter()
|
|
@@ -57,7 +112,13 @@ def compare(
|
|
|
57
112
|
if winner:
|
|
58
113
|
winner_counts[winner] += 1
|
|
59
114
|
|
|
60
|
-
|
|
115
|
+
process_test_runs(test_run_map=test_run_map, test_cases=test_cases)
|
|
116
|
+
wrap_up_experiment(
|
|
117
|
+
name=name,
|
|
118
|
+
test_runs=list(test_run_map.values()),
|
|
119
|
+
winner_counts=winner_counts,
|
|
120
|
+
run_duration=run_duration,
|
|
121
|
+
)
|
|
61
122
|
return dict(winner_counts)
|
|
62
123
|
|
|
63
124
|
|
|
@@ -70,6 +131,7 @@ async def a_execute_arena_test_cases(
|
|
|
70
131
|
throttle_value: int,
|
|
71
132
|
skip_on_missing_params: bool,
|
|
72
133
|
max_concurrent: int,
|
|
134
|
+
test_run_map: Dict[str, TestRun],
|
|
73
135
|
) -> List[str]:
|
|
74
136
|
semaphore = asyncio.Semaphore(max_concurrent)
|
|
75
137
|
|
|
@@ -104,6 +166,8 @@ async def a_execute_arena_test_cases(
|
|
|
104
166
|
else metric.verbose_mode
|
|
105
167
|
),
|
|
106
168
|
)
|
|
169
|
+
|
|
170
|
+
start_time = time.perf_counter()
|
|
107
171
|
winner = await _a_handle_metric_measurement(
|
|
108
172
|
metric=metric_copy,
|
|
109
173
|
test_case=test_case,
|
|
@@ -112,10 +176,21 @@ async def a_execute_arena_test_cases(
|
|
|
112
176
|
_progress=progress,
|
|
113
177
|
_pbar_id=pbar_test_case_id,
|
|
114
178
|
)
|
|
179
|
+
end_time = time.perf_counter()
|
|
180
|
+
run_duration = end_time - start_time
|
|
181
|
+
|
|
115
182
|
if winner:
|
|
116
183
|
winners.append(winner)
|
|
117
184
|
|
|
118
185
|
update_pbar(progress, pbar_id)
|
|
186
|
+
update_test_run_map(
|
|
187
|
+
test_case=test_case,
|
|
188
|
+
index=index,
|
|
189
|
+
test_run_map=test_run_map,
|
|
190
|
+
metric_copy=metric_copy,
|
|
191
|
+
winner=winner,
|
|
192
|
+
run_duration=run_duration,
|
|
193
|
+
)
|
|
119
194
|
|
|
120
195
|
# Create tasks for all test cases
|
|
121
196
|
if show_indicator:
|
|
@@ -156,6 +231,7 @@ def execute_arena_test_cases(
|
|
|
156
231
|
skip_on_missing_params: bool,
|
|
157
232
|
show_indicator: bool,
|
|
158
233
|
verbose_mode: Optional[bool] = None,
|
|
234
|
+
test_run_map: Optional[Dict[str, TestRun]] = None,
|
|
159
235
|
) -> List[str]:
|
|
160
236
|
"""
|
|
161
237
|
Non-async version of comparing arena test cases.
|
|
@@ -183,6 +259,8 @@ def execute_arena_test_cases(
|
|
|
183
259
|
else metric.verbose_mode
|
|
184
260
|
),
|
|
185
261
|
)
|
|
262
|
+
|
|
263
|
+
start_time = time.perf_counter()
|
|
186
264
|
winner = _handle_metric_measurement(
|
|
187
265
|
metric=metric_copy,
|
|
188
266
|
test_case=test_case,
|
|
@@ -191,10 +269,21 @@ def execute_arena_test_cases(
|
|
|
191
269
|
_progress=progress,
|
|
192
270
|
_pbar_id=pbar_test_case_id,
|
|
193
271
|
)
|
|
272
|
+
end_time = time.perf_counter()
|
|
273
|
+
run_duration = end_time - start_time
|
|
274
|
+
|
|
194
275
|
if winner:
|
|
195
276
|
winners.append(winner)
|
|
196
277
|
|
|
197
278
|
update_pbar(progress, pbar_id)
|
|
279
|
+
update_test_run_map(
|
|
280
|
+
test_case=test_case,
|
|
281
|
+
index=i,
|
|
282
|
+
test_run_map=test_run_map,
|
|
283
|
+
metric_copy=metric_copy,
|
|
284
|
+
winner=winner,
|
|
285
|
+
run_duration=run_duration,
|
|
286
|
+
)
|
|
198
287
|
|
|
199
288
|
if show_indicator:
|
|
200
289
|
progress = Progress(
|
|
@@ -313,3 +402,129 @@ async def _a_handle_metric_measurement(
|
|
|
313
402
|
return None
|
|
314
403
|
else:
|
|
315
404
|
raise
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
def update_test_run_map(
|
|
408
|
+
test_case: ArenaTestCase,
|
|
409
|
+
index: int,
|
|
410
|
+
test_run_map: Dict[str, TestRun],
|
|
411
|
+
metric_copy: ArenaGEval,
|
|
412
|
+
winner: str,
|
|
413
|
+
run_duration: float,
|
|
414
|
+
):
|
|
415
|
+
for contestant in test_case.contestants:
|
|
416
|
+
test_run = test_run_map.get(contestant.name)
|
|
417
|
+
|
|
418
|
+
# update test cases in test run
|
|
419
|
+
api_test_case: LLMApiTestCase = create_api_test_case(
|
|
420
|
+
test_case=contestant.test_case, index=index
|
|
421
|
+
)
|
|
422
|
+
metric_data: MetricData = create_arena_metric_data(
|
|
423
|
+
metric_copy, contestant.name
|
|
424
|
+
)
|
|
425
|
+
api_test_case.update_metric_data(metric_data)
|
|
426
|
+
api_test_case.update_run_duration(run_duration)
|
|
427
|
+
test_run.add_test_case(api_test_case)
|
|
428
|
+
|
|
429
|
+
# update other test run attributes
|
|
430
|
+
if test_run.run_duration is None:
|
|
431
|
+
test_run.run_duration = 0.0
|
|
432
|
+
test_run.run_duration += run_duration
|
|
433
|
+
|
|
434
|
+
# Ensure test_passed and test_failed are initialized
|
|
435
|
+
if test_run.test_passed is None:
|
|
436
|
+
test_run.test_passed = 0
|
|
437
|
+
if test_run.test_failed is None:
|
|
438
|
+
test_run.test_failed = 0
|
|
439
|
+
|
|
440
|
+
if winner == contestant:
|
|
441
|
+
test_run.test_passed += 1
|
|
442
|
+
else:
|
|
443
|
+
test_run.test_failed += 1
|
|
444
|
+
|
|
445
|
+
# update metric scores
|
|
446
|
+
test_run.metrics_scores[0].metric = metric_copy.name
|
|
447
|
+
test_run.metrics_scores[0].scores.append(
|
|
448
|
+
1 if winner == contestant else 0
|
|
449
|
+
)
|
|
450
|
+
test_run.metrics_scores[0].passes += 1 if winner == contestant else 0
|
|
451
|
+
test_run.metrics_scores[0].fails += 1 if winner != contestant else 0
|
|
452
|
+
test_run.metrics_scores[0].errors += 0
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
def process_test_runs(
|
|
456
|
+
test_run_map: Dict[str, TestRun],
|
|
457
|
+
test_cases: List[ArenaTestCase],
|
|
458
|
+
):
|
|
459
|
+
hyperparameters_map = {
|
|
460
|
+
contestant_name: {} for contestant_name in test_run_map.keys()
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
for test_case in test_cases:
|
|
464
|
+
for contestant in test_case.contestants:
|
|
465
|
+
if contestant.hyperparameters:
|
|
466
|
+
hyperparameters_map[contestant.name].update(
|
|
467
|
+
contestant.hyperparameters
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
for contestant_name, hyperparameters in hyperparameters_map.items():
|
|
471
|
+
test_run = test_run_map.get(contestant_name)
|
|
472
|
+
test_run.hyperparameters = process_hyperparameters(hyperparameters)
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
def wrap_up_experiment(
|
|
476
|
+
name: str,
|
|
477
|
+
test_runs: List[TestRun],
|
|
478
|
+
winner_counts: Counter,
|
|
479
|
+
run_duration: float,
|
|
480
|
+
):
|
|
481
|
+
winner_breakdown = []
|
|
482
|
+
for contestant, wins in winner_counts.most_common():
|
|
483
|
+
winner_breakdown.append(
|
|
484
|
+
f" » [bold green]{contestant}[/bold green]: {wins} wins"
|
|
485
|
+
)
|
|
486
|
+
winner_text = (
|
|
487
|
+
"\n".join(winner_breakdown) if winner_breakdown else "No winners"
|
|
488
|
+
)
|
|
489
|
+
console.print(
|
|
490
|
+
f"\n🎉 Arena completed! (time taken: {round(run_duration, 2)}s | token cost: {test_runs[0].evaluation_cost if test_runs else 0} USD)\n"
|
|
491
|
+
f"🏆 Results ({sum(winner_counts.values())} total test cases):\n"
|
|
492
|
+
f"{winner_text}\n\n"
|
|
493
|
+
)
|
|
494
|
+
|
|
495
|
+
if not is_confident():
|
|
496
|
+
console.print(
|
|
497
|
+
f"{'=' * 80}\n"
|
|
498
|
+
f"\n» Want to share experiments with your team? ❤️ 🏟️\n"
|
|
499
|
+
f" » Run [bold]'deepeval login'[/bold] to analyze and save arena results on [rgb(106,0,255)]Confident AI[/rgb(106,0,255)].\n\n"
|
|
500
|
+
)
|
|
501
|
+
return
|
|
502
|
+
|
|
503
|
+
try:
|
|
504
|
+
api = Api()
|
|
505
|
+
experiment_request = PostExperimentRequest(
|
|
506
|
+
testRuns=test_runs, name=name
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
try:
|
|
510
|
+
body = experiment_request.model_dump(
|
|
511
|
+
by_alias=True, exclude_none=True
|
|
512
|
+
)
|
|
513
|
+
except AttributeError:
|
|
514
|
+
body = experiment_request.dict(by_alias=True, exclude_none=True)
|
|
515
|
+
json_str = json.dumps(body, cls=TestRunEncoder)
|
|
516
|
+
body = json.loads(json_str)
|
|
517
|
+
|
|
518
|
+
_, link = api.send_request(
|
|
519
|
+
method=HttpMethods.POST,
|
|
520
|
+
endpoint=Endpoints.EXPERIMENT_ENDPOINT,
|
|
521
|
+
body=body,
|
|
522
|
+
)
|
|
523
|
+
console.print(
|
|
524
|
+
"[rgb(5,245,141)]✓[/rgb(5,245,141)] Done 🎉! View results on "
|
|
525
|
+
f"[link={link}]{link}[/link]"
|
|
526
|
+
)
|
|
527
|
+
open_browser(link)
|
|
528
|
+
|
|
529
|
+
except Exception:
|
|
530
|
+
raise
|
deepeval/evaluate/types.py
CHANGED
|
@@ -4,6 +4,7 @@ from pydantic import BaseModel
|
|
|
4
4
|
|
|
5
5
|
from deepeval.test_run.api import MetricData, TurnApi
|
|
6
6
|
from deepeval.test_case import MLLMImage
|
|
7
|
+
from deepeval.test_run import TestRun
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
@dataclass
|
|
@@ -29,3 +30,8 @@ class EvaluationResult(BaseModel):
|
|
|
29
30
|
test_results: List[TestResult]
|
|
30
31
|
confident_link: Optional[str]
|
|
31
32
|
test_run_id: Optional[str]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class PostExperimentRequest(BaseModel):
|
|
36
|
+
testRuns: List[TestRun]
|
|
37
|
+
name: Optional[str]
|
deepeval/evaluate/utils.py
CHANGED
|
@@ -8,6 +8,7 @@ from deepeval.utils import format_turn
|
|
|
8
8
|
from deepeval.test_run.test_run import TestRunResultDisplay
|
|
9
9
|
from deepeval.dataset import Golden
|
|
10
10
|
from deepeval.metrics import (
|
|
11
|
+
ArenaGEval,
|
|
11
12
|
BaseMetric,
|
|
12
13
|
BaseConversationalMetric,
|
|
13
14
|
BaseMultimodalMetric,
|
|
@@ -84,6 +85,35 @@ def create_metric_data(metric: BaseMetric) -> MetricData:
|
|
|
84
85
|
)
|
|
85
86
|
|
|
86
87
|
|
|
88
|
+
def create_arena_metric_data(metric: ArenaGEval, contestant: str) -> MetricData:
|
|
89
|
+
if metric.error is not None:
|
|
90
|
+
return MetricData(
|
|
91
|
+
name=metric.__name__,
|
|
92
|
+
threshold=1,
|
|
93
|
+
score=None,
|
|
94
|
+
reason=None,
|
|
95
|
+
success=False,
|
|
96
|
+
strictMode=True,
|
|
97
|
+
evaluationModel=metric.evaluation_model,
|
|
98
|
+
error=metric.error,
|
|
99
|
+
evaluationCost=metric.evaluation_cost,
|
|
100
|
+
verboseLogs=metric.verbose_logs,
|
|
101
|
+
)
|
|
102
|
+
else:
|
|
103
|
+
return MetricData(
|
|
104
|
+
name=metric.__name__,
|
|
105
|
+
score=1 if contestant == metric.winner else 0,
|
|
106
|
+
threshold=1,
|
|
107
|
+
reason=metric.reason,
|
|
108
|
+
success=metric.is_successful(),
|
|
109
|
+
strictMode=True,
|
|
110
|
+
evaluationModel=metric.evaluation_model,
|
|
111
|
+
error=None,
|
|
112
|
+
evaluationCost=metric.evaluation_cost,
|
|
113
|
+
verboseLogs=metric.verbose_logs,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
|
|
87
117
|
def create_test_result(
|
|
88
118
|
api_test_case: Union[LLMApiTestCase, ConversationalApiTestCase],
|
|
89
119
|
) -> TestResult:
|
deepeval/key_handler.py
CHANGED
|
@@ -70,6 +70,7 @@ class ModelKeyValues(Enum):
|
|
|
70
70
|
GOOGLE_GENAI_USE_VERTEXAI = "GOOGLE_GENAI_USE_VERTEXAI"
|
|
71
71
|
GOOGLE_CLOUD_PROJECT = "GOOGLE_CLOUD_PROJECT"
|
|
72
72
|
GOOGLE_CLOUD_LOCATION = "GOOGLE_CLOUD_LOCATION"
|
|
73
|
+
GOOGLE_SERVICE_ACCOUNT_KEY = "GOOGLE_SERVICE_ACCOUNT_KEY"
|
|
73
74
|
# LiteLLM
|
|
74
75
|
USE_LITELLM = "USE_LITELLM"
|
|
75
76
|
LITELLM_MODEL_NAME = "LITELLM_MODEL_NAME"
|
|
@@ -46,7 +46,11 @@ class ArenaGEval(BaseArenaMetric):
|
|
|
46
46
|
self.criteria = criteria
|
|
47
47
|
self.model, self.using_native_model = initialize_model(model)
|
|
48
48
|
self.evaluation_model = self.model.get_model_name()
|
|
49
|
-
self.evaluation_steps =
|
|
49
|
+
self.evaluation_steps = (
|
|
50
|
+
evaluation_steps
|
|
51
|
+
if evaluation_steps and len(evaluation_steps) > 0
|
|
52
|
+
else None
|
|
53
|
+
)
|
|
50
54
|
self.async_mode = async_mode
|
|
51
55
|
self.verbose_mode = verbose_mode
|
|
52
56
|
self._include_g_eval_suffix = _include_g_eval_suffix
|
|
@@ -89,10 +89,10 @@ class FormattedArenaTestCase:
|
|
|
89
89
|
def format_arena_test_case(
|
|
90
90
|
evaluation_params: List[LLMTestCaseParams], test_case: ArenaTestCase
|
|
91
91
|
) -> Tuple[FormattedArenaTestCase, Dict[str, str]]:
|
|
92
|
-
case = next(iter(test_case.contestants
|
|
92
|
+
case = next(iter([case.test_case for case in test_case.contestants]))
|
|
93
93
|
|
|
94
94
|
# Create dummy name mapping
|
|
95
|
-
real_names = list(test_case.contestants
|
|
95
|
+
real_names = list([case.name for case in test_case.contestants])
|
|
96
96
|
available_fake_names = FAKE_NAMES.copy()
|
|
97
97
|
random.shuffle(available_fake_names)
|
|
98
98
|
|
|
@@ -119,10 +119,10 @@ def format_arena_test_case(
|
|
|
119
119
|
else None
|
|
120
120
|
),
|
|
121
121
|
contestants={
|
|
122
|
-
contestant: construct_formatted_llm_test_case(
|
|
123
|
-
evaluation_params, test_case
|
|
122
|
+
contestant.name: construct_formatted_llm_test_case(
|
|
123
|
+
evaluation_params, contestant.test_case
|
|
124
124
|
)
|
|
125
|
-
for contestant
|
|
125
|
+
for contestant in test_case.contestants
|
|
126
126
|
},
|
|
127
127
|
dummy_to_real_names=dummy_to_real_names,
|
|
128
128
|
)
|
|
@@ -9,6 +9,8 @@ from deepeval.metrics.g_eval.utils import (
|
|
|
9
9
|
construct_conversational_g_eval_turn_params_string,
|
|
10
10
|
construct_non_turns_test_case_string,
|
|
11
11
|
format_rubrics,
|
|
12
|
+
validate_and_sort_rubrics,
|
|
13
|
+
validate_criteria_and_evaluation_steps,
|
|
12
14
|
)
|
|
13
15
|
from deepeval.test_case import (
|
|
14
16
|
TurnParams,
|
|
@@ -63,27 +65,16 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
63
65
|
|
|
64
66
|
self.evaluation_params = evaluation_params
|
|
65
67
|
|
|
66
|
-
|
|
67
|
-
if criteria is None and evaluation_steps is None:
|
|
68
|
-
raise ValueError(
|
|
69
|
-
"Either 'criteria' or 'evaluation_steps' must be provided."
|
|
70
|
-
)
|
|
71
|
-
|
|
72
|
-
# Check if criteria is provided, it cannot be an empty string
|
|
73
|
-
if criteria is not None and not criteria.strip():
|
|
74
|
-
raise ValueError("Criteria provided cannot be an empty string.")
|
|
75
|
-
|
|
76
|
-
# Check if evaluation_steps is provided, it cannot be an empty list
|
|
77
|
-
if evaluation_steps is not None and len(evaluation_steps) == 0:
|
|
78
|
-
raise ValueError(
|
|
79
|
-
"'evaluation_steps' must not be an empty list. Either omit evaluation steps or include a non-empty list of steps."
|
|
80
|
-
)
|
|
81
|
-
|
|
68
|
+
validate_criteria_and_evaluation_steps(criteria, evaluation_steps)
|
|
82
69
|
self.criteria = criteria
|
|
83
|
-
self.rubric = rubric
|
|
70
|
+
self.rubric = validate_and_sort_rubrics(rubric)
|
|
84
71
|
self.model, self.using_native_model = initialize_model(model)
|
|
85
72
|
self.evaluation_model = self.model.get_model_name()
|
|
86
|
-
self.evaluation_steps =
|
|
73
|
+
self.evaluation_steps = (
|
|
74
|
+
evaluation_steps
|
|
75
|
+
if evaluation_steps and len(evaluation_steps) > 0
|
|
76
|
+
else None
|
|
77
|
+
)
|
|
87
78
|
self.threshold = 1 if strict_mode else threshold
|
|
88
79
|
self.strict_mode = strict_mode
|
|
89
80
|
self.async_mode = async_mode
|
|
@@ -61,7 +61,11 @@ class GEval(BaseMetric):
|
|
|
61
61
|
self.score_range_span = self.score_range[1] - self.score_range[0]
|
|
62
62
|
self.model, self.using_native_model = initialize_model(model)
|
|
63
63
|
self.evaluation_model = self.model.get_model_name()
|
|
64
|
-
self.evaluation_steps =
|
|
64
|
+
self.evaluation_steps = (
|
|
65
|
+
evaluation_steps
|
|
66
|
+
if evaluation_steps and len(evaluation_steps) > 0
|
|
67
|
+
else None
|
|
68
|
+
)
|
|
65
69
|
self.threshold = 1 if strict_mode else threshold
|
|
66
70
|
self.top_logprobs = top_logprobs
|
|
67
71
|
self.strict_mode = strict_mode
|
deepeval/metrics/g_eval/utils.py
CHANGED
|
@@ -77,7 +77,7 @@ def validate_criteria_and_evaluation_steps(
|
|
|
77
77
|
def validate_and_sort_rubrics(
|
|
78
78
|
rubrics: Optional[List[Rubric]] = None,
|
|
79
79
|
) -> Optional[List[Rubric]]:
|
|
80
|
-
if rubrics is None:
|
|
80
|
+
if rubrics is None or len(rubrics) == 0:
|
|
81
81
|
return None
|
|
82
82
|
|
|
83
83
|
# Sort rubrics by start of range
|
|
@@ -64,7 +64,11 @@ class MultimodalGEval(BaseMultimodalMetric):
|
|
|
64
64
|
self.rubric = validate_and_sort_rubrics(rubric)
|
|
65
65
|
self.model, self.using_native_model = initialize_multimodal_model(model)
|
|
66
66
|
self.evaluation_model = self.model.get_model_name()
|
|
67
|
-
self.evaluation_steps =
|
|
67
|
+
self.evaluation_steps = (
|
|
68
|
+
evaluation_steps
|
|
69
|
+
if evaluation_steps and len(evaluation_steps) > 0
|
|
70
|
+
else None
|
|
71
|
+
)
|
|
68
72
|
self.threshold = 1 if strict_mode else threshold
|
|
69
73
|
self.top_logprobs = top_logprobs
|
|
70
74
|
self.strict_mode = strict_mode
|
deepeval/metrics/utils.py
CHANGED
|
@@ -270,7 +270,7 @@ def check_arena_test_case_params(
|
|
|
270
270
|
f"Expected ArenaTestCase, got {type(arena_test_case).__name__}"
|
|
271
271
|
)
|
|
272
272
|
|
|
273
|
-
cases =
|
|
273
|
+
cases = [contestant.test_case for contestant in arena_test_case.contestants]
|
|
274
274
|
ref_input = cases[0].input
|
|
275
275
|
for case in cases[1:]:
|
|
276
276
|
if case.input != ref_input:
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from pydantic import BaseModel
|
|
2
|
-
from google.genai import types
|
|
2
|
+
from google.genai import types, Client
|
|
3
3
|
from typing import Optional, Dict
|
|
4
|
-
from google import genai
|
|
5
4
|
|
|
6
5
|
from deepeval.models.retry_policy import (
|
|
7
6
|
create_retry_decorator,
|
|
@@ -9,7 +8,8 @@ from deepeval.models.retry_policy import (
|
|
|
9
8
|
from deepeval.key_handler import ModelKeyValues, KEY_FILE_HANDLER
|
|
10
9
|
from deepeval.models.base_model import DeepEvalBaseLLM
|
|
11
10
|
from deepeval.constants import ProviderSlug as PS
|
|
12
|
-
|
|
11
|
+
from google.oauth2 import service_account
|
|
12
|
+
import json
|
|
13
13
|
|
|
14
14
|
default_gemini_model = "gemini-1.5-pro"
|
|
15
15
|
|
|
@@ -52,6 +52,7 @@ class GeminiModel(DeepEvalBaseLLM):
|
|
|
52
52
|
api_key: Optional[str] = None,
|
|
53
53
|
project: Optional[str] = None,
|
|
54
54
|
location: Optional[str] = None,
|
|
55
|
+
service_account_key: Optional[Dict[str, str]] = None,
|
|
55
56
|
temperature: float = 0,
|
|
56
57
|
generation_kwargs: Optional[Dict] = None,
|
|
57
58
|
**kwargs,
|
|
@@ -75,6 +76,17 @@ class GeminiModel(DeepEvalBaseLLM):
|
|
|
75
76
|
self.use_vertexai = KEY_FILE_HANDLER.fetch_data(
|
|
76
77
|
ModelKeyValues.GOOGLE_GENAI_USE_VERTEXAI
|
|
77
78
|
)
|
|
79
|
+
if service_account_key:
|
|
80
|
+
self.service_account_key = service_account_key
|
|
81
|
+
else:
|
|
82
|
+
service_account_key_data = KEY_FILE_HANDLER.fetch_data(
|
|
83
|
+
ModelKeyValues.GOOGLE_SERVICE_ACCOUNT_KEY
|
|
84
|
+
)
|
|
85
|
+
if service_account_key_data is None:
|
|
86
|
+
self.service_account_key = None
|
|
87
|
+
elif isinstance(service_account_key_data, str):
|
|
88
|
+
self.service_account_key = json.loads(service_account_key_data)
|
|
89
|
+
|
|
78
90
|
if temperature < 0:
|
|
79
91
|
raise ValueError("Temperature must be >= 0.")
|
|
80
92
|
self.temperature = temperature
|
|
@@ -117,10 +129,20 @@ class GeminiModel(DeepEvalBaseLLM):
|
|
|
117
129
|
)
|
|
118
130
|
|
|
119
131
|
# Create client for Vertex AI
|
|
120
|
-
self.client =
|
|
132
|
+
self.client = Client(
|
|
121
133
|
vertexai=True,
|
|
122
134
|
project=self.project,
|
|
123
135
|
location=self.location,
|
|
136
|
+
credentials=(
|
|
137
|
+
service_account.Credentials.from_service_account_info(
|
|
138
|
+
self.service_account_key,
|
|
139
|
+
scopes=[
|
|
140
|
+
"https://www.googleapis.com/auth/cloud-platform"
|
|
141
|
+
],
|
|
142
|
+
)
|
|
143
|
+
if self.service_account_key
|
|
144
|
+
else None
|
|
145
|
+
),
|
|
124
146
|
**self.kwargs,
|
|
125
147
|
)
|
|
126
148
|
else:
|
|
@@ -130,7 +152,7 @@ class GeminiModel(DeepEvalBaseLLM):
|
|
|
130
152
|
"or set it in your DeepEval configuration."
|
|
131
153
|
)
|
|
132
154
|
# Create client for Gemini API
|
|
133
|
-
self.client =
|
|
155
|
+
self.client = Client(api_key=self.api_key, **self.kwargs)
|
|
134
156
|
|
|
135
157
|
# Configure default model generation settings
|
|
136
158
|
self.model_safety_settings = [
|