deepeval 3.7.9__py3-none-any.whl → 3.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/annotation/annotation.py +2 -2
- deepeval/cli/main.py +168 -0
- deepeval/confident/api.py +2 -0
- deepeval/config/settings.py +13 -0
- deepeval/constants.py +1 -0
- deepeval/dataset/dataset.py +6 -4
- deepeval/integrations/langchain/callback.py +330 -158
- deepeval/integrations/langchain/utils.py +31 -8
- deepeval/key_handler.py +8 -1
- deepeval/metrics/contextual_recall/contextual_recall.py +25 -6
- deepeval/metrics/contextual_recall/schema.py +6 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +35 -0
- deepeval/metrics/g_eval/g_eval.py +35 -1
- deepeval/metrics/g_eval/utils.py +65 -0
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +10 -1
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +10 -1
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +10 -1
- deepeval/metrics/utils.py +1 -1
- deepeval/models/__init__.py +2 -0
- deepeval/models/llms/__init__.py +2 -0
- deepeval/models/llms/amazon_bedrock_model.py +51 -6
- deepeval/models/llms/azure_model.py +33 -7
- deepeval/models/llms/constants.py +23 -0
- deepeval/models/llms/gemini_model.py +6 -1
- deepeval/models/llms/openai_model.py +5 -4
- deepeval/models/llms/openrouter_model.py +398 -0
- deepeval/models/retry_policy.py +3 -0
- deepeval/prompt/api.py +1 -0
- deepeval/prompt/prompt.py +7 -5
- deepeval/test_case/llm_test_case.py +1 -0
- deepeval/tracing/tracing.py +6 -1
- deepeval/tracing/types.py +1 -1
- {deepeval-3.7.9.dist-info → deepeval-3.8.1.dist-info}/METADATA +3 -3
- {deepeval-3.7.9.dist-info → deepeval-3.8.1.dist-info}/RECORD +38 -37
- {deepeval-3.7.9.dist-info → deepeval-3.8.1.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.9.dist-info → deepeval-3.8.1.dist-info}/WHEEL +0 -0
- {deepeval-3.7.9.dist-info → deepeval-3.8.1.dist-info}/entry_points.txt +0 -0
|
@@ -145,6 +145,7 @@ def enter_current_context(
|
|
|
145
145
|
progress: Optional[Progress] = None,
|
|
146
146
|
pbar_callback_id: Optional[int] = None,
|
|
147
147
|
uuid_str: Optional[str] = None,
|
|
148
|
+
fallback_trace_uuid: Optional[str] = None,
|
|
148
149
|
) -> BaseSpan:
|
|
149
150
|
start_time = perf_counter()
|
|
150
151
|
observe_kwargs = observe_kwargs or {}
|
|
@@ -159,12 +160,27 @@ def enter_current_context(
|
|
|
159
160
|
parent_uuid: Optional[str] = None
|
|
160
161
|
|
|
161
162
|
if parent_span:
|
|
162
|
-
|
|
163
|
-
trace_uuid
|
|
164
|
-
|
|
163
|
+
# Validate that the parent span's trace is still active
|
|
164
|
+
if parent_span.trace_uuid in trace_manager.active_traces:
|
|
165
|
+
parent_uuid = parent_span.uuid
|
|
166
|
+
trace_uuid = parent_span.trace_uuid
|
|
167
|
+
else:
|
|
168
|
+
# Parent span references a dead trace - treat as if no parent
|
|
169
|
+
parent_span = None
|
|
170
|
+
|
|
171
|
+
if not parent_span:
|
|
165
172
|
current_trace = current_trace_context.get()
|
|
166
|
-
|
|
173
|
+
# IMPORTANT: Verify trace is still active, not just in context
|
|
174
|
+
# (a previous failed async operation might leave a dead trace in context)
|
|
175
|
+
if current_trace and current_trace.uuid in trace_manager.active_traces:
|
|
167
176
|
trace_uuid = current_trace.uuid
|
|
177
|
+
elif (
|
|
178
|
+
fallback_trace_uuid
|
|
179
|
+
and fallback_trace_uuid in trace_manager.active_traces
|
|
180
|
+
):
|
|
181
|
+
# In async contexts, ContextVar may not propagate. Use the fallback trace_uuid
|
|
182
|
+
# provided by the CallbackHandler to avoid creating duplicate traces.
|
|
183
|
+
trace_uuid = fallback_trace_uuid
|
|
168
184
|
else:
|
|
169
185
|
trace = trace_manager.start_new_trace(
|
|
170
186
|
metric_collection=metric_collection
|
|
@@ -258,11 +274,13 @@ def exit_current_context(
|
|
|
258
274
|
|
|
259
275
|
current_span = current_span_context.get()
|
|
260
276
|
|
|
277
|
+
# In async contexts (LangChain/LangGraph), context variables don't propagate
|
|
278
|
+
# reliably across task boundaries. Fall back to direct span lookup.
|
|
261
279
|
if not current_span or current_span.uuid != uuid_str:
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
280
|
+
current_span = trace_manager.get_span_by_uuid(uuid_str)
|
|
281
|
+
if not current_span:
|
|
282
|
+
# Span already removed or never existed
|
|
283
|
+
return
|
|
266
284
|
|
|
267
285
|
current_span.end_time = end_time
|
|
268
286
|
if exc_type is not None:
|
|
@@ -295,7 +313,12 @@ def exit_current_context(
|
|
|
295
313
|
else:
|
|
296
314
|
current_span_context.set(None)
|
|
297
315
|
else:
|
|
316
|
+
# Try context first, then fall back to direct trace lookup for async contexts
|
|
298
317
|
current_trace = current_trace_context.get()
|
|
318
|
+
if not current_trace and current_span.trace_uuid:
|
|
319
|
+
current_trace = trace_manager.get_trace_by_uuid(
|
|
320
|
+
current_span.trace_uuid
|
|
321
|
+
)
|
|
299
322
|
if current_span.status == TraceSpanStatus.ERRORED and current_trace:
|
|
300
323
|
current_trace.status = TraceSpanStatus.ERRORED
|
|
301
324
|
if current_trace and current_trace.uuid == current_span.trace_uuid:
|
deepeval/key_handler.py
CHANGED
|
@@ -162,6 +162,13 @@ class ModelKeyValues(Enum):
|
|
|
162
162
|
VLLM_API_KEY = "VLLM_API_KEY"
|
|
163
163
|
VLLM_MODEL_NAME = "VLLM_MODEL_NAME"
|
|
164
164
|
|
|
165
|
+
# OpenRouter
|
|
166
|
+
USE_OPENROUTER_MODEL = "USE_OPENROUTER_MODEL"
|
|
167
|
+
OPENROUTER_MODEL_NAME = "OPENROUTER_MODEL_NAME"
|
|
168
|
+
OPENROUTER_COST_PER_INPUT_TOKEN = "OPENROUTER_COST_PER_INPUT_TOKEN"
|
|
169
|
+
OPENROUTER_COST_PER_OUTPUT_TOKEN = "OPENROUTER_COST_PER_OUTPUT_TOKEN"
|
|
170
|
+
OPENROUTER_API_KEY = "OPENROUTER_API_KEY"
|
|
171
|
+
|
|
165
172
|
|
|
166
173
|
class EmbeddingKeyValues(Enum):
|
|
167
174
|
# Azure OpenAI
|
|
@@ -174,7 +181,7 @@ class EmbeddingKeyValues(Enum):
|
|
|
174
181
|
USE_LOCAL_EMBEDDINGS = "USE_LOCAL_EMBEDDINGS"
|
|
175
182
|
LOCAL_EMBEDDING_MODEL_NAME = "LOCAL_EMBEDDING_MODEL_NAME"
|
|
176
183
|
LOCAL_EMBEDDING_BASE_URL = "LOCAL_EMBEDDING_BASE_URL"
|
|
177
|
-
LOCAL_EMBEDDING_API_KEY = "LOCAL_EMBEDDING_API_KEY"
|
|
184
|
+
LOCAL_EMBEDDING_API_KEY = ("LOCAL_EMBEDDING_API_KEY",)
|
|
178
185
|
|
|
179
186
|
|
|
180
187
|
class KeyFileHandler:
|
|
@@ -23,6 +23,7 @@ from deepeval.metrics.contextual_recall.schema import (
|
|
|
23
23
|
ContextualRecallVerdict,
|
|
24
24
|
Verdicts,
|
|
25
25
|
ContextualRecallScoreReason,
|
|
26
|
+
VerdictWithExpectedOutput,
|
|
26
27
|
)
|
|
27
28
|
from deepeval.metrics.api import metric_data_manager
|
|
28
29
|
|
|
@@ -93,7 +94,7 @@ class ContextualRecallMetric(BaseMetric):
|
|
|
93
94
|
expected_output = test_case.expected_output
|
|
94
95
|
retrieval_context = test_case.retrieval_context
|
|
95
96
|
|
|
96
|
-
self.verdicts: List[
|
|
97
|
+
self.verdicts: List[VerdictWithExpectedOutput] = (
|
|
97
98
|
self._generate_verdicts(
|
|
98
99
|
expected_output, retrieval_context, multimodal
|
|
99
100
|
)
|
|
@@ -144,7 +145,7 @@ class ContextualRecallMetric(BaseMetric):
|
|
|
144
145
|
expected_output = test_case.expected_output
|
|
145
146
|
retrieval_context = test_case.retrieval_context
|
|
146
147
|
|
|
147
|
-
self.verdicts: List[
|
|
148
|
+
self.verdicts: List[VerdictWithExpectedOutput] = (
|
|
148
149
|
await self._a_generate_verdicts(
|
|
149
150
|
expected_output, retrieval_context, multimodal
|
|
150
151
|
)
|
|
@@ -241,13 +242,13 @@ class ContextualRecallMetric(BaseMetric):
|
|
|
241
242
|
expected_output: str,
|
|
242
243
|
retrieval_context: List[str],
|
|
243
244
|
multimodal: bool,
|
|
244
|
-
) -> List[
|
|
245
|
+
) -> List[VerdictWithExpectedOutput]:
|
|
245
246
|
prompt = self.evaluation_template.generate_verdicts(
|
|
246
247
|
expected_output=expected_output,
|
|
247
248
|
retrieval_context=retrieval_context,
|
|
248
249
|
multimodal=multimodal,
|
|
249
250
|
)
|
|
250
|
-
|
|
251
|
+
verdicts = await a_generate_with_schema_and_extract(
|
|
251
252
|
metric=self,
|
|
252
253
|
prompt=prompt,
|
|
253
254
|
schema_cls=Verdicts,
|
|
@@ -256,19 +257,28 @@ class ContextualRecallMetric(BaseMetric):
|
|
|
256
257
|
ContextualRecallVerdict(**item) for item in data["verdicts"]
|
|
257
258
|
],
|
|
258
259
|
)
|
|
260
|
+
final_verdicts = []
|
|
261
|
+
for verdict in verdicts:
|
|
262
|
+
new_verdict = VerdictWithExpectedOutput(
|
|
263
|
+
verdict=verdict.verdict,
|
|
264
|
+
reason=verdict.reason,
|
|
265
|
+
expected_output=expected_output,
|
|
266
|
+
)
|
|
267
|
+
final_verdicts.append(new_verdict)
|
|
268
|
+
return final_verdicts
|
|
259
269
|
|
|
260
270
|
def _generate_verdicts(
|
|
261
271
|
self,
|
|
262
272
|
expected_output: str,
|
|
263
273
|
retrieval_context: List[str],
|
|
264
274
|
multimodal: bool,
|
|
265
|
-
) -> List[
|
|
275
|
+
) -> List[VerdictWithExpectedOutput]:
|
|
266
276
|
prompt = self.evaluation_template.generate_verdicts(
|
|
267
277
|
expected_output=expected_output,
|
|
268
278
|
retrieval_context=retrieval_context,
|
|
269
279
|
multimodal=multimodal,
|
|
270
280
|
)
|
|
271
|
-
|
|
281
|
+
verdicts = generate_with_schema_and_extract(
|
|
272
282
|
metric=self,
|
|
273
283
|
prompt=prompt,
|
|
274
284
|
schema_cls=Verdicts,
|
|
@@ -277,6 +287,15 @@ class ContextualRecallMetric(BaseMetric):
|
|
|
277
287
|
ContextualRecallVerdict(**item) for item in data["verdicts"]
|
|
278
288
|
],
|
|
279
289
|
)
|
|
290
|
+
final_verdicts = []
|
|
291
|
+
for verdict in verdicts:
|
|
292
|
+
new_verdict = VerdictWithExpectedOutput(
|
|
293
|
+
verdict=verdict.verdict,
|
|
294
|
+
reason=verdict.reason,
|
|
295
|
+
expected_output=expected_output,
|
|
296
|
+
)
|
|
297
|
+
final_verdicts.append(new_verdict)
|
|
298
|
+
return final_verdicts
|
|
280
299
|
|
|
281
300
|
def is_successful(self) -> bool:
|
|
282
301
|
if self.error is not None:
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from openai.types.chat.chat_completion import ChatCompletion
|
|
4
4
|
from typing import Optional, List, Tuple, Union, Dict, Type
|
|
5
|
+
from rich.console import Console
|
|
5
6
|
import math
|
|
6
7
|
from deepeval.metrics import BaseConversationalMetric
|
|
7
8
|
from deepeval.metrics.g_eval.utils import (
|
|
@@ -11,6 +12,8 @@ from deepeval.metrics.g_eval.utils import (
|
|
|
11
12
|
format_rubrics,
|
|
12
13
|
validate_and_sort_rubrics,
|
|
13
14
|
validate_criteria_and_evaluation_steps,
|
|
15
|
+
CONVERSATIONAL_G_EVAL_API_PARAMS,
|
|
16
|
+
construct_geval_upload_payload,
|
|
14
17
|
)
|
|
15
18
|
from deepeval.test_case import (
|
|
16
19
|
TurnParams,
|
|
@@ -33,6 +36,7 @@ from deepeval.models import DeepEvalBaseLLM
|
|
|
33
36
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
34
37
|
import deepeval.metrics.conversational_g_eval.schema as cgschema
|
|
35
38
|
from deepeval.metrics.api import metric_data_manager
|
|
39
|
+
from deepeval.confident.api import Api, Endpoints, HttpMethods
|
|
36
40
|
|
|
37
41
|
|
|
38
42
|
class ConversationalGEval(BaseConversationalMetric):
|
|
@@ -412,6 +416,37 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
412
416
|
self.success = False
|
|
413
417
|
return self.success
|
|
414
418
|
|
|
419
|
+
def upload(self):
|
|
420
|
+
api = Api()
|
|
421
|
+
|
|
422
|
+
payload = construct_geval_upload_payload(
|
|
423
|
+
name=self.name,
|
|
424
|
+
evaluation_params=self.evaluation_params,
|
|
425
|
+
g_eval_api_params=CONVERSATIONAL_G_EVAL_API_PARAMS,
|
|
426
|
+
criteria=self.criteria,
|
|
427
|
+
evaluation_steps=self.evaluation_steps,
|
|
428
|
+
multi_turn=True,
|
|
429
|
+
rubric=self.rubric,
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
data, _ = api.send_request(
|
|
433
|
+
method=HttpMethods.POST,
|
|
434
|
+
endpoint=Endpoints.METRICS_ENDPOINT,
|
|
435
|
+
body=payload,
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
metric_id = data.get("id")
|
|
439
|
+
self.metric_id = metric_id
|
|
440
|
+
console = Console()
|
|
441
|
+
|
|
442
|
+
if metric_id:
|
|
443
|
+
console.print(
|
|
444
|
+
"[rgb(5,245,141)]✓[/rgb(5,245,141)] Metric uploaded successfully "
|
|
445
|
+
f"(id: [bold]{metric_id}[/bold])"
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
return data
|
|
449
|
+
|
|
415
450
|
@property
|
|
416
451
|
def __name__(self):
|
|
417
452
|
if self._include_g_eval_suffix:
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""LLM evaluated metric based on the GEval framework: https://arxiv.org/pdf/2303.16634.pdf"""
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
-
|
|
4
|
+
from rich.console import Console
|
|
5
5
|
from typing import Optional, List, Tuple, Union, Type
|
|
6
6
|
from deepeval.metrics import BaseMetric
|
|
7
7
|
from deepeval.test_case import (
|
|
@@ -32,9 +32,12 @@ from deepeval.metrics.g_eval.utils import (
|
|
|
32
32
|
validate_criteria_and_evaluation_steps,
|
|
33
33
|
number_evaluation_steps,
|
|
34
34
|
get_score_range,
|
|
35
|
+
construct_geval_upload_payload,
|
|
36
|
+
G_EVAL_API_PARAMS,
|
|
35
37
|
)
|
|
36
38
|
from deepeval.metrics.api import metric_data_manager
|
|
37
39
|
from deepeval.config.settings import get_settings
|
|
40
|
+
from deepeval.confident.api import Api, Endpoints, HttpMethods
|
|
38
41
|
|
|
39
42
|
|
|
40
43
|
class GEval(BaseMetric):
|
|
@@ -408,6 +411,37 @@ class GEval(BaseMetric):
|
|
|
408
411
|
self.success = False
|
|
409
412
|
return self.success
|
|
410
413
|
|
|
414
|
+
def upload(self):
|
|
415
|
+
api = Api()
|
|
416
|
+
|
|
417
|
+
payload = construct_geval_upload_payload(
|
|
418
|
+
name=self.name,
|
|
419
|
+
evaluation_params=self.evaluation_params,
|
|
420
|
+
g_eval_api_params=G_EVAL_API_PARAMS,
|
|
421
|
+
criteria=self.criteria,
|
|
422
|
+
evaluation_steps=self.evaluation_steps,
|
|
423
|
+
multi_turn=False,
|
|
424
|
+
rubric=self.rubric,
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
data, _ = api.send_request(
|
|
428
|
+
method=HttpMethods.POST,
|
|
429
|
+
endpoint=Endpoints.METRICS_ENDPOINT,
|
|
430
|
+
body=payload,
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
metric_id = data.get("id")
|
|
434
|
+
self.metric_id = metric_id
|
|
435
|
+
console = Console()
|
|
436
|
+
|
|
437
|
+
if metric_id:
|
|
438
|
+
console.print(
|
|
439
|
+
"[rgb(5,245,141)]✓[/rgb(5,245,141)] Metric uploaded successfully "
|
|
440
|
+
f"(id: [bold]{metric_id}[/bold])"
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
return data
|
|
444
|
+
|
|
411
445
|
@property
|
|
412
446
|
def __name__(self):
|
|
413
447
|
if self._include_g_eval_suffix:
|
deepeval/metrics/g_eval/utils.py
CHANGED
|
@@ -52,6 +52,71 @@ CONVERSATIONAL_G_EVAL_PARAMS = {
|
|
|
52
52
|
TurnParams.SCENARIO: "Scenario",
|
|
53
53
|
}
|
|
54
54
|
|
|
55
|
+
G_EVAL_API_PARAMS = {
|
|
56
|
+
LLMTestCaseParams.INPUT: "input",
|
|
57
|
+
LLMTestCaseParams.ACTUAL_OUTPUT: "actualOutput",
|
|
58
|
+
LLMTestCaseParams.EXPECTED_OUTPUT: "expectedOutput",
|
|
59
|
+
LLMTestCaseParams.CONTEXT: "context",
|
|
60
|
+
LLMTestCaseParams.RETRIEVAL_CONTEXT: "retrievalContext",
|
|
61
|
+
LLMTestCaseParams.EXPECTED_TOOLS: "expectedTools",
|
|
62
|
+
LLMTestCaseParams.TOOLS_CALLED: "toolsCalled",
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
CONVERSATIONAL_G_EVAL_API_PARAMS = {
|
|
66
|
+
TurnParams.ROLE: "role",
|
|
67
|
+
TurnParams.CONTENT: "content",
|
|
68
|
+
TurnParams.SCENARIO: "scenario",
|
|
69
|
+
TurnParams.EXPECTED_OUTCOME: "expectedOutcome",
|
|
70
|
+
TurnParams.RETRIEVAL_CONTEXT: "retrievalContext",
|
|
71
|
+
TurnParams.TOOLS_CALLED: "toolsCalled",
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def construct_geval_upload_payload(
|
|
76
|
+
name: str,
|
|
77
|
+
evaluation_params: List[LLMTestCaseParams],
|
|
78
|
+
g_eval_api_params: Dict,
|
|
79
|
+
criteria: Optional[str] = None,
|
|
80
|
+
evaluation_steps: Optional[List[str]] = None,
|
|
81
|
+
multi_turn: bool = False,
|
|
82
|
+
rubric: Optional[List[Rubric]] = None,
|
|
83
|
+
) -> Dict:
|
|
84
|
+
if not evaluation_params:
|
|
85
|
+
raise ValueError("GEval requires at least one evaluation parameter.")
|
|
86
|
+
|
|
87
|
+
unsupported_params = [
|
|
88
|
+
param for param in evaluation_params if param not in g_eval_api_params
|
|
89
|
+
]
|
|
90
|
+
if unsupported_params:
|
|
91
|
+
raise ValueError(
|
|
92
|
+
"Unsupported evaluation params for GEval upload: "
|
|
93
|
+
+ ", ".join(param.name for param in unsupported_params)
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
payload = {
|
|
97
|
+
"name": name,
|
|
98
|
+
"evaluationParams": [
|
|
99
|
+
g_eval_api_params[param] for param in evaluation_params
|
|
100
|
+
],
|
|
101
|
+
"multiTurn": multi_turn,
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
if criteria is not None:
|
|
105
|
+
payload["criteria"] = criteria
|
|
106
|
+
else:
|
|
107
|
+
payload["evaluationSteps"] = evaluation_steps
|
|
108
|
+
|
|
109
|
+
if rubric is not None:
|
|
110
|
+
payload["rubric"] = [
|
|
111
|
+
{
|
|
112
|
+
"scoreRange": list(r.score_range),
|
|
113
|
+
"expectedOutcome": r.expected_outcome,
|
|
114
|
+
}
|
|
115
|
+
for r in rubric
|
|
116
|
+
]
|
|
117
|
+
|
|
118
|
+
return payload
|
|
119
|
+
|
|
55
120
|
|
|
56
121
|
def validate_criteria_and_evaluation_steps(
|
|
57
122
|
criteria: Optional[str] = None,
|
|
@@ -85,7 +85,12 @@ class ImageCoherenceMetric(BaseMetric):
|
|
|
85
85
|
self.contexts_below = []
|
|
86
86
|
self.scores = []
|
|
87
87
|
self.reasons = []
|
|
88
|
-
|
|
88
|
+
image_indices = self.get_image_indices(actual_output)
|
|
89
|
+
if not image_indices:
|
|
90
|
+
raise ValueError(
|
|
91
|
+
f"The test case must have atleast one image in the `actual_output` to calculate {self.__name__} score"
|
|
92
|
+
)
|
|
93
|
+
for image_index in image_indices:
|
|
89
94
|
context_above, context_below = self.get_image_context(
|
|
90
95
|
image_index, actual_output
|
|
91
96
|
)
|
|
@@ -188,6 +193,10 @@ class ImageCoherenceMetric(BaseMetric):
|
|
|
188
193
|
|
|
189
194
|
tasks = []
|
|
190
195
|
image_indices = self.get_image_indices(actual_output)
|
|
196
|
+
if not image_indices:
|
|
197
|
+
raise ValueError(
|
|
198
|
+
f"The test case must have atleast one image in the `actual_output` to calculate {self.__name__} score"
|
|
199
|
+
)
|
|
191
200
|
for image_index in image_indices:
|
|
192
201
|
context_above, context_below = self.get_image_context(
|
|
193
202
|
image_index, actual_output
|
|
@@ -86,7 +86,12 @@ class ImageHelpfulnessMetric(BaseMetric):
|
|
|
86
86
|
self.contexts_below = []
|
|
87
87
|
self.scores = []
|
|
88
88
|
self.reasons = []
|
|
89
|
-
|
|
89
|
+
image_indices = self.get_image_indices(actual_output)
|
|
90
|
+
if not image_indices:
|
|
91
|
+
raise ValueError(
|
|
92
|
+
f"The test case must have atleast one image in the `actual_output` to calculate {self.__name__} score"
|
|
93
|
+
)
|
|
94
|
+
for image_index in image_indices:
|
|
90
95
|
context_above, context_below = self.get_image_context(
|
|
91
96
|
image_index, actual_output
|
|
92
97
|
)
|
|
@@ -189,6 +194,10 @@ class ImageHelpfulnessMetric(BaseMetric):
|
|
|
189
194
|
|
|
190
195
|
tasks = []
|
|
191
196
|
image_indices = self.get_image_indices(actual_output)
|
|
197
|
+
if not image_indices:
|
|
198
|
+
raise ValueError(
|
|
199
|
+
f"The test case must have atleast one image in the `actual_output` to calculate {self.__name__} score"
|
|
200
|
+
)
|
|
192
201
|
for image_index in image_indices:
|
|
193
202
|
context_above, context_below = self.get_image_context(
|
|
194
203
|
image_index, actual_output
|
|
@@ -86,7 +86,12 @@ class ImageReferenceMetric(BaseMetric):
|
|
|
86
86
|
self.contexts_below = []
|
|
87
87
|
self.scores = []
|
|
88
88
|
self.reasons = []
|
|
89
|
-
|
|
89
|
+
image_indices = self.get_image_indices(actual_output)
|
|
90
|
+
if not image_indices:
|
|
91
|
+
raise ValueError(
|
|
92
|
+
f"The test case must have atleast one image in the `actual_output` to calculate {self.__name__} score"
|
|
93
|
+
)
|
|
94
|
+
for image_index in image_indices:
|
|
90
95
|
context_above, context_below = self.get_image_context(
|
|
91
96
|
image_index, actual_output
|
|
92
97
|
)
|
|
@@ -189,6 +194,10 @@ class ImageReferenceMetric(BaseMetric):
|
|
|
189
194
|
|
|
190
195
|
tasks = []
|
|
191
196
|
image_indices = self.get_image_indices(actual_output)
|
|
197
|
+
if not image_indices:
|
|
198
|
+
raise ValueError(
|
|
199
|
+
f"The test case must have atleast one image in the `actual_output` to calculate {self.__name__} score"
|
|
200
|
+
)
|
|
192
201
|
for image_index in image_indices:
|
|
193
202
|
context_above, context_below = self.get_image_context(
|
|
194
203
|
image_index, actual_output
|
deepeval/metrics/utils.py
CHANGED
|
@@ -312,7 +312,7 @@ def check_llm_test_case_params(
|
|
|
312
312
|
if isinstance(ele, MLLMImage):
|
|
313
313
|
count += 1
|
|
314
314
|
if count != actual_output_image_count:
|
|
315
|
-
error_str = f"
|
|
315
|
+
error_str = f"Can only evaluate test cases with '{actual_output_image_count}' output images using the '{metric.__name__}' metric. `{count}` found."
|
|
316
316
|
raise ValueError(error_str)
|
|
317
317
|
|
|
318
318
|
if isinstance(test_case, LLMTestCase) is False:
|
deepeval/models/__init__.py
CHANGED
|
@@ -16,6 +16,7 @@ from deepeval.models.llms import (
|
|
|
16
16
|
GrokModel,
|
|
17
17
|
DeepSeekModel,
|
|
18
18
|
PortkeyModel,
|
|
19
|
+
OpenRouterModel,
|
|
19
20
|
)
|
|
20
21
|
from deepeval.models.embedding_models import (
|
|
21
22
|
OpenAIEmbeddingModel,
|
|
@@ -44,4 +45,5 @@ __all__ = [
|
|
|
44
45
|
"LocalEmbeddingModel",
|
|
45
46
|
"OllamaEmbeddingModel",
|
|
46
47
|
"PortkeyModel",
|
|
48
|
+
"OpenRouterModel",
|
|
47
49
|
]
|
deepeval/models/llms/__init__.py
CHANGED
|
@@ -10,6 +10,7 @@ from .kimi_model import KimiModel
|
|
|
10
10
|
from .grok_model import GrokModel
|
|
11
11
|
from .deepseek_model import DeepSeekModel
|
|
12
12
|
from .portkey_model import PortkeyModel
|
|
13
|
+
from .openrouter_model import OpenRouterModel
|
|
13
14
|
|
|
14
15
|
__all__ = [
|
|
15
16
|
"AzureOpenAIModel",
|
|
@@ -24,4 +25,5 @@ __all__ = [
|
|
|
24
25
|
"GrokModel",
|
|
25
26
|
"DeepSeekModel",
|
|
26
27
|
"PortkeyModel",
|
|
28
|
+
"OpenRouterModel",
|
|
27
29
|
]
|
|
@@ -14,6 +14,7 @@ from deepeval.models.retry_policy import (
|
|
|
14
14
|
sdk_retries_for,
|
|
15
15
|
)
|
|
16
16
|
from deepeval.test_case import MLLMImage
|
|
17
|
+
from deepeval.errors import DeepEvalError
|
|
17
18
|
from deepeval.utils import check_if_multimodal, convert_to_multi_modal_array
|
|
18
19
|
from deepeval.models import DeepEvalBaseLLM
|
|
19
20
|
from deepeval.models.llms.constants import BEDROCK_MODELS_DATA
|
|
@@ -155,27 +156,28 @@ class AmazonBedrockModel(DeepEvalBaseLLM):
|
|
|
155
156
|
|
|
156
157
|
def generate(
|
|
157
158
|
self, prompt: str, schema: Optional[BaseModel] = None
|
|
158
|
-
) -> Tuple[Union[str, BaseModel], float]:
|
|
159
|
+
) -> Tuple[Union[str, BaseModel], Optional[float]]:
|
|
159
160
|
return safe_asyncio_run(self.a_generate(prompt, schema))
|
|
160
161
|
|
|
161
162
|
@retry_bedrock
|
|
162
163
|
async def a_generate(
|
|
163
164
|
self, prompt: str, schema: Optional[BaseModel] = None
|
|
164
|
-
) -> Tuple[Union[str, BaseModel], float]:
|
|
165
|
+
) -> Tuple[Union[str, BaseModel], Optional[float]]:
|
|
165
166
|
if check_if_multimodal(prompt):
|
|
166
167
|
prompt = convert_to_multi_modal_array(input=prompt)
|
|
167
168
|
payload = self.generate_payload(prompt)
|
|
168
169
|
else:
|
|
169
170
|
payload = self.get_converse_request_body(prompt)
|
|
170
171
|
|
|
171
|
-
payload = self.get_converse_request_body(prompt)
|
|
172
172
|
client = await self._ensure_client()
|
|
173
173
|
response = await client.converse(
|
|
174
174
|
modelId=self.get_model_name(),
|
|
175
175
|
messages=payload["messages"],
|
|
176
176
|
inferenceConfig=payload["inferenceConfig"],
|
|
177
177
|
)
|
|
178
|
-
|
|
178
|
+
|
|
179
|
+
message = self._extract_text_from_converse_response(response)
|
|
180
|
+
|
|
179
181
|
cost = self.calculate_cost(
|
|
180
182
|
response["usage"]["inputTokens"],
|
|
181
183
|
response["usage"]["outputTokens"],
|
|
@@ -206,7 +208,7 @@ class AmazonBedrockModel(DeepEvalBaseLLM):
|
|
|
206
208
|
try:
|
|
207
209
|
image_raw_bytes = base64.b64decode(element.dataBase64)
|
|
208
210
|
except Exception:
|
|
209
|
-
raise
|
|
211
|
+
raise DeepEvalError(
|
|
210
212
|
f"Invalid base64 data in MLLMImage: {element._id}"
|
|
211
213
|
)
|
|
212
214
|
|
|
@@ -294,6 +296,46 @@ class AmazonBedrockModel(DeepEvalBaseLLM):
|
|
|
294
296
|
# Helpers
|
|
295
297
|
###############################################
|
|
296
298
|
|
|
299
|
+
@staticmethod
|
|
300
|
+
def _extract_text_from_converse_response(response: dict) -> str:
|
|
301
|
+
try:
|
|
302
|
+
content = response["output"]["message"]["content"]
|
|
303
|
+
except Exception as e:
|
|
304
|
+
raise DeepEvalError(
|
|
305
|
+
"Missing output.message.content in Bedrock response"
|
|
306
|
+
) from e
|
|
307
|
+
|
|
308
|
+
# Collect any text blocks (ignore reasoning/tool blocks)
|
|
309
|
+
text_parts = []
|
|
310
|
+
for block in content:
|
|
311
|
+
if isinstance(block, dict) and "text" in block:
|
|
312
|
+
v = block.get("text")
|
|
313
|
+
if isinstance(v, str) and v.strip():
|
|
314
|
+
text_parts.append(v)
|
|
315
|
+
|
|
316
|
+
if text_parts:
|
|
317
|
+
# join in case there are multiple text blocks
|
|
318
|
+
return "\n".join(text_parts)
|
|
319
|
+
|
|
320
|
+
# No text blocks present; raise an actionable error
|
|
321
|
+
keys = []
|
|
322
|
+
for b in content:
|
|
323
|
+
if isinstance(b, dict):
|
|
324
|
+
keys.append(list(b.keys()))
|
|
325
|
+
else:
|
|
326
|
+
keys.append(type(b).__name__)
|
|
327
|
+
|
|
328
|
+
stop_reason = (
|
|
329
|
+
response.get("stopReason")
|
|
330
|
+
or response.get("output", {}).get("stopReason")
|
|
331
|
+
or response.get("output", {}).get("message", {}).get("stopReason")
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
raise DeepEvalError(
|
|
335
|
+
f"Bedrock response contained no text content blocks. "
|
|
336
|
+
f"content keys={keys}, stopReason={stop_reason}"
|
|
337
|
+
)
|
|
338
|
+
|
|
297
339
|
def get_converse_request_body(self, prompt: str) -> dict:
|
|
298
340
|
|
|
299
341
|
return {
|
|
@@ -303,11 +345,14 @@ class AmazonBedrockModel(DeepEvalBaseLLM):
|
|
|
303
345
|
},
|
|
304
346
|
}
|
|
305
347
|
|
|
306
|
-
def calculate_cost(
|
|
348
|
+
def calculate_cost(
|
|
349
|
+
self, input_tokens: int, output_tokens: int
|
|
350
|
+
) -> Optional[float]:
|
|
307
351
|
if self.model_data.input_price and self.model_data.output_price:
|
|
308
352
|
input_cost = input_tokens * self.model_data.input_price
|
|
309
353
|
output_cost = output_tokens * self.model_data.output_price
|
|
310
354
|
return input_cost + output_cost
|
|
355
|
+
return None
|
|
311
356
|
|
|
312
357
|
def load_model(self):
|
|
313
358
|
pass
|