deepeval 3.7.9__py3-none-any.whl → 3.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/annotation/annotation.py +2 -2
  3. deepeval/cli/main.py +168 -0
  4. deepeval/confident/api.py +2 -0
  5. deepeval/config/settings.py +13 -0
  6. deepeval/constants.py +1 -0
  7. deepeval/dataset/dataset.py +6 -4
  8. deepeval/integrations/langchain/callback.py +330 -158
  9. deepeval/integrations/langchain/utils.py +31 -8
  10. deepeval/key_handler.py +8 -1
  11. deepeval/metrics/contextual_recall/contextual_recall.py +25 -6
  12. deepeval/metrics/contextual_recall/schema.py +6 -0
  13. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +35 -0
  14. deepeval/metrics/g_eval/g_eval.py +35 -1
  15. deepeval/metrics/g_eval/utils.py +65 -0
  16. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +10 -1
  17. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +10 -1
  18. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +10 -1
  19. deepeval/metrics/utils.py +1 -1
  20. deepeval/models/__init__.py +2 -0
  21. deepeval/models/llms/__init__.py +2 -0
  22. deepeval/models/llms/amazon_bedrock_model.py +51 -6
  23. deepeval/models/llms/azure_model.py +33 -7
  24. deepeval/models/llms/constants.py +23 -0
  25. deepeval/models/llms/gemini_model.py +6 -1
  26. deepeval/models/llms/openai_model.py +5 -4
  27. deepeval/models/llms/openrouter_model.py +398 -0
  28. deepeval/models/retry_policy.py +3 -0
  29. deepeval/prompt/api.py +1 -0
  30. deepeval/prompt/prompt.py +7 -5
  31. deepeval/test_case/llm_test_case.py +1 -0
  32. deepeval/tracing/tracing.py +6 -1
  33. deepeval/tracing/types.py +1 -1
  34. {deepeval-3.7.9.dist-info → deepeval-3.8.1.dist-info}/METADATA +3 -3
  35. {deepeval-3.7.9.dist-info → deepeval-3.8.1.dist-info}/RECORD +38 -37
  36. {deepeval-3.7.9.dist-info → deepeval-3.8.1.dist-info}/LICENSE.md +0 -0
  37. {deepeval-3.7.9.dist-info → deepeval-3.8.1.dist-info}/WHEEL +0 -0
  38. {deepeval-3.7.9.dist-info → deepeval-3.8.1.dist-info}/entry_points.txt +0 -0
@@ -145,6 +145,7 @@ def enter_current_context(
145
145
  progress: Optional[Progress] = None,
146
146
  pbar_callback_id: Optional[int] = None,
147
147
  uuid_str: Optional[str] = None,
148
+ fallback_trace_uuid: Optional[str] = None,
148
149
  ) -> BaseSpan:
149
150
  start_time = perf_counter()
150
151
  observe_kwargs = observe_kwargs or {}
@@ -159,12 +160,27 @@ def enter_current_context(
159
160
  parent_uuid: Optional[str] = None
160
161
 
161
162
  if parent_span:
162
- parent_uuid = parent_span.uuid
163
- trace_uuid = parent_span.trace_uuid
164
- else:
163
+ # Validate that the parent span's trace is still active
164
+ if parent_span.trace_uuid in trace_manager.active_traces:
165
+ parent_uuid = parent_span.uuid
166
+ trace_uuid = parent_span.trace_uuid
167
+ else:
168
+ # Parent span references a dead trace - treat as if no parent
169
+ parent_span = None
170
+
171
+ if not parent_span:
165
172
  current_trace = current_trace_context.get()
166
- if current_trace:
173
+ # IMPORTANT: Verify trace is still active, not just in context
174
+ # (a previous failed async operation might leave a dead trace in context)
175
+ if current_trace and current_trace.uuid in trace_manager.active_traces:
167
176
  trace_uuid = current_trace.uuid
177
+ elif (
178
+ fallback_trace_uuid
179
+ and fallback_trace_uuid in trace_manager.active_traces
180
+ ):
181
+ # In async contexts, ContextVar may not propagate. Use the fallback trace_uuid
182
+ # provided by the CallbackHandler to avoid creating duplicate traces.
183
+ trace_uuid = fallback_trace_uuid
168
184
  else:
169
185
  trace = trace_manager.start_new_trace(
170
186
  metric_collection=metric_collection
@@ -258,11 +274,13 @@ def exit_current_context(
258
274
 
259
275
  current_span = current_span_context.get()
260
276
 
277
+ # In async contexts (LangChain/LangGraph), context variables don't propagate
278
+ # reliably across task boundaries. Fall back to direct span lookup.
261
279
  if not current_span or current_span.uuid != uuid_str:
262
- print(
263
- f"Error: Current span in context does not match the span being exited. Expected UUID: {uuid_str}, Got: {current_span.uuid if current_span else 'None'}"
264
- )
265
- return
280
+ current_span = trace_manager.get_span_by_uuid(uuid_str)
281
+ if not current_span:
282
+ # Span already removed or never existed
283
+ return
266
284
 
267
285
  current_span.end_time = end_time
268
286
  if exc_type is not None:
@@ -295,7 +313,12 @@ def exit_current_context(
295
313
  else:
296
314
  current_span_context.set(None)
297
315
  else:
316
+ # Try context first, then fall back to direct trace lookup for async contexts
298
317
  current_trace = current_trace_context.get()
318
+ if not current_trace and current_span.trace_uuid:
319
+ current_trace = trace_manager.get_trace_by_uuid(
320
+ current_span.trace_uuid
321
+ )
299
322
  if current_span.status == TraceSpanStatus.ERRORED and current_trace:
300
323
  current_trace.status = TraceSpanStatus.ERRORED
301
324
  if current_trace and current_trace.uuid == current_span.trace_uuid:
deepeval/key_handler.py CHANGED
@@ -162,6 +162,13 @@ class ModelKeyValues(Enum):
162
162
  VLLM_API_KEY = "VLLM_API_KEY"
163
163
  VLLM_MODEL_NAME = "VLLM_MODEL_NAME"
164
164
 
165
+ # OpenRouter
166
+ USE_OPENROUTER_MODEL = "USE_OPENROUTER_MODEL"
167
+ OPENROUTER_MODEL_NAME = "OPENROUTER_MODEL_NAME"
168
+ OPENROUTER_COST_PER_INPUT_TOKEN = "OPENROUTER_COST_PER_INPUT_TOKEN"
169
+ OPENROUTER_COST_PER_OUTPUT_TOKEN = "OPENROUTER_COST_PER_OUTPUT_TOKEN"
170
+ OPENROUTER_API_KEY = "OPENROUTER_API_KEY"
171
+
165
172
 
166
173
  class EmbeddingKeyValues(Enum):
167
174
  # Azure OpenAI
@@ -174,7 +181,7 @@ class EmbeddingKeyValues(Enum):
174
181
  USE_LOCAL_EMBEDDINGS = "USE_LOCAL_EMBEDDINGS"
175
182
  LOCAL_EMBEDDING_MODEL_NAME = "LOCAL_EMBEDDING_MODEL_NAME"
176
183
  LOCAL_EMBEDDING_BASE_URL = "LOCAL_EMBEDDING_BASE_URL"
177
- LOCAL_EMBEDDING_API_KEY = "LOCAL_EMBEDDING_API_KEY"
184
+ LOCAL_EMBEDDING_API_KEY = ("LOCAL_EMBEDDING_API_KEY",)
178
185
 
179
186
 
180
187
  class KeyFileHandler:
@@ -23,6 +23,7 @@ from deepeval.metrics.contextual_recall.schema import (
23
23
  ContextualRecallVerdict,
24
24
  Verdicts,
25
25
  ContextualRecallScoreReason,
26
+ VerdictWithExpectedOutput,
26
27
  )
27
28
  from deepeval.metrics.api import metric_data_manager
28
29
 
@@ -93,7 +94,7 @@ class ContextualRecallMetric(BaseMetric):
93
94
  expected_output = test_case.expected_output
94
95
  retrieval_context = test_case.retrieval_context
95
96
 
96
- self.verdicts: List[ContextualRecallVerdict] = (
97
+ self.verdicts: List[VerdictWithExpectedOutput] = (
97
98
  self._generate_verdicts(
98
99
  expected_output, retrieval_context, multimodal
99
100
  )
@@ -144,7 +145,7 @@ class ContextualRecallMetric(BaseMetric):
144
145
  expected_output = test_case.expected_output
145
146
  retrieval_context = test_case.retrieval_context
146
147
 
147
- self.verdicts: List[ContextualRecallVerdict] = (
148
+ self.verdicts: List[VerdictWithExpectedOutput] = (
148
149
  await self._a_generate_verdicts(
149
150
  expected_output, retrieval_context, multimodal
150
151
  )
@@ -241,13 +242,13 @@ class ContextualRecallMetric(BaseMetric):
241
242
  expected_output: str,
242
243
  retrieval_context: List[str],
243
244
  multimodal: bool,
244
- ) -> List[ContextualRecallVerdict]:
245
+ ) -> List[VerdictWithExpectedOutput]:
245
246
  prompt = self.evaluation_template.generate_verdicts(
246
247
  expected_output=expected_output,
247
248
  retrieval_context=retrieval_context,
248
249
  multimodal=multimodal,
249
250
  )
250
- return await a_generate_with_schema_and_extract(
251
+ verdicts = await a_generate_with_schema_and_extract(
251
252
  metric=self,
252
253
  prompt=prompt,
253
254
  schema_cls=Verdicts,
@@ -256,19 +257,28 @@ class ContextualRecallMetric(BaseMetric):
256
257
  ContextualRecallVerdict(**item) for item in data["verdicts"]
257
258
  ],
258
259
  )
260
+ final_verdicts = []
261
+ for verdict in verdicts:
262
+ new_verdict = VerdictWithExpectedOutput(
263
+ verdict=verdict.verdict,
264
+ reason=verdict.reason,
265
+ expected_output=expected_output,
266
+ )
267
+ final_verdicts.append(new_verdict)
268
+ return final_verdicts
259
269
 
260
270
  def _generate_verdicts(
261
271
  self,
262
272
  expected_output: str,
263
273
  retrieval_context: List[str],
264
274
  multimodal: bool,
265
- ) -> List[ContextualRecallVerdict]:
275
+ ) -> List[VerdictWithExpectedOutput]:
266
276
  prompt = self.evaluation_template.generate_verdicts(
267
277
  expected_output=expected_output,
268
278
  retrieval_context=retrieval_context,
269
279
  multimodal=multimodal,
270
280
  )
271
- return generate_with_schema_and_extract(
281
+ verdicts = generate_with_schema_and_extract(
272
282
  metric=self,
273
283
  prompt=prompt,
274
284
  schema_cls=Verdicts,
@@ -277,6 +287,15 @@ class ContextualRecallMetric(BaseMetric):
277
287
  ContextualRecallVerdict(**item) for item in data["verdicts"]
278
288
  ],
279
289
  )
290
+ final_verdicts = []
291
+ for verdict in verdicts:
292
+ new_verdict = VerdictWithExpectedOutput(
293
+ verdict=verdict.verdict,
294
+ reason=verdict.reason,
295
+ expected_output=expected_output,
296
+ )
297
+ final_verdicts.append(new_verdict)
298
+ return final_verdicts
280
299
 
281
300
  def is_successful(self) -> bool:
282
301
  if self.error is not None:
@@ -7,6 +7,12 @@ class ContextualRecallVerdict(BaseModel):
7
7
  reason: str
8
8
 
9
9
 
10
+ class VerdictWithExpectedOutput(BaseModel):
11
+ verdict: str
12
+ reason: str
13
+ expected_output: str
14
+
15
+
10
16
  class Verdicts(BaseModel):
11
17
  verdicts: List[ContextualRecallVerdict]
12
18
 
@@ -2,6 +2,7 @@
2
2
 
3
3
  from openai.types.chat.chat_completion import ChatCompletion
4
4
  from typing import Optional, List, Tuple, Union, Dict, Type
5
+ from rich.console import Console
5
6
  import math
6
7
  from deepeval.metrics import BaseConversationalMetric
7
8
  from deepeval.metrics.g_eval.utils import (
@@ -11,6 +12,8 @@ from deepeval.metrics.g_eval.utils import (
11
12
  format_rubrics,
12
13
  validate_and_sort_rubrics,
13
14
  validate_criteria_and_evaluation_steps,
15
+ CONVERSATIONAL_G_EVAL_API_PARAMS,
16
+ construct_geval_upload_payload,
14
17
  )
15
18
  from deepeval.test_case import (
16
19
  TurnParams,
@@ -33,6 +36,7 @@ from deepeval.models import DeepEvalBaseLLM
33
36
  from deepeval.metrics.indicator import metric_progress_indicator
34
37
  import deepeval.metrics.conversational_g_eval.schema as cgschema
35
38
  from deepeval.metrics.api import metric_data_manager
39
+ from deepeval.confident.api import Api, Endpoints, HttpMethods
36
40
 
37
41
 
38
42
  class ConversationalGEval(BaseConversationalMetric):
@@ -412,6 +416,37 @@ class ConversationalGEval(BaseConversationalMetric):
412
416
  self.success = False
413
417
  return self.success
414
418
 
419
+ def upload(self):
420
+ api = Api()
421
+
422
+ payload = construct_geval_upload_payload(
423
+ name=self.name,
424
+ evaluation_params=self.evaluation_params,
425
+ g_eval_api_params=CONVERSATIONAL_G_EVAL_API_PARAMS,
426
+ criteria=self.criteria,
427
+ evaluation_steps=self.evaluation_steps,
428
+ multi_turn=True,
429
+ rubric=self.rubric,
430
+ )
431
+
432
+ data, _ = api.send_request(
433
+ method=HttpMethods.POST,
434
+ endpoint=Endpoints.METRICS_ENDPOINT,
435
+ body=payload,
436
+ )
437
+
438
+ metric_id = data.get("id")
439
+ self.metric_id = metric_id
440
+ console = Console()
441
+
442
+ if metric_id:
443
+ console.print(
444
+ "[rgb(5,245,141)]✓[/rgb(5,245,141)] Metric uploaded successfully "
445
+ f"(id: [bold]{metric_id}[/bold])"
446
+ )
447
+
448
+ return data
449
+
415
450
  @property
416
451
  def __name__(self):
417
452
  if self._include_g_eval_suffix:
@@ -1,7 +1,7 @@
1
1
  """LLM evaluated metric based on the GEval framework: https://arxiv.org/pdf/2303.16634.pdf"""
2
2
 
3
3
  import asyncio
4
-
4
+ from rich.console import Console
5
5
  from typing import Optional, List, Tuple, Union, Type
6
6
  from deepeval.metrics import BaseMetric
7
7
  from deepeval.test_case import (
@@ -32,9 +32,12 @@ from deepeval.metrics.g_eval.utils import (
32
32
  validate_criteria_and_evaluation_steps,
33
33
  number_evaluation_steps,
34
34
  get_score_range,
35
+ construct_geval_upload_payload,
36
+ G_EVAL_API_PARAMS,
35
37
  )
36
38
  from deepeval.metrics.api import metric_data_manager
37
39
  from deepeval.config.settings import get_settings
40
+ from deepeval.confident.api import Api, Endpoints, HttpMethods
38
41
 
39
42
 
40
43
  class GEval(BaseMetric):
@@ -408,6 +411,37 @@ class GEval(BaseMetric):
408
411
  self.success = False
409
412
  return self.success
410
413
 
414
+ def upload(self):
415
+ api = Api()
416
+
417
+ payload = construct_geval_upload_payload(
418
+ name=self.name,
419
+ evaluation_params=self.evaluation_params,
420
+ g_eval_api_params=G_EVAL_API_PARAMS,
421
+ criteria=self.criteria,
422
+ evaluation_steps=self.evaluation_steps,
423
+ multi_turn=False,
424
+ rubric=self.rubric,
425
+ )
426
+
427
+ data, _ = api.send_request(
428
+ method=HttpMethods.POST,
429
+ endpoint=Endpoints.METRICS_ENDPOINT,
430
+ body=payload,
431
+ )
432
+
433
+ metric_id = data.get("id")
434
+ self.metric_id = metric_id
435
+ console = Console()
436
+
437
+ if metric_id:
438
+ console.print(
439
+ "[rgb(5,245,141)]✓[/rgb(5,245,141)] Metric uploaded successfully "
440
+ f"(id: [bold]{metric_id}[/bold])"
441
+ )
442
+
443
+ return data
444
+
411
445
  @property
412
446
  def __name__(self):
413
447
  if self._include_g_eval_suffix:
@@ -52,6 +52,71 @@ CONVERSATIONAL_G_EVAL_PARAMS = {
52
52
  TurnParams.SCENARIO: "Scenario",
53
53
  }
54
54
 
55
+ G_EVAL_API_PARAMS = {
56
+ LLMTestCaseParams.INPUT: "input",
57
+ LLMTestCaseParams.ACTUAL_OUTPUT: "actualOutput",
58
+ LLMTestCaseParams.EXPECTED_OUTPUT: "expectedOutput",
59
+ LLMTestCaseParams.CONTEXT: "context",
60
+ LLMTestCaseParams.RETRIEVAL_CONTEXT: "retrievalContext",
61
+ LLMTestCaseParams.EXPECTED_TOOLS: "expectedTools",
62
+ LLMTestCaseParams.TOOLS_CALLED: "toolsCalled",
63
+ }
64
+
65
+ CONVERSATIONAL_G_EVAL_API_PARAMS = {
66
+ TurnParams.ROLE: "role",
67
+ TurnParams.CONTENT: "content",
68
+ TurnParams.SCENARIO: "scenario",
69
+ TurnParams.EXPECTED_OUTCOME: "expectedOutcome",
70
+ TurnParams.RETRIEVAL_CONTEXT: "retrievalContext",
71
+ TurnParams.TOOLS_CALLED: "toolsCalled",
72
+ }
73
+
74
+
75
+ def construct_geval_upload_payload(
76
+ name: str,
77
+ evaluation_params: List[LLMTestCaseParams],
78
+ g_eval_api_params: Dict,
79
+ criteria: Optional[str] = None,
80
+ evaluation_steps: Optional[List[str]] = None,
81
+ multi_turn: bool = False,
82
+ rubric: Optional[List[Rubric]] = None,
83
+ ) -> Dict:
84
+ if not evaluation_params:
85
+ raise ValueError("GEval requires at least one evaluation parameter.")
86
+
87
+ unsupported_params = [
88
+ param for param in evaluation_params if param not in g_eval_api_params
89
+ ]
90
+ if unsupported_params:
91
+ raise ValueError(
92
+ "Unsupported evaluation params for GEval upload: "
93
+ + ", ".join(param.name for param in unsupported_params)
94
+ )
95
+
96
+ payload = {
97
+ "name": name,
98
+ "evaluationParams": [
99
+ g_eval_api_params[param] for param in evaluation_params
100
+ ],
101
+ "multiTurn": multi_turn,
102
+ }
103
+
104
+ if criteria is not None:
105
+ payload["criteria"] = criteria
106
+ else:
107
+ payload["evaluationSteps"] = evaluation_steps
108
+
109
+ if rubric is not None:
110
+ payload["rubric"] = [
111
+ {
112
+ "scoreRange": list(r.score_range),
113
+ "expectedOutcome": r.expected_outcome,
114
+ }
115
+ for r in rubric
116
+ ]
117
+
118
+ return payload
119
+
55
120
 
56
121
  def validate_criteria_and_evaluation_steps(
57
122
  criteria: Optional[str] = None,
@@ -85,7 +85,12 @@ class ImageCoherenceMetric(BaseMetric):
85
85
  self.contexts_below = []
86
86
  self.scores = []
87
87
  self.reasons = []
88
- for image_index in self.get_image_indices(actual_output):
88
+ image_indices = self.get_image_indices(actual_output)
89
+ if not image_indices:
90
+ raise ValueError(
91
+ f"The test case must have atleast one image in the `actual_output` to calculate {self.__name__} score"
92
+ )
93
+ for image_index in image_indices:
89
94
  context_above, context_below = self.get_image_context(
90
95
  image_index, actual_output
91
96
  )
@@ -188,6 +193,10 @@ class ImageCoherenceMetric(BaseMetric):
188
193
 
189
194
  tasks = []
190
195
  image_indices = self.get_image_indices(actual_output)
196
+ if not image_indices:
197
+ raise ValueError(
198
+ f"The test case must have atleast one image in the `actual_output` to calculate {self.__name__} score"
199
+ )
191
200
  for image_index in image_indices:
192
201
  context_above, context_below = self.get_image_context(
193
202
  image_index, actual_output
@@ -86,7 +86,12 @@ class ImageHelpfulnessMetric(BaseMetric):
86
86
  self.contexts_below = []
87
87
  self.scores = []
88
88
  self.reasons = []
89
- for image_index in self.get_image_indices(actual_output):
89
+ image_indices = self.get_image_indices(actual_output)
90
+ if not image_indices:
91
+ raise ValueError(
92
+ f"The test case must have atleast one image in the `actual_output` to calculate {self.__name__} score"
93
+ )
94
+ for image_index in image_indices:
90
95
  context_above, context_below = self.get_image_context(
91
96
  image_index, actual_output
92
97
  )
@@ -189,6 +194,10 @@ class ImageHelpfulnessMetric(BaseMetric):
189
194
 
190
195
  tasks = []
191
196
  image_indices = self.get_image_indices(actual_output)
197
+ if not image_indices:
198
+ raise ValueError(
199
+ f"The test case must have atleast one image in the `actual_output` to calculate {self.__name__} score"
200
+ )
192
201
  for image_index in image_indices:
193
202
  context_above, context_below = self.get_image_context(
194
203
  image_index, actual_output
@@ -86,7 +86,12 @@ class ImageReferenceMetric(BaseMetric):
86
86
  self.contexts_below = []
87
87
  self.scores = []
88
88
  self.reasons = []
89
- for image_index in self.get_image_indices(actual_output):
89
+ image_indices = self.get_image_indices(actual_output)
90
+ if not image_indices:
91
+ raise ValueError(
92
+ f"The test case must have atleast one image in the `actual_output` to calculate {self.__name__} score"
93
+ )
94
+ for image_index in image_indices:
90
95
  context_above, context_below = self.get_image_context(
91
96
  image_index, actual_output
92
97
  )
@@ -189,6 +194,10 @@ class ImageReferenceMetric(BaseMetric):
189
194
 
190
195
  tasks = []
191
196
  image_indices = self.get_image_indices(actual_output)
197
+ if not image_indices:
198
+ raise ValueError(
199
+ f"The test case must have atleast one image in the `actual_output` to calculate {self.__name__} score"
200
+ )
192
201
  for image_index in image_indices:
193
202
  context_above, context_below = self.get_image_context(
194
203
  image_index, actual_output
deepeval/metrics/utils.py CHANGED
@@ -312,7 +312,7 @@ def check_llm_test_case_params(
312
312
  if isinstance(ele, MLLMImage):
313
313
  count += 1
314
314
  if count != actual_output_image_count:
315
- error_str = f"Unable to evaluate test cases with '{actual_output_image_count}' output images using the '{metric.__name__}' metric. `{count}` found."
315
+ error_str = f"Can only evaluate test cases with '{actual_output_image_count}' output images using the '{metric.__name__}' metric. `{count}` found."
316
316
  raise ValueError(error_str)
317
317
 
318
318
  if isinstance(test_case, LLMTestCase) is False:
@@ -16,6 +16,7 @@ from deepeval.models.llms import (
16
16
  GrokModel,
17
17
  DeepSeekModel,
18
18
  PortkeyModel,
19
+ OpenRouterModel,
19
20
  )
20
21
  from deepeval.models.embedding_models import (
21
22
  OpenAIEmbeddingModel,
@@ -44,4 +45,5 @@ __all__ = [
44
45
  "LocalEmbeddingModel",
45
46
  "OllamaEmbeddingModel",
46
47
  "PortkeyModel",
48
+ "OpenRouterModel",
47
49
  ]
@@ -10,6 +10,7 @@ from .kimi_model import KimiModel
10
10
  from .grok_model import GrokModel
11
11
  from .deepseek_model import DeepSeekModel
12
12
  from .portkey_model import PortkeyModel
13
+ from .openrouter_model import OpenRouterModel
13
14
 
14
15
  __all__ = [
15
16
  "AzureOpenAIModel",
@@ -24,4 +25,5 @@ __all__ = [
24
25
  "GrokModel",
25
26
  "DeepSeekModel",
26
27
  "PortkeyModel",
28
+ "OpenRouterModel",
27
29
  ]
@@ -14,6 +14,7 @@ from deepeval.models.retry_policy import (
14
14
  sdk_retries_for,
15
15
  )
16
16
  from deepeval.test_case import MLLMImage
17
+ from deepeval.errors import DeepEvalError
17
18
  from deepeval.utils import check_if_multimodal, convert_to_multi_modal_array
18
19
  from deepeval.models import DeepEvalBaseLLM
19
20
  from deepeval.models.llms.constants import BEDROCK_MODELS_DATA
@@ -155,27 +156,28 @@ class AmazonBedrockModel(DeepEvalBaseLLM):
155
156
 
156
157
  def generate(
157
158
  self, prompt: str, schema: Optional[BaseModel] = None
158
- ) -> Tuple[Union[str, BaseModel], float]:
159
+ ) -> Tuple[Union[str, BaseModel], Optional[float]]:
159
160
  return safe_asyncio_run(self.a_generate(prompt, schema))
160
161
 
161
162
  @retry_bedrock
162
163
  async def a_generate(
163
164
  self, prompt: str, schema: Optional[BaseModel] = None
164
- ) -> Tuple[Union[str, BaseModel], float]:
165
+ ) -> Tuple[Union[str, BaseModel], Optional[float]]:
165
166
  if check_if_multimodal(prompt):
166
167
  prompt = convert_to_multi_modal_array(input=prompt)
167
168
  payload = self.generate_payload(prompt)
168
169
  else:
169
170
  payload = self.get_converse_request_body(prompt)
170
171
 
171
- payload = self.get_converse_request_body(prompt)
172
172
  client = await self._ensure_client()
173
173
  response = await client.converse(
174
174
  modelId=self.get_model_name(),
175
175
  messages=payload["messages"],
176
176
  inferenceConfig=payload["inferenceConfig"],
177
177
  )
178
- message = response["output"]["message"]["content"][0]["text"]
178
+
179
+ message = self._extract_text_from_converse_response(response)
180
+
179
181
  cost = self.calculate_cost(
180
182
  response["usage"]["inputTokens"],
181
183
  response["usage"]["outputTokens"],
@@ -206,7 +208,7 @@ class AmazonBedrockModel(DeepEvalBaseLLM):
206
208
  try:
207
209
  image_raw_bytes = base64.b64decode(element.dataBase64)
208
210
  except Exception:
209
- raise ValueError(
211
+ raise DeepEvalError(
210
212
  f"Invalid base64 data in MLLMImage: {element._id}"
211
213
  )
212
214
 
@@ -294,6 +296,46 @@ class AmazonBedrockModel(DeepEvalBaseLLM):
294
296
  # Helpers
295
297
  ###############################################
296
298
 
299
+ @staticmethod
300
+ def _extract_text_from_converse_response(response: dict) -> str:
301
+ try:
302
+ content = response["output"]["message"]["content"]
303
+ except Exception as e:
304
+ raise DeepEvalError(
305
+ "Missing output.message.content in Bedrock response"
306
+ ) from e
307
+
308
+ # Collect any text blocks (ignore reasoning/tool blocks)
309
+ text_parts = []
310
+ for block in content:
311
+ if isinstance(block, dict) and "text" in block:
312
+ v = block.get("text")
313
+ if isinstance(v, str) and v.strip():
314
+ text_parts.append(v)
315
+
316
+ if text_parts:
317
+ # join in case there are multiple text blocks
318
+ return "\n".join(text_parts)
319
+
320
+ # No text blocks present; raise an actionable error
321
+ keys = []
322
+ for b in content:
323
+ if isinstance(b, dict):
324
+ keys.append(list(b.keys()))
325
+ else:
326
+ keys.append(type(b).__name__)
327
+
328
+ stop_reason = (
329
+ response.get("stopReason")
330
+ or response.get("output", {}).get("stopReason")
331
+ or response.get("output", {}).get("message", {}).get("stopReason")
332
+ )
333
+
334
+ raise DeepEvalError(
335
+ f"Bedrock response contained no text content blocks. "
336
+ f"content keys={keys}, stopReason={stop_reason}"
337
+ )
338
+
297
339
  def get_converse_request_body(self, prompt: str) -> dict:
298
340
 
299
341
  return {
@@ -303,11 +345,14 @@ class AmazonBedrockModel(DeepEvalBaseLLM):
303
345
  },
304
346
  }
305
347
 
306
- def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
348
+ def calculate_cost(
349
+ self, input_tokens: int, output_tokens: int
350
+ ) -> Optional[float]:
307
351
  if self.model_data.input_price and self.model_data.output_price:
308
352
  input_cost = input_tokens * self.model_data.input_price
309
353
  output_cost = output_tokens * self.model_data.output_price
310
354
  return input_cost + output_cost
355
+ return None
311
356
 
312
357
  def load_model(self):
313
358
  pass