deepeval 3.7.4__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/dataset/golden.py +54 -2
  3. deepeval/evaluate/evaluate.py +16 -8
  4. deepeval/evaluate/execute.py +70 -26
  5. deepeval/evaluate/utils.py +26 -22
  6. deepeval/integrations/pydantic_ai/agent.py +19 -2
  7. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  8. deepeval/metrics/__init__.py +14 -12
  9. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  10. deepeval/metrics/answer_relevancy/template.py +188 -92
  11. deepeval/metrics/base_metric.py +2 -5
  12. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  13. deepeval/metrics/contextual_precision/template.py +115 -66
  14. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  15. deepeval/metrics/contextual_recall/template.py +106 -55
  16. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  17. deepeval/metrics/contextual_relevancy/template.py +87 -58
  18. deepeval/metrics/dag/templates.py +2 -2
  19. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  20. deepeval/metrics/faithfulness/schema.py +1 -1
  21. deepeval/metrics/faithfulness/template.py +200 -115
  22. deepeval/metrics/g_eval/utils.py +2 -2
  23. deepeval/metrics/indicator.py +4 -4
  24. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  25. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  26. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  27. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  28. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  29. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  30. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  31. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  32. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  33. deepeval/metrics/ragas.py +3 -3
  34. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  35. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  36. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  37. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  38. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  39. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  40. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  41. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  42. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  43. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  44. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  45. deepeval/metrics/turn_faithfulness/template.py +218 -0
  46. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  47. deepeval/metrics/utils.py +39 -58
  48. deepeval/models/__init__.py +0 -12
  49. deepeval/models/base_model.py +16 -38
  50. deepeval/models/embedding_models/__init__.py +7 -0
  51. deepeval/models/embedding_models/azure_embedding_model.py +52 -28
  52. deepeval/models/embedding_models/local_embedding_model.py +18 -14
  53. deepeval/models/embedding_models/ollama_embedding_model.py +38 -16
  54. deepeval/models/embedding_models/openai_embedding_model.py +40 -21
  55. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  56. deepeval/models/llms/anthropic_model.py +44 -23
  57. deepeval/models/llms/azure_model.py +121 -36
  58. deepeval/models/llms/deepseek_model.py +18 -13
  59. deepeval/models/llms/gemini_model.py +129 -43
  60. deepeval/models/llms/grok_model.py +18 -13
  61. deepeval/models/llms/kimi_model.py +18 -13
  62. deepeval/models/llms/litellm_model.py +42 -22
  63. deepeval/models/llms/local_model.py +12 -7
  64. deepeval/models/llms/ollama_model.py +114 -12
  65. deepeval/models/llms/openai_model.py +137 -41
  66. deepeval/models/llms/portkey_model.py +24 -7
  67. deepeval/models/llms/utils.py +5 -3
  68. deepeval/models/retry_policy.py +17 -14
  69. deepeval/models/utils.py +46 -1
  70. deepeval/optimizer/__init__.py +5 -0
  71. deepeval/optimizer/algorithms/__init__.py +6 -0
  72. deepeval/optimizer/algorithms/base.py +29 -0
  73. deepeval/optimizer/algorithms/configs.py +18 -0
  74. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  75. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  76. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  77. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  78. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  79. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  80. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  81. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  82. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  83. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  84. deepeval/{optimization → optimizer}/configs.py +5 -8
  85. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  86. deepeval/optimizer/prompt_optimizer.py +263 -0
  87. deepeval/optimizer/rewriter/__init__.py +5 -0
  88. deepeval/optimizer/rewriter/rewriter.py +124 -0
  89. deepeval/optimizer/rewriter/utils.py +214 -0
  90. deepeval/optimizer/scorer/__init__.py +5 -0
  91. deepeval/optimizer/scorer/base.py +86 -0
  92. deepeval/optimizer/scorer/scorer.py +316 -0
  93. deepeval/optimizer/scorer/utils.py +30 -0
  94. deepeval/optimizer/types.py +148 -0
  95. deepeval/{optimization → optimizer}/utils.py +47 -165
  96. deepeval/prompt/prompt.py +5 -9
  97. deepeval/test_case/__init__.py +1 -3
  98. deepeval/test_case/api.py +12 -10
  99. deepeval/test_case/conversational_test_case.py +19 -1
  100. deepeval/test_case/llm_test_case.py +152 -1
  101. deepeval/test_case/utils.py +4 -8
  102. deepeval/test_run/api.py +15 -14
  103. deepeval/test_run/test_run.py +3 -3
  104. deepeval/tracing/patchers.py +9 -4
  105. deepeval/tracing/tracing.py +2 -2
  106. deepeval/utils.py +65 -0
  107. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  108. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/RECORD +116 -125
  109. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  110. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  111. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  112. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  113. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  114. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  115. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  116. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  117. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  118. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  119. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  120. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  121. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  122. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  123. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  124. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  125. deepeval/models/mlllms/__init__.py +0 -4
  126. deepeval/models/mlllms/azure_model.py +0 -343
  127. deepeval/models/mlllms/gemini_model.py +0 -313
  128. deepeval/models/mlllms/ollama_model.py +0 -175
  129. deepeval/models/mlllms/openai_model.py +0 -309
  130. deepeval/optimization/__init__.py +0 -13
  131. deepeval/optimization/adapters/__init__.py +0 -2
  132. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  133. deepeval/optimization/aggregates.py +0 -14
  134. deepeval/optimization/copro/configs.py +0 -31
  135. deepeval/optimization/gepa/__init__.py +0 -7
  136. deepeval/optimization/gepa/configs.py +0 -115
  137. deepeval/optimization/miprov2/configs.py +0 -134
  138. deepeval/optimization/miprov2/loop.py +0 -785
  139. deepeval/optimization/mutations/__init__.py +0 -0
  140. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  141. deepeval/optimization/policies/__init__.py +0 -16
  142. deepeval/optimization/policies/tie_breaker.py +0 -67
  143. deepeval/optimization/prompt_optimizer.py +0 -462
  144. deepeval/optimization/simba/__init__.py +0 -0
  145. deepeval/optimization/simba/configs.py +0 -33
  146. deepeval/optimization/types.py +0 -361
  147. deepeval/test_case/mllm_test_case.py +0 -170
  148. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  149. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  150. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  152. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  153. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  154. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  155. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -1,361 +0,0 @@
1
- from __future__ import annotations
2
- import uuid
3
-
4
- from dataclasses import dataclass
5
- from typing import (
6
- Any,
7
- Callable,
8
- Dict,
9
- List,
10
- Literal,
11
- Optional,
12
- Protocol,
13
- TYPE_CHECKING,
14
- TypedDict,
15
- Tuple,
16
- Union,
17
- )
18
- from enum import Enum
19
- from pydantic import BaseModel as PydanticBaseModel, Field, AliasChoices
20
-
21
- from deepeval.prompt.prompt import Prompt
22
- from deepeval.models.base_model import DeepEvalBaseLLM
23
-
24
-
25
- if TYPE_CHECKING:
26
- from deepeval.dataset.golden import Golden, ConversationalGolden
27
-
28
- PromptConfigurationId = str
29
- ModuleId = str
30
- ScoreVector = List[float] # scores per instance on D_pareto, aligned order
31
- ScoreTable = Dict[PromptConfigurationId, ScoreVector]
32
-
33
-
34
- @dataclass
35
- class PromptConfiguration:
36
- id: PromptConfigurationId
37
- parent: Optional[PromptConfigurationId]
38
- prompts: Dict[ModuleId, Prompt]
39
-
40
- @staticmethod
41
- def new(
42
- prompts: Dict[ModuleId, Prompt],
43
- parent: Optional[PromptConfigurationId] = None,
44
- ) -> "PromptConfiguration":
45
- return PromptConfiguration(
46
- id=str(uuid.uuid4()), parent=parent, prompts=dict(prompts)
47
- )
48
-
49
-
50
- class ScoringAdapter(Protocol):
51
- """
52
- Scoring adapter contract used by optimization runners.
53
-
54
- Runners call into this adapter to:
55
- - compute scores per-instance on some subset (score_on_pareto),
56
- - compute minibatch means for selection and acceptance,
57
- - generate feedback text used by the PromptRewriter.
58
- """
59
-
60
- # Sync
61
- def score_on_pareto(
62
- self,
63
- prompt_configuration: PromptConfiguration,
64
- d_pareto: Union[List[Golden], List[ConversationalGolden]],
65
- ) -> ScoreVector:
66
- """Return per-instance scores on D_pareto."""
67
- ...
68
-
69
- def minibatch_score(
70
- self,
71
- prompt_configuration: PromptConfiguration,
72
- minibatch: Union[List[Golden], List[ConversationalGolden]],
73
- ) -> float:
74
- """Return average score μ on a minibatch from D_feedback."""
75
- ...
76
-
77
- def minibatch_feedback(
78
- self,
79
- prompt_configuration: PromptConfiguration,
80
- module: ModuleId,
81
- minibatch: Union[List[Golden], List[ConversationalGolden]],
82
- ) -> str:
83
- """Return μ_f text for the module (metric.reason + traces, etc.)."""
84
- ...
85
-
86
- def select_module(
87
- self, prompt_configuration: PromptConfiguration
88
- ) -> ModuleId:
89
- """Pick a module to mutate."""
90
- ...
91
-
92
- # Async
93
- async def a_score_on_pareto(
94
- self,
95
- prompt_configuration: PromptConfiguration,
96
- d_pareto: Union[List[Golden], List[ConversationalGolden]],
97
- ) -> ScoreVector: ...
98
- async def a_minibatch_score(
99
- self,
100
- prompt_configuration: PromptConfiguration,
101
- minibatch: Union[List[Golden], List[ConversationalGolden]],
102
- ) -> float: ...
103
- async def a_minibatch_feedback(
104
- self,
105
- prompt_configuration: PromptConfiguration,
106
- module: ModuleId,
107
- minibatch: Union[List[Golden], List[ConversationalGolden]],
108
- ) -> str: ...
109
- async def a_select_module(
110
- self, prompt_configuration: PromptConfiguration
111
- ) -> ModuleId: ...
112
-
113
-
114
- class PromptRewriterProtocol(Protocol):
115
- def rewrite(
116
- self,
117
- *,
118
- module_id: ModuleId,
119
- model: Optional[DeepEvalBaseLLM] = None,
120
- model_schema: Optional[PydanticBaseModel] = None,
121
- model_callback: Optional[
122
- Callable[
123
- ...,
124
- Union[
125
- str,
126
- Dict,
127
- Tuple[Union[str, Dict], float],
128
- ],
129
- ]
130
- ] = None,
131
- old_prompt: Prompt,
132
- feedback_text: str,
133
- ) -> Prompt: ...
134
-
135
- async def a_rewrite(
136
- self,
137
- *,
138
- module_id: ModuleId,
139
- model: Optional[DeepEvalBaseLLM] = None,
140
- model_schema: Optional[PydanticBaseModel] = None,
141
- model_callback: Optional[
142
- Callable[
143
- ...,
144
- Union[
145
- str,
146
- Dict,
147
- Tuple[Union[str, Dict], float],
148
- ],
149
- ]
150
- ] = None,
151
- old_prompt: Prompt,
152
- feedback_text: str,
153
- ) -> Prompt: ...
154
-
155
-
156
- class RunnerStatusType(str, Enum):
157
- """Status events emitted by optimization runners."""
158
-
159
- PROGRESS = "progress"
160
- TIE = "tie"
161
- ERROR = "error"
162
-
163
-
164
- class RunnerStatusCallbackProtocol(Protocol):
165
- def __call__(
166
- self,
167
- kind: RunnerStatusType,
168
- *,
169
- detail: str,
170
- step_index: Optional[int] = None,
171
- total_steps: Optional[int] = None,
172
- ) -> None: ...
173
-
174
-
175
- class RunnerProtocol(Protocol):
176
- """
177
- Contract for prompt optimization runners used by PromptOptimizer.
178
-
179
- Runners are responsible for executing the optimization algorithm
180
- and returning an optimized Prompt plus a report dict.
181
- """
182
-
183
- # status_callback is injected by PromptOptimizer
184
- # A runner may call this to report:
185
- # progress, ties, or errors during execution.
186
- status_callback: Optional[RunnerStatusCallbackProtocol]
187
- model_callback: Optional[
188
- Callable[
189
- ...,
190
- Union[
191
- str,
192
- Dict,
193
- Tuple[Union[str, Dict], float],
194
- ],
195
- ]
196
- ]
197
-
198
- scoring_adapter: Optional[ScoringAdapter]
199
-
200
- def execute(
201
- self,
202
- *,
203
- prompt: Prompt,
204
- goldens: Union[List["Golden"], List["ConversationalGolden"]],
205
- ) -> Tuple[Prompt, Dict]: ...
206
-
207
- async def a_execute(
208
- self,
209
- *,
210
- prompt: Prompt,
211
- goldens: Union[List["Golden"], List["ConversationalGolden"]],
212
- ) -> Tuple[Prompt, Dict]: ...
213
-
214
-
215
- class Objective(Protocol):
216
- """Strategy for reducing scores per-metric to a single scalar value.
217
-
218
- Implementations receive a mapping from metric name to score
219
- (for example, {"AnswerRelevancyMetric": 0.82}) and return a
220
- single float used for comparisons inside the optimizer.
221
- """
222
-
223
- def scalarize(self, scores_by_metric: Dict[str, float]) -> float: ...
224
-
225
-
226
- class MeanObjective(Objective):
227
- """Default scalarizer: unweighted arithmetic mean.
228
-
229
- - If `scores_by_metric` is non-empty, returns the arithmetic
230
- mean of all metric scores.
231
- - If `scores_by_metric` is empty, returns 0.0.
232
- """
233
-
234
- def scalarize(self, scores_by_metric: Dict[str, float]) -> float:
235
- if not scores_by_metric:
236
- return 0.0
237
- return sum(scores_by_metric.values()) / len(scores_by_metric)
238
-
239
-
240
- class WeightedObjective(Objective):
241
- """
242
- Objective that scales each metric's score by a user-provided weight and sums them.
243
-
244
- - `weights_by_metric` keys should match the names of the metrics passed to the
245
- metric class names passed to the PromptOptimizer.
246
- - Metrics not present in `weights_by_metric` receive `default_weight`.
247
- This makes it easy to emphasize a subset of metrics while keeping
248
- everything else at a baseline weight of 1.0, e.g.:
249
-
250
- WeightedObjective({"AnswerRelevancyMetric": 2.0})
251
-
252
- which treats AnswerRelevancy as 2x as important as the other metrics.
253
- """
254
-
255
- def __init__(
256
- self,
257
- weights_by_metric: Optional[Dict[str, float]] = None,
258
- default_weight: float = 1.0,
259
- ):
260
- self.weights_by_metric: Dict[str, float] = dict(weights_by_metric or {})
261
- self.default_weight: float = float(default_weight)
262
-
263
- def scalarize(self, scores_by_metric: Dict[str, float]) -> float:
264
- return sum(
265
- self.weights_by_metric.get(name, self.default_weight) * score
266
- for name, score in scores_by_metric.items()
267
- )
268
-
269
-
270
- @dataclass
271
- class MetricInfo:
272
- name: str
273
- rubric: Optional[str] = None
274
-
275
-
276
- class AcceptedIterationDict(TypedDict):
277
- parent: PromptConfigurationId
278
- child: PromptConfigurationId
279
- module: ModuleId
280
- before: float
281
- after: float
282
-
283
-
284
- class AcceptedIteration(PydanticBaseModel):
285
- parent: str
286
- child: str
287
- module: str
288
- before: float
289
- after: float
290
-
291
-
292
- class PromptMessageSnapshot(PydanticBaseModel):
293
- role: str
294
- content: str
295
-
296
-
297
- class PromptModuleSnapshot(PydanticBaseModel):
298
- type: Literal["TEXT", "LIST"]
299
- # Only used when type == "TEXT"
300
- text_template: Optional[str] = None
301
- # Only used when type == "LIST"
302
- messages: Optional[List[PromptMessageSnapshot]] = None
303
-
304
-
305
- class PromptConfigSnapshot(PydanticBaseModel):
306
- parent: Optional[str]
307
- prompts: Dict[str, PromptModuleSnapshot]
308
-
309
-
310
- @dataclass
311
- class OptimizationResult:
312
- optimization_id: str
313
- best_id: PromptConfigurationId
314
- accepted_iterations: List[Dict]
315
- pareto_scores: Dict[PromptConfigurationId, List[float]]
316
- parents: Dict[PromptConfigurationId, Optional[PromptConfigurationId]]
317
- prompt_configurations: Dict[PromptConfigurationId, Dict[str, Any]]
318
-
319
- def as_dict(self) -> Dict:
320
- return dict(
321
- optimization_id=self.optimization_id,
322
- best_id=self.best_id,
323
- accepted_iterations=self.accepted_iterations,
324
- pareto_scores=self.pareto_scores,
325
- parents=self.parents,
326
- prompt_configurations=self.prompt_configurations,
327
- )
328
-
329
-
330
- class OptimizationReport(PydanticBaseModel):
331
- optimization_id: str = Field(
332
- alias="optimizationId",
333
- validation_alias=AliasChoices("optimizationId", "optimization_id"),
334
- )
335
- best_id: str = Field(
336
- alias="bestId",
337
- validation_alias=AliasChoices("bestId", "best_id"),
338
- )
339
- accepted_iterations: list[AcceptedIteration] = Field(
340
- default_factory=list,
341
- alias="acceptedIterations",
342
- validation_alias=AliasChoices(
343
- "acceptedIterations", "accepted_iterations"
344
- ),
345
- )
346
- pareto_scores: dict[str, list[float]] = Field(
347
- alias="paretoScores",
348
- validation_alias=AliasChoices("paretoScores", "pareto_scores"),
349
- )
350
- parents: dict[str, str | None]
351
- prompt_configurations: dict[str, PromptConfigSnapshot] = Field(
352
- alias="promptConfigurations",
353
- validation_alias=AliasChoices(
354
- "promptConfigurations", "prompt_configurations"
355
- ),
356
- )
357
-
358
- @classmethod
359
- def from_runtime(cls, result: dict) -> "OptimizationReport":
360
- # accepts the dict from OptimizationResult.as_dict()
361
- return cls(**result)
@@ -1,170 +0,0 @@
1
- from typing import List, Optional, Dict, Union
2
- from urllib.parse import urlparse, unquote
3
- from dataclasses import dataclass, field
4
- from enum import Enum
5
- import mimetypes
6
- import base64
7
- import os
8
-
9
- from deepeval.test_case import ToolCall
10
-
11
-
12
- @dataclass
13
- class MLLMImage:
14
- dataBase64: Optional[str] = None
15
- mimeType: Optional[str] = None
16
- url: Optional[str] = None
17
- local: Optional[bool] = None
18
- filename: Optional[str] = None
19
-
20
- def __post_init__(self):
21
-
22
- if self.url and self.dataBase64:
23
- raise ValueError(
24
- "You cannot provide both 'url' and 'dataBase64' at the same time when creating an MLLMImage."
25
- )
26
-
27
- if not self.url and not self.dataBase64:
28
- raise ValueError(
29
- "You must provide either a 'url' or both 'dataBase64' and 'mimeType' to create an MLLMImage."
30
- )
31
-
32
- if self.dataBase64 is not None:
33
- if self.mimeType is None:
34
- raise ValueError(
35
- "mimeType must be provided when initializing from Base64 data."
36
- )
37
- else:
38
- is_local = self.is_local_path(self.url)
39
- if self.local is not None:
40
- assert self.local == is_local, "Local path mismatch"
41
- else:
42
- self.local = is_local
43
-
44
- # compute filename, mime_type, and Base64 data
45
- if self.local:
46
- path = self.process_url(self.url)
47
- self.filename = os.path.basename(path)
48
- self.mimeType = (
49
- mimetypes.guess_type(path)[0] or "application/octet-stream"
50
- )
51
- with open(path, "rb") as f:
52
- raw = f.read()
53
- self.dataBase64 = base64.b64encode(raw).decode("ascii")
54
- else:
55
- self.filename = None
56
- self.mimeType = None
57
- self.dataBase64 = None
58
-
59
- @staticmethod
60
- def process_url(url: str) -> str:
61
- if os.path.exists(url):
62
- return url
63
- parsed = urlparse(url)
64
- if parsed.scheme == "file":
65
- raw_path = (
66
- f"//{parsed.netloc}{parsed.path}"
67
- if parsed.netloc
68
- else parsed.path
69
- )
70
- path = unquote(raw_path)
71
- return path
72
- return url
73
-
74
- @staticmethod
75
- def is_local_path(url: str) -> bool:
76
- if os.path.exists(url):
77
- return True
78
- parsed = urlparse(url)
79
- if parsed.scheme == "file":
80
- raw_path = (
81
- f"//{parsed.netloc}{parsed.path}"
82
- if parsed.netloc
83
- else parsed.path
84
- )
85
- path = unquote(raw_path)
86
- return os.path.exists(path)
87
- return False
88
-
89
- def as_data_uri(self) -> Optional[str]:
90
- """Return the image as a data URI string, if Base64 data is available."""
91
- if not self.dataBase64 or not self.mimeType:
92
- return None
93
- return f"data:{self.mimeType};base64,{self.dataBase64}"
94
-
95
-
96
- class MLLMTestCaseParams(Enum):
97
- INPUT = "input"
98
- ACTUAL_OUTPUT = "actual_output"
99
- EXPECTED_OUTPUT = "expected_output"
100
- CONTEXT = "context"
101
- RETRIEVAL_CONTEXT = "retrieval_context"
102
- TOOLS_CALLED = "tools_called"
103
- EXPECTED_TOOLS = "expected_tools"
104
-
105
-
106
- @dataclass
107
- class MLLMTestCase:
108
- input: List[Union[str, MLLMImage]]
109
- actual_output: List[Union[str, MLLMImage]]
110
- expected_output: Optional[List[Union[str, MLLMImage]]] = None
111
- context: Optional[List[Union[str, MLLMImage]]] = None
112
- retrieval_context: Optional[List[Union[str, MLLMImage]]] = None
113
- additional_metadata: Optional[Dict] = None
114
- comments: Optional[str] = None
115
- tools_called: Optional[List[ToolCall]] = None
116
- expected_tools: Optional[List[ToolCall]] = None
117
- token_cost: Optional[float] = None
118
- completion_time: Optional[float] = None
119
- name: Optional[str] = field(default=None)
120
- _dataset_rank: Optional[int] = field(default=None, repr=False)
121
- _dataset_alias: Optional[str] = field(default=None, repr=False)
122
- _dataset_id: Optional[str] = field(default=None, repr=False)
123
-
124
- def __post_init__(self):
125
- # Ensure `expected_output` is None or a list of strings or MLLMImage instances
126
- if self.expected_output is not None:
127
- if not isinstance(self.expected_output, list) or not all(
128
- isinstance(item, (str, MLLMImage))
129
- for item in self.expected_output
130
- ):
131
- raise TypeError(
132
- "'expected_output' must be None or a list of strings or MLLMImage instances"
133
- )
134
-
135
- # Ensure `context` is None or a list of strings or MLLMImage instances
136
- if self.context is not None:
137
- if not isinstance(self.context, list) or not all(
138
- isinstance(item, (str, MLLMImage)) for item in self.context
139
- ):
140
- raise TypeError(
141
- "'context' must be None or a list of strings or MLLMImage instances"
142
- )
143
-
144
- # Ensure `retrieval_context` is None or a list of strings or MLLMImage instances
145
- if self.retrieval_context is not None:
146
- if not isinstance(self.retrieval_context, list) or not all(
147
- isinstance(item, (str, MLLMImage))
148
- for item in self.retrieval_context
149
- ):
150
- raise TypeError(
151
- "'retrieval_context' must be None or a list of strings or MLLMImage instances"
152
- )
153
-
154
- # Ensure `tools_called` is None or a list of strings
155
- if self.tools_called is not None:
156
- if not isinstance(self.tools_called, list) or not all(
157
- isinstance(item, ToolCall) for item in self.tools_called
158
- ):
159
- raise TypeError(
160
- "'tools_called' must be None or a list of `ToolCall`"
161
- )
162
-
163
- # Ensure `expected_tools` is None or a list of strings
164
- if self.expected_tools is not None:
165
- if not isinstance(self.expected_tools, list) or not all(
166
- isinstance(item, ToolCall) for item in self.expected_tools
167
- ):
168
- raise TypeError(
169
- "'expected_tools' must be None or a list of `ToolCall`"
170
- )