deepeval 3.7.3__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/test.py +1 -1
  3. deepeval/config/settings.py +102 -13
  4. deepeval/dataset/golden.py +54 -2
  5. deepeval/evaluate/configs.py +1 -1
  6. deepeval/evaluate/evaluate.py +16 -8
  7. deepeval/evaluate/execute.py +74 -27
  8. deepeval/evaluate/utils.py +26 -22
  9. deepeval/integrations/pydantic_ai/agent.py +19 -2
  10. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  11. deepeval/metrics/__init__.py +14 -12
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  13. deepeval/metrics/answer_relevancy/template.py +188 -92
  14. deepeval/metrics/argument_correctness/template.py +2 -2
  15. deepeval/metrics/base_metric.py +2 -5
  16. deepeval/metrics/bias/template.py +3 -3
  17. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  18. deepeval/metrics/contextual_precision/template.py +115 -66
  19. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  20. deepeval/metrics/contextual_recall/template.py +106 -55
  21. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  22. deepeval/metrics/contextual_relevancy/template.py +87 -58
  23. deepeval/metrics/conversation_completeness/template.py +2 -2
  24. deepeval/metrics/conversational_dag/templates.py +4 -4
  25. deepeval/metrics/conversational_g_eval/template.py +4 -3
  26. deepeval/metrics/dag/templates.py +5 -5
  27. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  28. deepeval/metrics/faithfulness/schema.py +1 -1
  29. deepeval/metrics/faithfulness/template.py +200 -115
  30. deepeval/metrics/g_eval/utils.py +2 -2
  31. deepeval/metrics/hallucination/template.py +4 -4
  32. deepeval/metrics/indicator.py +4 -4
  33. deepeval/metrics/misuse/template.py +2 -2
  34. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  35. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  36. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  37. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  38. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  39. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  40. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  41. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  42. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  43. deepeval/metrics/non_advice/template.py +2 -2
  44. deepeval/metrics/pii_leakage/template.py +2 -2
  45. deepeval/metrics/prompt_alignment/template.py +4 -4
  46. deepeval/metrics/ragas.py +3 -3
  47. deepeval/metrics/role_violation/template.py +2 -2
  48. deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
  49. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  50. deepeval/metrics/toxicity/template.py +4 -4
  51. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  52. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  53. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  54. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  55. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  56. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  57. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  58. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  59. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  60. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  61. deepeval/metrics/turn_faithfulness/template.py +218 -0
  62. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  63. deepeval/metrics/turn_relevancy/template.py +2 -2
  64. deepeval/metrics/utils.py +39 -58
  65. deepeval/models/__init__.py +0 -12
  66. deepeval/models/base_model.py +16 -38
  67. deepeval/models/embedding_models/__init__.py +7 -0
  68. deepeval/models/embedding_models/azure_embedding_model.py +69 -32
  69. deepeval/models/embedding_models/local_embedding_model.py +39 -22
  70. deepeval/models/embedding_models/ollama_embedding_model.py +42 -18
  71. deepeval/models/embedding_models/openai_embedding_model.py +50 -15
  72. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  73. deepeval/models/llms/anthropic_model.py +53 -20
  74. deepeval/models/llms/azure_model.py +140 -43
  75. deepeval/models/llms/deepseek_model.py +38 -23
  76. deepeval/models/llms/gemini_model.py +222 -103
  77. deepeval/models/llms/grok_model.py +39 -27
  78. deepeval/models/llms/kimi_model.py +39 -23
  79. deepeval/models/llms/litellm_model.py +103 -45
  80. deepeval/models/llms/local_model.py +35 -22
  81. deepeval/models/llms/ollama_model.py +129 -17
  82. deepeval/models/llms/openai_model.py +151 -50
  83. deepeval/models/llms/portkey_model.py +149 -0
  84. deepeval/models/llms/utils.py +5 -3
  85. deepeval/models/retry_policy.py +17 -14
  86. deepeval/models/utils.py +94 -4
  87. deepeval/optimizer/__init__.py +5 -0
  88. deepeval/optimizer/algorithms/__init__.py +6 -0
  89. deepeval/optimizer/algorithms/base.py +29 -0
  90. deepeval/optimizer/algorithms/configs.py +18 -0
  91. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  92. deepeval/optimizer/algorithms/copro/copro.py +836 -0
  93. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  94. deepeval/optimizer/algorithms/gepa/gepa.py +737 -0
  95. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  96. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  97. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  98. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  99. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  100. deepeval/optimizer/algorithms/simba/simba.py +999 -0
  101. deepeval/optimizer/algorithms/simba/types.py +15 -0
  102. deepeval/optimizer/configs.py +31 -0
  103. deepeval/optimizer/policies.py +227 -0
  104. deepeval/optimizer/prompt_optimizer.py +263 -0
  105. deepeval/optimizer/rewriter/__init__.py +5 -0
  106. deepeval/optimizer/rewriter/rewriter.py +124 -0
  107. deepeval/optimizer/rewriter/utils.py +214 -0
  108. deepeval/optimizer/scorer/__init__.py +5 -0
  109. deepeval/optimizer/scorer/base.py +86 -0
  110. deepeval/optimizer/scorer/scorer.py +316 -0
  111. deepeval/optimizer/scorer/utils.py +30 -0
  112. deepeval/optimizer/types.py +148 -0
  113. deepeval/optimizer/utils.py +480 -0
  114. deepeval/prompt/prompt.py +7 -6
  115. deepeval/test_case/__init__.py +1 -3
  116. deepeval/test_case/api.py +12 -10
  117. deepeval/test_case/conversational_test_case.py +19 -1
  118. deepeval/test_case/llm_test_case.py +152 -1
  119. deepeval/test_case/utils.py +4 -8
  120. deepeval/test_run/api.py +15 -14
  121. deepeval/test_run/cache.py +2 -0
  122. deepeval/test_run/test_run.py +9 -4
  123. deepeval/tracing/patchers.py +9 -4
  124. deepeval/tracing/tracing.py +2 -2
  125. deepeval/utils.py +89 -0
  126. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  127. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/RECORD +134 -118
  128. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  129. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  130. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  131. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  132. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  133. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  134. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  135. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  136. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  137. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  138. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  139. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  140. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  141. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  142. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  143. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  144. deepeval/models/mlllms/__init__.py +0 -4
  145. deepeval/models/mlllms/azure_model.py +0 -334
  146. deepeval/models/mlllms/gemini_model.py +0 -284
  147. deepeval/models/mlllms/ollama_model.py +0 -144
  148. deepeval/models/mlllms/openai_model.py +0 -258
  149. deepeval/test_case/mllm_test_case.py +0 -170
  150. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  152. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  153. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  154. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  155. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  156. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,148 @@
1
+ from __future__ import annotations
2
+ import uuid
3
+ from abc import ABC, abstractmethod
4
+
5
+ from dataclasses import dataclass
6
+ from typing import (
7
+ Callable,
8
+ Dict,
9
+ List,
10
+ Optional,
11
+ TypedDict,
12
+ TYPE_CHECKING,
13
+ Union,
14
+ )
15
+ from enum import Enum
16
+ from pydantic import BaseModel, ConfigDict
17
+
18
+ from deepeval.prompt.prompt import Prompt
19
+
20
+ if TYPE_CHECKING:
21
+ from deepeval.dataset.golden import Golden, ConversationalGolden
22
+
23
+ PromptConfigurationId = str
24
+ ModuleId = str
25
+ ScoreVector = List[float] # scores per instance on D_pareto, aligned order
26
+ ScoreTable = Dict[PromptConfigurationId, ScoreVector]
27
+
28
+ # Type alias for model callback function
29
+ ModelCallback = Callable[[Prompt, Union["Golden", "ConversationalGolden"]], str]
30
+
31
+
32
+ @dataclass
33
+ class PromptConfiguration:
34
+ id: PromptConfigurationId
35
+ parent: Optional[PromptConfigurationId]
36
+ prompts: Dict[ModuleId, Prompt]
37
+
38
+ @staticmethod
39
+ def new(
40
+ prompts: Dict[ModuleId, Prompt],
41
+ parent: Optional[PromptConfigurationId] = None,
42
+ ) -> "PromptConfiguration":
43
+ return PromptConfiguration(
44
+ id=str(uuid.uuid4()), parent=parent, prompts=dict(prompts)
45
+ )
46
+
47
+
48
+ class RunnerStatusType(str, Enum):
49
+ """Status events emitted by optimization runners."""
50
+
51
+ PROGRESS = "progress"
52
+ TIE = "tie"
53
+ ERROR = "error"
54
+
55
+
56
+ # Type alias for status callback function
57
+ RunnerStatusCallback = Callable[..., None]
58
+
59
+
60
+ class Objective(ABC):
61
+ """Strategy for reducing scores per-metric to a single scalar value.
62
+
63
+ Implementations receive a mapping from metric name to score
64
+ (for example, {"AnswerRelevancyMetric": 0.82}) and return a
65
+ single float used for comparisons inside the optimizer.
66
+ """
67
+
68
+ @abstractmethod
69
+ def scalarize(self, scores_by_metric: Dict[str, float]) -> float:
70
+ raise NotImplementedError
71
+
72
+
73
+ class MeanObjective(Objective):
74
+ """Default scalarizer: unweighted arithmetic mean.
75
+
76
+ - If `scores_by_metric` is non-empty, returns the arithmetic
77
+ mean of all metric scores.
78
+ - If `scores_by_metric` is empty, returns 0.0.
79
+ """
80
+
81
+ def scalarize(self, scores_by_metric: Dict[str, float]) -> float:
82
+ if not scores_by_metric:
83
+ return 0.0
84
+ return sum(scores_by_metric.values()) / len(scores_by_metric)
85
+
86
+
87
+ class WeightedObjective(Objective):
88
+ """
89
+ Objective that scales each metric's score by a user-provided weight and sums them.
90
+
91
+ - `weights_by_metric` keys should match the names of the metrics passed to the
92
+ metric class names passed to the PromptOptimizer.
93
+ - Metrics not present in `weights_by_metric` receive `default_weight`.
94
+ This makes it easy to emphasize a subset of metrics while keeping
95
+ everything else at a baseline weight of 1.0, e.g.:
96
+
97
+ WeightedObjective({"AnswerRelevancyMetric": 2.0})
98
+
99
+ which treats AnswerRelevancy as 2x as important as the other metrics.
100
+ """
101
+
102
+ def __init__(
103
+ self,
104
+ weights_by_metric: Optional[Dict[str, float]] = None,
105
+ default_weight: float = 1.0,
106
+ ):
107
+ self.weights_by_metric: Dict[str, float] = dict(weights_by_metric or {})
108
+ self.default_weight: float = float(default_weight)
109
+
110
+ def scalarize(self, scores_by_metric: Dict[str, float]) -> float:
111
+ return sum(
112
+ self.weights_by_metric.get(name, self.default_weight) * score
113
+ for name, score in scores_by_metric.items()
114
+ )
115
+
116
+
117
+ class AcceptedIterationDict(TypedDict):
118
+ parent: PromptConfigurationId
119
+ child: PromptConfigurationId
120
+ module: ModuleId
121
+ before: float
122
+ after: float
123
+
124
+
125
+ class AcceptedIteration(BaseModel):
126
+ parent: str
127
+ child: str
128
+ module: str
129
+ before: float
130
+ after: float
131
+
132
+
133
+ class PromptConfigSnapshot(BaseModel):
134
+ model_config = ConfigDict(arbitrary_types_allowed=True)
135
+
136
+ parent: Optional[str]
137
+ prompts: Dict[str, Prompt]
138
+
139
+
140
+ class OptimizationReport(BaseModel):
141
+ model_config = ConfigDict(arbitrary_types_allowed=True)
142
+
143
+ optimization_id: str
144
+ best_id: str
145
+ accepted_iterations: List[AcceptedIteration]
146
+ pareto_scores: Dict[str, List[float]]
147
+ parents: Dict[str, Optional[str]]
148
+ prompt_configurations: Dict[str, PromptConfigSnapshot]
@@ -0,0 +1,480 @@
1
+ from __future__ import annotations
2
+ import inspect
3
+ import random
4
+ import re
5
+ import statistics
6
+ from typing import (
7
+ Any,
8
+ Callable,
9
+ List,
10
+ Optional,
11
+ Protocol,
12
+ Sequence,
13
+ Tuple,
14
+ TYPE_CHECKING,
15
+ Union,
16
+ Dict,
17
+ Set,
18
+ )
19
+
20
+ from deepeval.errors import DeepEvalError
21
+ from deepeval.metrics.base_metric import BaseMetric, BaseConversationalMetric
22
+ from deepeval.prompt.prompt import Prompt
23
+ from deepeval.prompt.api import PromptMessage
24
+ from deepeval.optimizer.types import (
25
+ ModelCallback,
26
+ ModuleId,
27
+ PromptConfigurationId,
28
+ PromptConfiguration,
29
+ PromptConfigSnapshot,
30
+ OptimizationReport,
31
+ )
32
+
33
+
34
+ if TYPE_CHECKING:
35
+ from deepeval.dataset.golden import Golden, ConversationalGolden
36
+ from deepeval.prompt.api import PromptMessage
37
+
38
+
39
+ def split_goldens(
40
+ goldens: Union[List[Golden], List[ConversationalGolden]],
41
+ pareto_size: int,
42
+ *,
43
+ random_state: random.Random,
44
+ ) -> Tuple[
45
+ Union[List[Golden], List[ConversationalGolden]],
46
+ Union[List[Golden], List[ConversationalGolden]],
47
+ ]:
48
+ """
49
+ Split `goldens` into two disjoint parts:
50
+
51
+ - d_feedback: items not selected for the Pareto validation set
52
+ - d_pareto: `pareto_size` items for instance-wise Pareto scoring
53
+
54
+ The selection is deterministic given `seed`. Within each split, the
55
+ original order from `goldens` is preserved.
56
+
57
+ Args:
58
+ goldens: Full list/sequence of examples.
59
+ pareto_size: Number of items to allocate to the Pareto set bound between [0, len(goldens)].
60
+ random_state: A shared `random.Random` instance that provides the source
61
+ of randomness. For reproducible runs, pass the same object used by
62
+ the GEPA loop constructed from `GEPA.random_seed`
63
+
64
+ Returns:
65
+ (d_feedback, d_pareto)
66
+ """
67
+ if pareto_size < 0:
68
+ raise ValueError("pareto_size must be >= 0")
69
+
70
+ total = len(goldens)
71
+
72
+ if total == 0:
73
+ # nothing to split
74
+ return [], []
75
+
76
+ # With a single example, we cannot form a meaningful feedback set.
77
+ # callers like GEPARunner should enforce a minimum of 2 goldens for
78
+ # optimization.
79
+ if total == 1:
80
+ return [], list(goldens)
81
+
82
+ # For total >= 2, ensure that we always leave at least one example
83
+ # for d_feedback. This keeps the splits disjoint while still honoring
84
+ # pareto_size as a target up to (total - 1).
85
+ chosen_size = min(pareto_size, total - 1)
86
+
87
+ indices = list(range(total))
88
+ random_state.shuffle(indices)
89
+
90
+ pareto_indices = set(indices[:chosen_size])
91
+
92
+ d_pareto = [goldens[i] for i in range(total) if i in pareto_indices]
93
+ d_feedback = [goldens[i] for i in range(total) if i not in pareto_indices]
94
+
95
+ return d_feedback, d_pareto
96
+
97
+
98
+ ################################
99
+ # Prompt normalization helpers #
100
+ ################################
101
+
102
+
103
+ def _slug(text: str) -> str:
104
+ slug = text.lower()
105
+ slug = re.sub(r"[^a-z0-9]+", "-", slug)
106
+ return slug.strip("-")
107
+
108
+
109
+ def generate_module_id(prompt: Prompt, index: int, existing: Set[str]) -> str:
110
+ """
111
+ Build a human readable module id stable within a single optimization run.
112
+ Prefers alias/label; enrich with model settings provider and name; dedupe; cap to 64 chars.
113
+ """
114
+ parts: List[str] = []
115
+ if prompt.alias:
116
+ parts.append(str(prompt.alias))
117
+ if prompt.label:
118
+ parts.append(str(prompt.label))
119
+
120
+ ms = prompt.model_settings
121
+ if ms is not None:
122
+ if ms.provider is not None:
123
+ parts.append(ms.provider.value)
124
+ if ms.name:
125
+ parts.append(ms.name)
126
+
127
+ base = "-".join(_slug(p) for p in parts if p) or f"module-{index+1}"
128
+ base = base[:64] or f"module-{index+1}"
129
+
130
+ candidate = base
131
+ suffix = 2
132
+ while candidate in existing:
133
+ candidate = f"{base}-{suffix}"
134
+ candidate = candidate[:64]
135
+ suffix += 1
136
+
137
+ existing.add(candidate)
138
+ return candidate
139
+
140
+
141
+ def normalize_seed_prompts(
142
+ seed_prompts: Union[Dict[ModuleId, Prompt], List[Prompt]],
143
+ ) -> Dict[ModuleId, Prompt]:
144
+ """
145
+ Accept either {module_id: Prompt} or List[Prompt].
146
+ If a list is given, generate human readable module ids.
147
+ """
148
+ if isinstance(seed_prompts, dict):
149
+ return dict(seed_prompts) # shallow copy
150
+
151
+ mapping: Dict[ModuleId, Prompt] = {}
152
+ used: Set[str] = set()
153
+ for i, prompt in enumerate(seed_prompts):
154
+ module_id = generate_module_id(prompt, i, used)
155
+ mapping[module_id] = prompt
156
+ return mapping
157
+
158
+
159
+ def invoke_model_callback(
160
+ *,
161
+ model_callback: ModelCallback,
162
+ prompt: Prompt,
163
+ golden: Union["Golden", "ConversationalGolden"],
164
+ ) -> str:
165
+ """
166
+ Call a user provided model_callback in a synchronous context.
167
+
168
+ Raises if the callback returns an awaitable.
169
+ """
170
+ result = model_callback(prompt, golden)
171
+ if inspect.isawaitable(result):
172
+ raise DeepEvalError(
173
+ "model_callback returned an awaitable from a synchronous context. "
174
+ "Either declare the callback as `async def` and use async optimization, or call "
175
+ "`model.generate(...)` instead of `model.a_generate(...)` inside a sync callback."
176
+ )
177
+ return result
178
+
179
+
180
+ async def a_invoke_model_callback(
181
+ *,
182
+ model_callback: ModelCallback,
183
+ prompt: Prompt,
184
+ golden: Union["Golden", "ConversationalGolden"],
185
+ ) -> str:
186
+ """
187
+ Call a user provided model_callback in an async context.
188
+
189
+ Supports both sync and async callbacks.
190
+ """
191
+ result = model_callback(prompt, golden)
192
+ if inspect.isawaitable(result):
193
+ return await result
194
+ return result
195
+
196
+
197
+ ###########
198
+ # Reports #
199
+ ###########
200
+
201
+
202
+ def build_prompt_config_snapshots(
203
+ prompt_configurations_by_id: Dict[
204
+ PromptConfigurationId, "PromptConfiguration"
205
+ ],
206
+ ) -> Dict[PromptConfigurationId, PromptConfigSnapshot]:
207
+ """
208
+ Build snapshots of all prompt configurations.
209
+ """
210
+ snapshots: Dict[PromptConfigurationId, PromptConfigSnapshot] = {}
211
+
212
+ for cfg_id, cfg in prompt_configurations_by_id.items():
213
+ snapshots[cfg_id] = PromptConfigSnapshot(
214
+ parent=cfg.parent,
215
+ prompts=dict(cfg.prompts),
216
+ )
217
+
218
+ return snapshots
219
+
220
+
221
+ def inflate_prompts_from_report(
222
+ report: OptimizationReport,
223
+ ) -> Dict[str, Dict[str, Prompt]]:
224
+ """
225
+ Build a mapping from configuration id -> { module_id -> Prompt }.
226
+
227
+ This is a convenience for users who want to work with real Prompt
228
+ instances instead of raw snapshots.
229
+
230
+ Returns:
231
+ {
232
+ "<config_id>": {
233
+ "<module_id>": Prompt(...),
234
+ ...
235
+ },
236
+ ...
237
+ }
238
+ """
239
+ inflated: Dict[str, Dict[str, Prompt]] = {}
240
+
241
+ for cfg_id, cfg_snapshot in report.prompt_configurations.items():
242
+ module_prompts: Dict[str, Prompt] = {}
243
+
244
+ for module_id, module_snapshot in cfg_snapshot.prompts.items():
245
+ if module_snapshot.type == "TEXT":
246
+ module_prompts[module_id] = Prompt(
247
+ text_template=module_snapshot.text_template or ""
248
+ )
249
+ else: # "LIST"
250
+ messages = [
251
+ PromptMessage(role=m.role, content=m.content)
252
+ for m in module_snapshot.messages or []
253
+ ]
254
+ module_prompts[module_id] = Prompt(messages_template=messages)
255
+
256
+ inflated[cfg_id] = module_prompts
257
+
258
+ return inflated
259
+
260
+
261
+ def get_best_prompts_from_report(
262
+ report: OptimizationReport,
263
+ ) -> Dict[str, Prompt]:
264
+ """
265
+ Convenience wrapper returning the best configuration's module prompts.
266
+ """
267
+ all_prompts = inflate_prompts_from_report(report)
268
+ return all_prompts.get(report.best_id, {})
269
+
270
+
271
+ ##############
272
+ # Validation #
273
+ ##############
274
+ def _format_type_names(types: Tuple[type, ...]) -> str:
275
+ names = [t.__name__ for t in types]
276
+ if len(names) == 1:
277
+ return names[0]
278
+ if len(names) == 2:
279
+ return f"{names[0]} or {names[1]}"
280
+ return ", ".join(names[:-1]) + f", or {names[-1]}"
281
+
282
+
283
+ def validate_instance(
284
+ *,
285
+ component: str,
286
+ param_name: str,
287
+ value: Any,
288
+ expected_types: Union[type, Tuple[type, ...]],
289
+ allow_none: bool = False,
290
+ ) -> Any:
291
+ """
292
+ Generic type validator.
293
+
294
+ - component: Intended to help identify what is being validated.
295
+ e.g. "PromptOptimizer.__init__", "PromptOptimizer.optimize", etc.
296
+ - param_name: the name of the parameter being validated
297
+ - value: the actual value passed.
298
+ - expected_types: a type or tuple of types to accept.
299
+ - allow_none: if True, None is allowed and returned as-is.
300
+ """
301
+ if value is None and allow_none:
302
+ return value
303
+
304
+ if not isinstance(expected_types, tuple):
305
+ expected_types = (expected_types,)
306
+
307
+ if not isinstance(value, expected_types):
308
+ expected_desc = _format_type_names(expected_types)
309
+ raise DeepEvalError(
310
+ f"{component} expected `{param_name}` to be an instance of "
311
+ f"{expected_desc}, but received {type(value).__name__!r} instead."
312
+ )
313
+ return value
314
+
315
+
316
+ def validate_sequence_of(
317
+ *,
318
+ component: str,
319
+ param_name: str,
320
+ value: Any,
321
+ expected_item_types: Union[type, Tuple[type, ...]],
322
+ sequence_types: Tuple[type, ...] = (list, tuple),
323
+ allow_none: bool = False,
324
+ ) -> Any:
325
+ """
326
+ Generic container validator.
327
+
328
+ - Ensures `value` is one of `sequence_types` (list by default).
329
+ - Ensures each item is an instance of `expected_item_types`.
330
+
331
+ Returns the original `value` on success.
332
+ """
333
+ if value is None:
334
+ if allow_none:
335
+ return value
336
+ raise DeepEvalError(
337
+ f"{component} expected `{param_name}` to be a "
338
+ f"{_format_type_names(sequence_types)} of "
339
+ f"{_format_type_names(expected_item_types if isinstance(expected_item_types, tuple) else (expected_item_types,))}, "
340
+ "but received None instead."
341
+ )
342
+
343
+ if not isinstance(sequence_types, tuple):
344
+ sequence_types = (sequence_types,)
345
+
346
+ if not isinstance(value, sequence_types):
347
+ expected_seq = _format_type_names(sequence_types)
348
+ raise DeepEvalError(
349
+ f"{component} expected `{param_name}` to be a {expected_seq}, "
350
+ f"but received {type(value).__name__!r} instead."
351
+ )
352
+
353
+ if not isinstance(expected_item_types, tuple):
354
+ expected_item_types = (expected_item_types,)
355
+
356
+ for index, item in enumerate(value):
357
+ if not isinstance(item, expected_item_types):
358
+ expected_items = _format_type_names(expected_item_types)
359
+ raise DeepEvalError(
360
+ f"{component} expected all elements of `{param_name}` to be "
361
+ f"instances of {expected_items}, but element at index {index} "
362
+ f"has type {type(item).__name__!r}."
363
+ )
364
+
365
+ return value
366
+
367
+
368
+ def validate_callback(
369
+ *,
370
+ component: str,
371
+ model_callback: Optional[ModelCallback],
372
+ ) -> ModelCallback:
373
+ """
374
+ Ensure that `model_callback` is provided.
375
+
376
+ - `model_callback` should be a callable that performs generation and
377
+ returns the model output.
378
+
379
+ Returns `model_callback` unchanged on success.
380
+ """
381
+ if model_callback is None:
382
+ raise DeepEvalError(
383
+ f"{component} requires a `model_callback`.\n\n"
384
+ "supply a custom callable via `model_callback=` that performs "
385
+ "generation and returns the model output."
386
+ )
387
+ return model_callback
388
+
389
+
390
+ def validate_metrics(
391
+ *,
392
+ component: str,
393
+ metrics: Union[List[BaseMetric], List[BaseConversationalMetric]],
394
+ ) -> Union[List[BaseMetric], List[BaseConversationalMetric]]:
395
+
396
+ if metrics is None or not len(metrics):
397
+ raise DeepEvalError(
398
+ f"{component} requires a `metrics`.\n\n"
399
+ "supply one or more DeepEval metrics via `metrics=`"
400
+ )
401
+
402
+ validate_sequence_of(
403
+ component=component,
404
+ param_name="metrics",
405
+ value=metrics,
406
+ expected_item_types=(BaseMetric, BaseConversationalMetric),
407
+ sequence_types=(list, tuple),
408
+ )
409
+ return list(metrics)
410
+
411
+
412
+ def validate_int_in_range(
413
+ *,
414
+ component: str,
415
+ param_name: str,
416
+ value: int,
417
+ min_inclusive: Optional[int] = None,
418
+ max_exclusive: Optional[int] = None,
419
+ ) -> int:
420
+ """
421
+ Validate that an int is within range [min_inclusive, max_exclusive).
422
+
423
+ - If `min_inclusive` is not None, value must be >= min_inclusive.
424
+ - If `max_exclusive` is not None, value must be < max_exclusive.
425
+
426
+ Returns the validated int on success.
427
+ """
428
+ value = validate_instance(
429
+ component=component,
430
+ param_name=param_name,
431
+ value=value,
432
+ expected_types=int,
433
+ )
434
+
435
+ # Lower bound check
436
+ if min_inclusive is not None and value < min_inclusive:
437
+ if max_exclusive is None:
438
+ raise DeepEvalError(
439
+ f"{component} expected `{param_name}` to be >= {min_inclusive}, "
440
+ f"but received {value!r} instead."
441
+ )
442
+ max_inclusive = max_exclusive - 1
443
+ raise DeepEvalError(
444
+ f"{component} expected `{param_name}` to be between "
445
+ f"{min_inclusive} and {max_inclusive} (inclusive), "
446
+ f"but received {value!r} instead."
447
+ )
448
+
449
+ # Upper bound check (half-open, < max_exclusive)
450
+ if max_exclusive is not None and value >= max_exclusive:
451
+ if min_inclusive is None:
452
+ raise DeepEvalError(
453
+ f"{component} expected `{param_name}` to be < {max_exclusive}, "
454
+ f"but received {value!r} instead."
455
+ )
456
+ max_inclusive = max_exclusive - 1
457
+ raise DeepEvalError(
458
+ f"{component} expected `{param_name}` to be between "
459
+ f"{min_inclusive} and {max_inclusive} (inclusive), "
460
+ f"but received {value!r} instead."
461
+ )
462
+
463
+ return value
464
+
465
+
466
+ ##############
467
+ # Aggregates #
468
+ ##############
469
+
470
+
471
+ class Aggregator(Protocol):
472
+ def __call__(self, scores: Sequence[float]) -> float: ...
473
+
474
+
475
+ def mean_of_all(scores: Sequence[float]) -> float:
476
+ return statistics.fmean(scores) if scores else 0.0
477
+
478
+
479
+ def median_of_all(scores: Sequence[float]) -> float:
480
+ return statistics.median(scores) if scores else 0.0
deepeval/prompt/prompt.py CHANGED
@@ -7,9 +7,6 @@ from enum import Enum
7
7
  from typing import Optional, List, Dict, Type, Literal
8
8
  from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
9
9
  from rich.console import Console
10
- import time
11
- import json
12
- import os
13
10
  from pydantic import BaseModel, ValidationError
14
11
  import asyncio
15
12
  import threading
@@ -37,7 +34,6 @@ from deepeval.prompt.utils import (
37
34
  from deepeval.confident.api import Api, Endpoints, HttpMethods
38
35
  from deepeval.constants import HIDDEN_DIR
39
36
 
40
-
41
37
  logger = logging.getLogger(__name__)
42
38
 
43
39
  portalocker = None
@@ -117,6 +113,7 @@ class Prompt:
117
113
  model_settings: Optional[ModelSettings] = None,
118
114
  output_type: Optional[OutputType] = None,
119
115
  output_schema: Optional[Type[BaseModel]] = None,
116
+ interpolation_type: Optional[PromptInterpolationType] = None,
120
117
  ):
121
118
  if text_template and messages_template:
122
119
  raise TypeError(
@@ -129,7 +126,9 @@ class Prompt:
129
126
  self.output_type: Optional[OutputType] = output_type
130
127
  self.output_schema: Optional[Type[BaseModel]] = output_schema
131
128
  self.label: Optional[str] = None
132
- self.interpolation_type: Optional[PromptInterpolationType] = None
129
+ self.interpolation_type: PromptInterpolationType = (
130
+ interpolation_type or PromptInterpolationType.FSTRING
131
+ )
133
132
 
134
133
  self._version = None
135
134
  self._prompt_version_id: Optional[str] = None
@@ -178,7 +177,7 @@ class Prompt:
178
177
  content = f.read()
179
178
  try:
180
179
  data = json.loads(content)
181
- except (json.JSONDecodeError, TypeError):
180
+ except (TypeError, json.JSONDecodeError):
182
181
  self.text_template = content
183
182
  return content
184
183
 
@@ -364,6 +363,8 @@ class Prompt:
364
363
  f.seek(0)
365
364
  f.truncate()
366
365
  json.dump(cache_data, f, cls=CustomEncoder)
366
+ f.flush()
367
+ os.fsync(f.fileno())
367
368
  except portalocker.exceptions.LockException:
368
369
  # If we can't acquire the lock, silently skip caching
369
370
  pass
@@ -3,13 +3,13 @@ from .llm_test_case import (
3
3
  LLMTestCaseParams,
4
4
  ToolCall,
5
5
  ToolCallParams,
6
+ MLLMImage,
6
7
  )
7
8
  from .conversational_test_case import (
8
9
  ConversationalTestCase,
9
10
  Turn,
10
11
  TurnParams,
11
12
  )
12
- from .mllm_test_case import MLLMTestCase, MLLMTestCaseParams, MLLMImage
13
13
  from .arena_test_case import ArenaTestCase, Contestant
14
14
  from .mcp import (
15
15
  MCPServer,
@@ -31,8 +31,6 @@ __all__ = [
31
31
  "MCPPromptCall",
32
32
  "MCPResourceCall",
33
33
  "MCPToolCall",
34
- "MLLMTestCase",
35
- "MLLMTestCaseParams",
36
34
  "MLLMImage",
37
35
  "ArenaTestCase",
38
36
  "Contestant",