deepeval 3.7.3__py3-none-any.whl → 3.7.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/test.py +1 -1
- deepeval/config/settings.py +102 -13
- deepeval/evaluate/configs.py +1 -1
- deepeval/evaluate/execute.py +4 -1
- deepeval/metrics/answer_relevancy/template.py +4 -4
- deepeval/metrics/argument_correctness/template.py +2 -2
- deepeval/metrics/bias/template.py +3 -3
- deepeval/metrics/contextual_precision/template.py +6 -6
- deepeval/metrics/contextual_recall/template.py +2 -2
- deepeval/metrics/contextual_relevancy/template.py +3 -3
- deepeval/metrics/conversation_completeness/template.py +2 -2
- deepeval/metrics/conversational_dag/templates.py +4 -4
- deepeval/metrics/conversational_g_eval/template.py +4 -3
- deepeval/metrics/dag/templates.py +4 -4
- deepeval/metrics/faithfulness/template.py +4 -4
- deepeval/metrics/hallucination/template.py +4 -4
- deepeval/metrics/misuse/template.py +2 -2
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +7 -7
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +6 -6
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +2 -2
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +3 -3
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +9 -9
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +4 -4
- deepeval/metrics/non_advice/template.py +2 -2
- deepeval/metrics/pii_leakage/template.py +2 -2
- deepeval/metrics/prompt_alignment/template.py +4 -4
- deepeval/metrics/role_violation/template.py +2 -2
- deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
- deepeval/metrics/toxicity/template.py +4 -4
- deepeval/metrics/turn_relevancy/template.py +2 -2
- deepeval/models/embedding_models/azure_embedding_model.py +28 -15
- deepeval/models/embedding_models/local_embedding_model.py +23 -10
- deepeval/models/embedding_models/ollama_embedding_model.py +8 -6
- deepeval/models/embedding_models/openai_embedding_model.py +18 -2
- deepeval/models/llms/anthropic_model.py +17 -5
- deepeval/models/llms/azure_model.py +30 -18
- deepeval/models/llms/deepseek_model.py +22 -12
- deepeval/models/llms/gemini_model.py +120 -87
- deepeval/models/llms/grok_model.py +23 -16
- deepeval/models/llms/kimi_model.py +23 -12
- deepeval/models/llms/litellm_model.py +63 -25
- deepeval/models/llms/local_model.py +26 -18
- deepeval/models/llms/ollama_model.py +17 -7
- deepeval/models/llms/openai_model.py +22 -17
- deepeval/models/llms/portkey_model.py +132 -0
- deepeval/models/mlllms/azure_model.py +28 -19
- deepeval/models/mlllms/gemini_model.py +102 -73
- deepeval/models/mlllms/ollama_model.py +40 -9
- deepeval/models/mlllms/openai_model.py +65 -14
- deepeval/models/utils.py +48 -3
- deepeval/optimization/__init__.py +13 -0
- deepeval/optimization/adapters/__init__.py +2 -0
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +588 -0
- deepeval/optimization/aggregates.py +14 -0
- deepeval/optimization/configs.py +34 -0
- deepeval/optimization/copro/configs.py +31 -0
- deepeval/optimization/copro/loop.py +837 -0
- deepeval/optimization/gepa/__init__.py +7 -0
- deepeval/optimization/gepa/configs.py +115 -0
- deepeval/optimization/gepa/loop.py +677 -0
- deepeval/optimization/miprov2/configs.py +134 -0
- deepeval/optimization/miprov2/loop.py +785 -0
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +458 -0
- deepeval/optimization/policies/__init__.py +16 -0
- deepeval/optimization/policies/selection.py +166 -0
- deepeval/optimization/policies/tie_breaker.py +67 -0
- deepeval/optimization/prompt_optimizer.py +462 -0
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +33 -0
- deepeval/optimization/simba/loop.py +983 -0
- deepeval/optimization/simba/types.py +15 -0
- deepeval/optimization/types.py +361 -0
- deepeval/optimization/utils.py +598 -0
- deepeval/prompt/prompt.py +10 -5
- deepeval/test_run/cache.py +2 -0
- deepeval/test_run/test_run.py +6 -1
- deepeval/utils.py +24 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/METADATA +1 -1
- {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/RECORD +84 -59
- {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/WHEEL +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class SIMBAStrategy(str, Enum):
|
|
5
|
+
"""
|
|
6
|
+
Edit strategies used by SIMBA-style optimization.
|
|
7
|
+
|
|
8
|
+
- APPEND_DEMO: append one or more input/output demos distilled from the
|
|
9
|
+
current minibatch, similar in spirit to DSPy's `append_a_demo`.
|
|
10
|
+
- APPEND_RULE: append a concise natural-language rule distilled from
|
|
11
|
+
feedback, similar in spirit to DSPy's `append_a_rule`.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
APPEND_DEMO = "append_demo"
|
|
15
|
+
APPEND_RULE = "append_rule"
|
|
@@ -0,0 +1,361 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import uuid
|
|
3
|
+
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import (
|
|
6
|
+
Any,
|
|
7
|
+
Callable,
|
|
8
|
+
Dict,
|
|
9
|
+
List,
|
|
10
|
+
Literal,
|
|
11
|
+
Optional,
|
|
12
|
+
Protocol,
|
|
13
|
+
TYPE_CHECKING,
|
|
14
|
+
TypedDict,
|
|
15
|
+
Tuple,
|
|
16
|
+
Union,
|
|
17
|
+
)
|
|
18
|
+
from enum import Enum
|
|
19
|
+
from pydantic import BaseModel as PydanticBaseModel, Field, AliasChoices
|
|
20
|
+
|
|
21
|
+
from deepeval.prompt.prompt import Prompt
|
|
22
|
+
from deepeval.models.base_model import DeepEvalBaseLLM
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
if TYPE_CHECKING:
|
|
26
|
+
from deepeval.dataset.golden import Golden, ConversationalGolden
|
|
27
|
+
|
|
28
|
+
PromptConfigurationId = str
|
|
29
|
+
ModuleId = str
|
|
30
|
+
ScoreVector = List[float] # scores per instance on D_pareto, aligned order
|
|
31
|
+
ScoreTable = Dict[PromptConfigurationId, ScoreVector]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class PromptConfiguration:
|
|
36
|
+
id: PromptConfigurationId
|
|
37
|
+
parent: Optional[PromptConfigurationId]
|
|
38
|
+
prompts: Dict[ModuleId, Prompt]
|
|
39
|
+
|
|
40
|
+
@staticmethod
|
|
41
|
+
def new(
|
|
42
|
+
prompts: Dict[ModuleId, Prompt],
|
|
43
|
+
parent: Optional[PromptConfigurationId] = None,
|
|
44
|
+
) -> "PromptConfiguration":
|
|
45
|
+
return PromptConfiguration(
|
|
46
|
+
id=str(uuid.uuid4()), parent=parent, prompts=dict(prompts)
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class ScoringAdapter(Protocol):
|
|
51
|
+
"""
|
|
52
|
+
Scoring adapter contract used by optimization runners.
|
|
53
|
+
|
|
54
|
+
Runners call into this adapter to:
|
|
55
|
+
- compute scores per-instance on some subset (score_on_pareto),
|
|
56
|
+
- compute minibatch means for selection and acceptance,
|
|
57
|
+
- generate feedback text used by the PromptRewriter.
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
# Sync
|
|
61
|
+
def score_on_pareto(
|
|
62
|
+
self,
|
|
63
|
+
prompt_configuration: PromptConfiguration,
|
|
64
|
+
d_pareto: Union[List[Golden], List[ConversationalGolden]],
|
|
65
|
+
) -> ScoreVector:
|
|
66
|
+
"""Return per-instance scores on D_pareto."""
|
|
67
|
+
...
|
|
68
|
+
|
|
69
|
+
def minibatch_score(
|
|
70
|
+
self,
|
|
71
|
+
prompt_configuration: PromptConfiguration,
|
|
72
|
+
minibatch: Union[List[Golden], List[ConversationalGolden]],
|
|
73
|
+
) -> float:
|
|
74
|
+
"""Return average score μ on a minibatch from D_feedback."""
|
|
75
|
+
...
|
|
76
|
+
|
|
77
|
+
def minibatch_feedback(
|
|
78
|
+
self,
|
|
79
|
+
prompt_configuration: PromptConfiguration,
|
|
80
|
+
module: ModuleId,
|
|
81
|
+
minibatch: Union[List[Golden], List[ConversationalGolden]],
|
|
82
|
+
) -> str:
|
|
83
|
+
"""Return μ_f text for the module (metric.reason + traces, etc.)."""
|
|
84
|
+
...
|
|
85
|
+
|
|
86
|
+
def select_module(
|
|
87
|
+
self, prompt_configuration: PromptConfiguration
|
|
88
|
+
) -> ModuleId:
|
|
89
|
+
"""Pick a module to mutate."""
|
|
90
|
+
...
|
|
91
|
+
|
|
92
|
+
# Async
|
|
93
|
+
async def a_score_on_pareto(
|
|
94
|
+
self,
|
|
95
|
+
prompt_configuration: PromptConfiguration,
|
|
96
|
+
d_pareto: Union[List[Golden], List[ConversationalGolden]],
|
|
97
|
+
) -> ScoreVector: ...
|
|
98
|
+
async def a_minibatch_score(
|
|
99
|
+
self,
|
|
100
|
+
prompt_configuration: PromptConfiguration,
|
|
101
|
+
minibatch: Union[List[Golden], List[ConversationalGolden]],
|
|
102
|
+
) -> float: ...
|
|
103
|
+
async def a_minibatch_feedback(
|
|
104
|
+
self,
|
|
105
|
+
prompt_configuration: PromptConfiguration,
|
|
106
|
+
module: ModuleId,
|
|
107
|
+
minibatch: Union[List[Golden], List[ConversationalGolden]],
|
|
108
|
+
) -> str: ...
|
|
109
|
+
async def a_select_module(
|
|
110
|
+
self, prompt_configuration: PromptConfiguration
|
|
111
|
+
) -> ModuleId: ...
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class PromptRewriterProtocol(Protocol):
|
|
115
|
+
def rewrite(
|
|
116
|
+
self,
|
|
117
|
+
*,
|
|
118
|
+
module_id: ModuleId,
|
|
119
|
+
model: Optional[DeepEvalBaseLLM] = None,
|
|
120
|
+
model_schema: Optional[PydanticBaseModel] = None,
|
|
121
|
+
model_callback: Optional[
|
|
122
|
+
Callable[
|
|
123
|
+
...,
|
|
124
|
+
Union[
|
|
125
|
+
str,
|
|
126
|
+
Dict,
|
|
127
|
+
Tuple[Union[str, Dict], float],
|
|
128
|
+
],
|
|
129
|
+
]
|
|
130
|
+
] = None,
|
|
131
|
+
old_prompt: Prompt,
|
|
132
|
+
feedback_text: str,
|
|
133
|
+
) -> Prompt: ...
|
|
134
|
+
|
|
135
|
+
async def a_rewrite(
|
|
136
|
+
self,
|
|
137
|
+
*,
|
|
138
|
+
module_id: ModuleId,
|
|
139
|
+
model: Optional[DeepEvalBaseLLM] = None,
|
|
140
|
+
model_schema: Optional[PydanticBaseModel] = None,
|
|
141
|
+
model_callback: Optional[
|
|
142
|
+
Callable[
|
|
143
|
+
...,
|
|
144
|
+
Union[
|
|
145
|
+
str,
|
|
146
|
+
Dict,
|
|
147
|
+
Tuple[Union[str, Dict], float],
|
|
148
|
+
],
|
|
149
|
+
]
|
|
150
|
+
] = None,
|
|
151
|
+
old_prompt: Prompt,
|
|
152
|
+
feedback_text: str,
|
|
153
|
+
) -> Prompt: ...
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
class RunnerStatusType(str, Enum):
|
|
157
|
+
"""Status events emitted by optimization runners."""
|
|
158
|
+
|
|
159
|
+
PROGRESS = "progress"
|
|
160
|
+
TIE = "tie"
|
|
161
|
+
ERROR = "error"
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
class RunnerStatusCallbackProtocol(Protocol):
|
|
165
|
+
def __call__(
|
|
166
|
+
self,
|
|
167
|
+
kind: RunnerStatusType,
|
|
168
|
+
*,
|
|
169
|
+
detail: str,
|
|
170
|
+
step_index: Optional[int] = None,
|
|
171
|
+
total_steps: Optional[int] = None,
|
|
172
|
+
) -> None: ...
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
class RunnerProtocol(Protocol):
|
|
176
|
+
"""
|
|
177
|
+
Contract for prompt optimization runners used by PromptOptimizer.
|
|
178
|
+
|
|
179
|
+
Runners are responsible for executing the optimization algorithm
|
|
180
|
+
and returning an optimized Prompt plus a report dict.
|
|
181
|
+
"""
|
|
182
|
+
|
|
183
|
+
# status_callback is injected by PromptOptimizer
|
|
184
|
+
# A runner may call this to report:
|
|
185
|
+
# progress, ties, or errors during execution.
|
|
186
|
+
status_callback: Optional[RunnerStatusCallbackProtocol]
|
|
187
|
+
model_callback: Optional[
|
|
188
|
+
Callable[
|
|
189
|
+
...,
|
|
190
|
+
Union[
|
|
191
|
+
str,
|
|
192
|
+
Dict,
|
|
193
|
+
Tuple[Union[str, Dict], float],
|
|
194
|
+
],
|
|
195
|
+
]
|
|
196
|
+
]
|
|
197
|
+
|
|
198
|
+
scoring_adapter: Optional[ScoringAdapter]
|
|
199
|
+
|
|
200
|
+
def execute(
|
|
201
|
+
self,
|
|
202
|
+
*,
|
|
203
|
+
prompt: Prompt,
|
|
204
|
+
goldens: Union[List["Golden"], List["ConversationalGolden"]],
|
|
205
|
+
) -> Tuple[Prompt, Dict]: ...
|
|
206
|
+
|
|
207
|
+
async def a_execute(
|
|
208
|
+
self,
|
|
209
|
+
*,
|
|
210
|
+
prompt: Prompt,
|
|
211
|
+
goldens: Union[List["Golden"], List["ConversationalGolden"]],
|
|
212
|
+
) -> Tuple[Prompt, Dict]: ...
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
class Objective(Protocol):
|
|
216
|
+
"""Strategy for reducing scores per-metric to a single scalar value.
|
|
217
|
+
|
|
218
|
+
Implementations receive a mapping from metric name to score
|
|
219
|
+
(for example, {"AnswerRelevancyMetric": 0.82}) and return a
|
|
220
|
+
single float used for comparisons inside the optimizer.
|
|
221
|
+
"""
|
|
222
|
+
|
|
223
|
+
def scalarize(self, scores_by_metric: Dict[str, float]) -> float: ...
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
class MeanObjective(Objective):
|
|
227
|
+
"""Default scalarizer: unweighted arithmetic mean.
|
|
228
|
+
|
|
229
|
+
- If `scores_by_metric` is non-empty, returns the arithmetic
|
|
230
|
+
mean of all metric scores.
|
|
231
|
+
- If `scores_by_metric` is empty, returns 0.0.
|
|
232
|
+
"""
|
|
233
|
+
|
|
234
|
+
def scalarize(self, scores_by_metric: Dict[str, float]) -> float:
|
|
235
|
+
if not scores_by_metric:
|
|
236
|
+
return 0.0
|
|
237
|
+
return sum(scores_by_metric.values()) / len(scores_by_metric)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
class WeightedObjective(Objective):
|
|
241
|
+
"""
|
|
242
|
+
Objective that scales each metric's score by a user-provided weight and sums them.
|
|
243
|
+
|
|
244
|
+
- `weights_by_metric` keys should match the names of the metrics passed to the
|
|
245
|
+
metric class names passed to the PromptOptimizer.
|
|
246
|
+
- Metrics not present in `weights_by_metric` receive `default_weight`.
|
|
247
|
+
This makes it easy to emphasize a subset of metrics while keeping
|
|
248
|
+
everything else at a baseline weight of 1.0, e.g.:
|
|
249
|
+
|
|
250
|
+
WeightedObjective({"AnswerRelevancyMetric": 2.0})
|
|
251
|
+
|
|
252
|
+
which treats AnswerRelevancy as 2x as important as the other metrics.
|
|
253
|
+
"""
|
|
254
|
+
|
|
255
|
+
def __init__(
|
|
256
|
+
self,
|
|
257
|
+
weights_by_metric: Optional[Dict[str, float]] = None,
|
|
258
|
+
default_weight: float = 1.0,
|
|
259
|
+
):
|
|
260
|
+
self.weights_by_metric: Dict[str, float] = dict(weights_by_metric or {})
|
|
261
|
+
self.default_weight: float = float(default_weight)
|
|
262
|
+
|
|
263
|
+
def scalarize(self, scores_by_metric: Dict[str, float]) -> float:
|
|
264
|
+
return sum(
|
|
265
|
+
self.weights_by_metric.get(name, self.default_weight) * score
|
|
266
|
+
for name, score in scores_by_metric.items()
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
@dataclass
|
|
271
|
+
class MetricInfo:
|
|
272
|
+
name: str
|
|
273
|
+
rubric: Optional[str] = None
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
class AcceptedIterationDict(TypedDict):
|
|
277
|
+
parent: PromptConfigurationId
|
|
278
|
+
child: PromptConfigurationId
|
|
279
|
+
module: ModuleId
|
|
280
|
+
before: float
|
|
281
|
+
after: float
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
class AcceptedIteration(PydanticBaseModel):
|
|
285
|
+
parent: str
|
|
286
|
+
child: str
|
|
287
|
+
module: str
|
|
288
|
+
before: float
|
|
289
|
+
after: float
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
class PromptMessageSnapshot(PydanticBaseModel):
|
|
293
|
+
role: str
|
|
294
|
+
content: str
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
class PromptModuleSnapshot(PydanticBaseModel):
|
|
298
|
+
type: Literal["TEXT", "LIST"]
|
|
299
|
+
# Only used when type == "TEXT"
|
|
300
|
+
text_template: Optional[str] = None
|
|
301
|
+
# Only used when type == "LIST"
|
|
302
|
+
messages: Optional[List[PromptMessageSnapshot]] = None
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
class PromptConfigSnapshot(PydanticBaseModel):
|
|
306
|
+
parent: Optional[str]
|
|
307
|
+
prompts: Dict[str, PromptModuleSnapshot]
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
@dataclass
|
|
311
|
+
class OptimizationResult:
|
|
312
|
+
optimization_id: str
|
|
313
|
+
best_id: PromptConfigurationId
|
|
314
|
+
accepted_iterations: List[Dict]
|
|
315
|
+
pareto_scores: Dict[PromptConfigurationId, List[float]]
|
|
316
|
+
parents: Dict[PromptConfigurationId, Optional[PromptConfigurationId]]
|
|
317
|
+
prompt_configurations: Dict[PromptConfigurationId, Dict[str, Any]]
|
|
318
|
+
|
|
319
|
+
def as_dict(self) -> Dict:
|
|
320
|
+
return dict(
|
|
321
|
+
optimization_id=self.optimization_id,
|
|
322
|
+
best_id=self.best_id,
|
|
323
|
+
accepted_iterations=self.accepted_iterations,
|
|
324
|
+
pareto_scores=self.pareto_scores,
|
|
325
|
+
parents=self.parents,
|
|
326
|
+
prompt_configurations=self.prompt_configurations,
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
class OptimizationReport(PydanticBaseModel):
|
|
331
|
+
optimization_id: str = Field(
|
|
332
|
+
alias="optimizationId",
|
|
333
|
+
validation_alias=AliasChoices("optimizationId", "optimization_id"),
|
|
334
|
+
)
|
|
335
|
+
best_id: str = Field(
|
|
336
|
+
alias="bestId",
|
|
337
|
+
validation_alias=AliasChoices("bestId", "best_id"),
|
|
338
|
+
)
|
|
339
|
+
accepted_iterations: list[AcceptedIteration] = Field(
|
|
340
|
+
default_factory=list,
|
|
341
|
+
alias="acceptedIterations",
|
|
342
|
+
validation_alias=AliasChoices(
|
|
343
|
+
"acceptedIterations", "accepted_iterations"
|
|
344
|
+
),
|
|
345
|
+
)
|
|
346
|
+
pareto_scores: dict[str, list[float]] = Field(
|
|
347
|
+
alias="paretoScores",
|
|
348
|
+
validation_alias=AliasChoices("paretoScores", "pareto_scores"),
|
|
349
|
+
)
|
|
350
|
+
parents: dict[str, str | None]
|
|
351
|
+
prompt_configurations: dict[str, PromptConfigSnapshot] = Field(
|
|
352
|
+
alias="promptConfigurations",
|
|
353
|
+
validation_alias=AliasChoices(
|
|
354
|
+
"promptConfigurations", "prompt_configurations"
|
|
355
|
+
),
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
@classmethod
|
|
359
|
+
def from_runtime(cls, result: dict) -> "OptimizationReport":
|
|
360
|
+
# accepts the dict from OptimizationResult.as_dict()
|
|
361
|
+
return cls(**result)
|