deepeval 3.7.3__py3-none-any.whl → 3.7.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/test.py +1 -1
  3. deepeval/config/settings.py +102 -13
  4. deepeval/evaluate/configs.py +1 -1
  5. deepeval/evaluate/execute.py +4 -1
  6. deepeval/metrics/answer_relevancy/template.py +4 -4
  7. deepeval/metrics/argument_correctness/template.py +2 -2
  8. deepeval/metrics/bias/template.py +3 -3
  9. deepeval/metrics/contextual_precision/template.py +6 -6
  10. deepeval/metrics/contextual_recall/template.py +2 -2
  11. deepeval/metrics/contextual_relevancy/template.py +3 -3
  12. deepeval/metrics/conversation_completeness/template.py +2 -2
  13. deepeval/metrics/conversational_dag/templates.py +4 -4
  14. deepeval/metrics/conversational_g_eval/template.py +4 -3
  15. deepeval/metrics/dag/templates.py +4 -4
  16. deepeval/metrics/faithfulness/template.py +4 -4
  17. deepeval/metrics/hallucination/template.py +4 -4
  18. deepeval/metrics/misuse/template.py +2 -2
  19. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +7 -7
  20. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +6 -6
  21. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +2 -2
  22. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +3 -3
  23. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +9 -9
  24. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +4 -4
  25. deepeval/metrics/non_advice/template.py +2 -2
  26. deepeval/metrics/pii_leakage/template.py +2 -2
  27. deepeval/metrics/prompt_alignment/template.py +4 -4
  28. deepeval/metrics/role_violation/template.py +2 -2
  29. deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
  30. deepeval/metrics/toxicity/template.py +4 -4
  31. deepeval/metrics/turn_relevancy/template.py +2 -2
  32. deepeval/models/embedding_models/azure_embedding_model.py +28 -15
  33. deepeval/models/embedding_models/local_embedding_model.py +23 -10
  34. deepeval/models/embedding_models/ollama_embedding_model.py +8 -6
  35. deepeval/models/embedding_models/openai_embedding_model.py +18 -2
  36. deepeval/models/llms/anthropic_model.py +17 -5
  37. deepeval/models/llms/azure_model.py +30 -18
  38. deepeval/models/llms/deepseek_model.py +22 -12
  39. deepeval/models/llms/gemini_model.py +120 -87
  40. deepeval/models/llms/grok_model.py +23 -16
  41. deepeval/models/llms/kimi_model.py +23 -12
  42. deepeval/models/llms/litellm_model.py +63 -25
  43. deepeval/models/llms/local_model.py +26 -18
  44. deepeval/models/llms/ollama_model.py +17 -7
  45. deepeval/models/llms/openai_model.py +22 -17
  46. deepeval/models/llms/portkey_model.py +132 -0
  47. deepeval/models/mlllms/azure_model.py +28 -19
  48. deepeval/models/mlllms/gemini_model.py +102 -73
  49. deepeval/models/mlllms/ollama_model.py +40 -9
  50. deepeval/models/mlllms/openai_model.py +65 -14
  51. deepeval/models/utils.py +48 -3
  52. deepeval/optimization/__init__.py +13 -0
  53. deepeval/optimization/adapters/__init__.py +2 -0
  54. deepeval/optimization/adapters/deepeval_scoring_adapter.py +588 -0
  55. deepeval/optimization/aggregates.py +14 -0
  56. deepeval/optimization/configs.py +34 -0
  57. deepeval/optimization/copro/configs.py +31 -0
  58. deepeval/optimization/copro/loop.py +837 -0
  59. deepeval/optimization/gepa/__init__.py +7 -0
  60. deepeval/optimization/gepa/configs.py +115 -0
  61. deepeval/optimization/gepa/loop.py +677 -0
  62. deepeval/optimization/miprov2/configs.py +134 -0
  63. deepeval/optimization/miprov2/loop.py +785 -0
  64. deepeval/optimization/mutations/__init__.py +0 -0
  65. deepeval/optimization/mutations/prompt_rewriter.py +458 -0
  66. deepeval/optimization/policies/__init__.py +16 -0
  67. deepeval/optimization/policies/selection.py +166 -0
  68. deepeval/optimization/policies/tie_breaker.py +67 -0
  69. deepeval/optimization/prompt_optimizer.py +462 -0
  70. deepeval/optimization/simba/__init__.py +0 -0
  71. deepeval/optimization/simba/configs.py +33 -0
  72. deepeval/optimization/simba/loop.py +983 -0
  73. deepeval/optimization/simba/types.py +15 -0
  74. deepeval/optimization/types.py +361 -0
  75. deepeval/optimization/utils.py +598 -0
  76. deepeval/prompt/prompt.py +10 -5
  77. deepeval/test_run/cache.py +2 -0
  78. deepeval/test_run/test_run.py +6 -1
  79. deepeval/utils.py +24 -0
  80. {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/METADATA +1 -1
  81. {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/RECORD +84 -59
  82. {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/LICENSE.md +0 -0
  83. {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/WHEEL +0 -0
  84. {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,15 @@
1
+ from enum import Enum
2
+
3
+
4
+ class SIMBAStrategy(str, Enum):
5
+ """
6
+ Edit strategies used by SIMBA-style optimization.
7
+
8
+ - APPEND_DEMO: append one or more input/output demos distilled from the
9
+ current minibatch, similar in spirit to DSPy's `append_a_demo`.
10
+ - APPEND_RULE: append a concise natural-language rule distilled from
11
+ feedback, similar in spirit to DSPy's `append_a_rule`.
12
+ """
13
+
14
+ APPEND_DEMO = "append_demo"
15
+ APPEND_RULE = "append_rule"
@@ -0,0 +1,361 @@
1
+ from __future__ import annotations
2
+ import uuid
3
+
4
+ from dataclasses import dataclass
5
+ from typing import (
6
+ Any,
7
+ Callable,
8
+ Dict,
9
+ List,
10
+ Literal,
11
+ Optional,
12
+ Protocol,
13
+ TYPE_CHECKING,
14
+ TypedDict,
15
+ Tuple,
16
+ Union,
17
+ )
18
+ from enum import Enum
19
+ from pydantic import BaseModel as PydanticBaseModel, Field, AliasChoices
20
+
21
+ from deepeval.prompt.prompt import Prompt
22
+ from deepeval.models.base_model import DeepEvalBaseLLM
23
+
24
+
25
+ if TYPE_CHECKING:
26
+ from deepeval.dataset.golden import Golden, ConversationalGolden
27
+
28
+ PromptConfigurationId = str
29
+ ModuleId = str
30
+ ScoreVector = List[float] # scores per instance on D_pareto, aligned order
31
+ ScoreTable = Dict[PromptConfigurationId, ScoreVector]
32
+
33
+
34
+ @dataclass
35
+ class PromptConfiguration:
36
+ id: PromptConfigurationId
37
+ parent: Optional[PromptConfigurationId]
38
+ prompts: Dict[ModuleId, Prompt]
39
+
40
+ @staticmethod
41
+ def new(
42
+ prompts: Dict[ModuleId, Prompt],
43
+ parent: Optional[PromptConfigurationId] = None,
44
+ ) -> "PromptConfiguration":
45
+ return PromptConfiguration(
46
+ id=str(uuid.uuid4()), parent=parent, prompts=dict(prompts)
47
+ )
48
+
49
+
50
+ class ScoringAdapter(Protocol):
51
+ """
52
+ Scoring adapter contract used by optimization runners.
53
+
54
+ Runners call into this adapter to:
55
+ - compute scores per-instance on some subset (score_on_pareto),
56
+ - compute minibatch means for selection and acceptance,
57
+ - generate feedback text used by the PromptRewriter.
58
+ """
59
+
60
+ # Sync
61
+ def score_on_pareto(
62
+ self,
63
+ prompt_configuration: PromptConfiguration,
64
+ d_pareto: Union[List[Golden], List[ConversationalGolden]],
65
+ ) -> ScoreVector:
66
+ """Return per-instance scores on D_pareto."""
67
+ ...
68
+
69
+ def minibatch_score(
70
+ self,
71
+ prompt_configuration: PromptConfiguration,
72
+ minibatch: Union[List[Golden], List[ConversationalGolden]],
73
+ ) -> float:
74
+ """Return average score μ on a minibatch from D_feedback."""
75
+ ...
76
+
77
+ def minibatch_feedback(
78
+ self,
79
+ prompt_configuration: PromptConfiguration,
80
+ module: ModuleId,
81
+ minibatch: Union[List[Golden], List[ConversationalGolden]],
82
+ ) -> str:
83
+ """Return μ_f text for the module (metric.reason + traces, etc.)."""
84
+ ...
85
+
86
+ def select_module(
87
+ self, prompt_configuration: PromptConfiguration
88
+ ) -> ModuleId:
89
+ """Pick a module to mutate."""
90
+ ...
91
+
92
+ # Async
93
+ async def a_score_on_pareto(
94
+ self,
95
+ prompt_configuration: PromptConfiguration,
96
+ d_pareto: Union[List[Golden], List[ConversationalGolden]],
97
+ ) -> ScoreVector: ...
98
+ async def a_minibatch_score(
99
+ self,
100
+ prompt_configuration: PromptConfiguration,
101
+ minibatch: Union[List[Golden], List[ConversationalGolden]],
102
+ ) -> float: ...
103
+ async def a_minibatch_feedback(
104
+ self,
105
+ prompt_configuration: PromptConfiguration,
106
+ module: ModuleId,
107
+ minibatch: Union[List[Golden], List[ConversationalGolden]],
108
+ ) -> str: ...
109
+ async def a_select_module(
110
+ self, prompt_configuration: PromptConfiguration
111
+ ) -> ModuleId: ...
112
+
113
+
114
+ class PromptRewriterProtocol(Protocol):
115
+ def rewrite(
116
+ self,
117
+ *,
118
+ module_id: ModuleId,
119
+ model: Optional[DeepEvalBaseLLM] = None,
120
+ model_schema: Optional[PydanticBaseModel] = None,
121
+ model_callback: Optional[
122
+ Callable[
123
+ ...,
124
+ Union[
125
+ str,
126
+ Dict,
127
+ Tuple[Union[str, Dict], float],
128
+ ],
129
+ ]
130
+ ] = None,
131
+ old_prompt: Prompt,
132
+ feedback_text: str,
133
+ ) -> Prompt: ...
134
+
135
+ async def a_rewrite(
136
+ self,
137
+ *,
138
+ module_id: ModuleId,
139
+ model: Optional[DeepEvalBaseLLM] = None,
140
+ model_schema: Optional[PydanticBaseModel] = None,
141
+ model_callback: Optional[
142
+ Callable[
143
+ ...,
144
+ Union[
145
+ str,
146
+ Dict,
147
+ Tuple[Union[str, Dict], float],
148
+ ],
149
+ ]
150
+ ] = None,
151
+ old_prompt: Prompt,
152
+ feedback_text: str,
153
+ ) -> Prompt: ...
154
+
155
+
156
+ class RunnerStatusType(str, Enum):
157
+ """Status events emitted by optimization runners."""
158
+
159
+ PROGRESS = "progress"
160
+ TIE = "tie"
161
+ ERROR = "error"
162
+
163
+
164
+ class RunnerStatusCallbackProtocol(Protocol):
165
+ def __call__(
166
+ self,
167
+ kind: RunnerStatusType,
168
+ *,
169
+ detail: str,
170
+ step_index: Optional[int] = None,
171
+ total_steps: Optional[int] = None,
172
+ ) -> None: ...
173
+
174
+
175
+ class RunnerProtocol(Protocol):
176
+ """
177
+ Contract for prompt optimization runners used by PromptOptimizer.
178
+
179
+ Runners are responsible for executing the optimization algorithm
180
+ and returning an optimized Prompt plus a report dict.
181
+ """
182
+
183
+ # status_callback is injected by PromptOptimizer
184
+ # A runner may call this to report:
185
+ # progress, ties, or errors during execution.
186
+ status_callback: Optional[RunnerStatusCallbackProtocol]
187
+ model_callback: Optional[
188
+ Callable[
189
+ ...,
190
+ Union[
191
+ str,
192
+ Dict,
193
+ Tuple[Union[str, Dict], float],
194
+ ],
195
+ ]
196
+ ]
197
+
198
+ scoring_adapter: Optional[ScoringAdapter]
199
+
200
+ def execute(
201
+ self,
202
+ *,
203
+ prompt: Prompt,
204
+ goldens: Union[List["Golden"], List["ConversationalGolden"]],
205
+ ) -> Tuple[Prompt, Dict]: ...
206
+
207
+ async def a_execute(
208
+ self,
209
+ *,
210
+ prompt: Prompt,
211
+ goldens: Union[List["Golden"], List["ConversationalGolden"]],
212
+ ) -> Tuple[Prompt, Dict]: ...
213
+
214
+
215
+ class Objective(Protocol):
216
+ """Strategy for reducing scores per-metric to a single scalar value.
217
+
218
+ Implementations receive a mapping from metric name to score
219
+ (for example, {"AnswerRelevancyMetric": 0.82}) and return a
220
+ single float used for comparisons inside the optimizer.
221
+ """
222
+
223
+ def scalarize(self, scores_by_metric: Dict[str, float]) -> float: ...
224
+
225
+
226
+ class MeanObjective(Objective):
227
+ """Default scalarizer: unweighted arithmetic mean.
228
+
229
+ - If `scores_by_metric` is non-empty, returns the arithmetic
230
+ mean of all metric scores.
231
+ - If `scores_by_metric` is empty, returns 0.0.
232
+ """
233
+
234
+ def scalarize(self, scores_by_metric: Dict[str, float]) -> float:
235
+ if not scores_by_metric:
236
+ return 0.0
237
+ return sum(scores_by_metric.values()) / len(scores_by_metric)
238
+
239
+
240
+ class WeightedObjective(Objective):
241
+ """
242
+ Objective that scales each metric's score by a user-provided weight and sums them.
243
+
244
+ - `weights_by_metric` keys should match the names of the metrics passed to the
245
+ metric class names passed to the PromptOptimizer.
246
+ - Metrics not present in `weights_by_metric` receive `default_weight`.
247
+ This makes it easy to emphasize a subset of metrics while keeping
248
+ everything else at a baseline weight of 1.0, e.g.:
249
+
250
+ WeightedObjective({"AnswerRelevancyMetric": 2.0})
251
+
252
+ which treats AnswerRelevancy as 2x as important as the other metrics.
253
+ """
254
+
255
+ def __init__(
256
+ self,
257
+ weights_by_metric: Optional[Dict[str, float]] = None,
258
+ default_weight: float = 1.0,
259
+ ):
260
+ self.weights_by_metric: Dict[str, float] = dict(weights_by_metric or {})
261
+ self.default_weight: float = float(default_weight)
262
+
263
+ def scalarize(self, scores_by_metric: Dict[str, float]) -> float:
264
+ return sum(
265
+ self.weights_by_metric.get(name, self.default_weight) * score
266
+ for name, score in scores_by_metric.items()
267
+ )
268
+
269
+
270
+ @dataclass
271
+ class MetricInfo:
272
+ name: str
273
+ rubric: Optional[str] = None
274
+
275
+
276
+ class AcceptedIterationDict(TypedDict):
277
+ parent: PromptConfigurationId
278
+ child: PromptConfigurationId
279
+ module: ModuleId
280
+ before: float
281
+ after: float
282
+
283
+
284
+ class AcceptedIteration(PydanticBaseModel):
285
+ parent: str
286
+ child: str
287
+ module: str
288
+ before: float
289
+ after: float
290
+
291
+
292
+ class PromptMessageSnapshot(PydanticBaseModel):
293
+ role: str
294
+ content: str
295
+
296
+
297
+ class PromptModuleSnapshot(PydanticBaseModel):
298
+ type: Literal["TEXT", "LIST"]
299
+ # Only used when type == "TEXT"
300
+ text_template: Optional[str] = None
301
+ # Only used when type == "LIST"
302
+ messages: Optional[List[PromptMessageSnapshot]] = None
303
+
304
+
305
+ class PromptConfigSnapshot(PydanticBaseModel):
306
+ parent: Optional[str]
307
+ prompts: Dict[str, PromptModuleSnapshot]
308
+
309
+
310
+ @dataclass
311
+ class OptimizationResult:
312
+ optimization_id: str
313
+ best_id: PromptConfigurationId
314
+ accepted_iterations: List[Dict]
315
+ pareto_scores: Dict[PromptConfigurationId, List[float]]
316
+ parents: Dict[PromptConfigurationId, Optional[PromptConfigurationId]]
317
+ prompt_configurations: Dict[PromptConfigurationId, Dict[str, Any]]
318
+
319
+ def as_dict(self) -> Dict:
320
+ return dict(
321
+ optimization_id=self.optimization_id,
322
+ best_id=self.best_id,
323
+ accepted_iterations=self.accepted_iterations,
324
+ pareto_scores=self.pareto_scores,
325
+ parents=self.parents,
326
+ prompt_configurations=self.prompt_configurations,
327
+ )
328
+
329
+
330
+ class OptimizationReport(PydanticBaseModel):
331
+ optimization_id: str = Field(
332
+ alias="optimizationId",
333
+ validation_alias=AliasChoices("optimizationId", "optimization_id"),
334
+ )
335
+ best_id: str = Field(
336
+ alias="bestId",
337
+ validation_alias=AliasChoices("bestId", "best_id"),
338
+ )
339
+ accepted_iterations: list[AcceptedIteration] = Field(
340
+ default_factory=list,
341
+ alias="acceptedIterations",
342
+ validation_alias=AliasChoices(
343
+ "acceptedIterations", "accepted_iterations"
344
+ ),
345
+ )
346
+ pareto_scores: dict[str, list[float]] = Field(
347
+ alias="paretoScores",
348
+ validation_alias=AliasChoices("paretoScores", "pareto_scores"),
349
+ )
350
+ parents: dict[str, str | None]
351
+ prompt_configurations: dict[str, PromptConfigSnapshot] = Field(
352
+ alias="promptConfigurations",
353
+ validation_alias=AliasChoices(
354
+ "promptConfigurations", "prompt_configurations"
355
+ ),
356
+ )
357
+
358
+ @classmethod
359
+ def from_runtime(cls, result: dict) -> "OptimizationReport":
360
+ # accepts the dict from OptimizationResult.as_dict()
361
+ return cls(**result)