deepeval 3.7.3__py3-none-any.whl → 3.7.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/test.py +1 -1
  3. deepeval/config/settings.py +102 -13
  4. deepeval/evaluate/configs.py +1 -1
  5. deepeval/evaluate/execute.py +4 -1
  6. deepeval/metrics/answer_relevancy/template.py +4 -4
  7. deepeval/metrics/argument_correctness/template.py +2 -2
  8. deepeval/metrics/bias/template.py +3 -3
  9. deepeval/metrics/contextual_precision/template.py +6 -6
  10. deepeval/metrics/contextual_recall/template.py +2 -2
  11. deepeval/metrics/contextual_relevancy/template.py +3 -3
  12. deepeval/metrics/conversation_completeness/template.py +2 -2
  13. deepeval/metrics/conversational_dag/templates.py +4 -4
  14. deepeval/metrics/conversational_g_eval/template.py +4 -3
  15. deepeval/metrics/dag/templates.py +4 -4
  16. deepeval/metrics/faithfulness/template.py +4 -4
  17. deepeval/metrics/hallucination/template.py +4 -4
  18. deepeval/metrics/misuse/template.py +2 -2
  19. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +7 -7
  20. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +6 -6
  21. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +2 -2
  22. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +3 -3
  23. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +9 -9
  24. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +4 -4
  25. deepeval/metrics/non_advice/template.py +2 -2
  26. deepeval/metrics/pii_leakage/template.py +2 -2
  27. deepeval/metrics/prompt_alignment/template.py +4 -4
  28. deepeval/metrics/role_violation/template.py +2 -2
  29. deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
  30. deepeval/metrics/toxicity/template.py +4 -4
  31. deepeval/metrics/turn_relevancy/template.py +2 -2
  32. deepeval/models/embedding_models/azure_embedding_model.py +28 -15
  33. deepeval/models/embedding_models/local_embedding_model.py +23 -10
  34. deepeval/models/embedding_models/ollama_embedding_model.py +8 -6
  35. deepeval/models/embedding_models/openai_embedding_model.py +18 -2
  36. deepeval/models/llms/anthropic_model.py +17 -5
  37. deepeval/models/llms/azure_model.py +30 -18
  38. deepeval/models/llms/deepseek_model.py +22 -12
  39. deepeval/models/llms/gemini_model.py +120 -87
  40. deepeval/models/llms/grok_model.py +23 -16
  41. deepeval/models/llms/kimi_model.py +23 -12
  42. deepeval/models/llms/litellm_model.py +63 -25
  43. deepeval/models/llms/local_model.py +26 -18
  44. deepeval/models/llms/ollama_model.py +17 -7
  45. deepeval/models/llms/openai_model.py +22 -17
  46. deepeval/models/llms/portkey_model.py +132 -0
  47. deepeval/models/mlllms/azure_model.py +28 -19
  48. deepeval/models/mlllms/gemini_model.py +102 -73
  49. deepeval/models/mlllms/ollama_model.py +40 -9
  50. deepeval/models/mlllms/openai_model.py +65 -14
  51. deepeval/models/utils.py +48 -3
  52. deepeval/optimization/__init__.py +13 -0
  53. deepeval/optimization/adapters/__init__.py +2 -0
  54. deepeval/optimization/adapters/deepeval_scoring_adapter.py +588 -0
  55. deepeval/optimization/aggregates.py +14 -0
  56. deepeval/optimization/configs.py +34 -0
  57. deepeval/optimization/copro/configs.py +31 -0
  58. deepeval/optimization/copro/loop.py +837 -0
  59. deepeval/optimization/gepa/__init__.py +7 -0
  60. deepeval/optimization/gepa/configs.py +115 -0
  61. deepeval/optimization/gepa/loop.py +677 -0
  62. deepeval/optimization/miprov2/configs.py +134 -0
  63. deepeval/optimization/miprov2/loop.py +785 -0
  64. deepeval/optimization/mutations/__init__.py +0 -0
  65. deepeval/optimization/mutations/prompt_rewriter.py +458 -0
  66. deepeval/optimization/policies/__init__.py +16 -0
  67. deepeval/optimization/policies/selection.py +166 -0
  68. deepeval/optimization/policies/tie_breaker.py +67 -0
  69. deepeval/optimization/prompt_optimizer.py +462 -0
  70. deepeval/optimization/simba/__init__.py +0 -0
  71. deepeval/optimization/simba/configs.py +33 -0
  72. deepeval/optimization/simba/loop.py +983 -0
  73. deepeval/optimization/simba/types.py +15 -0
  74. deepeval/optimization/types.py +361 -0
  75. deepeval/optimization/utils.py +598 -0
  76. deepeval/prompt/prompt.py +10 -5
  77. deepeval/test_run/cache.py +2 -0
  78. deepeval/test_run/test_run.py +6 -1
  79. deepeval/utils.py +24 -0
  80. {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/METADATA +1 -1
  81. {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/RECORD +84 -59
  82. {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/LICENSE.md +0 -0
  83. {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/WHEEL +0 -0
  84. {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,462 @@
1
+ from typing import (
2
+ Callable,
3
+ Dict,
4
+ List,
5
+ Optional,
6
+ Tuple,
7
+ Union,
8
+ )
9
+
10
+ from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn
11
+
12
+ from deepeval.dataset.golden import Golden, ConversationalGolden
13
+ from deepeval.errors import DeepEvalError
14
+ from deepeval.metrics import BaseConversationalMetric, BaseMetric
15
+ from deepeval.evaluate.configs import AsyncConfig
16
+ from deepeval.optimization.adapters.deepeval_scoring_adapter import (
17
+ DeepEvalScoringAdapter,
18
+ )
19
+ from deepeval.optimization.mutations.prompt_rewriter import (
20
+ PromptRewriter,
21
+ )
22
+ from deepeval.optimization.types import (
23
+ OptimizationReport,
24
+ RunnerProtocol,
25
+ RunnerStatusType,
26
+ )
27
+ from deepeval.optimization.utils import (
28
+ validate_callback,
29
+ validate_metrics,
30
+ validate_instance,
31
+ validate_sequence_of,
32
+ )
33
+ from deepeval.optimization.configs import (
34
+ OptimizerDisplayConfig,
35
+ PromptListMutationConfig,
36
+ )
37
+ from deepeval.prompt.prompt import Prompt
38
+ from deepeval.utils import get_or_create_event_loop
39
+ from deepeval.optimization.gepa.configs import GEPAConfig
40
+ from deepeval.optimization.gepa.loop import GEPARunner
41
+ from deepeval.optimization.miprov2.configs import MIPROConfig
42
+ from deepeval.optimization.miprov2.loop import MIPRORunner
43
+ from deepeval.optimization.copro.configs import COPROConfig
44
+ from deepeval.optimization.copro.loop import COPRORunner
45
+ from deepeval.optimization.simba.configs import SIMBAConfig
46
+ from deepeval.optimization.simba.loop import SIMBARunner
47
+
48
+
49
+ class PromptOptimizer:
50
+ """
51
+ High-level entrypoint for prompt optimization.
52
+
53
+ Typical usage:
54
+
55
+ optimizer = PromptOptimizer(
56
+ metrics=[AnswerRelevancyMetric()],
57
+ model_callback=model_callback,
58
+ )
59
+
60
+ optimized_prompt = optimizer.optimize(
61
+ prompt=Prompt(text_template="Respond to the query."),
62
+ goldens=goldens,
63
+ )
64
+
65
+ By default, this constructs and uses a GEPA based runner internally.
66
+ Advanced users can construct their own runner with a custom config
67
+ (GEPAConfig) and attach it via `set_runner(...)`.
68
+ """
69
+
70
+ def __init__(
71
+ self,
72
+ *,
73
+ model_callback: Callable[
74
+ ...,
75
+ Union[
76
+ str,
77
+ Dict,
78
+ Tuple[Union[str, Dict], float],
79
+ ],
80
+ ],
81
+ metrics: Union[List[BaseMetric], List[BaseConversationalMetric]],
82
+ async_config: Optional[AsyncConfig] = None,
83
+ display_config: Optional[OptimizerDisplayConfig] = None,
84
+ prompt_list_mutation_config: Optional[PromptListMutationConfig] = None,
85
+ list_input_role: str = "user",
86
+ algorithm: str = "gepa",
87
+ ):
88
+ # Validate and store the callback
89
+ self.model_callback = validate_callback(
90
+ component="PromptOptimizer",
91
+ model_callback=model_callback,
92
+ )
93
+ self.metrics = validate_metrics(
94
+ component="PromptOptimizer", metrics=metrics
95
+ )
96
+ # Validate async_config
97
+ async_config = async_config or AsyncConfig()
98
+ validate_instance(
99
+ component="PromptOptimizer.__init__",
100
+ param_name="async_config",
101
+ value=async_config,
102
+ expected_types=AsyncConfig,
103
+ )
104
+ self.async_config = async_config
105
+
106
+ # validate display_config
107
+ display_config = display_config or OptimizerDisplayConfig()
108
+ validate_instance(
109
+ component="PromptOptimizer.__init__",
110
+ param_name="display_config",
111
+ value=display_config,
112
+ expected_types=OptimizerDisplayConfig,
113
+ )
114
+ self.display_config = display_config
115
+
116
+ # validate prompt_list_mutation_config
117
+ prompt_list_mutation_config = (
118
+ prompt_list_mutation_config or PromptListMutationConfig()
119
+ )
120
+ validate_instance(
121
+ component="PromptOptimizer.__init__",
122
+ param_name="prompt_list_mutation_config",
123
+ value=prompt_list_mutation_config,
124
+ expected_types=PromptListMutationConfig,
125
+ )
126
+ self.prompt_list_mutation_config = prompt_list_mutation_config
127
+
128
+ # validate list_input_role
129
+ validate_instance(
130
+ component="PromptOptimizer.__init__",
131
+ param_name="list_input_role",
132
+ value=list_input_role,
133
+ expected_types=str,
134
+ )
135
+ self.list_input_role = list_input_role
136
+
137
+ # Validate algorithm
138
+ algo_raw = algorithm or "gepa"
139
+ if not isinstance(algo_raw, str):
140
+ raise DeepEvalError(
141
+ "PromptOptimizer.__init__ expected `algorithm` to be a string "
142
+ f"(e.g. 'gepa'), but received {type(algorithm).__name__!r} instead."
143
+ )
144
+
145
+ algo_normalized = (algo_raw.strip() or "gepa").lower()
146
+ if algo_normalized in {"mipro", "miprov2"}:
147
+ algo_normalized = "miprov2"
148
+
149
+ self._allowed_algorithms = {"gepa", "miprov2", "copro", "simba"}
150
+
151
+ if algo_normalized not in self._allowed_algorithms:
152
+ raise DeepEvalError(
153
+ "PromptOptimizer.__init__ received unsupported `algorithm` "
154
+ f"value {algorithm!r}. Supported algorithms are: "
155
+ + ", ".join(sorted(self._allowed_algorithms))
156
+ )
157
+
158
+ self.algorithm = algo_normalized
159
+
160
+ # Internal state used only when a progress indicator is active.
161
+ # Tuple is (Progress instance, task_id).
162
+ self._progress_state: Optional[Tuple[Progress, int]] = None
163
+
164
+ self.runner: Optional[RunnerProtocol] = None
165
+
166
+ ##############
167
+ # Public API #
168
+ ##############
169
+
170
+ def optimize(
171
+ self,
172
+ *,
173
+ prompt: Prompt,
174
+ goldens: Union[List["Golden"], List["ConversationalGolden"]],
175
+ ) -> Prompt:
176
+ """
177
+ Run the configured optimization algorithm and return an optimized Prompt.
178
+
179
+ The returned Prompt will have an OptimizationReport attached as
180
+ `prompt.optimization_report`.
181
+ """
182
+ # Validate prompt
183
+ validate_instance(
184
+ component="PromptOptimizer.optimize",
185
+ param_name="prompt",
186
+ value=prompt,
187
+ expected_types=Prompt,
188
+ )
189
+
190
+ # Validate goldens: must be a list of Golden or ConversationalGolden
191
+ validate_sequence_of(
192
+ component="PromptOptimizer.optimize",
193
+ param_name="goldens",
194
+ value=goldens,
195
+ expected_item_types=(Golden, ConversationalGolden),
196
+ sequence_types=(list,),
197
+ )
198
+
199
+ if self.runner is None:
200
+ self.set_runner(self._build_default_runner())
201
+
202
+ if not self.display_config.show_indicator:
203
+ best_prompt, report_dict = (
204
+ self._run_optimization_with_error_handling(
205
+ prompt=prompt,
206
+ goldens=goldens,
207
+ )
208
+ )
209
+ else:
210
+ with Progress(
211
+ SpinnerColumn(style="rgb(106,0,255)"),
212
+ BarColumn(bar_width=60),
213
+ TextColumn("[progress.description]{task.description}"),
214
+ transient=True,
215
+ ) as progress:
216
+ # Total will be provided by the runner via the
217
+ # progress status_callback. Start at 0 and update later.
218
+ task = progress.add_task(
219
+ f"Optimizing prompt with {self.algorithm.upper()}..."
220
+ )
221
+ self._progress_state = (progress, task)
222
+
223
+ try:
224
+ best_prompt, report_dict = (
225
+ self._run_optimization_with_error_handling(
226
+ prompt=prompt,
227
+ goldens=goldens,
228
+ )
229
+ )
230
+ finally:
231
+ # Clear progress state even if an error occurs
232
+ self._progress_state = None
233
+
234
+ best_prompt.optimization_report = OptimizationReport.from_runtime(
235
+ report_dict
236
+ )
237
+ return best_prompt
238
+
239
+ def set_runner(self, runner: RunnerProtocol):
240
+ self._set_runner_callbacks(runner)
241
+ scoring_adapter = getattr(runner, "scoring_adapter", None)
242
+ if scoring_adapter is None:
243
+ runner.scoring_adapter = self._build_default_scoring_adapter()
244
+ else:
245
+ if not len(runner.scoring_adapter.metrics):
246
+ runner.scoring_adapter.set_metrics(self.metrics)
247
+ if runner.scoring_adapter.model_callback is None:
248
+ runner.scoring_adapter.model_callback = self.model_callback
249
+ self.runner = runner
250
+
251
+ ####################
252
+ # Internal helpers #
253
+ ####################
254
+
255
+ def _run_optimization(
256
+ self,
257
+ *,
258
+ prompt: Prompt,
259
+ goldens: Union[List["Golden"], List["ConversationalGolden"]],
260
+ ) -> Tuple[Prompt, Dict]:
261
+ if self.async_config.run_async:
262
+ loop = get_or_create_event_loop()
263
+ return loop.run_until_complete(
264
+ self.runner.a_execute(prompt=prompt, goldens=goldens)
265
+ )
266
+ return self.runner.execute(prompt=prompt, goldens=goldens)
267
+
268
+ def _run_optimization_with_error_handling(
269
+ self,
270
+ *,
271
+ prompt: Prompt,
272
+ goldens: Union[List["Golden"], List["ConversationalGolden"]],
273
+ ) -> Tuple[Prompt, Dict]:
274
+ """
275
+ Run optimization and convert uncaught exceptions into a concise
276
+ user facing error message.
277
+
278
+ This is a fallback for errors that occur before the runner
279
+ enters its main iteration loop, which would otherwise surface
280
+ as a full traceback.
281
+ """
282
+ try:
283
+ return self._run_optimization(prompt=prompt, goldens=goldens)
284
+ except Exception as exc:
285
+ # Try to recover iteration count from the runner config
286
+ total_steps: Optional[int] = None
287
+ iterations: Optional[int] = None
288
+ runner_config = getattr(self.runner, "config", None)
289
+ if runner_config is not None:
290
+ iterations = getattr(runner_config, "iterations", None)
291
+ if iterations is not None:
292
+ total_steps = int(iterations)
293
+
294
+ prefix = (
295
+ f"(iterations={iterations}) " if iterations is not None else ""
296
+ )
297
+ detail = (
298
+ f"{prefix}• error {exc.__class__.__name__}: {exc} "
299
+ "• halted before first iteration"
300
+ )
301
+
302
+ self._on_status(
303
+ RunnerStatusType.ERROR,
304
+ detail=detail,
305
+ step_index=None,
306
+ total_steps=total_steps,
307
+ )
308
+
309
+ algo = self.algorithm.upper()
310
+
311
+ # using `from None` avoids a long chained stack trace while keeping
312
+ # the error message readable.
313
+ raise DeepEvalError(f"[{algo}] {detail}") from None
314
+
315
+ def _on_status(
316
+ self,
317
+ kind: RunnerStatusType,
318
+ *,
319
+ detail: str,
320
+ step_index: Optional[int] = None,
321
+ total_steps: Optional[int] = None,
322
+ ) -> None:
323
+ """
324
+ Unified status callback used by the configured runner.
325
+
326
+ - PROGRESS: update the progress bar description and position
327
+ - TIE: optionally print a tie message
328
+ - ERROR: print a concise error message and allow the run to halt
329
+ """
330
+ algo = self.algorithm.upper()
331
+
332
+ # ERROR: always print, optionally update progress bar
333
+ if kind is RunnerStatusType.ERROR:
334
+ if (
335
+ self.display_config.show_indicator
336
+ and self._progress_state is not None
337
+ ):
338
+ progress, task = self._progress_state
339
+
340
+ if total_steps is not None:
341
+ progress.update(task, total=total_steps)
342
+
343
+ description = self._format_progress_description(detail)
344
+ progress.update(task, description=description)
345
+
346
+ # Print a concise, error line regardless of indicator state
347
+ print(f"[{algo}] {detail}")
348
+ return
349
+
350
+ # TIE: optional one line message, no progress bar changes
351
+ if kind is RunnerStatusType.TIE:
352
+ if not self.display_config.announce_ties:
353
+ return
354
+ print(f"[{algo}] {detail}")
355
+ return
356
+
357
+ if kind is not RunnerStatusType.PROGRESS:
358
+ return
359
+
360
+ if not self.display_config.show_indicator:
361
+ return
362
+
363
+ if self._progress_state is None:
364
+ return
365
+
366
+ progress, task = self._progress_state
367
+
368
+ # Allow the runner to set or update the total steps.
369
+ if total_steps is not None:
370
+ progress.update(task, total=total_steps)
371
+
372
+ # iteration 0 shouldn't advance the bar
373
+ if step_index is not None and step_index > 0:
374
+ progress.advance(task, 1)
375
+
376
+ description = self._format_progress_description(detail)
377
+ progress.update(task, description=description)
378
+
379
+ def _format_progress_description(self, detail: str) -> str:
380
+ """
381
+ Compose a human readable progress line using an algorithm agnostic
382
+ prefix and an algorithm specific detail string provided by the runner.
383
+ """
384
+ algo = self.algorithm.upper()
385
+ base = f"Optimizing prompt with {algo}"
386
+ if detail:
387
+ return f"{base} [rgb(25,227,160)]{detail}[/]"
388
+ return base
389
+
390
+ def _build_default_scoring_adapter(self) -> DeepEvalScoringAdapter:
391
+ scoring_adapter = DeepEvalScoringAdapter(
392
+ list_input_role=self.list_input_role
393
+ )
394
+ scoring_adapter.set_model_callback(self.model_callback)
395
+ scoring_adapter.set_metrics(self.metrics)
396
+ return scoring_adapter
397
+
398
+ def _set_runner_callbacks(self, runner: RunnerProtocol):
399
+ runner.model_callback = (
400
+ self.model_callback
401
+ if runner.model_callback is None
402
+ else runner.model_callback
403
+ )
404
+ runner.status_callback = (
405
+ self._on_status
406
+ if runner.status_callback is None
407
+ else runner.status_callback
408
+ )
409
+
410
+ def _build_default_runner(self) -> RunnerProtocol:
411
+ if self.algorithm not in self._allowed_algorithms:
412
+ raise DeepEvalError(
413
+ f"Unsupported optimization algorithm: {self.algorithm!r}. "
414
+ "Supported algorithms are: 'gepa', 'miprov2' (alias 'mipro'), "
415
+ "'copro', 'simba'."
416
+ )
417
+
418
+ scoring_adapter = self._build_default_scoring_adapter()
419
+
420
+ if hasattr(scoring_adapter, "configure_async"):
421
+ scoring_adapter.configure_async(
422
+ max_concurrent=self.async_config.max_concurrent,
423
+ throttle_seconds=float(self.async_config.throttle_value),
424
+ )
425
+
426
+ if self.algorithm == "gepa":
427
+ config = GEPAConfig()
428
+ runner: RunnerProtocol = GEPARunner(
429
+ config=config,
430
+ scoring_adapter=scoring_adapter,
431
+ )
432
+ elif self.algorithm == "miprov2":
433
+ # MIPROv2 0-shot, instruction-only
434
+ config = MIPROConfig()
435
+ runner = MIPRORunner(
436
+ config=config,
437
+ scoring_adapter=scoring_adapter,
438
+ )
439
+ elif self.algorithm == "copro":
440
+ # COPRO cooperative multi-proposal variant
441
+ config = COPROConfig()
442
+ runner = COPRORunner(
443
+ config=config,
444
+ scoring_adapter=scoring_adapter,
445
+ )
446
+ else:
447
+ config = SIMBAConfig()
448
+ runner = SIMBARunner(
449
+ config=config,
450
+ scoring_adapter=scoring_adapter,
451
+ )
452
+
453
+ # Attach a PromptRewriter to the runner so that it has mutation behavior
454
+ runner._rewriter = PromptRewriter(
455
+ max_chars=config.rewrite_instruction_max_chars,
456
+ list_mutation_config=self.prompt_list_mutation_config,
457
+ random_state=runner.random_state,
458
+ )
459
+
460
+ self._set_runner_callbacks(runner)
461
+
462
+ return runner
File without changes
@@ -0,0 +1,33 @@
1
+ from __future__ import annotations
2
+ from pydantic import Field, PositiveInt, conint
3
+
4
+ from deepeval.optimization.copro.configs import COPROConfig
5
+
6
+
7
+ class SIMBAConfig(COPROConfig):
8
+ """
9
+ Configuration for SIMBA style cooperative prompt optimization.
10
+
11
+ Extends `COPROConfig` with strategy specific controls:
12
+
13
+ - How many minibatch examples are surfaced as demos and how long
14
+ those snippets can be (`max_demos_per_proposal`,
15
+ `demo_input_max_chars`).
16
+ """
17
+
18
+ max_demos_per_proposal: conint(ge=0) = Field(
19
+ default=3,
20
+ description=(
21
+ "Maximum number of goldens from the current minibatch that are "
22
+ "converted into concrete input/output demos when using the "
23
+ "APPEND_DEMO strategy."
24
+ ),
25
+ )
26
+
27
+ demo_input_max_chars: PositiveInt = Field(
28
+ default=256,
29
+ description=(
30
+ "Maximum number of characters taken from the golden input and "
31
+ "expected output when constructing demo snippets for APPEND_DEMO."
32
+ ),
33
+ )