deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -30,92 +30,119 @@ from typing import (
30
30
  Union,
31
31
  )
32
32
 
33
+ from deepeval.models.base_model import DeepEvalBaseLLM
34
+
33
35
  from deepeval.errors import DeepEvalError
34
- from deepeval.optimization.aggregates import Aggregator, mean_of_all
35
- from deepeval.optimization.types import (
36
+ from deepeval.optimizer.utils import Aggregator, mean_of_all
37
+ from deepeval.optimizer.types import (
36
38
  AcceptedIterationDict,
37
39
  ModuleId,
38
- OptimizationResult,
40
+ OptimizationReport,
39
41
  PromptConfiguration,
40
42
  PromptConfigurationId,
41
- RunnerStatusCallbackProtocol,
43
+ RunnerStatusCallback,
42
44
  RunnerStatusType,
43
45
  ScoreTable,
44
- ScoringAdapter,
45
46
  )
46
- from deepeval.optimization.utils import (
47
+ from deepeval.optimizer.scorer.base import BaseScorer
48
+ from deepeval.optimizer.utils import (
47
49
  build_prompt_config_snapshots,
48
50
  )
49
51
  from deepeval.prompt.api import PromptType
50
52
  from deepeval.prompt.prompt import Prompt
51
- from deepeval.optimization.mutations.prompt_rewriter import PromptRewriter
52
-
53
- from .configs import COPROConfig
53
+ from deepeval.optimizer.rewriter import Rewriter
54
+ from deepeval.optimizer.algorithms.configs import MIPROV2_MIN_DELTA
55
+ from deepeval.optimizer.algorithms.base import BaseAlgorithm
54
56
 
55
57
  if TYPE_CHECKING: # pragma: no cover - type-checking only
56
58
  from deepeval.dataset.golden import ConversationalGolden, Golden
57
59
 
58
60
 
59
- class COPRORunner:
61
+ class COPRO(BaseAlgorithm):
60
62
  """
61
63
  COPRO style cooperative prompt optimization loop with sync/async execution.
62
64
 
63
65
  This runner is intentionally low level and does not know about metrics,
64
- models, or async configs. It relies on a preconfigured ScoringAdapter and
65
- PromptRewriter, which are typically constructed by PromptOptimizer.
66
-
67
- - Optimizes a single Prompt (instruction) against a list of Goldens.
68
- - Uses mini-batches of goldens for trial scoring and epsilon-greedy
69
- selection over prompt candidates based on mean minibatch scores,
70
- extended with cooperative proposals:
71
- - At each iteration, a parent candidate is selected.
72
- - A shared feedback string is computed on a minibatch.
73
- - Multiple child prompts are proposed from that parent using the
74
- same feedback but different LLM samples.
75
- - Any child whose minibatch score improves over the parent by at
76
- least ``min_delta`` is added to the candidate pool.
66
+ models, or async configs. It relies on a preconfigured Scorer and
67
+ Rewriter, which are typically constructed by PromptOptimizer.
68
+
69
+ Parameters
70
+ ----------
71
+ iterations : int
72
+ Total number of optimization trials. Default is 5.
73
+ minibatch_size : int
74
+ Number of examples drawn per iteration. Default is 8.
75
+ random_seed : int, optional
76
+ RNG seed for reproducibility. If None, derived from time.time_ns().
77
+ exploration_probability : float
78
+ Epsilon greedy exploration rate. Default is 0.2.
79
+ full_eval_every : int, optional
80
+ Fully evaluate best candidate every N trials. Default is 5.
81
+ population_size : int
82
+ Maximum number of candidates in the pool. Default is 4.
83
+ proposals_per_step : int
84
+ Number of child prompts proposed per iteration. Default is 4.
77
85
  """
78
86
 
87
+ name = "COPRO"
79
88
  SINGLE_MODULE_ID: ModuleId = "__module__"
80
89
 
81
90
  def __init__(
82
91
  self,
83
- *,
84
- config: COPROConfig,
92
+ iterations: int = 5,
93
+ minibatch_size: int = 8,
94
+ random_seed: Optional[int] = None,
95
+ exploration_probability: float = 0.2,
96
+ full_eval_every: Optional[int] = 5,
97
+ population_size: int = 4,
98
+ proposals_per_step: int = 4,
85
99
  aggregate_instances: Aggregator = mean_of_all,
86
- scoring_adapter: Optional[ScoringAdapter] = None,
100
+ scorer: Optional[BaseScorer] = None,
87
101
  ) -> None:
88
- self.config = config
102
+ # Validate parameters
103
+ if iterations < 1:
104
+ raise ValueError("iterations must be >= 1")
105
+ if minibatch_size < 1:
106
+ raise ValueError("minibatch_size must be >= 1")
107
+ if exploration_probability < 0.0 or exploration_probability > 1.0:
108
+ raise ValueError(
109
+ "exploration_probability must be >= 0.0 and <= 1.0"
110
+ )
111
+ if full_eval_every is not None and full_eval_every < 1:
112
+ raise ValueError("full_eval_every must be >= 1")
113
+ if population_size < 1:
114
+ raise ValueError("population_size must be >= 1")
115
+ if proposals_per_step < 1:
116
+ raise ValueError("proposals_per_step must be >= 1")
117
+
118
+ self.iterations = iterations
119
+ self.minibatch_size = minibatch_size
120
+ self.exploration_probability = exploration_probability
121
+ self.full_eval_every = full_eval_every
122
+ self.population_size = population_size
123
+ self.proposals_per_step = proposals_per_step
89
124
  self.aggregate_instances = aggregate_instances
90
- self.scoring_adapter = scoring_adapter
91
-
92
- # Random seeded from config is used for minibatch sampling and
93
- # epsilon-greedy candidate selection.
94
- self.random_state = random.Random(config.random_seed)
125
+ self.scorer = scorer
95
126
 
96
- self.random_state = random.Random(config.random_seed)
127
+ # If no seed provided, use time-based seed
128
+ if random_seed is None:
129
+ random_seed = time.time_ns()
130
+ self.random_seed = random_seed
131
+ self.random_state = random.Random(random_seed)
97
132
 
98
133
  # Runtime state to be reset between runs
99
134
  self.reset_state()
100
135
 
101
136
  # Status callback set by PromptOptimizer:
102
137
  # (kind, step_index, total_steps, detail) -> None
103
- self.status_callback: Optional[RunnerStatusCallbackProtocol] = None
104
-
105
- # Model callback used by the rewriter set by PromptOptimizer.
106
- self.model_callback: Optional[
107
- Callable[
108
- ...,
109
- Union[
110
- str,
111
- Dict,
112
- Tuple[Union[str, Dict], float],
113
- ],
114
- ]
115
- ] = None
116
-
117
- # Lazy-loaded PromptRewriter set by PromptOptimizer
118
- self._rewriter: Optional[PromptRewriter] = None
138
+ self.status_callback: Optional[RunnerStatusCallback] = None
139
+
140
+ # Optimizer model used by the rewriter for prompt mutation.
141
+ # Set by PromptOptimizer.
142
+ self.optimizer_model: Optional["DeepEvalBaseLLM"] = None
143
+
144
+ # Lazy-loaded Rewriter set by PromptOptimizer
145
+ self._rewriter: Optional[Rewriter] = None
119
146
 
120
147
  ##############
121
148
  # Public API #
@@ -123,10 +150,9 @@ class COPRORunner:
123
150
 
124
151
  def execute(
125
152
  self,
126
- *,
127
153
  prompt: Prompt,
128
154
  goldens: Union[List["Golden"], List["ConversationalGolden"]],
129
- ) -> Tuple[Prompt, Dict]:
155
+ ) -> Tuple[Prompt, OptimizationReport]:
130
156
  """
131
157
  Synchronous COPRO run from a full list of goldens.
132
158
 
@@ -141,8 +167,7 @@ class COPRORunner:
141
167
  "the optimizer."
142
168
  )
143
169
 
144
- self._ensure_scoring_adapter()
145
- self._ensure_rewriter()
170
+ self._ensure_scorer()
146
171
  self.reset_state()
147
172
 
148
173
  # Seed candidate pool with the root prompt configuration.
@@ -168,7 +193,7 @@ class COPRORunner:
168
193
  # candidate on the first iteration.
169
194
  if not self._minibatch_score_counts:
170
195
  seed_minibatch = self._draw_minibatch(goldens)
171
- root_score = self.scoring_adapter.minibatch_score(
196
+ root_score = self.scorer.score_minibatch(
172
197
  root_prompt_configuration, seed_minibatch
173
198
  )
174
199
  self._record_minibatch_score(
@@ -183,7 +208,7 @@ class COPRORunner:
183
208
 
184
209
  # Compute shared feedback for this parent/minibatch that will be
185
210
  # used by all cooperative child proposals.
186
- feedback_text = self.scoring_adapter.minibatch_feedback(
211
+ feedback_text = self.scorer.get_minibatch_feedback(
187
212
  parent_prompt_configuration, selected_module_id, minibatch
188
213
  )
189
214
 
@@ -191,10 +216,10 @@ class COPRORunner:
191
216
  parent_prompt_configuration.id
192
217
  )
193
218
  jitter = 1e-6
194
- min_delta = max(self.config.min_delta, jitter)
219
+ min_delta = max(MIPROV2_MIN_DELTA, jitter)
195
220
 
196
221
  # 2. Generate multiple cooperative child prompts and evaluate them.
197
- num_proposals = int(self.config.proposals_per_step)
222
+ num_proposals = int(self.proposals_per_step)
198
223
  for _ in range(num_proposals):
199
224
  child_prompt = self._generate_child_prompt(
200
225
  selected_module_id,
@@ -211,7 +236,7 @@ class COPRORunner:
211
236
  child_prompt,
212
237
  )
213
238
 
214
- child_score = self.scoring_adapter.minibatch_score(
239
+ child_score = self.scorer.score_minibatch(
215
240
  child_prompt_configuration, minibatch
216
241
  )
217
242
 
@@ -236,8 +261,8 @@ class COPRORunner:
236
261
 
237
262
  self.trial_index += 1
238
263
  if (
239
- self.config.full_eval_every is not None
240
- and self.trial_index % self.config.full_eval_every == 0
264
+ self.full_eval_every is not None
265
+ and self.trial_index % self.full_eval_every == 0
241
266
  ):
242
267
  self._full_evaluate_best(goldens)
243
268
 
@@ -253,7 +278,7 @@ class COPRORunner:
253
278
  prompt_config_snapshots = build_prompt_config_snapshots(
254
279
  self.prompt_configurations_by_id
255
280
  )
256
- report = OptimizationResult(
281
+ report = OptimizationReport(
257
282
  optimization_id=self.optimization_id,
258
283
  best_id=best.id,
259
284
  accepted_iterations=accepted_iterations,
@@ -261,14 +286,13 @@ class COPRORunner:
261
286
  parents=self.parents_by_id,
262
287
  prompt_configurations=prompt_config_snapshots,
263
288
  )
264
- return best.prompts[self.SINGLE_MODULE_ID], report.as_dict()
289
+ return best.prompts[self.SINGLE_MODULE_ID], report
265
290
 
266
291
  async def a_execute(
267
292
  self,
268
- *,
269
293
  prompt: Prompt,
270
294
  goldens: Union[List["Golden"], List["ConversationalGolden"]],
271
- ) -> Tuple[Prompt, Dict]:
295
+ ) -> Tuple[Prompt, OptimizationReport]:
272
296
  """
273
297
  Asynchronous twin of execute().
274
298
  """
@@ -280,8 +304,7 @@ class COPRORunner:
280
304
  "the optimizer."
281
305
  )
282
306
 
283
- self._ensure_scoring_adapter()
284
- self._ensure_rewriter()
307
+ self._ensure_scorer()
285
308
  self.reset_state()
286
309
 
287
310
  seed_prompts_by_module = {self.SINGLE_MODULE_ID: prompt}
@@ -306,7 +329,7 @@ class COPRORunner:
306
329
  # candidate on the first iteration.
307
330
  if not self._minibatch_score_counts:
308
331
  seed_minibatch = self._draw_minibatch(goldens)
309
- root_score = await self.scoring_adapter.a_minibatch_score(
332
+ root_score = await self.scorer.a_score_minibatch(
310
333
  root_prompt_configuration, seed_minibatch
311
334
  )
312
335
  self._record_minibatch_score(
@@ -318,7 +341,7 @@ class COPRORunner:
318
341
 
319
342
  minibatch = self._draw_minibatch(goldens)
320
343
 
321
- feedback_text = await self.scoring_adapter.a_minibatch_feedback(
344
+ feedback_text = await self.scorer.a_get_minibatch_feedback(
322
345
  parent_prompt_configuration, selected_module_id, minibatch
323
346
  )
324
347
 
@@ -326,9 +349,9 @@ class COPRORunner:
326
349
  parent_prompt_configuration.id
327
350
  )
328
351
  jitter = 1e-6
329
- min_delta = max(self.config.min_delta, jitter)
352
+ min_delta = max(MIPROV2_MIN_DELTA, jitter)
330
353
 
331
- num_proposals = int(self.config.proposals_per_step)
354
+ num_proposals = int(self.proposals_per_step)
332
355
  for _ in range(num_proposals):
333
356
  child_prompt = await self._a_generate_child_prompt(
334
357
  selected_module_id,
@@ -344,7 +367,7 @@ class COPRORunner:
344
367
  child_prompt,
345
368
  )
346
369
 
347
- child_score = await self.scoring_adapter.a_minibatch_score(
370
+ child_score = await self.scorer.a_score_minibatch(
348
371
  child_prompt_configuration, minibatch
349
372
  )
350
373
 
@@ -366,8 +389,8 @@ class COPRORunner:
366
389
 
367
390
  self.trial_index += 1
368
391
  if (
369
- self.config.full_eval_every is not None
370
- and self.trial_index % self.config.full_eval_every == 0
392
+ self.full_eval_every is not None
393
+ and self.trial_index % self.full_eval_every == 0
371
394
  ):
372
395
  await self._a_full_evaluate_best(goldens)
373
396
 
@@ -382,7 +405,7 @@ class COPRORunner:
382
405
  prompt_config_snapshots = build_prompt_config_snapshots(
383
406
  self.prompt_configurations_by_id
384
407
  )
385
- report = OptimizationResult(
408
+ report = OptimizationReport(
386
409
  optimization_id=self.optimization_id,
387
410
  best_id=best.id,
388
411
  accepted_iterations=accepted_iterations,
@@ -390,7 +413,7 @@ class COPRORunner:
390
413
  parents=self.parents_by_id,
391
414
  prompt_configurations=prompt_config_snapshots,
392
415
  )
393
- return best.prompts[self.SINGLE_MODULE_ID], report.as_dict()
416
+ return best.prompts[self.SINGLE_MODULE_ID], report
394
417
 
395
418
  ###################
396
419
  # State & helpers #
@@ -414,25 +437,14 @@ class COPRORunner:
414
437
  # Trial counter (used for full_eval_every).
415
438
  self.trial_index: int = 0
416
439
 
417
- def _ensure_scoring_adapter(self) -> None:
418
- if self.scoring_adapter is None:
440
+ def _ensure_scorer(self) -> None:
441
+ if self.scorer is None:
419
442
  raise DeepEvalError(
420
- "COPRORunner requires a `scoring_adapter`. "
421
- "Construct one (for example, DeepEvalScoringAdapter) in "
422
- "PromptOptimizer and assign it to `runner.scoring_adapter`."
443
+ "COPRORunner requires a `scorer`. "
444
+ "Construct one (for example, Scorer) in "
445
+ "PromptOptimizer and assign it to `runner.scorer`."
423
446
  )
424
447
 
425
- def _ensure_rewriter(self) -> None:
426
- if self._rewriter is not None:
427
- return
428
-
429
- # Default basic PromptRewriter; PromptOptimizer can override this and
430
- # pass a configured instance (e.g. with list-mutation config).
431
- self._rewriter = PromptRewriter(
432
- max_chars=self.config.rewrite_instruction_max_chars,
433
- random_state=self.random_state,
434
- )
435
-
436
448
  def _prompts_equivalent(
437
449
  self,
438
450
  old_prompt: Prompt,
@@ -484,9 +496,7 @@ class COPRORunner:
484
496
 
485
497
  # If we exceed the population size, iteratively prune the worst
486
498
  # (by mean minibatch score), never removing the current best.
487
- while (
488
- len(self.prompt_configurations_by_id) > self.config.population_size
489
- ):
499
+ while len(self.prompt_configurations_by_id) > self.population_size:
490
500
  best_id: Optional[PromptConfigurationId] = None
491
501
  best_score = float("-inf")
492
502
  for cand_id in self.prompt_configurations_by_id.keys():
@@ -611,7 +621,7 @@ class COPRORunner:
611
621
  "COPRORunner has an empty candidate pool; this should not happen."
612
622
  )
613
623
 
614
- eps = float(self.config.exploration_probability)
624
+ eps = float(self.exploration_probability)
615
625
  if eps > 0.0 and self.random_state.random() < eps:
616
626
  chosen_id = self.random_state.choice(candidate_ids)
617
627
  else:
@@ -624,23 +634,14 @@ class COPRORunner:
624
634
  goldens: Union[List["Golden"], List["ConversationalGolden"]],
625
635
  ) -> Union[List["Golden"], List["ConversationalGolden"]]:
626
636
  """
627
- Determine effective minibatch size from COPROConfig, bounded by the
628
- available goldens, and sample with replacement.
637
+ Determine effective minibatch size, bounded by the available goldens,
638
+ and sample with replacement.
629
639
  """
630
640
  n = len(goldens)
631
641
  if n <= 0:
632
642
  return []
633
643
 
634
- if self.config.minibatch_size is not None:
635
- size = self.config.minibatch_size
636
- else:
637
- dynamic = max(1, int(round(n * self.config.minibatch_ratio)))
638
- size = max(
639
- self.config.minibatch_min_size,
640
- min(dynamic, self.config.minibatch_max_size),
641
- )
642
-
643
- size = max(1, min(size, n))
644
+ size = min(self.minibatch_size, n)
644
645
 
645
646
  return [goldens[self.random_state.randrange(0, n)] for _ in range(size)]
646
647
 
@@ -655,7 +656,7 @@ class COPRORunner:
655
656
  if best.id in self.pareto_score_table:
656
657
  return
657
658
 
658
- scores = await self.scoring_adapter.a_score_on_pareto(best, goldens)
659
+ scores = await self.scorer.a_score_pareto(best, goldens)
659
660
  self.pareto_score_table[best.id] = scores
660
661
 
661
662
  def _full_evaluate_best(
@@ -669,7 +670,7 @@ class COPRORunner:
669
670
  if best.id in self.pareto_score_table:
670
671
  return
671
672
 
672
- scores = self.scoring_adapter.score_on_pareto(best, goldens)
673
+ scores = self.scorer.score_pareto(best, goldens)
673
674
  self.pareto_score_table[best.id] = scores
674
675
 
675
676
  async def _a_generate_child_prompt(
@@ -688,7 +689,6 @@ class COPRORunner:
688
689
  ) from exc
689
690
 
690
691
  new_prompt = await self._rewriter.a_rewrite(
691
- model_callback=self.model_callback,
692
692
  module_id=selected_module_id,
693
693
  old_prompt=old_prompt,
694
694
  feedback_text=feedback_text,
@@ -718,7 +718,6 @@ class COPRORunner:
718
718
  ) from exc
719
719
 
720
720
  new_prompt = self._rewriter.rewrite(
721
- model_callback=self.model_callback,
722
721
  module_id=selected_module_id,
723
722
  old_prompt=old_prompt,
724
723
  feedback_text=feedback_text,
@@ -788,7 +787,7 @@ class COPRORunner:
788
787
  self,
789
788
  copro_iteration: Callable[[], bool],
790
789
  ) -> None:
791
- total_iterations = self.config.iterations
790
+ total_iterations = self.iterations
792
791
  remaining_iterations = total_iterations
793
792
  iteration = 0
794
793
  self._update_progress(
@@ -814,7 +813,7 @@ class COPRORunner:
814
813
  self,
815
814
  a_copro_iteration: Callable[[], Awaitable[bool]],
816
815
  ) -> None:
817
- total_iterations = self.config.iterations
816
+ total_iterations = self.iterations
818
817
  remaining_iterations = total_iterations
819
818
  iteration = 0
820
819
  self._update_progress(
@@ -0,0 +1,5 @@
1
+ from .gepa import GEPA
2
+
3
+ __all__ = [
4
+ "GEPA",
5
+ ]