deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +139 -2
- deepeval/evaluate/evaluate.py +16 -11
- deepeval/evaluate/execute.py +13 -181
- deepeval/evaluate/utils.py +6 -26
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +14 -16
- deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
- deepeval/metrics/answer_relevancy/template.py +22 -3
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -44
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +16 -2
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +138 -149
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +7 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -19
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +158 -122
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +49 -33
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +79 -33
- deepeval/models/embedding_models/local_embedding_model.py +39 -20
- deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
- deepeval/models/embedding_models/openai_embedding_model.py +42 -22
- deepeval/models/llms/amazon_bedrock_model.py +226 -72
- deepeval/models/llms/anthropic_model.py +178 -63
- deepeval/models/llms/azure_model.py +218 -60
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +95 -40
- deepeval/models/llms/gemini_model.py +209 -64
- deepeval/models/llms/grok_model.py +139 -68
- deepeval/models/llms/kimi_model.py +140 -90
- deepeval/models/llms/litellm_model.py +131 -37
- deepeval/models/llms/local_model.py +125 -21
- deepeval/models/llms/ollama_model.py +147 -24
- deepeval/models/llms/openai_model.py +222 -269
- deepeval/models/llms/portkey_model.py +81 -22
- deepeval/models/llms/utils.py +8 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +106 -5
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +26 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +68 -1
- deepeval/test_case/llm_test_case.py +206 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +18 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -30,92 +30,119 @@ from typing import (
|
|
|
30
30
|
Union,
|
|
31
31
|
)
|
|
32
32
|
|
|
33
|
+
from deepeval.models.base_model import DeepEvalBaseLLM
|
|
34
|
+
|
|
33
35
|
from deepeval.errors import DeepEvalError
|
|
34
|
-
from deepeval.
|
|
35
|
-
from deepeval.
|
|
36
|
+
from deepeval.optimizer.utils import Aggregator, mean_of_all
|
|
37
|
+
from deepeval.optimizer.types import (
|
|
36
38
|
AcceptedIterationDict,
|
|
37
39
|
ModuleId,
|
|
38
|
-
|
|
40
|
+
OptimizationReport,
|
|
39
41
|
PromptConfiguration,
|
|
40
42
|
PromptConfigurationId,
|
|
41
|
-
|
|
43
|
+
RunnerStatusCallback,
|
|
42
44
|
RunnerStatusType,
|
|
43
45
|
ScoreTable,
|
|
44
|
-
ScoringAdapter,
|
|
45
46
|
)
|
|
46
|
-
from deepeval.
|
|
47
|
+
from deepeval.optimizer.scorer.base import BaseScorer
|
|
48
|
+
from deepeval.optimizer.utils import (
|
|
47
49
|
build_prompt_config_snapshots,
|
|
48
50
|
)
|
|
49
51
|
from deepeval.prompt.api import PromptType
|
|
50
52
|
from deepeval.prompt.prompt import Prompt
|
|
51
|
-
from deepeval.
|
|
52
|
-
|
|
53
|
-
from .
|
|
53
|
+
from deepeval.optimizer.rewriter import Rewriter
|
|
54
|
+
from deepeval.optimizer.algorithms.configs import MIPROV2_MIN_DELTA
|
|
55
|
+
from deepeval.optimizer.algorithms.base import BaseAlgorithm
|
|
54
56
|
|
|
55
57
|
if TYPE_CHECKING: # pragma: no cover - type-checking only
|
|
56
58
|
from deepeval.dataset.golden import ConversationalGolden, Golden
|
|
57
59
|
|
|
58
60
|
|
|
59
|
-
class
|
|
61
|
+
class COPRO(BaseAlgorithm):
|
|
60
62
|
"""
|
|
61
63
|
COPRO style cooperative prompt optimization loop with sync/async execution.
|
|
62
64
|
|
|
63
65
|
This runner is intentionally low level and does not know about metrics,
|
|
64
|
-
models, or async configs. It relies on a preconfigured
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
66
|
+
models, or async configs. It relies on a preconfigured Scorer and
|
|
67
|
+
Rewriter, which are typically constructed by PromptOptimizer.
|
|
68
|
+
|
|
69
|
+
Parameters
|
|
70
|
+
----------
|
|
71
|
+
iterations : int
|
|
72
|
+
Total number of optimization trials. Default is 5.
|
|
73
|
+
minibatch_size : int
|
|
74
|
+
Number of examples drawn per iteration. Default is 8.
|
|
75
|
+
random_seed : int, optional
|
|
76
|
+
RNG seed for reproducibility. If None, derived from time.time_ns().
|
|
77
|
+
exploration_probability : float
|
|
78
|
+
Epsilon greedy exploration rate. Default is 0.2.
|
|
79
|
+
full_eval_every : int, optional
|
|
80
|
+
Fully evaluate best candidate every N trials. Default is 5.
|
|
81
|
+
population_size : int
|
|
82
|
+
Maximum number of candidates in the pool. Default is 4.
|
|
83
|
+
proposals_per_step : int
|
|
84
|
+
Number of child prompts proposed per iteration. Default is 4.
|
|
77
85
|
"""
|
|
78
86
|
|
|
87
|
+
name = "COPRO"
|
|
79
88
|
SINGLE_MODULE_ID: ModuleId = "__module__"
|
|
80
89
|
|
|
81
90
|
def __init__(
|
|
82
91
|
self,
|
|
83
|
-
|
|
84
|
-
|
|
92
|
+
iterations: int = 5,
|
|
93
|
+
minibatch_size: int = 8,
|
|
94
|
+
random_seed: Optional[int] = None,
|
|
95
|
+
exploration_probability: float = 0.2,
|
|
96
|
+
full_eval_every: Optional[int] = 5,
|
|
97
|
+
population_size: int = 4,
|
|
98
|
+
proposals_per_step: int = 4,
|
|
85
99
|
aggregate_instances: Aggregator = mean_of_all,
|
|
86
|
-
|
|
100
|
+
scorer: Optional[BaseScorer] = None,
|
|
87
101
|
) -> None:
|
|
88
|
-
|
|
102
|
+
# Validate parameters
|
|
103
|
+
if iterations < 1:
|
|
104
|
+
raise ValueError("iterations must be >= 1")
|
|
105
|
+
if minibatch_size < 1:
|
|
106
|
+
raise ValueError("minibatch_size must be >= 1")
|
|
107
|
+
if exploration_probability < 0.0 or exploration_probability > 1.0:
|
|
108
|
+
raise ValueError(
|
|
109
|
+
"exploration_probability must be >= 0.0 and <= 1.0"
|
|
110
|
+
)
|
|
111
|
+
if full_eval_every is not None and full_eval_every < 1:
|
|
112
|
+
raise ValueError("full_eval_every must be >= 1")
|
|
113
|
+
if population_size < 1:
|
|
114
|
+
raise ValueError("population_size must be >= 1")
|
|
115
|
+
if proposals_per_step < 1:
|
|
116
|
+
raise ValueError("proposals_per_step must be >= 1")
|
|
117
|
+
|
|
118
|
+
self.iterations = iterations
|
|
119
|
+
self.minibatch_size = minibatch_size
|
|
120
|
+
self.exploration_probability = exploration_probability
|
|
121
|
+
self.full_eval_every = full_eval_every
|
|
122
|
+
self.population_size = population_size
|
|
123
|
+
self.proposals_per_step = proposals_per_step
|
|
89
124
|
self.aggregate_instances = aggregate_instances
|
|
90
|
-
self.
|
|
91
|
-
|
|
92
|
-
# Random seeded from config is used for minibatch sampling and
|
|
93
|
-
# epsilon-greedy candidate selection.
|
|
94
|
-
self.random_state = random.Random(config.random_seed)
|
|
125
|
+
self.scorer = scorer
|
|
95
126
|
|
|
96
|
-
|
|
127
|
+
# If no seed provided, use time-based seed
|
|
128
|
+
if random_seed is None:
|
|
129
|
+
random_seed = time.time_ns()
|
|
130
|
+
self.random_seed = random_seed
|
|
131
|
+
self.random_state = random.Random(random_seed)
|
|
97
132
|
|
|
98
133
|
# Runtime state to be reset between runs
|
|
99
134
|
self.reset_state()
|
|
100
135
|
|
|
101
136
|
# Status callback set by PromptOptimizer:
|
|
102
137
|
# (kind, step_index, total_steps, detail) -> None
|
|
103
|
-
self.status_callback: Optional[
|
|
104
|
-
|
|
105
|
-
#
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
Dict,
|
|
112
|
-
Tuple[Union[str, Dict], float],
|
|
113
|
-
],
|
|
114
|
-
]
|
|
115
|
-
] = None
|
|
116
|
-
|
|
117
|
-
# Lazy-loaded PromptRewriter set by PromptOptimizer
|
|
118
|
-
self._rewriter: Optional[PromptRewriter] = None
|
|
138
|
+
self.status_callback: Optional[RunnerStatusCallback] = None
|
|
139
|
+
|
|
140
|
+
# Optimizer model used by the rewriter for prompt mutation.
|
|
141
|
+
# Set by PromptOptimizer.
|
|
142
|
+
self.optimizer_model: Optional["DeepEvalBaseLLM"] = None
|
|
143
|
+
|
|
144
|
+
# Lazy-loaded Rewriter set by PromptOptimizer
|
|
145
|
+
self._rewriter: Optional[Rewriter] = None
|
|
119
146
|
|
|
120
147
|
##############
|
|
121
148
|
# Public API #
|
|
@@ -123,10 +150,9 @@ class COPRORunner:
|
|
|
123
150
|
|
|
124
151
|
def execute(
|
|
125
152
|
self,
|
|
126
|
-
*,
|
|
127
153
|
prompt: Prompt,
|
|
128
154
|
goldens: Union[List["Golden"], List["ConversationalGolden"]],
|
|
129
|
-
) -> Tuple[Prompt,
|
|
155
|
+
) -> Tuple[Prompt, OptimizationReport]:
|
|
130
156
|
"""
|
|
131
157
|
Synchronous COPRO run from a full list of goldens.
|
|
132
158
|
|
|
@@ -141,8 +167,7 @@ class COPRORunner:
|
|
|
141
167
|
"the optimizer."
|
|
142
168
|
)
|
|
143
169
|
|
|
144
|
-
self.
|
|
145
|
-
self._ensure_rewriter()
|
|
170
|
+
self._ensure_scorer()
|
|
146
171
|
self.reset_state()
|
|
147
172
|
|
|
148
173
|
# Seed candidate pool with the root prompt configuration.
|
|
@@ -168,7 +193,7 @@ class COPRORunner:
|
|
|
168
193
|
# candidate on the first iteration.
|
|
169
194
|
if not self._minibatch_score_counts:
|
|
170
195
|
seed_minibatch = self._draw_minibatch(goldens)
|
|
171
|
-
root_score = self.
|
|
196
|
+
root_score = self.scorer.score_minibatch(
|
|
172
197
|
root_prompt_configuration, seed_minibatch
|
|
173
198
|
)
|
|
174
199
|
self._record_minibatch_score(
|
|
@@ -183,7 +208,7 @@ class COPRORunner:
|
|
|
183
208
|
|
|
184
209
|
# Compute shared feedback for this parent/minibatch that will be
|
|
185
210
|
# used by all cooperative child proposals.
|
|
186
|
-
feedback_text = self.
|
|
211
|
+
feedback_text = self.scorer.get_minibatch_feedback(
|
|
187
212
|
parent_prompt_configuration, selected_module_id, minibatch
|
|
188
213
|
)
|
|
189
214
|
|
|
@@ -191,10 +216,10 @@ class COPRORunner:
|
|
|
191
216
|
parent_prompt_configuration.id
|
|
192
217
|
)
|
|
193
218
|
jitter = 1e-6
|
|
194
|
-
min_delta = max(
|
|
219
|
+
min_delta = max(MIPROV2_MIN_DELTA, jitter)
|
|
195
220
|
|
|
196
221
|
# 2. Generate multiple cooperative child prompts and evaluate them.
|
|
197
|
-
num_proposals = int(self.
|
|
222
|
+
num_proposals = int(self.proposals_per_step)
|
|
198
223
|
for _ in range(num_proposals):
|
|
199
224
|
child_prompt = self._generate_child_prompt(
|
|
200
225
|
selected_module_id,
|
|
@@ -211,7 +236,7 @@ class COPRORunner:
|
|
|
211
236
|
child_prompt,
|
|
212
237
|
)
|
|
213
238
|
|
|
214
|
-
child_score = self.
|
|
239
|
+
child_score = self.scorer.score_minibatch(
|
|
215
240
|
child_prompt_configuration, minibatch
|
|
216
241
|
)
|
|
217
242
|
|
|
@@ -236,8 +261,8 @@ class COPRORunner:
|
|
|
236
261
|
|
|
237
262
|
self.trial_index += 1
|
|
238
263
|
if (
|
|
239
|
-
self.
|
|
240
|
-
and self.trial_index % self.
|
|
264
|
+
self.full_eval_every is not None
|
|
265
|
+
and self.trial_index % self.full_eval_every == 0
|
|
241
266
|
):
|
|
242
267
|
self._full_evaluate_best(goldens)
|
|
243
268
|
|
|
@@ -253,7 +278,7 @@ class COPRORunner:
|
|
|
253
278
|
prompt_config_snapshots = build_prompt_config_snapshots(
|
|
254
279
|
self.prompt_configurations_by_id
|
|
255
280
|
)
|
|
256
|
-
report =
|
|
281
|
+
report = OptimizationReport(
|
|
257
282
|
optimization_id=self.optimization_id,
|
|
258
283
|
best_id=best.id,
|
|
259
284
|
accepted_iterations=accepted_iterations,
|
|
@@ -261,14 +286,13 @@ class COPRORunner:
|
|
|
261
286
|
parents=self.parents_by_id,
|
|
262
287
|
prompt_configurations=prompt_config_snapshots,
|
|
263
288
|
)
|
|
264
|
-
return best.prompts[self.SINGLE_MODULE_ID], report
|
|
289
|
+
return best.prompts[self.SINGLE_MODULE_ID], report
|
|
265
290
|
|
|
266
291
|
async def a_execute(
|
|
267
292
|
self,
|
|
268
|
-
*,
|
|
269
293
|
prompt: Prompt,
|
|
270
294
|
goldens: Union[List["Golden"], List["ConversationalGolden"]],
|
|
271
|
-
) -> Tuple[Prompt,
|
|
295
|
+
) -> Tuple[Prompt, OptimizationReport]:
|
|
272
296
|
"""
|
|
273
297
|
Asynchronous twin of execute().
|
|
274
298
|
"""
|
|
@@ -280,8 +304,7 @@ class COPRORunner:
|
|
|
280
304
|
"the optimizer."
|
|
281
305
|
)
|
|
282
306
|
|
|
283
|
-
self.
|
|
284
|
-
self._ensure_rewriter()
|
|
307
|
+
self._ensure_scorer()
|
|
285
308
|
self.reset_state()
|
|
286
309
|
|
|
287
310
|
seed_prompts_by_module = {self.SINGLE_MODULE_ID: prompt}
|
|
@@ -306,7 +329,7 @@ class COPRORunner:
|
|
|
306
329
|
# candidate on the first iteration.
|
|
307
330
|
if not self._minibatch_score_counts:
|
|
308
331
|
seed_minibatch = self._draw_minibatch(goldens)
|
|
309
|
-
root_score = await self.
|
|
332
|
+
root_score = await self.scorer.a_score_minibatch(
|
|
310
333
|
root_prompt_configuration, seed_minibatch
|
|
311
334
|
)
|
|
312
335
|
self._record_minibatch_score(
|
|
@@ -318,7 +341,7 @@ class COPRORunner:
|
|
|
318
341
|
|
|
319
342
|
minibatch = self._draw_minibatch(goldens)
|
|
320
343
|
|
|
321
|
-
feedback_text = await self.
|
|
344
|
+
feedback_text = await self.scorer.a_get_minibatch_feedback(
|
|
322
345
|
parent_prompt_configuration, selected_module_id, minibatch
|
|
323
346
|
)
|
|
324
347
|
|
|
@@ -326,9 +349,9 @@ class COPRORunner:
|
|
|
326
349
|
parent_prompt_configuration.id
|
|
327
350
|
)
|
|
328
351
|
jitter = 1e-6
|
|
329
|
-
min_delta = max(
|
|
352
|
+
min_delta = max(MIPROV2_MIN_DELTA, jitter)
|
|
330
353
|
|
|
331
|
-
num_proposals = int(self.
|
|
354
|
+
num_proposals = int(self.proposals_per_step)
|
|
332
355
|
for _ in range(num_proposals):
|
|
333
356
|
child_prompt = await self._a_generate_child_prompt(
|
|
334
357
|
selected_module_id,
|
|
@@ -344,7 +367,7 @@ class COPRORunner:
|
|
|
344
367
|
child_prompt,
|
|
345
368
|
)
|
|
346
369
|
|
|
347
|
-
child_score = await self.
|
|
370
|
+
child_score = await self.scorer.a_score_minibatch(
|
|
348
371
|
child_prompt_configuration, minibatch
|
|
349
372
|
)
|
|
350
373
|
|
|
@@ -366,8 +389,8 @@ class COPRORunner:
|
|
|
366
389
|
|
|
367
390
|
self.trial_index += 1
|
|
368
391
|
if (
|
|
369
|
-
self.
|
|
370
|
-
and self.trial_index % self.
|
|
392
|
+
self.full_eval_every is not None
|
|
393
|
+
and self.trial_index % self.full_eval_every == 0
|
|
371
394
|
):
|
|
372
395
|
await self._a_full_evaluate_best(goldens)
|
|
373
396
|
|
|
@@ -382,7 +405,7 @@ class COPRORunner:
|
|
|
382
405
|
prompt_config_snapshots = build_prompt_config_snapshots(
|
|
383
406
|
self.prompt_configurations_by_id
|
|
384
407
|
)
|
|
385
|
-
report =
|
|
408
|
+
report = OptimizationReport(
|
|
386
409
|
optimization_id=self.optimization_id,
|
|
387
410
|
best_id=best.id,
|
|
388
411
|
accepted_iterations=accepted_iterations,
|
|
@@ -390,7 +413,7 @@ class COPRORunner:
|
|
|
390
413
|
parents=self.parents_by_id,
|
|
391
414
|
prompt_configurations=prompt_config_snapshots,
|
|
392
415
|
)
|
|
393
|
-
return best.prompts[self.SINGLE_MODULE_ID], report
|
|
416
|
+
return best.prompts[self.SINGLE_MODULE_ID], report
|
|
394
417
|
|
|
395
418
|
###################
|
|
396
419
|
# State & helpers #
|
|
@@ -414,25 +437,14 @@ class COPRORunner:
|
|
|
414
437
|
# Trial counter (used for full_eval_every).
|
|
415
438
|
self.trial_index: int = 0
|
|
416
439
|
|
|
417
|
-
def
|
|
418
|
-
if self.
|
|
440
|
+
def _ensure_scorer(self) -> None:
|
|
441
|
+
if self.scorer is None:
|
|
419
442
|
raise DeepEvalError(
|
|
420
|
-
"COPRORunner requires a `
|
|
421
|
-
"Construct one (for example,
|
|
422
|
-
"PromptOptimizer and assign it to `runner.
|
|
443
|
+
"COPRORunner requires a `scorer`. "
|
|
444
|
+
"Construct one (for example, Scorer) in "
|
|
445
|
+
"PromptOptimizer and assign it to `runner.scorer`."
|
|
423
446
|
)
|
|
424
447
|
|
|
425
|
-
def _ensure_rewriter(self) -> None:
|
|
426
|
-
if self._rewriter is not None:
|
|
427
|
-
return
|
|
428
|
-
|
|
429
|
-
# Default basic PromptRewriter; PromptOptimizer can override this and
|
|
430
|
-
# pass a configured instance (e.g. with list-mutation config).
|
|
431
|
-
self._rewriter = PromptRewriter(
|
|
432
|
-
max_chars=self.config.rewrite_instruction_max_chars,
|
|
433
|
-
random_state=self.random_state,
|
|
434
|
-
)
|
|
435
|
-
|
|
436
448
|
def _prompts_equivalent(
|
|
437
449
|
self,
|
|
438
450
|
old_prompt: Prompt,
|
|
@@ -484,9 +496,7 @@ class COPRORunner:
|
|
|
484
496
|
|
|
485
497
|
# If we exceed the population size, iteratively prune the worst
|
|
486
498
|
# (by mean minibatch score), never removing the current best.
|
|
487
|
-
while (
|
|
488
|
-
len(self.prompt_configurations_by_id) > self.config.population_size
|
|
489
|
-
):
|
|
499
|
+
while len(self.prompt_configurations_by_id) > self.population_size:
|
|
490
500
|
best_id: Optional[PromptConfigurationId] = None
|
|
491
501
|
best_score = float("-inf")
|
|
492
502
|
for cand_id in self.prompt_configurations_by_id.keys():
|
|
@@ -611,7 +621,7 @@ class COPRORunner:
|
|
|
611
621
|
"COPRORunner has an empty candidate pool; this should not happen."
|
|
612
622
|
)
|
|
613
623
|
|
|
614
|
-
eps = float(self.
|
|
624
|
+
eps = float(self.exploration_probability)
|
|
615
625
|
if eps > 0.0 and self.random_state.random() < eps:
|
|
616
626
|
chosen_id = self.random_state.choice(candidate_ids)
|
|
617
627
|
else:
|
|
@@ -624,23 +634,14 @@ class COPRORunner:
|
|
|
624
634
|
goldens: Union[List["Golden"], List["ConversationalGolden"]],
|
|
625
635
|
) -> Union[List["Golden"], List["ConversationalGolden"]]:
|
|
626
636
|
"""
|
|
627
|
-
Determine effective minibatch size
|
|
628
|
-
|
|
637
|
+
Determine effective minibatch size, bounded by the available goldens,
|
|
638
|
+
and sample with replacement.
|
|
629
639
|
"""
|
|
630
640
|
n = len(goldens)
|
|
631
641
|
if n <= 0:
|
|
632
642
|
return []
|
|
633
643
|
|
|
634
|
-
|
|
635
|
-
size = self.config.minibatch_size
|
|
636
|
-
else:
|
|
637
|
-
dynamic = max(1, int(round(n * self.config.minibatch_ratio)))
|
|
638
|
-
size = max(
|
|
639
|
-
self.config.minibatch_min_size,
|
|
640
|
-
min(dynamic, self.config.minibatch_max_size),
|
|
641
|
-
)
|
|
642
|
-
|
|
643
|
-
size = max(1, min(size, n))
|
|
644
|
+
size = min(self.minibatch_size, n)
|
|
644
645
|
|
|
645
646
|
return [goldens[self.random_state.randrange(0, n)] for _ in range(size)]
|
|
646
647
|
|
|
@@ -655,7 +656,7 @@ class COPRORunner:
|
|
|
655
656
|
if best.id in self.pareto_score_table:
|
|
656
657
|
return
|
|
657
658
|
|
|
658
|
-
scores = await self.
|
|
659
|
+
scores = await self.scorer.a_score_pareto(best, goldens)
|
|
659
660
|
self.pareto_score_table[best.id] = scores
|
|
660
661
|
|
|
661
662
|
def _full_evaluate_best(
|
|
@@ -669,7 +670,7 @@ class COPRORunner:
|
|
|
669
670
|
if best.id in self.pareto_score_table:
|
|
670
671
|
return
|
|
671
672
|
|
|
672
|
-
scores = self.
|
|
673
|
+
scores = self.scorer.score_pareto(best, goldens)
|
|
673
674
|
self.pareto_score_table[best.id] = scores
|
|
674
675
|
|
|
675
676
|
async def _a_generate_child_prompt(
|
|
@@ -688,7 +689,6 @@ class COPRORunner:
|
|
|
688
689
|
) from exc
|
|
689
690
|
|
|
690
691
|
new_prompt = await self._rewriter.a_rewrite(
|
|
691
|
-
model_callback=self.model_callback,
|
|
692
692
|
module_id=selected_module_id,
|
|
693
693
|
old_prompt=old_prompt,
|
|
694
694
|
feedback_text=feedback_text,
|
|
@@ -718,7 +718,6 @@ class COPRORunner:
|
|
|
718
718
|
) from exc
|
|
719
719
|
|
|
720
720
|
new_prompt = self._rewriter.rewrite(
|
|
721
|
-
model_callback=self.model_callback,
|
|
722
721
|
module_id=selected_module_id,
|
|
723
722
|
old_prompt=old_prompt,
|
|
724
723
|
feedback_text=feedback_text,
|
|
@@ -788,7 +787,7 @@ class COPRORunner:
|
|
|
788
787
|
self,
|
|
789
788
|
copro_iteration: Callable[[], bool],
|
|
790
789
|
) -> None:
|
|
791
|
-
total_iterations = self.
|
|
790
|
+
total_iterations = self.iterations
|
|
792
791
|
remaining_iterations = total_iterations
|
|
793
792
|
iteration = 0
|
|
794
793
|
self._update_progress(
|
|
@@ -814,7 +813,7 @@ class COPRORunner:
|
|
|
814
813
|
self,
|
|
815
814
|
a_copro_iteration: Callable[[], Awaitable[bool]],
|
|
816
815
|
) -> None:
|
|
817
|
-
total_iterations = self.
|
|
816
|
+
total_iterations = self.iterations
|
|
818
817
|
remaining_iterations = total_iterations
|
|
819
818
|
iteration = 0
|
|
820
819
|
self._update_progress(
|