deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +139 -2
- deepeval/evaluate/evaluate.py +16 -11
- deepeval/evaluate/execute.py +13 -181
- deepeval/evaluate/utils.py +6 -26
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +14 -16
- deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
- deepeval/metrics/answer_relevancy/template.py +22 -3
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -44
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +16 -2
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +138 -149
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +7 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -19
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +158 -122
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +49 -33
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +79 -33
- deepeval/models/embedding_models/local_embedding_model.py +39 -20
- deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
- deepeval/models/embedding_models/openai_embedding_model.py +42 -22
- deepeval/models/llms/amazon_bedrock_model.py +226 -72
- deepeval/models/llms/anthropic_model.py +178 -63
- deepeval/models/llms/azure_model.py +218 -60
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +95 -40
- deepeval/models/llms/gemini_model.py +209 -64
- deepeval/models/llms/grok_model.py +139 -68
- deepeval/models/llms/kimi_model.py +140 -90
- deepeval/models/llms/litellm_model.py +131 -37
- deepeval/models/llms/local_model.py +125 -21
- deepeval/models/llms/ollama_model.py +147 -24
- deepeval/models/llms/openai_model.py +222 -269
- deepeval/models/llms/portkey_model.py +81 -22
- deepeval/models/llms/utils.py +8 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +106 -5
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +26 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +68 -1
- deepeval/test_case/llm_test_case.py +206 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +18 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -14,86 +14,118 @@ from typing import (
|
|
|
14
14
|
Optional,
|
|
15
15
|
)
|
|
16
16
|
|
|
17
|
+
from deepeval.models.base_model import DeepEvalBaseLLM
|
|
18
|
+
|
|
17
19
|
from deepeval.errors import DeepEvalError
|
|
18
|
-
from deepeval.
|
|
19
|
-
from deepeval.
|
|
20
|
+
from deepeval.optimizer.utils import Aggregator, mean_of_all
|
|
21
|
+
from deepeval.optimizer.types import (
|
|
20
22
|
AcceptedIterationDict,
|
|
21
23
|
PromptConfiguration,
|
|
22
24
|
PromptConfigurationId,
|
|
23
25
|
ModuleId,
|
|
24
26
|
ScoreTable,
|
|
25
|
-
|
|
26
|
-
OptimizationResult,
|
|
27
|
+
OptimizationReport,
|
|
27
28
|
RunnerStatusType,
|
|
28
|
-
|
|
29
|
+
RunnerStatusCallback,
|
|
29
30
|
)
|
|
30
|
-
from deepeval.
|
|
31
|
+
from deepeval.optimizer.scorer.base import BaseScorer
|
|
32
|
+
from deepeval.optimizer.algorithms.base import BaseAlgorithm
|
|
33
|
+
from deepeval.optimizer.utils import (
|
|
31
34
|
split_goldens,
|
|
32
35
|
build_prompt_config_snapshots,
|
|
33
36
|
)
|
|
34
|
-
from deepeval.
|
|
37
|
+
from deepeval.optimizer.policies import (
|
|
35
38
|
pick_best_with_ties,
|
|
36
39
|
select_prompt_configuration_pareto,
|
|
40
|
+
frequency_weights,
|
|
41
|
+
pareto_frontier,
|
|
37
42
|
)
|
|
38
43
|
from deepeval.prompt.api import PromptType
|
|
39
44
|
from deepeval.prompt.prompt import Prompt
|
|
40
|
-
from deepeval.
|
|
41
|
-
|
|
45
|
+
from deepeval.optimizer.rewriter import Rewriter
|
|
46
|
+
from deepeval.optimizer.policies import TieBreaker
|
|
47
|
+
from deepeval.optimizer.algorithms.configs import (
|
|
48
|
+
GEPA_MIN_DELTA,
|
|
49
|
+
GEPA_TIE_TOLERANCE,
|
|
50
|
+
GEPA_REWRITE_INSTRUCTION_MAX_CHARS,
|
|
42
51
|
)
|
|
43
|
-
from .configs import GEPAConfig
|
|
44
52
|
|
|
45
53
|
|
|
46
54
|
if TYPE_CHECKING:
|
|
47
55
|
from deepeval.dataset.golden import Golden, ConversationalGolden
|
|
48
56
|
|
|
49
57
|
|
|
50
|
-
class
|
|
58
|
+
class GEPA(BaseAlgorithm):
|
|
51
59
|
"""
|
|
52
60
|
GEPA loop with sync/async execution.
|
|
53
61
|
|
|
54
62
|
This runner is intentionally low level and does not know about metrics,
|
|
55
63
|
models, or async configs. It relies on a preconfigured
|
|
56
|
-
|
|
64
|
+
Scorer and Rewriter, which are typically constructed by
|
|
57
65
|
the higher-level PromptOptimizer.
|
|
66
|
+
|
|
67
|
+
Parameters
|
|
68
|
+
----------
|
|
69
|
+
iterations : int
|
|
70
|
+
Total number of GEPA loop iterations (mutation attempts). Default is 5.
|
|
71
|
+
minibatch_size : int
|
|
72
|
+
Number of examples drawn from D_feedback per iteration. Default is 8.
|
|
73
|
+
pareto_size : int
|
|
74
|
+
Size of the Pareto validation subset D_pareto. Default is 3.
|
|
75
|
+
random_seed : int, optional
|
|
76
|
+
RNG seed for reproducibility. If None, derived from time.time_ns().
|
|
77
|
+
tie_breaker : TieBreaker
|
|
78
|
+
Policy for breaking ties. Default is TieBreaker.PREFER_CHILD.
|
|
58
79
|
"""
|
|
59
80
|
|
|
81
|
+
name = "GEPA"
|
|
60
82
|
SINGLE_MODULE_ID: ModuleId = "__module__"
|
|
83
|
+
TieBreaker = TieBreaker
|
|
61
84
|
|
|
62
85
|
def __init__(
|
|
63
86
|
self,
|
|
64
|
-
|
|
65
|
-
|
|
87
|
+
iterations: int = 5,
|
|
88
|
+
minibatch_size: int = 8,
|
|
89
|
+
pareto_size: int = 3,
|
|
90
|
+
random_seed: Optional[int] = None,
|
|
91
|
+
tie_breaker: TieBreaker = TieBreaker.PREFER_CHILD,
|
|
66
92
|
aggregate_instances: Aggregator = mean_of_all,
|
|
67
|
-
|
|
93
|
+
scorer: Optional[BaseScorer] = None,
|
|
68
94
|
) -> None:
|
|
69
|
-
|
|
95
|
+
# Validate parameters
|
|
96
|
+
if iterations < 1:
|
|
97
|
+
raise ValueError("iterations must be >= 1")
|
|
98
|
+
if minibatch_size < 1:
|
|
99
|
+
raise ValueError("minibatch_size must be >= 1")
|
|
100
|
+
if pareto_size < 1:
|
|
101
|
+
raise ValueError("pareto_size must be >= 1")
|
|
102
|
+
|
|
103
|
+
self.iterations = iterations
|
|
104
|
+
self.minibatch_size = minibatch_size
|
|
105
|
+
self.pareto_size = pareto_size
|
|
106
|
+
self.tie_breaker = tie_breaker
|
|
70
107
|
self.aggregate_instances = aggregate_instances
|
|
71
|
-
self.
|
|
108
|
+
self.scorer = scorer
|
|
72
109
|
|
|
73
|
-
#
|
|
74
|
-
|
|
110
|
+
# If no seed provided, use time-based seed
|
|
111
|
+
if random_seed is None:
|
|
112
|
+
random_seed = time.time_ns()
|
|
113
|
+
self.random_seed = random_seed
|
|
114
|
+
self.random_state = random.Random(random_seed)
|
|
75
115
|
|
|
76
116
|
# runtime state to be reset between runs
|
|
77
117
|
self.reset_state()
|
|
78
118
|
|
|
79
119
|
# Status callback set by PromptOptimizer:
|
|
80
120
|
# (kind, step_index, total_steps, detail) -> None
|
|
81
|
-
self.status_callback: Optional[
|
|
82
|
-
|
|
83
|
-
#
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
...,
|
|
87
|
-
Union[
|
|
88
|
-
str,
|
|
89
|
-
Dict,
|
|
90
|
-
Tuple[Union[str, Dict], float],
|
|
91
|
-
],
|
|
92
|
-
]
|
|
93
|
-
] = None
|
|
121
|
+
self.status_callback: Optional[RunnerStatusCallback] = None
|
|
122
|
+
|
|
123
|
+
# Optimizer model used by the rewriter for prompt mutation.
|
|
124
|
+
# Set by PromptOptimizer.
|
|
125
|
+
self.optimizer_model: Optional["DeepEvalBaseLLM"] = None
|
|
94
126
|
|
|
95
127
|
# lazy loaded
|
|
96
|
-
self._rewriter: Optional[
|
|
128
|
+
self._rewriter: Optional[Rewriter] = None
|
|
97
129
|
|
|
98
130
|
##############
|
|
99
131
|
# Public API #
|
|
@@ -101,10 +133,9 @@ class GEPARunner:
|
|
|
101
133
|
|
|
102
134
|
def execute(
|
|
103
135
|
self,
|
|
104
|
-
*,
|
|
105
136
|
prompt: Prompt,
|
|
106
137
|
goldens: Union[List["Golden"], List["ConversationalGolden"]],
|
|
107
|
-
) -> Tuple[Prompt,
|
|
138
|
+
) -> Tuple[Prompt, OptimizationReport]:
|
|
108
139
|
"""Synchronous GEPA run from a full list of goldens (splits internally)."""
|
|
109
140
|
total_goldens = len(goldens)
|
|
110
141
|
if total_goldens < 2:
|
|
@@ -114,12 +145,11 @@ class GEPARunner:
|
|
|
114
145
|
"run the optimizer."
|
|
115
146
|
)
|
|
116
147
|
|
|
117
|
-
self.
|
|
118
|
-
self._ensure_rewriter()
|
|
148
|
+
self._ensure_scorer()
|
|
119
149
|
self.reset_state()
|
|
120
150
|
|
|
121
151
|
d_feedback, d_pareto = split_goldens(
|
|
122
|
-
goldens, self.
|
|
152
|
+
goldens, self.pareto_size, random_state=self.random_state
|
|
123
153
|
)
|
|
124
154
|
|
|
125
155
|
seed_prompts_by_module = {self.SINGLE_MODULE_ID: prompt}
|
|
@@ -139,7 +169,7 @@ class GEPARunner:
|
|
|
139
169
|
# Seed Pareto scores lazily on first iteration
|
|
140
170
|
if not self.pareto_score_table:
|
|
141
171
|
self.pareto_score_table[root_prompt_configuration.id] = (
|
|
142
|
-
self.
|
|
172
|
+
self.scorer.score_pareto(
|
|
143
173
|
root_prompt_configuration, d_pareto
|
|
144
174
|
)
|
|
145
175
|
)
|
|
@@ -154,7 +184,7 @@ class GEPARunner:
|
|
|
154
184
|
minibatch = self._draw_minibatch(d_feedback)
|
|
155
185
|
|
|
156
186
|
# 4. Feedback
|
|
157
|
-
feedback_text = self.
|
|
187
|
+
feedback_text = self.scorer.get_minibatch_feedback(
|
|
158
188
|
parent_prompt_configuration, selected_module_id, minibatch
|
|
159
189
|
)
|
|
160
190
|
|
|
@@ -172,15 +202,16 @@ class GEPARunner:
|
|
|
172
202
|
)
|
|
173
203
|
|
|
174
204
|
# 7. Evaluate parent/child on minibatch
|
|
175
|
-
parent_score = self.
|
|
205
|
+
parent_score = self.scorer.score_minibatch(
|
|
176
206
|
parent_prompt_configuration, minibatch
|
|
177
207
|
)
|
|
178
|
-
child_score = self.
|
|
208
|
+
child_score = self.scorer.score_minibatch(
|
|
179
209
|
child_prompt_configuration, minibatch
|
|
180
210
|
)
|
|
181
211
|
|
|
182
212
|
# 8. Acceptance test
|
|
183
|
-
|
|
213
|
+
accepted = self._should_accept_child(parent_score, child_score)
|
|
214
|
+
if accepted:
|
|
184
215
|
accepted_iterations.append(
|
|
185
216
|
self._accept_child(
|
|
186
217
|
selected_module_id,
|
|
@@ -199,7 +230,7 @@ class GEPARunner:
|
|
|
199
230
|
prompt_config_snapshots = build_prompt_config_snapshots(
|
|
200
231
|
self.prompt_configurations_by_id
|
|
201
232
|
)
|
|
202
|
-
report =
|
|
233
|
+
report = OptimizationReport(
|
|
203
234
|
optimization_id=self.optimization_id,
|
|
204
235
|
best_id=best.id,
|
|
205
236
|
accepted_iterations=accepted_iterations,
|
|
@@ -207,14 +238,13 @@ class GEPARunner:
|
|
|
207
238
|
parents=self.parents_by_id,
|
|
208
239
|
prompt_configurations=prompt_config_snapshots,
|
|
209
240
|
)
|
|
210
|
-
return best.prompts[self.SINGLE_MODULE_ID], report
|
|
241
|
+
return best.prompts[self.SINGLE_MODULE_ID], report
|
|
211
242
|
|
|
212
243
|
async def a_execute(
|
|
213
244
|
self,
|
|
214
|
-
*,
|
|
215
245
|
prompt: Prompt,
|
|
216
246
|
goldens: Union[List["Golden"], List["ConversationalGolden"]],
|
|
217
|
-
) -> Tuple[Prompt,
|
|
247
|
+
) -> Tuple[Prompt, OptimizationReport]:
|
|
218
248
|
"""Asynchronous twin of execute_gepa()."""
|
|
219
249
|
total_goldens = len(goldens)
|
|
220
250
|
if total_goldens < 2:
|
|
@@ -224,12 +254,11 @@ class GEPARunner:
|
|
|
224
254
|
"run the optimizer."
|
|
225
255
|
)
|
|
226
256
|
|
|
227
|
-
self.
|
|
228
|
-
self._ensure_rewriter()
|
|
257
|
+
self._ensure_scorer()
|
|
229
258
|
self.reset_state()
|
|
230
259
|
|
|
231
260
|
d_feedback, d_pareto = split_goldens(
|
|
232
|
-
goldens, self.
|
|
261
|
+
goldens, self.pareto_size, random_state=self.random_state
|
|
233
262
|
)
|
|
234
263
|
|
|
235
264
|
seed_prompts_by_module = {self.SINGLE_MODULE_ID: prompt}
|
|
@@ -246,13 +275,19 @@ class GEPARunner:
|
|
|
246
275
|
if not d_feedback:
|
|
247
276
|
return False
|
|
248
277
|
|
|
278
|
+
iter_start = time.perf_counter()
|
|
279
|
+
|
|
249
280
|
# Seed Pareto scores lazily on first iteration
|
|
250
281
|
if not self.pareto_score_table:
|
|
282
|
+
t0 = time.perf_counter()
|
|
251
283
|
self.pareto_score_table[root_prompt_configuration.id] = (
|
|
252
|
-
await self.
|
|
284
|
+
await self.scorer.a_score_pareto(
|
|
253
285
|
root_prompt_configuration, d_pareto
|
|
254
286
|
)
|
|
255
287
|
)
|
|
288
|
+
print(
|
|
289
|
+
f"[DEBUG] Initial pareto scoring ({len(d_pareto)} goldens): {time.perf_counter() - t0:.2f}s"
|
|
290
|
+
)
|
|
256
291
|
|
|
257
292
|
# 1. Pick prompt_configuration via Pareto
|
|
258
293
|
parent_prompt_configuration = self._pick_prompt_configuration()
|
|
@@ -262,18 +297,23 @@ class GEPARunner:
|
|
|
262
297
|
|
|
263
298
|
# 3. Draw minibatch
|
|
264
299
|
minibatch = self._draw_minibatch(d_feedback)
|
|
300
|
+
print(f"[DEBUG] Minibatch size: {len(minibatch)}")
|
|
265
301
|
|
|
266
302
|
# 4. Feedback
|
|
267
|
-
|
|
303
|
+
t0 = time.perf_counter()
|
|
304
|
+
feedback_text = await self.scorer.a_get_minibatch_feedback(
|
|
268
305
|
parent_prompt_configuration, selected_module_id, minibatch
|
|
269
306
|
)
|
|
307
|
+
print(f"[DEBUG] Get feedback: {time.perf_counter() - t0:.2f}s")
|
|
270
308
|
|
|
271
309
|
# 5. Rewrite
|
|
310
|
+
t0 = time.perf_counter()
|
|
272
311
|
child_prompt = await self._a_generate_child_prompt(
|
|
273
312
|
selected_module_id, parent_prompt_configuration, feedback_text
|
|
274
313
|
)
|
|
314
|
+
print(f"[DEBUG] Rewrite prompt: {time.perf_counter() - t0:.2f}s")
|
|
275
315
|
if child_prompt is None:
|
|
276
|
-
|
|
316
|
+
print(f"[DEBUG] Child prompt same as parent, skipping")
|
|
277
317
|
return True
|
|
278
318
|
|
|
279
319
|
# 6. Child prompt_configuration
|
|
@@ -282,15 +322,29 @@ class GEPARunner:
|
|
|
282
322
|
)
|
|
283
323
|
|
|
284
324
|
# 7. Evaluate parent/child on minibatch
|
|
285
|
-
|
|
325
|
+
t0 = time.perf_counter()
|
|
326
|
+
parent_score = await self.scorer.a_score_minibatch(
|
|
286
327
|
parent_prompt_configuration, minibatch
|
|
287
328
|
)
|
|
288
|
-
|
|
329
|
+
print(
|
|
330
|
+
f"[DEBUG] Score parent on minibatch: {time.perf_counter() - t0:.2f}s (score={parent_score:.4f})"
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
t0 = time.perf_counter()
|
|
334
|
+
child_score = await self.scorer.a_score_minibatch(
|
|
289
335
|
child_prompt_configuration, minibatch
|
|
290
336
|
)
|
|
337
|
+
print(
|
|
338
|
+
f"[DEBUG] Score child on minibatch: {time.perf_counter() - t0:.2f}s (score={child_score:.4f})"
|
|
339
|
+
)
|
|
291
340
|
|
|
292
341
|
# 8. Acceptance test
|
|
293
|
-
|
|
342
|
+
accepted = self._should_accept_child(parent_score, child_score)
|
|
343
|
+
print(
|
|
344
|
+
f"[DEBUG] Acceptance: {'ACCEPTED' if accepted else 'REJECTED'}"
|
|
345
|
+
)
|
|
346
|
+
if accepted:
|
|
347
|
+
t0 = time.perf_counter()
|
|
294
348
|
accepted_iterations.append(
|
|
295
349
|
await self._a_accept_child(
|
|
296
350
|
selected_module_id,
|
|
@@ -301,6 +355,13 @@ class GEPARunner:
|
|
|
301
355
|
child_score,
|
|
302
356
|
)
|
|
303
357
|
)
|
|
358
|
+
print(
|
|
359
|
+
f"[DEBUG] Accept child (pareto scoring): {time.perf_counter() - t0:.2f}s"
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
print(
|
|
363
|
+
f"[DEBUG] Total iteration time: {time.perf_counter() - iter_start:.2f}s\n"
|
|
364
|
+
)
|
|
304
365
|
return True
|
|
305
366
|
|
|
306
367
|
await self._a_run_loop_iteration(_one_iteration)
|
|
@@ -308,7 +369,7 @@ class GEPARunner:
|
|
|
308
369
|
prompt_config_snapshots = build_prompt_config_snapshots(
|
|
309
370
|
self.prompt_configurations_by_id
|
|
310
371
|
)
|
|
311
|
-
report =
|
|
372
|
+
report = OptimizationReport(
|
|
312
373
|
optimization_id=self.optimization_id,
|
|
313
374
|
best_id=best.id,
|
|
314
375
|
accepted_iterations=accepted_iterations,
|
|
@@ -316,7 +377,7 @@ class GEPARunner:
|
|
|
316
377
|
parents=self.parents_by_id,
|
|
317
378
|
prompt_configurations=prompt_config_snapshots,
|
|
318
379
|
)
|
|
319
|
-
return best.prompts[self.SINGLE_MODULE_ID], report
|
|
380
|
+
return best.prompts[self.SINGLE_MODULE_ID], report
|
|
320
381
|
|
|
321
382
|
###################
|
|
322
383
|
# State & helpers #
|
|
@@ -332,23 +393,14 @@ class GEPARunner:
|
|
|
332
393
|
] = {}
|
|
333
394
|
self.pareto_score_table: ScoreTable = {}
|
|
334
395
|
|
|
335
|
-
def
|
|
336
|
-
if self.
|
|
396
|
+
def _ensure_scorer(self) -> None:
|
|
397
|
+
if self.scorer is None:
|
|
337
398
|
raise DeepEvalError(
|
|
338
|
-
"GEPARunner requires a `
|
|
339
|
-
"Construct one (for example,
|
|
340
|
-
"PromptOptimizer and assign it to `runner.
|
|
399
|
+
"GEPARunner requires a `scorer`. "
|
|
400
|
+
"Construct one (for example, Scorer) in "
|
|
401
|
+
"PromptOptimizer and assign it to `runner.scorer`."
|
|
341
402
|
)
|
|
342
403
|
|
|
343
|
-
def _ensure_rewriter(self) -> None:
|
|
344
|
-
if self._rewriter is not None:
|
|
345
|
-
return
|
|
346
|
-
|
|
347
|
-
# For now, always use the basic PromptRewriter. Additional
|
|
348
|
-
# variants (e.g. for GEPA Alg. 4 crossover) can be introduced
|
|
349
|
-
# later
|
|
350
|
-
self._rewriter = PromptRewriter()
|
|
351
|
-
|
|
352
404
|
def _prompts_equivalent(
|
|
353
405
|
self, old_prompt: Prompt, new_prompt: Prompt
|
|
354
406
|
) -> bool:
|
|
@@ -413,17 +465,16 @@ class GEPARunner:
|
|
|
413
465
|
totals,
|
|
414
466
|
self.parents_by_id,
|
|
415
467
|
random_state=self.random_state,
|
|
416
|
-
tie_tolerance=
|
|
417
|
-
policy=self.
|
|
468
|
+
tie_tolerance=GEPA_TIE_TOLERANCE,
|
|
469
|
+
policy=self.tie_breaker,
|
|
418
470
|
)
|
|
419
471
|
if self.status_callback is not None and len(tied) > 1:
|
|
420
472
|
msg = (
|
|
421
473
|
f"tie on aggregate={max_val:.4f} among {len(tied)} "
|
|
422
474
|
f"prompt_configurations; using tie_breaker="
|
|
423
|
-
f"{self.
|
|
424
|
-
f"To change, set
|
|
425
|
-
f"{[t.value for t in self.
|
|
426
|
-
f"(tie_tolerance={float(self.config.tie_tolerance):g})."
|
|
475
|
+
f"{self.tie_breaker.value!r} selected {chosen}. "
|
|
476
|
+
f"To change, set GEPA tie_breaker to one of: "
|
|
477
|
+
f"{[t.value for t in self.TieBreaker]}."
|
|
427
478
|
)
|
|
428
479
|
self.status_callback(
|
|
429
480
|
RunnerStatusType.TIE,
|
|
@@ -433,9 +484,43 @@ class GEPARunner:
|
|
|
433
484
|
return self.prompt_configurations_by_id[chosen]
|
|
434
485
|
|
|
435
486
|
def _pick_prompt_configuration(self) -> PromptConfiguration:
|
|
487
|
+
# Log Pareto selection details
|
|
488
|
+
all_candidates = list(self.pareto_score_table.keys())
|
|
489
|
+
print(f"[DEBUG] Pareto Selection:")
|
|
490
|
+
print(f" - Total candidates in pool: {len(all_candidates)}")
|
|
491
|
+
|
|
492
|
+
# Show score table
|
|
493
|
+
print(f" - Score table (per-instance scores):")
|
|
494
|
+
for cid, scores in self.pareto_score_table.items():
|
|
495
|
+
is_root = self.parents_by_id.get(cid) is None
|
|
496
|
+
label = (
|
|
497
|
+
"(root)"
|
|
498
|
+
if is_root
|
|
499
|
+
else f"(child of {self.parents_by_id.get(cid)[:8]}...)"
|
|
500
|
+
)
|
|
501
|
+
mean_score = sum(scores) / len(scores) if scores else 0
|
|
502
|
+
print(
|
|
503
|
+
f" {cid[:8]}... {label}: {[round(s, 3) for s in scores]} (mean={mean_score:.3f})"
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
# Show Pareto frontier
|
|
507
|
+
frontier = pareto_frontier(all_candidates, self.pareto_score_table)
|
|
508
|
+
print(f" - Pareto frontier ({len(frontier)} non-dominated):")
|
|
509
|
+
for cid in frontier:
|
|
510
|
+
print(f" {cid[:8]}...")
|
|
511
|
+
|
|
512
|
+
# Show frequency weights
|
|
513
|
+
freq = frequency_weights(self.pareto_score_table)
|
|
514
|
+
print(f" - Frequency weights (how often each wins an instance):")
|
|
515
|
+
for cid, weight in freq.items():
|
|
516
|
+
print(f" {cid[:8]}...: {weight}")
|
|
517
|
+
|
|
518
|
+
# Do the selection
|
|
436
519
|
selected_prompt_configuration_id = select_prompt_configuration_pareto(
|
|
437
520
|
self.pareto_score_table, random_state=self.random_state
|
|
438
521
|
)
|
|
522
|
+
print(f" - Selected: {selected_prompt_configuration_id[:8]}...\n")
|
|
523
|
+
|
|
439
524
|
return self.prompt_configurations_by_id[
|
|
440
525
|
selected_prompt_configuration_id
|
|
441
526
|
]
|
|
@@ -443,25 +528,13 @@ class GEPARunner:
|
|
|
443
528
|
def _draw_minibatch(
|
|
444
529
|
self, d_feedback: Union[List["Golden"], List["ConversationalGolden"]]
|
|
445
530
|
) -> Union[List["Golden"], List["ConversationalGolden"]]:
|
|
446
|
-
# Determine effective minibatch size
|
|
531
|
+
# Determine effective minibatch size, bounded by the
|
|
447
532
|
# available feedback set.
|
|
448
533
|
n_feedback = len(d_feedback)
|
|
449
534
|
if n_feedback <= 0:
|
|
450
535
|
return []
|
|
451
536
|
|
|
452
|
-
|
|
453
|
-
size = self.config.minibatch_size
|
|
454
|
-
else:
|
|
455
|
-
# Dynamic sizing from ratio, bounded between min and max.
|
|
456
|
-
dynamic = max(
|
|
457
|
-
1, int(round(n_feedback * self.config.minibatch_ratio))
|
|
458
|
-
)
|
|
459
|
-
size = max(
|
|
460
|
-
self.config.minibatch_min_size,
|
|
461
|
-
min(dynamic, self.config.minibatch_max_size),
|
|
462
|
-
)
|
|
463
|
-
|
|
464
|
-
size = max(1, min(size, n_feedback))
|
|
537
|
+
size = min(self.minibatch_size, n_feedback)
|
|
465
538
|
|
|
466
539
|
return [
|
|
467
540
|
d_feedback[self.random_state.randrange(0, n_feedback)]
|
|
@@ -479,7 +552,6 @@ class GEPARunner:
|
|
|
479
552
|
)
|
|
480
553
|
|
|
481
554
|
new_prompt = await self._rewriter.a_rewrite(
|
|
482
|
-
model_callback=self.model_callback,
|
|
483
555
|
module_id=selected_module_id,
|
|
484
556
|
old_prompt=old_prompt,
|
|
485
557
|
feedback_text=feedback_text,
|
|
@@ -504,7 +576,6 @@ class GEPARunner:
|
|
|
504
576
|
)
|
|
505
577
|
|
|
506
578
|
new_prompt = self._rewriter.rewrite(
|
|
507
|
-
model_callback=self.model_callback,
|
|
508
579
|
module_id=selected_module_id,
|
|
509
580
|
old_prompt=old_prompt,
|
|
510
581
|
feedback_text=feedback_text,
|
|
@@ -535,7 +606,7 @@ class GEPARunner:
|
|
|
535
606
|
self, parent_score: float, child_score: float
|
|
536
607
|
) -> bool:
|
|
537
608
|
jitter = 1e-6
|
|
538
|
-
return child_score >= parent_score + max(
|
|
609
|
+
return child_score >= parent_score + max(GEPA_MIN_DELTA, jitter)
|
|
539
610
|
|
|
540
611
|
def _accept_child(
|
|
541
612
|
self,
|
|
@@ -548,9 +619,7 @@ class GEPARunner:
|
|
|
548
619
|
) -> AcceptedIterationDict:
|
|
549
620
|
self._add_prompt_configuration(child_prompt_configuration)
|
|
550
621
|
self.pareto_score_table[child_prompt_configuration.id] = (
|
|
551
|
-
self.
|
|
552
|
-
child_prompt_configuration, d_pareto
|
|
553
|
-
)
|
|
622
|
+
self.scorer.score_pareto(child_prompt_configuration, d_pareto)
|
|
554
623
|
)
|
|
555
624
|
|
|
556
625
|
return AcceptedIterationDict(
|
|
@@ -572,7 +641,7 @@ class GEPARunner:
|
|
|
572
641
|
) -> AcceptedIterationDict:
|
|
573
642
|
self._add_prompt_configuration(child_prompt_configuration)
|
|
574
643
|
self.pareto_score_table[child_prompt_configuration.id] = (
|
|
575
|
-
await self.
|
|
644
|
+
await self.scorer.a_score_pareto(
|
|
576
645
|
child_prompt_configuration, d_pareto
|
|
577
646
|
)
|
|
578
647
|
)
|
|
@@ -590,13 +659,12 @@ class GEPARunner:
|
|
|
590
659
|
total_iterations: int,
|
|
591
660
|
iteration: int,
|
|
592
661
|
remaining_iterations: int,
|
|
593
|
-
elapsed: float,
|
|
594
662
|
):
|
|
595
663
|
if self.status_callback is not None:
|
|
596
664
|
detail = (
|
|
597
665
|
f"(iterations={total_iterations}) "
|
|
598
666
|
f"• iteration {iteration}/{total_iterations} "
|
|
599
|
-
f"•
|
|
667
|
+
f"• remaining={remaining_iterations}"
|
|
600
668
|
)
|
|
601
669
|
self.status_callback(
|
|
602
670
|
RunnerStatusType.PROGRESS,
|
|
@@ -626,52 +694,44 @@ class GEPARunner:
|
|
|
626
694
|
self,
|
|
627
695
|
gepa_iteration: Callable[[], bool],
|
|
628
696
|
) -> None:
|
|
629
|
-
total_iterations = self.
|
|
697
|
+
total_iterations = self.iterations
|
|
630
698
|
remaining_iterations = total_iterations
|
|
631
699
|
iteration = 0
|
|
632
|
-
self._update_progress(
|
|
633
|
-
total_iterations, iteration, remaining_iterations, 0
|
|
634
|
-
)
|
|
700
|
+
self._update_progress(total_iterations, iteration, remaining_iterations)
|
|
635
701
|
while remaining_iterations > 0:
|
|
636
702
|
iteration += 1
|
|
637
|
-
start_time = time.perf_counter()
|
|
638
703
|
try:
|
|
639
704
|
ok = gepa_iteration()
|
|
640
705
|
except Exception as exc:
|
|
641
706
|
# Report a user facing error event and halt optimization.
|
|
642
707
|
self._update_error(total_iterations, iteration, exc)
|
|
643
708
|
break
|
|
644
|
-
elapsed = time.perf_counter() - start_time
|
|
645
709
|
if not ok:
|
|
646
710
|
break
|
|
647
711
|
remaining_iterations -= 1
|
|
648
712
|
self._update_progress(
|
|
649
|
-
total_iterations, iteration, remaining_iterations
|
|
713
|
+
total_iterations, iteration, remaining_iterations
|
|
650
714
|
)
|
|
651
715
|
|
|
652
716
|
async def _a_run_loop_iteration(
|
|
653
717
|
self,
|
|
654
718
|
a_gepa_iteration: Callable[[], Awaitable[bool]],
|
|
655
719
|
) -> None:
|
|
656
|
-
total_iterations = self.
|
|
720
|
+
total_iterations = self.iterations
|
|
657
721
|
remaining_iterations = total_iterations
|
|
658
722
|
iteration = 0
|
|
659
|
-
self._update_progress(
|
|
660
|
-
total_iterations, iteration, remaining_iterations, 0
|
|
661
|
-
)
|
|
723
|
+
self._update_progress(total_iterations, iteration, remaining_iterations)
|
|
662
724
|
while remaining_iterations > 0:
|
|
663
725
|
iteration += 1
|
|
664
|
-
start_time = time.perf_counter()
|
|
665
726
|
try:
|
|
666
727
|
ok = await a_gepa_iteration()
|
|
667
728
|
except Exception as exc:
|
|
668
729
|
# Report a user facing error event and halt optimization.
|
|
669
730
|
self._update_error(total_iterations, iteration, exc)
|
|
670
731
|
break
|
|
671
|
-
elapsed = time.perf_counter() - start_time
|
|
672
732
|
if not ok:
|
|
673
733
|
break
|
|
674
734
|
remaining_iterations -= 1
|
|
675
735
|
self._update_progress(
|
|
676
|
-
total_iterations, iteration, remaining_iterations
|
|
736
|
+
total_iterations, iteration, remaining_iterations
|
|
677
737
|
)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from .miprov2 import MIPROV2
|
|
2
|
+
from .proposer import InstructionProposer
|
|
3
|
+
from .bootstrapper import (
|
|
4
|
+
Demo,
|
|
5
|
+
DemoSet,
|
|
6
|
+
DemoBootstrapper,
|
|
7
|
+
render_prompt_with_demos,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"MIPROV2",
|
|
12
|
+
"InstructionProposer",
|
|
13
|
+
"Demo",
|
|
14
|
+
"DemoSet",
|
|
15
|
+
"DemoBootstrapper",
|
|
16
|
+
"render_prompt_with_demos",
|
|
17
|
+
]
|