deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +139 -2
- deepeval/evaluate/evaluate.py +16 -11
- deepeval/evaluate/execute.py +13 -181
- deepeval/evaluate/utils.py +6 -26
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +14 -16
- deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
- deepeval/metrics/answer_relevancy/template.py +22 -3
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -44
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +16 -2
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +138 -149
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +7 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -19
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +158 -122
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +49 -33
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +79 -33
- deepeval/models/embedding_models/local_embedding_model.py +39 -20
- deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
- deepeval/models/embedding_models/openai_embedding_model.py +42 -22
- deepeval/models/llms/amazon_bedrock_model.py +226 -72
- deepeval/models/llms/anthropic_model.py +178 -63
- deepeval/models/llms/azure_model.py +218 -60
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +95 -40
- deepeval/models/llms/gemini_model.py +209 -64
- deepeval/models/llms/grok_model.py +139 -68
- deepeval/models/llms/kimi_model.py +140 -90
- deepeval/models/llms/litellm_model.py +131 -37
- deepeval/models/llms/local_model.py +125 -21
- deepeval/models/llms/ollama_model.py +147 -24
- deepeval/models/llms/openai_model.py +222 -269
- deepeval/models/llms/portkey_model.py +81 -22
- deepeval/models/llms/utils.py +8 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +106 -5
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +26 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +68 -1
- deepeval/test_case/llm_test_case.py +206 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +18 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -1,462 +0,0 @@
|
|
|
1
|
-
from typing import (
|
|
2
|
-
Callable,
|
|
3
|
-
Dict,
|
|
4
|
-
List,
|
|
5
|
-
Optional,
|
|
6
|
-
Tuple,
|
|
7
|
-
Union,
|
|
8
|
-
)
|
|
9
|
-
|
|
10
|
-
from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn
|
|
11
|
-
|
|
12
|
-
from deepeval.dataset.golden import Golden, ConversationalGolden
|
|
13
|
-
from deepeval.errors import DeepEvalError
|
|
14
|
-
from deepeval.metrics import BaseConversationalMetric, BaseMetric
|
|
15
|
-
from deepeval.evaluate.configs import AsyncConfig
|
|
16
|
-
from deepeval.optimization.adapters.deepeval_scoring_adapter import (
|
|
17
|
-
DeepEvalScoringAdapter,
|
|
18
|
-
)
|
|
19
|
-
from deepeval.optimization.mutations.prompt_rewriter import (
|
|
20
|
-
PromptRewriter,
|
|
21
|
-
)
|
|
22
|
-
from deepeval.optimization.types import (
|
|
23
|
-
OptimizationReport,
|
|
24
|
-
RunnerProtocol,
|
|
25
|
-
RunnerStatusType,
|
|
26
|
-
)
|
|
27
|
-
from deepeval.optimization.utils import (
|
|
28
|
-
validate_callback,
|
|
29
|
-
validate_metrics,
|
|
30
|
-
validate_instance,
|
|
31
|
-
validate_sequence_of,
|
|
32
|
-
)
|
|
33
|
-
from deepeval.optimization.configs import (
|
|
34
|
-
OptimizerDisplayConfig,
|
|
35
|
-
PromptListMutationConfig,
|
|
36
|
-
)
|
|
37
|
-
from deepeval.prompt.prompt import Prompt
|
|
38
|
-
from deepeval.utils import get_or_create_event_loop
|
|
39
|
-
from deepeval.optimization.gepa.configs import GEPAConfig
|
|
40
|
-
from deepeval.optimization.gepa.loop import GEPARunner
|
|
41
|
-
from deepeval.optimization.miprov2.configs import MIPROConfig
|
|
42
|
-
from deepeval.optimization.miprov2.loop import MIPRORunner
|
|
43
|
-
from deepeval.optimization.copro.configs import COPROConfig
|
|
44
|
-
from deepeval.optimization.copro.loop import COPRORunner
|
|
45
|
-
from deepeval.optimization.simba.configs import SIMBAConfig
|
|
46
|
-
from deepeval.optimization.simba.loop import SIMBARunner
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
class PromptOptimizer:
|
|
50
|
-
"""
|
|
51
|
-
High-level entrypoint for prompt optimization.
|
|
52
|
-
|
|
53
|
-
Typical usage:
|
|
54
|
-
|
|
55
|
-
optimizer = PromptOptimizer(
|
|
56
|
-
metrics=[AnswerRelevancyMetric()],
|
|
57
|
-
model_callback=model_callback,
|
|
58
|
-
)
|
|
59
|
-
|
|
60
|
-
optimized_prompt = optimizer.optimize(
|
|
61
|
-
prompt=Prompt(text_template="Respond to the query."),
|
|
62
|
-
goldens=goldens,
|
|
63
|
-
)
|
|
64
|
-
|
|
65
|
-
By default, this constructs and uses a GEPA based runner internally.
|
|
66
|
-
Advanced users can construct their own runner with a custom config
|
|
67
|
-
(GEPAConfig) and attach it via `set_runner(...)`.
|
|
68
|
-
"""
|
|
69
|
-
|
|
70
|
-
def __init__(
|
|
71
|
-
self,
|
|
72
|
-
*,
|
|
73
|
-
model_callback: Callable[
|
|
74
|
-
...,
|
|
75
|
-
Union[
|
|
76
|
-
str,
|
|
77
|
-
Dict,
|
|
78
|
-
Tuple[Union[str, Dict], float],
|
|
79
|
-
],
|
|
80
|
-
],
|
|
81
|
-
metrics: Union[List[BaseMetric], List[BaseConversationalMetric]],
|
|
82
|
-
async_config: Optional[AsyncConfig] = None,
|
|
83
|
-
display_config: Optional[OptimizerDisplayConfig] = None,
|
|
84
|
-
prompt_list_mutation_config: Optional[PromptListMutationConfig] = None,
|
|
85
|
-
list_input_role: str = "user",
|
|
86
|
-
algorithm: str = "gepa",
|
|
87
|
-
):
|
|
88
|
-
# Validate and store the callback
|
|
89
|
-
self.model_callback = validate_callback(
|
|
90
|
-
component="PromptOptimizer",
|
|
91
|
-
model_callback=model_callback,
|
|
92
|
-
)
|
|
93
|
-
self.metrics = validate_metrics(
|
|
94
|
-
component="PromptOptimizer", metrics=metrics
|
|
95
|
-
)
|
|
96
|
-
# Validate async_config
|
|
97
|
-
async_config = async_config or AsyncConfig()
|
|
98
|
-
validate_instance(
|
|
99
|
-
component="PromptOptimizer.__init__",
|
|
100
|
-
param_name="async_config",
|
|
101
|
-
value=async_config,
|
|
102
|
-
expected_types=AsyncConfig,
|
|
103
|
-
)
|
|
104
|
-
self.async_config = async_config
|
|
105
|
-
|
|
106
|
-
# validate display_config
|
|
107
|
-
display_config = display_config or OptimizerDisplayConfig()
|
|
108
|
-
validate_instance(
|
|
109
|
-
component="PromptOptimizer.__init__",
|
|
110
|
-
param_name="display_config",
|
|
111
|
-
value=display_config,
|
|
112
|
-
expected_types=OptimizerDisplayConfig,
|
|
113
|
-
)
|
|
114
|
-
self.display_config = display_config
|
|
115
|
-
|
|
116
|
-
# validate prompt_list_mutation_config
|
|
117
|
-
prompt_list_mutation_config = (
|
|
118
|
-
prompt_list_mutation_config or PromptListMutationConfig()
|
|
119
|
-
)
|
|
120
|
-
validate_instance(
|
|
121
|
-
component="PromptOptimizer.__init__",
|
|
122
|
-
param_name="prompt_list_mutation_config",
|
|
123
|
-
value=prompt_list_mutation_config,
|
|
124
|
-
expected_types=PromptListMutationConfig,
|
|
125
|
-
)
|
|
126
|
-
self.prompt_list_mutation_config = prompt_list_mutation_config
|
|
127
|
-
|
|
128
|
-
# validate list_input_role
|
|
129
|
-
validate_instance(
|
|
130
|
-
component="PromptOptimizer.__init__",
|
|
131
|
-
param_name="list_input_role",
|
|
132
|
-
value=list_input_role,
|
|
133
|
-
expected_types=str,
|
|
134
|
-
)
|
|
135
|
-
self.list_input_role = list_input_role
|
|
136
|
-
|
|
137
|
-
# Validate algorithm
|
|
138
|
-
algo_raw = algorithm or "gepa"
|
|
139
|
-
if not isinstance(algo_raw, str):
|
|
140
|
-
raise DeepEvalError(
|
|
141
|
-
"PromptOptimizer.__init__ expected `algorithm` to be a string "
|
|
142
|
-
f"(e.g. 'gepa'), but received {type(algorithm).__name__!r} instead."
|
|
143
|
-
)
|
|
144
|
-
|
|
145
|
-
algo_normalized = (algo_raw.strip() or "gepa").lower()
|
|
146
|
-
if algo_normalized in {"mipro", "miprov2"}:
|
|
147
|
-
algo_normalized = "miprov2"
|
|
148
|
-
|
|
149
|
-
self._allowed_algorithms = {"gepa", "miprov2", "copro", "simba"}
|
|
150
|
-
|
|
151
|
-
if algo_normalized not in self._allowed_algorithms:
|
|
152
|
-
raise DeepEvalError(
|
|
153
|
-
"PromptOptimizer.__init__ received unsupported `algorithm` "
|
|
154
|
-
f"value {algorithm!r}. Supported algorithms are: "
|
|
155
|
-
+ ", ".join(sorted(self._allowed_algorithms))
|
|
156
|
-
)
|
|
157
|
-
|
|
158
|
-
self.algorithm = algo_normalized
|
|
159
|
-
|
|
160
|
-
# Internal state used only when a progress indicator is active.
|
|
161
|
-
# Tuple is (Progress instance, task_id).
|
|
162
|
-
self._progress_state: Optional[Tuple[Progress, int]] = None
|
|
163
|
-
|
|
164
|
-
self.runner: Optional[RunnerProtocol] = None
|
|
165
|
-
|
|
166
|
-
##############
|
|
167
|
-
# Public API #
|
|
168
|
-
##############
|
|
169
|
-
|
|
170
|
-
def optimize(
|
|
171
|
-
self,
|
|
172
|
-
*,
|
|
173
|
-
prompt: Prompt,
|
|
174
|
-
goldens: Union[List["Golden"], List["ConversationalGolden"]],
|
|
175
|
-
) -> Prompt:
|
|
176
|
-
"""
|
|
177
|
-
Run the configured optimization algorithm and return an optimized Prompt.
|
|
178
|
-
|
|
179
|
-
The returned Prompt will have an OptimizationReport attached as
|
|
180
|
-
`prompt.optimization_report`.
|
|
181
|
-
"""
|
|
182
|
-
# Validate prompt
|
|
183
|
-
validate_instance(
|
|
184
|
-
component="PromptOptimizer.optimize",
|
|
185
|
-
param_name="prompt",
|
|
186
|
-
value=prompt,
|
|
187
|
-
expected_types=Prompt,
|
|
188
|
-
)
|
|
189
|
-
|
|
190
|
-
# Validate goldens: must be a list of Golden or ConversationalGolden
|
|
191
|
-
validate_sequence_of(
|
|
192
|
-
component="PromptOptimizer.optimize",
|
|
193
|
-
param_name="goldens",
|
|
194
|
-
value=goldens,
|
|
195
|
-
expected_item_types=(Golden, ConversationalGolden),
|
|
196
|
-
sequence_types=(list,),
|
|
197
|
-
)
|
|
198
|
-
|
|
199
|
-
if self.runner is None:
|
|
200
|
-
self.set_runner(self._build_default_runner())
|
|
201
|
-
|
|
202
|
-
if not self.display_config.show_indicator:
|
|
203
|
-
best_prompt, report_dict = (
|
|
204
|
-
self._run_optimization_with_error_handling(
|
|
205
|
-
prompt=prompt,
|
|
206
|
-
goldens=goldens,
|
|
207
|
-
)
|
|
208
|
-
)
|
|
209
|
-
else:
|
|
210
|
-
with Progress(
|
|
211
|
-
SpinnerColumn(style="rgb(106,0,255)"),
|
|
212
|
-
BarColumn(bar_width=60),
|
|
213
|
-
TextColumn("[progress.description]{task.description}"),
|
|
214
|
-
transient=True,
|
|
215
|
-
) as progress:
|
|
216
|
-
# Total will be provided by the runner via the
|
|
217
|
-
# progress status_callback. Start at 0 and update later.
|
|
218
|
-
task = progress.add_task(
|
|
219
|
-
f"Optimizing prompt with {self.algorithm.upper()}..."
|
|
220
|
-
)
|
|
221
|
-
self._progress_state = (progress, task)
|
|
222
|
-
|
|
223
|
-
try:
|
|
224
|
-
best_prompt, report_dict = (
|
|
225
|
-
self._run_optimization_with_error_handling(
|
|
226
|
-
prompt=prompt,
|
|
227
|
-
goldens=goldens,
|
|
228
|
-
)
|
|
229
|
-
)
|
|
230
|
-
finally:
|
|
231
|
-
# Clear progress state even if an error occurs
|
|
232
|
-
self._progress_state = None
|
|
233
|
-
|
|
234
|
-
best_prompt.optimization_report = OptimizationReport.from_runtime(
|
|
235
|
-
report_dict
|
|
236
|
-
)
|
|
237
|
-
return best_prompt
|
|
238
|
-
|
|
239
|
-
def set_runner(self, runner: RunnerProtocol):
|
|
240
|
-
self._set_runner_callbacks(runner)
|
|
241
|
-
scoring_adapter = getattr(runner, "scoring_adapter", None)
|
|
242
|
-
if scoring_adapter is None:
|
|
243
|
-
runner.scoring_adapter = self._build_default_scoring_adapter()
|
|
244
|
-
else:
|
|
245
|
-
if not len(runner.scoring_adapter.metrics):
|
|
246
|
-
runner.scoring_adapter.set_metrics(self.metrics)
|
|
247
|
-
if runner.scoring_adapter.model_callback is None:
|
|
248
|
-
runner.scoring_adapter.model_callback = self.model_callback
|
|
249
|
-
self.runner = runner
|
|
250
|
-
|
|
251
|
-
####################
|
|
252
|
-
# Internal helpers #
|
|
253
|
-
####################
|
|
254
|
-
|
|
255
|
-
def _run_optimization(
|
|
256
|
-
self,
|
|
257
|
-
*,
|
|
258
|
-
prompt: Prompt,
|
|
259
|
-
goldens: Union[List["Golden"], List["ConversationalGolden"]],
|
|
260
|
-
) -> Tuple[Prompt, Dict]:
|
|
261
|
-
if self.async_config.run_async:
|
|
262
|
-
loop = get_or_create_event_loop()
|
|
263
|
-
return loop.run_until_complete(
|
|
264
|
-
self.runner.a_execute(prompt=prompt, goldens=goldens)
|
|
265
|
-
)
|
|
266
|
-
return self.runner.execute(prompt=prompt, goldens=goldens)
|
|
267
|
-
|
|
268
|
-
def _run_optimization_with_error_handling(
|
|
269
|
-
self,
|
|
270
|
-
*,
|
|
271
|
-
prompt: Prompt,
|
|
272
|
-
goldens: Union[List["Golden"], List["ConversationalGolden"]],
|
|
273
|
-
) -> Tuple[Prompt, Dict]:
|
|
274
|
-
"""
|
|
275
|
-
Run optimization and convert uncaught exceptions into a concise
|
|
276
|
-
user facing error message.
|
|
277
|
-
|
|
278
|
-
This is a fallback for errors that occur before the runner
|
|
279
|
-
enters its main iteration loop, which would otherwise surface
|
|
280
|
-
as a full traceback.
|
|
281
|
-
"""
|
|
282
|
-
try:
|
|
283
|
-
return self._run_optimization(prompt=prompt, goldens=goldens)
|
|
284
|
-
except Exception as exc:
|
|
285
|
-
# Try to recover iteration count from the runner config
|
|
286
|
-
total_steps: Optional[int] = None
|
|
287
|
-
iterations: Optional[int] = None
|
|
288
|
-
runner_config = getattr(self.runner, "config", None)
|
|
289
|
-
if runner_config is not None:
|
|
290
|
-
iterations = getattr(runner_config, "iterations", None)
|
|
291
|
-
if iterations is not None:
|
|
292
|
-
total_steps = int(iterations)
|
|
293
|
-
|
|
294
|
-
prefix = (
|
|
295
|
-
f"(iterations={iterations}) " if iterations is not None else ""
|
|
296
|
-
)
|
|
297
|
-
detail = (
|
|
298
|
-
f"{prefix}• error {exc.__class__.__name__}: {exc} "
|
|
299
|
-
"• halted before first iteration"
|
|
300
|
-
)
|
|
301
|
-
|
|
302
|
-
self._on_status(
|
|
303
|
-
RunnerStatusType.ERROR,
|
|
304
|
-
detail=detail,
|
|
305
|
-
step_index=None,
|
|
306
|
-
total_steps=total_steps,
|
|
307
|
-
)
|
|
308
|
-
|
|
309
|
-
algo = self.algorithm.upper()
|
|
310
|
-
|
|
311
|
-
# using `from None` avoids a long chained stack trace while keeping
|
|
312
|
-
# the error message readable.
|
|
313
|
-
raise DeepEvalError(f"[{algo}] {detail}") from None
|
|
314
|
-
|
|
315
|
-
def _on_status(
|
|
316
|
-
self,
|
|
317
|
-
kind: RunnerStatusType,
|
|
318
|
-
*,
|
|
319
|
-
detail: str,
|
|
320
|
-
step_index: Optional[int] = None,
|
|
321
|
-
total_steps: Optional[int] = None,
|
|
322
|
-
) -> None:
|
|
323
|
-
"""
|
|
324
|
-
Unified status callback used by the configured runner.
|
|
325
|
-
|
|
326
|
-
- PROGRESS: update the progress bar description and position
|
|
327
|
-
- TIE: optionally print a tie message
|
|
328
|
-
- ERROR: print a concise error message and allow the run to halt
|
|
329
|
-
"""
|
|
330
|
-
algo = self.algorithm.upper()
|
|
331
|
-
|
|
332
|
-
# ERROR: always print, optionally update progress bar
|
|
333
|
-
if kind is RunnerStatusType.ERROR:
|
|
334
|
-
if (
|
|
335
|
-
self.display_config.show_indicator
|
|
336
|
-
and self._progress_state is not None
|
|
337
|
-
):
|
|
338
|
-
progress, task = self._progress_state
|
|
339
|
-
|
|
340
|
-
if total_steps is not None:
|
|
341
|
-
progress.update(task, total=total_steps)
|
|
342
|
-
|
|
343
|
-
description = self._format_progress_description(detail)
|
|
344
|
-
progress.update(task, description=description)
|
|
345
|
-
|
|
346
|
-
# Print a concise, error line regardless of indicator state
|
|
347
|
-
print(f"[{algo}] {detail}")
|
|
348
|
-
return
|
|
349
|
-
|
|
350
|
-
# TIE: optional one line message, no progress bar changes
|
|
351
|
-
if kind is RunnerStatusType.TIE:
|
|
352
|
-
if not self.display_config.announce_ties:
|
|
353
|
-
return
|
|
354
|
-
print(f"[{algo}] {detail}")
|
|
355
|
-
return
|
|
356
|
-
|
|
357
|
-
if kind is not RunnerStatusType.PROGRESS:
|
|
358
|
-
return
|
|
359
|
-
|
|
360
|
-
if not self.display_config.show_indicator:
|
|
361
|
-
return
|
|
362
|
-
|
|
363
|
-
if self._progress_state is None:
|
|
364
|
-
return
|
|
365
|
-
|
|
366
|
-
progress, task = self._progress_state
|
|
367
|
-
|
|
368
|
-
# Allow the runner to set or update the total steps.
|
|
369
|
-
if total_steps is not None:
|
|
370
|
-
progress.update(task, total=total_steps)
|
|
371
|
-
|
|
372
|
-
# iteration 0 shouldn't advance the bar
|
|
373
|
-
if step_index is not None and step_index > 0:
|
|
374
|
-
progress.advance(task, 1)
|
|
375
|
-
|
|
376
|
-
description = self._format_progress_description(detail)
|
|
377
|
-
progress.update(task, description=description)
|
|
378
|
-
|
|
379
|
-
def _format_progress_description(self, detail: str) -> str:
|
|
380
|
-
"""
|
|
381
|
-
Compose a human readable progress line using an algorithm agnostic
|
|
382
|
-
prefix and an algorithm specific detail string provided by the runner.
|
|
383
|
-
"""
|
|
384
|
-
algo = self.algorithm.upper()
|
|
385
|
-
base = f"Optimizing prompt with {algo}"
|
|
386
|
-
if detail:
|
|
387
|
-
return f"{base} [rgb(25,227,160)]{detail}[/]"
|
|
388
|
-
return base
|
|
389
|
-
|
|
390
|
-
def _build_default_scoring_adapter(self) -> DeepEvalScoringAdapter:
|
|
391
|
-
scoring_adapter = DeepEvalScoringAdapter(
|
|
392
|
-
list_input_role=self.list_input_role
|
|
393
|
-
)
|
|
394
|
-
scoring_adapter.set_model_callback(self.model_callback)
|
|
395
|
-
scoring_adapter.set_metrics(self.metrics)
|
|
396
|
-
return scoring_adapter
|
|
397
|
-
|
|
398
|
-
def _set_runner_callbacks(self, runner: RunnerProtocol):
|
|
399
|
-
runner.model_callback = (
|
|
400
|
-
self.model_callback
|
|
401
|
-
if runner.model_callback is None
|
|
402
|
-
else runner.model_callback
|
|
403
|
-
)
|
|
404
|
-
runner.status_callback = (
|
|
405
|
-
self._on_status
|
|
406
|
-
if runner.status_callback is None
|
|
407
|
-
else runner.status_callback
|
|
408
|
-
)
|
|
409
|
-
|
|
410
|
-
def _build_default_runner(self) -> RunnerProtocol:
|
|
411
|
-
if self.algorithm not in self._allowed_algorithms:
|
|
412
|
-
raise DeepEvalError(
|
|
413
|
-
f"Unsupported optimization algorithm: {self.algorithm!r}. "
|
|
414
|
-
"Supported algorithms are: 'gepa', 'miprov2' (alias 'mipro'), "
|
|
415
|
-
"'copro', 'simba'."
|
|
416
|
-
)
|
|
417
|
-
|
|
418
|
-
scoring_adapter = self._build_default_scoring_adapter()
|
|
419
|
-
|
|
420
|
-
if hasattr(scoring_adapter, "configure_async"):
|
|
421
|
-
scoring_adapter.configure_async(
|
|
422
|
-
max_concurrent=self.async_config.max_concurrent,
|
|
423
|
-
throttle_seconds=float(self.async_config.throttle_value),
|
|
424
|
-
)
|
|
425
|
-
|
|
426
|
-
if self.algorithm == "gepa":
|
|
427
|
-
config = GEPAConfig()
|
|
428
|
-
runner: RunnerProtocol = GEPARunner(
|
|
429
|
-
config=config,
|
|
430
|
-
scoring_adapter=scoring_adapter,
|
|
431
|
-
)
|
|
432
|
-
elif self.algorithm == "miprov2":
|
|
433
|
-
# MIPROv2 0-shot, instruction-only
|
|
434
|
-
config = MIPROConfig()
|
|
435
|
-
runner = MIPRORunner(
|
|
436
|
-
config=config,
|
|
437
|
-
scoring_adapter=scoring_adapter,
|
|
438
|
-
)
|
|
439
|
-
elif self.algorithm == "copro":
|
|
440
|
-
# COPRO cooperative multi-proposal variant
|
|
441
|
-
config = COPROConfig()
|
|
442
|
-
runner = COPRORunner(
|
|
443
|
-
config=config,
|
|
444
|
-
scoring_adapter=scoring_adapter,
|
|
445
|
-
)
|
|
446
|
-
else:
|
|
447
|
-
config = SIMBAConfig()
|
|
448
|
-
runner = SIMBARunner(
|
|
449
|
-
config=config,
|
|
450
|
-
scoring_adapter=scoring_adapter,
|
|
451
|
-
)
|
|
452
|
-
|
|
453
|
-
# Attach a PromptRewriter to the runner so that it has mutation behavior
|
|
454
|
-
runner._rewriter = PromptRewriter(
|
|
455
|
-
max_chars=config.rewrite_instruction_max_chars,
|
|
456
|
-
list_mutation_config=self.prompt_list_mutation_config,
|
|
457
|
-
random_state=runner.random_state,
|
|
458
|
-
)
|
|
459
|
-
|
|
460
|
-
self._set_runner_callbacks(runner)
|
|
461
|
-
|
|
462
|
-
return runner
|
|
File without changes
|
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
from pydantic import Field, PositiveInt, conint
|
|
3
|
-
|
|
4
|
-
from deepeval.optimization.copro.configs import COPROConfig
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class SIMBAConfig(COPROConfig):
|
|
8
|
-
"""
|
|
9
|
-
Configuration for SIMBA style cooperative prompt optimization.
|
|
10
|
-
|
|
11
|
-
Extends `COPROConfig` with strategy specific controls:
|
|
12
|
-
|
|
13
|
-
- How many minibatch examples are surfaced as demos and how long
|
|
14
|
-
those snippets can be (`max_demos_per_proposal`,
|
|
15
|
-
`demo_input_max_chars`).
|
|
16
|
-
"""
|
|
17
|
-
|
|
18
|
-
max_demos_per_proposal: conint(ge=0) = Field(
|
|
19
|
-
default=3,
|
|
20
|
-
description=(
|
|
21
|
-
"Maximum number of goldens from the current minibatch that are "
|
|
22
|
-
"converted into concrete input/output demos when using the "
|
|
23
|
-
"APPEND_DEMO strategy."
|
|
24
|
-
),
|
|
25
|
-
)
|
|
26
|
-
|
|
27
|
-
demo_input_max_chars: PositiveInt = Field(
|
|
28
|
-
default=256,
|
|
29
|
-
description=(
|
|
30
|
-
"Maximum number of characters taken from the golden input and "
|
|
31
|
-
"expected output when constructing demo snippets for APPEND_DEMO."
|
|
32
|
-
),
|
|
33
|
-
)
|