deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +139 -2
- deepeval/evaluate/evaluate.py +16 -11
- deepeval/evaluate/execute.py +13 -181
- deepeval/evaluate/utils.py +6 -26
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +14 -16
- deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
- deepeval/metrics/answer_relevancy/template.py +22 -3
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -44
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +16 -2
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +138 -149
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +7 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -19
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +158 -122
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +49 -33
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +79 -33
- deepeval/models/embedding_models/local_embedding_model.py +39 -20
- deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
- deepeval/models/embedding_models/openai_embedding_model.py +42 -22
- deepeval/models/llms/amazon_bedrock_model.py +226 -72
- deepeval/models/llms/anthropic_model.py +178 -63
- deepeval/models/llms/azure_model.py +218 -60
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +95 -40
- deepeval/models/llms/gemini_model.py +209 -64
- deepeval/models/llms/grok_model.py +139 -68
- deepeval/models/llms/kimi_model.py +140 -90
- deepeval/models/llms/litellm_model.py +131 -37
- deepeval/models/llms/local_model.py +125 -21
- deepeval/models/llms/ollama_model.py +147 -24
- deepeval/models/llms/openai_model.py +222 -269
- deepeval/models/llms/portkey_model.py +81 -22
- deepeval/models/llms/utils.py +8 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +106 -5
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +26 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +68 -1
- deepeval/test_case/llm_test_case.py +206 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +18 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
deepeval/optimization/types.py
DELETED
|
@@ -1,361 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
import uuid
|
|
3
|
-
|
|
4
|
-
from dataclasses import dataclass
|
|
5
|
-
from typing import (
|
|
6
|
-
Any,
|
|
7
|
-
Callable,
|
|
8
|
-
Dict,
|
|
9
|
-
List,
|
|
10
|
-
Literal,
|
|
11
|
-
Optional,
|
|
12
|
-
Protocol,
|
|
13
|
-
TYPE_CHECKING,
|
|
14
|
-
TypedDict,
|
|
15
|
-
Tuple,
|
|
16
|
-
Union,
|
|
17
|
-
)
|
|
18
|
-
from enum import Enum
|
|
19
|
-
from pydantic import BaseModel as PydanticBaseModel, Field, AliasChoices
|
|
20
|
-
|
|
21
|
-
from deepeval.prompt.prompt import Prompt
|
|
22
|
-
from deepeval.models.base_model import DeepEvalBaseLLM
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
if TYPE_CHECKING:
|
|
26
|
-
from deepeval.dataset.golden import Golden, ConversationalGolden
|
|
27
|
-
|
|
28
|
-
PromptConfigurationId = str
|
|
29
|
-
ModuleId = str
|
|
30
|
-
ScoreVector = List[float] # scores per instance on D_pareto, aligned order
|
|
31
|
-
ScoreTable = Dict[PromptConfigurationId, ScoreVector]
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
@dataclass
|
|
35
|
-
class PromptConfiguration:
|
|
36
|
-
id: PromptConfigurationId
|
|
37
|
-
parent: Optional[PromptConfigurationId]
|
|
38
|
-
prompts: Dict[ModuleId, Prompt]
|
|
39
|
-
|
|
40
|
-
@staticmethod
|
|
41
|
-
def new(
|
|
42
|
-
prompts: Dict[ModuleId, Prompt],
|
|
43
|
-
parent: Optional[PromptConfigurationId] = None,
|
|
44
|
-
) -> "PromptConfiguration":
|
|
45
|
-
return PromptConfiguration(
|
|
46
|
-
id=str(uuid.uuid4()), parent=parent, prompts=dict(prompts)
|
|
47
|
-
)
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
class ScoringAdapter(Protocol):
|
|
51
|
-
"""
|
|
52
|
-
Scoring adapter contract used by optimization runners.
|
|
53
|
-
|
|
54
|
-
Runners call into this adapter to:
|
|
55
|
-
- compute scores per-instance on some subset (score_on_pareto),
|
|
56
|
-
- compute minibatch means for selection and acceptance,
|
|
57
|
-
- generate feedback text used by the PromptRewriter.
|
|
58
|
-
"""
|
|
59
|
-
|
|
60
|
-
# Sync
|
|
61
|
-
def score_on_pareto(
|
|
62
|
-
self,
|
|
63
|
-
prompt_configuration: PromptConfiguration,
|
|
64
|
-
d_pareto: Union[List[Golden], List[ConversationalGolden]],
|
|
65
|
-
) -> ScoreVector:
|
|
66
|
-
"""Return per-instance scores on D_pareto."""
|
|
67
|
-
...
|
|
68
|
-
|
|
69
|
-
def minibatch_score(
|
|
70
|
-
self,
|
|
71
|
-
prompt_configuration: PromptConfiguration,
|
|
72
|
-
minibatch: Union[List[Golden], List[ConversationalGolden]],
|
|
73
|
-
) -> float:
|
|
74
|
-
"""Return average score μ on a minibatch from D_feedback."""
|
|
75
|
-
...
|
|
76
|
-
|
|
77
|
-
def minibatch_feedback(
|
|
78
|
-
self,
|
|
79
|
-
prompt_configuration: PromptConfiguration,
|
|
80
|
-
module: ModuleId,
|
|
81
|
-
minibatch: Union[List[Golden], List[ConversationalGolden]],
|
|
82
|
-
) -> str:
|
|
83
|
-
"""Return μ_f text for the module (metric.reason + traces, etc.)."""
|
|
84
|
-
...
|
|
85
|
-
|
|
86
|
-
def select_module(
|
|
87
|
-
self, prompt_configuration: PromptConfiguration
|
|
88
|
-
) -> ModuleId:
|
|
89
|
-
"""Pick a module to mutate."""
|
|
90
|
-
...
|
|
91
|
-
|
|
92
|
-
# Async
|
|
93
|
-
async def a_score_on_pareto(
|
|
94
|
-
self,
|
|
95
|
-
prompt_configuration: PromptConfiguration,
|
|
96
|
-
d_pareto: Union[List[Golden], List[ConversationalGolden]],
|
|
97
|
-
) -> ScoreVector: ...
|
|
98
|
-
async def a_minibatch_score(
|
|
99
|
-
self,
|
|
100
|
-
prompt_configuration: PromptConfiguration,
|
|
101
|
-
minibatch: Union[List[Golden], List[ConversationalGolden]],
|
|
102
|
-
) -> float: ...
|
|
103
|
-
async def a_minibatch_feedback(
|
|
104
|
-
self,
|
|
105
|
-
prompt_configuration: PromptConfiguration,
|
|
106
|
-
module: ModuleId,
|
|
107
|
-
minibatch: Union[List[Golden], List[ConversationalGolden]],
|
|
108
|
-
) -> str: ...
|
|
109
|
-
async def a_select_module(
|
|
110
|
-
self, prompt_configuration: PromptConfiguration
|
|
111
|
-
) -> ModuleId: ...
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
class PromptRewriterProtocol(Protocol):
|
|
115
|
-
def rewrite(
|
|
116
|
-
self,
|
|
117
|
-
*,
|
|
118
|
-
module_id: ModuleId,
|
|
119
|
-
model: Optional[DeepEvalBaseLLM] = None,
|
|
120
|
-
model_schema: Optional[PydanticBaseModel] = None,
|
|
121
|
-
model_callback: Optional[
|
|
122
|
-
Callable[
|
|
123
|
-
...,
|
|
124
|
-
Union[
|
|
125
|
-
str,
|
|
126
|
-
Dict,
|
|
127
|
-
Tuple[Union[str, Dict], float],
|
|
128
|
-
],
|
|
129
|
-
]
|
|
130
|
-
] = None,
|
|
131
|
-
old_prompt: Prompt,
|
|
132
|
-
feedback_text: str,
|
|
133
|
-
) -> Prompt: ...
|
|
134
|
-
|
|
135
|
-
async def a_rewrite(
|
|
136
|
-
self,
|
|
137
|
-
*,
|
|
138
|
-
module_id: ModuleId,
|
|
139
|
-
model: Optional[DeepEvalBaseLLM] = None,
|
|
140
|
-
model_schema: Optional[PydanticBaseModel] = None,
|
|
141
|
-
model_callback: Optional[
|
|
142
|
-
Callable[
|
|
143
|
-
...,
|
|
144
|
-
Union[
|
|
145
|
-
str,
|
|
146
|
-
Dict,
|
|
147
|
-
Tuple[Union[str, Dict], float],
|
|
148
|
-
],
|
|
149
|
-
]
|
|
150
|
-
] = None,
|
|
151
|
-
old_prompt: Prompt,
|
|
152
|
-
feedback_text: str,
|
|
153
|
-
) -> Prompt: ...
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
class RunnerStatusType(str, Enum):
|
|
157
|
-
"""Status events emitted by optimization runners."""
|
|
158
|
-
|
|
159
|
-
PROGRESS = "progress"
|
|
160
|
-
TIE = "tie"
|
|
161
|
-
ERROR = "error"
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
class RunnerStatusCallbackProtocol(Protocol):
|
|
165
|
-
def __call__(
|
|
166
|
-
self,
|
|
167
|
-
kind: RunnerStatusType,
|
|
168
|
-
*,
|
|
169
|
-
detail: str,
|
|
170
|
-
step_index: Optional[int] = None,
|
|
171
|
-
total_steps: Optional[int] = None,
|
|
172
|
-
) -> None: ...
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
class RunnerProtocol(Protocol):
|
|
176
|
-
"""
|
|
177
|
-
Contract for prompt optimization runners used by PromptOptimizer.
|
|
178
|
-
|
|
179
|
-
Runners are responsible for executing the optimization algorithm
|
|
180
|
-
and returning an optimized Prompt plus a report dict.
|
|
181
|
-
"""
|
|
182
|
-
|
|
183
|
-
# status_callback is injected by PromptOptimizer
|
|
184
|
-
# A runner may call this to report:
|
|
185
|
-
# progress, ties, or errors during execution.
|
|
186
|
-
status_callback: Optional[RunnerStatusCallbackProtocol]
|
|
187
|
-
model_callback: Optional[
|
|
188
|
-
Callable[
|
|
189
|
-
...,
|
|
190
|
-
Union[
|
|
191
|
-
str,
|
|
192
|
-
Dict,
|
|
193
|
-
Tuple[Union[str, Dict], float],
|
|
194
|
-
],
|
|
195
|
-
]
|
|
196
|
-
]
|
|
197
|
-
|
|
198
|
-
scoring_adapter: Optional[ScoringAdapter]
|
|
199
|
-
|
|
200
|
-
def execute(
|
|
201
|
-
self,
|
|
202
|
-
*,
|
|
203
|
-
prompt: Prompt,
|
|
204
|
-
goldens: Union[List["Golden"], List["ConversationalGolden"]],
|
|
205
|
-
) -> Tuple[Prompt, Dict]: ...
|
|
206
|
-
|
|
207
|
-
async def a_execute(
|
|
208
|
-
self,
|
|
209
|
-
*,
|
|
210
|
-
prompt: Prompt,
|
|
211
|
-
goldens: Union[List["Golden"], List["ConversationalGolden"]],
|
|
212
|
-
) -> Tuple[Prompt, Dict]: ...
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
class Objective(Protocol):
|
|
216
|
-
"""Strategy for reducing scores per-metric to a single scalar value.
|
|
217
|
-
|
|
218
|
-
Implementations receive a mapping from metric name to score
|
|
219
|
-
(for example, {"AnswerRelevancyMetric": 0.82}) and return a
|
|
220
|
-
single float used for comparisons inside the optimizer.
|
|
221
|
-
"""
|
|
222
|
-
|
|
223
|
-
def scalarize(self, scores_by_metric: Dict[str, float]) -> float: ...
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
class MeanObjective(Objective):
|
|
227
|
-
"""Default scalarizer: unweighted arithmetic mean.
|
|
228
|
-
|
|
229
|
-
- If `scores_by_metric` is non-empty, returns the arithmetic
|
|
230
|
-
mean of all metric scores.
|
|
231
|
-
- If `scores_by_metric` is empty, returns 0.0.
|
|
232
|
-
"""
|
|
233
|
-
|
|
234
|
-
def scalarize(self, scores_by_metric: Dict[str, float]) -> float:
|
|
235
|
-
if not scores_by_metric:
|
|
236
|
-
return 0.0
|
|
237
|
-
return sum(scores_by_metric.values()) / len(scores_by_metric)
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
class WeightedObjective(Objective):
|
|
241
|
-
"""
|
|
242
|
-
Objective that scales each metric's score by a user-provided weight and sums them.
|
|
243
|
-
|
|
244
|
-
- `weights_by_metric` keys should match the names of the metrics passed to the
|
|
245
|
-
metric class names passed to the PromptOptimizer.
|
|
246
|
-
- Metrics not present in `weights_by_metric` receive `default_weight`.
|
|
247
|
-
This makes it easy to emphasize a subset of metrics while keeping
|
|
248
|
-
everything else at a baseline weight of 1.0, e.g.:
|
|
249
|
-
|
|
250
|
-
WeightedObjective({"AnswerRelevancyMetric": 2.0})
|
|
251
|
-
|
|
252
|
-
which treats AnswerRelevancy as 2x as important as the other metrics.
|
|
253
|
-
"""
|
|
254
|
-
|
|
255
|
-
def __init__(
|
|
256
|
-
self,
|
|
257
|
-
weights_by_metric: Optional[Dict[str, float]] = None,
|
|
258
|
-
default_weight: float = 1.0,
|
|
259
|
-
):
|
|
260
|
-
self.weights_by_metric: Dict[str, float] = dict(weights_by_metric or {})
|
|
261
|
-
self.default_weight: float = float(default_weight)
|
|
262
|
-
|
|
263
|
-
def scalarize(self, scores_by_metric: Dict[str, float]) -> float:
|
|
264
|
-
return sum(
|
|
265
|
-
self.weights_by_metric.get(name, self.default_weight) * score
|
|
266
|
-
for name, score in scores_by_metric.items()
|
|
267
|
-
)
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
@dataclass
|
|
271
|
-
class MetricInfo:
|
|
272
|
-
name: str
|
|
273
|
-
rubric: Optional[str] = None
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
class AcceptedIterationDict(TypedDict):
|
|
277
|
-
parent: PromptConfigurationId
|
|
278
|
-
child: PromptConfigurationId
|
|
279
|
-
module: ModuleId
|
|
280
|
-
before: float
|
|
281
|
-
after: float
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
class AcceptedIteration(PydanticBaseModel):
|
|
285
|
-
parent: str
|
|
286
|
-
child: str
|
|
287
|
-
module: str
|
|
288
|
-
before: float
|
|
289
|
-
after: float
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
class PromptMessageSnapshot(PydanticBaseModel):
|
|
293
|
-
role: str
|
|
294
|
-
content: str
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
class PromptModuleSnapshot(PydanticBaseModel):
|
|
298
|
-
type: Literal["TEXT", "LIST"]
|
|
299
|
-
# Only used when type == "TEXT"
|
|
300
|
-
text_template: Optional[str] = None
|
|
301
|
-
# Only used when type == "LIST"
|
|
302
|
-
messages: Optional[List[PromptMessageSnapshot]] = None
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
class PromptConfigSnapshot(PydanticBaseModel):
|
|
306
|
-
parent: Optional[str]
|
|
307
|
-
prompts: Dict[str, PromptModuleSnapshot]
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
@dataclass
|
|
311
|
-
class OptimizationResult:
|
|
312
|
-
optimization_id: str
|
|
313
|
-
best_id: PromptConfigurationId
|
|
314
|
-
accepted_iterations: List[Dict]
|
|
315
|
-
pareto_scores: Dict[PromptConfigurationId, List[float]]
|
|
316
|
-
parents: Dict[PromptConfigurationId, Optional[PromptConfigurationId]]
|
|
317
|
-
prompt_configurations: Dict[PromptConfigurationId, Dict[str, Any]]
|
|
318
|
-
|
|
319
|
-
def as_dict(self) -> Dict:
|
|
320
|
-
return dict(
|
|
321
|
-
optimization_id=self.optimization_id,
|
|
322
|
-
best_id=self.best_id,
|
|
323
|
-
accepted_iterations=self.accepted_iterations,
|
|
324
|
-
pareto_scores=self.pareto_scores,
|
|
325
|
-
parents=self.parents,
|
|
326
|
-
prompt_configurations=self.prompt_configurations,
|
|
327
|
-
)
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
class OptimizationReport(PydanticBaseModel):
|
|
331
|
-
optimization_id: str = Field(
|
|
332
|
-
alias="optimizationId",
|
|
333
|
-
validation_alias=AliasChoices("optimizationId", "optimization_id"),
|
|
334
|
-
)
|
|
335
|
-
best_id: str = Field(
|
|
336
|
-
alias="bestId",
|
|
337
|
-
validation_alias=AliasChoices("bestId", "best_id"),
|
|
338
|
-
)
|
|
339
|
-
accepted_iterations: list[AcceptedIteration] = Field(
|
|
340
|
-
default_factory=list,
|
|
341
|
-
alias="acceptedIterations",
|
|
342
|
-
validation_alias=AliasChoices(
|
|
343
|
-
"acceptedIterations", "accepted_iterations"
|
|
344
|
-
),
|
|
345
|
-
)
|
|
346
|
-
pareto_scores: dict[str, list[float]] = Field(
|
|
347
|
-
alias="paretoScores",
|
|
348
|
-
validation_alias=AliasChoices("paretoScores", "pareto_scores"),
|
|
349
|
-
)
|
|
350
|
-
parents: dict[str, str | None]
|
|
351
|
-
prompt_configurations: dict[str, PromptConfigSnapshot] = Field(
|
|
352
|
-
alias="promptConfigurations",
|
|
353
|
-
validation_alias=AliasChoices(
|
|
354
|
-
"promptConfigurations", "prompt_configurations"
|
|
355
|
-
),
|
|
356
|
-
)
|
|
357
|
-
|
|
358
|
-
@classmethod
|
|
359
|
-
def from_runtime(cls, result: dict) -> "OptimizationReport":
|
|
360
|
-
# accepts the dict from OptimizationResult.as_dict()
|
|
361
|
-
return cls(**result)
|
|
@@ -1,170 +0,0 @@
|
|
|
1
|
-
from typing import List, Optional, Dict, Union
|
|
2
|
-
from urllib.parse import urlparse, unquote
|
|
3
|
-
from dataclasses import dataclass, field
|
|
4
|
-
from enum import Enum
|
|
5
|
-
import mimetypes
|
|
6
|
-
import base64
|
|
7
|
-
import os
|
|
8
|
-
|
|
9
|
-
from deepeval.test_case import ToolCall
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
@dataclass
|
|
13
|
-
class MLLMImage:
|
|
14
|
-
dataBase64: Optional[str] = None
|
|
15
|
-
mimeType: Optional[str] = None
|
|
16
|
-
url: Optional[str] = None
|
|
17
|
-
local: Optional[bool] = None
|
|
18
|
-
filename: Optional[str] = None
|
|
19
|
-
|
|
20
|
-
def __post_init__(self):
|
|
21
|
-
|
|
22
|
-
if self.url and self.dataBase64:
|
|
23
|
-
raise ValueError(
|
|
24
|
-
"You cannot provide both 'url' and 'dataBase64' at the same time when creating an MLLMImage."
|
|
25
|
-
)
|
|
26
|
-
|
|
27
|
-
if not self.url and not self.dataBase64:
|
|
28
|
-
raise ValueError(
|
|
29
|
-
"You must provide either a 'url' or both 'dataBase64' and 'mimeType' to create an MLLMImage."
|
|
30
|
-
)
|
|
31
|
-
|
|
32
|
-
if self.dataBase64 is not None:
|
|
33
|
-
if self.mimeType is None:
|
|
34
|
-
raise ValueError(
|
|
35
|
-
"mimeType must be provided when initializing from Base64 data."
|
|
36
|
-
)
|
|
37
|
-
else:
|
|
38
|
-
is_local = self.is_local_path(self.url)
|
|
39
|
-
if self.local is not None:
|
|
40
|
-
assert self.local == is_local, "Local path mismatch"
|
|
41
|
-
else:
|
|
42
|
-
self.local = is_local
|
|
43
|
-
|
|
44
|
-
# compute filename, mime_type, and Base64 data
|
|
45
|
-
if self.local:
|
|
46
|
-
path = self.process_url(self.url)
|
|
47
|
-
self.filename = os.path.basename(path)
|
|
48
|
-
self.mimeType = (
|
|
49
|
-
mimetypes.guess_type(path)[0] or "application/octet-stream"
|
|
50
|
-
)
|
|
51
|
-
with open(path, "rb") as f:
|
|
52
|
-
raw = f.read()
|
|
53
|
-
self.dataBase64 = base64.b64encode(raw).decode("ascii")
|
|
54
|
-
else:
|
|
55
|
-
self.filename = None
|
|
56
|
-
self.mimeType = None
|
|
57
|
-
self.dataBase64 = None
|
|
58
|
-
|
|
59
|
-
@staticmethod
|
|
60
|
-
def process_url(url: str) -> str:
|
|
61
|
-
if os.path.exists(url):
|
|
62
|
-
return url
|
|
63
|
-
parsed = urlparse(url)
|
|
64
|
-
if parsed.scheme == "file":
|
|
65
|
-
raw_path = (
|
|
66
|
-
f"//{parsed.netloc}{parsed.path}"
|
|
67
|
-
if parsed.netloc
|
|
68
|
-
else parsed.path
|
|
69
|
-
)
|
|
70
|
-
path = unquote(raw_path)
|
|
71
|
-
return path
|
|
72
|
-
return url
|
|
73
|
-
|
|
74
|
-
@staticmethod
|
|
75
|
-
def is_local_path(url: str) -> bool:
|
|
76
|
-
if os.path.exists(url):
|
|
77
|
-
return True
|
|
78
|
-
parsed = urlparse(url)
|
|
79
|
-
if parsed.scheme == "file":
|
|
80
|
-
raw_path = (
|
|
81
|
-
f"//{parsed.netloc}{parsed.path}"
|
|
82
|
-
if parsed.netloc
|
|
83
|
-
else parsed.path
|
|
84
|
-
)
|
|
85
|
-
path = unquote(raw_path)
|
|
86
|
-
return os.path.exists(path)
|
|
87
|
-
return False
|
|
88
|
-
|
|
89
|
-
def as_data_uri(self) -> Optional[str]:
|
|
90
|
-
"""Return the image as a data URI string, if Base64 data is available."""
|
|
91
|
-
if not self.dataBase64 or not self.mimeType:
|
|
92
|
-
return None
|
|
93
|
-
return f"data:{self.mimeType};base64,{self.dataBase64}"
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
class MLLMTestCaseParams(Enum):
|
|
97
|
-
INPUT = "input"
|
|
98
|
-
ACTUAL_OUTPUT = "actual_output"
|
|
99
|
-
EXPECTED_OUTPUT = "expected_output"
|
|
100
|
-
CONTEXT = "context"
|
|
101
|
-
RETRIEVAL_CONTEXT = "retrieval_context"
|
|
102
|
-
TOOLS_CALLED = "tools_called"
|
|
103
|
-
EXPECTED_TOOLS = "expected_tools"
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
@dataclass
|
|
107
|
-
class MLLMTestCase:
|
|
108
|
-
input: List[Union[str, MLLMImage]]
|
|
109
|
-
actual_output: List[Union[str, MLLMImage]]
|
|
110
|
-
expected_output: Optional[List[Union[str, MLLMImage]]] = None
|
|
111
|
-
context: Optional[List[Union[str, MLLMImage]]] = None
|
|
112
|
-
retrieval_context: Optional[List[Union[str, MLLMImage]]] = None
|
|
113
|
-
additional_metadata: Optional[Dict] = None
|
|
114
|
-
comments: Optional[str] = None
|
|
115
|
-
tools_called: Optional[List[ToolCall]] = None
|
|
116
|
-
expected_tools: Optional[List[ToolCall]] = None
|
|
117
|
-
token_cost: Optional[float] = None
|
|
118
|
-
completion_time: Optional[float] = None
|
|
119
|
-
name: Optional[str] = field(default=None)
|
|
120
|
-
_dataset_rank: Optional[int] = field(default=None, repr=False)
|
|
121
|
-
_dataset_alias: Optional[str] = field(default=None, repr=False)
|
|
122
|
-
_dataset_id: Optional[str] = field(default=None, repr=False)
|
|
123
|
-
|
|
124
|
-
def __post_init__(self):
|
|
125
|
-
# Ensure `expected_output` is None or a list of strings or MLLMImage instances
|
|
126
|
-
if self.expected_output is not None:
|
|
127
|
-
if not isinstance(self.expected_output, list) or not all(
|
|
128
|
-
isinstance(item, (str, MLLMImage))
|
|
129
|
-
for item in self.expected_output
|
|
130
|
-
):
|
|
131
|
-
raise TypeError(
|
|
132
|
-
"'expected_output' must be None or a list of strings or MLLMImage instances"
|
|
133
|
-
)
|
|
134
|
-
|
|
135
|
-
# Ensure `context` is None or a list of strings or MLLMImage instances
|
|
136
|
-
if self.context is not None:
|
|
137
|
-
if not isinstance(self.context, list) or not all(
|
|
138
|
-
isinstance(item, (str, MLLMImage)) for item in self.context
|
|
139
|
-
):
|
|
140
|
-
raise TypeError(
|
|
141
|
-
"'context' must be None or a list of strings or MLLMImage instances"
|
|
142
|
-
)
|
|
143
|
-
|
|
144
|
-
# Ensure `retrieval_context` is None or a list of strings or MLLMImage instances
|
|
145
|
-
if self.retrieval_context is not None:
|
|
146
|
-
if not isinstance(self.retrieval_context, list) or not all(
|
|
147
|
-
isinstance(item, (str, MLLMImage))
|
|
148
|
-
for item in self.retrieval_context
|
|
149
|
-
):
|
|
150
|
-
raise TypeError(
|
|
151
|
-
"'retrieval_context' must be None or a list of strings or MLLMImage instances"
|
|
152
|
-
)
|
|
153
|
-
|
|
154
|
-
# Ensure `tools_called` is None or a list of strings
|
|
155
|
-
if self.tools_called is not None:
|
|
156
|
-
if not isinstance(self.tools_called, list) or not all(
|
|
157
|
-
isinstance(item, ToolCall) for item in self.tools_called
|
|
158
|
-
):
|
|
159
|
-
raise TypeError(
|
|
160
|
-
"'tools_called' must be None or a list of `ToolCall`"
|
|
161
|
-
)
|
|
162
|
-
|
|
163
|
-
# Ensure `expected_tools` is None or a list of strings
|
|
164
|
-
if self.expected_tools is not None:
|
|
165
|
-
if not isinstance(self.expected_tools, list) or not all(
|
|
166
|
-
isinstance(item, ToolCall) for item in self.expected_tools
|
|
167
|
-
):
|
|
168
|
-
raise TypeError(
|
|
169
|
-
"'expected_tools' must be None or a list of `ToolCall`"
|
|
170
|
-
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|