deepeval 3.7.3__py3-none-any.whl → 3.7.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/test.py +1 -1
- deepeval/config/settings.py +102 -13
- deepeval/dataset/golden.py +54 -2
- deepeval/evaluate/configs.py +1 -1
- deepeval/evaluate/evaluate.py +16 -8
- deepeval/evaluate/execute.py +74 -27
- deepeval/evaluate/utils.py +26 -22
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/metrics/__init__.py +14 -12
- deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
- deepeval/metrics/answer_relevancy/template.py +188 -92
- deepeval/metrics/argument_correctness/template.py +2 -2
- deepeval/metrics/base_metric.py +2 -5
- deepeval/metrics/bias/template.py +3 -3
- deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/template.py +2 -2
- deepeval/metrics/conversational_dag/templates.py +4 -4
- deepeval/metrics/conversational_g_eval/template.py +4 -3
- deepeval/metrics/dag/templates.py +5 -5
- deepeval/metrics/faithfulness/faithfulness.py +70 -27
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/utils.py +2 -2
- deepeval/metrics/hallucination/template.py +4 -4
- deepeval/metrics/indicator.py +4 -4
- deepeval/metrics/misuse/template.py +2 -2
- deepeval/metrics/multimodal_metrics/__init__.py +0 -18
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
- deepeval/metrics/non_advice/template.py +2 -2
- deepeval/metrics/pii_leakage/template.py +2 -2
- deepeval/metrics/prompt_alignment/template.py +4 -4
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_violation/template.py +2 -2
- deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
- deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
- deepeval/metrics/toxicity/template.py +4 -4
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
- deepeval/metrics/turn_relevancy/template.py +2 -2
- deepeval/metrics/utils.py +39 -58
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +16 -38
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +69 -32
- deepeval/models/embedding_models/local_embedding_model.py +39 -22
- deepeval/models/embedding_models/ollama_embedding_model.py +42 -18
- deepeval/models/embedding_models/openai_embedding_model.py +50 -15
- deepeval/models/llms/amazon_bedrock_model.py +1 -2
- deepeval/models/llms/anthropic_model.py +53 -20
- deepeval/models/llms/azure_model.py +140 -43
- deepeval/models/llms/deepseek_model.py +38 -23
- deepeval/models/llms/gemini_model.py +222 -103
- deepeval/models/llms/grok_model.py +39 -27
- deepeval/models/llms/kimi_model.py +39 -23
- deepeval/models/llms/litellm_model.py +103 -45
- deepeval/models/llms/local_model.py +35 -22
- deepeval/models/llms/ollama_model.py +129 -17
- deepeval/models/llms/openai_model.py +151 -50
- deepeval/models/llms/portkey_model.py +149 -0
- deepeval/models/llms/utils.py +5 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +94 -4
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/optimizer/algorithms/copro/copro.py +836 -0
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/optimizer/algorithms/gepa/gepa.py +737 -0
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/optimizer/algorithms/simba/simba.py +999 -0
- deepeval/optimizer/algorithms/simba/types.py +15 -0
- deepeval/optimizer/configs.py +31 -0
- deepeval/optimizer/policies.py +227 -0
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/optimizer/utils.py +480 -0
- deepeval/prompt/prompt.py +7 -6
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +12 -10
- deepeval/test_case/conversational_test_case.py +19 -1
- deepeval/test_case/llm_test_case.py +152 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +15 -14
- deepeval/test_run/cache.py +2 -0
- deepeval/test_run/test_run.py +9 -4
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +89 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/RECORD +134 -118
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -334
- deepeval/models/mlllms/gemini_model.py +0 -284
- deepeval/models/mlllms/ollama_model.py +0 -144
- deepeval/models/mlllms/openai_model.py +0 -258
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import uuid
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import (
|
|
7
|
+
Callable,
|
|
8
|
+
Dict,
|
|
9
|
+
List,
|
|
10
|
+
Optional,
|
|
11
|
+
TypedDict,
|
|
12
|
+
TYPE_CHECKING,
|
|
13
|
+
Union,
|
|
14
|
+
)
|
|
15
|
+
from enum import Enum
|
|
16
|
+
from pydantic import BaseModel, ConfigDict
|
|
17
|
+
|
|
18
|
+
from deepeval.prompt.prompt import Prompt
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from deepeval.dataset.golden import Golden, ConversationalGolden
|
|
22
|
+
|
|
23
|
+
PromptConfigurationId = str
|
|
24
|
+
ModuleId = str
|
|
25
|
+
ScoreVector = List[float] # scores per instance on D_pareto, aligned order
|
|
26
|
+
ScoreTable = Dict[PromptConfigurationId, ScoreVector]
|
|
27
|
+
|
|
28
|
+
# Type alias for model callback function
|
|
29
|
+
ModelCallback = Callable[[Prompt, Union["Golden", "ConversationalGolden"]], str]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class PromptConfiguration:
|
|
34
|
+
id: PromptConfigurationId
|
|
35
|
+
parent: Optional[PromptConfigurationId]
|
|
36
|
+
prompts: Dict[ModuleId, Prompt]
|
|
37
|
+
|
|
38
|
+
@staticmethod
|
|
39
|
+
def new(
|
|
40
|
+
prompts: Dict[ModuleId, Prompt],
|
|
41
|
+
parent: Optional[PromptConfigurationId] = None,
|
|
42
|
+
) -> "PromptConfiguration":
|
|
43
|
+
return PromptConfiguration(
|
|
44
|
+
id=str(uuid.uuid4()), parent=parent, prompts=dict(prompts)
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class RunnerStatusType(str, Enum):
|
|
49
|
+
"""Status events emitted by optimization runners."""
|
|
50
|
+
|
|
51
|
+
PROGRESS = "progress"
|
|
52
|
+
TIE = "tie"
|
|
53
|
+
ERROR = "error"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# Type alias for status callback function
|
|
57
|
+
RunnerStatusCallback = Callable[..., None]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class Objective(ABC):
|
|
61
|
+
"""Strategy for reducing scores per-metric to a single scalar value.
|
|
62
|
+
|
|
63
|
+
Implementations receive a mapping from metric name to score
|
|
64
|
+
(for example, {"AnswerRelevancyMetric": 0.82}) and return a
|
|
65
|
+
single float used for comparisons inside the optimizer.
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
@abstractmethod
|
|
69
|
+
def scalarize(self, scores_by_metric: Dict[str, float]) -> float:
|
|
70
|
+
raise NotImplementedError
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class MeanObjective(Objective):
|
|
74
|
+
"""Default scalarizer: unweighted arithmetic mean.
|
|
75
|
+
|
|
76
|
+
- If `scores_by_metric` is non-empty, returns the arithmetic
|
|
77
|
+
mean of all metric scores.
|
|
78
|
+
- If `scores_by_metric` is empty, returns 0.0.
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
def scalarize(self, scores_by_metric: Dict[str, float]) -> float:
|
|
82
|
+
if not scores_by_metric:
|
|
83
|
+
return 0.0
|
|
84
|
+
return sum(scores_by_metric.values()) / len(scores_by_metric)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class WeightedObjective(Objective):
|
|
88
|
+
"""
|
|
89
|
+
Objective that scales each metric's score by a user-provided weight and sums them.
|
|
90
|
+
|
|
91
|
+
- `weights_by_metric` keys should match the names of the metrics passed to the
|
|
92
|
+
metric class names passed to the PromptOptimizer.
|
|
93
|
+
- Metrics not present in `weights_by_metric` receive `default_weight`.
|
|
94
|
+
This makes it easy to emphasize a subset of metrics while keeping
|
|
95
|
+
everything else at a baseline weight of 1.0, e.g.:
|
|
96
|
+
|
|
97
|
+
WeightedObjective({"AnswerRelevancyMetric": 2.0})
|
|
98
|
+
|
|
99
|
+
which treats AnswerRelevancy as 2x as important as the other metrics.
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
def __init__(
|
|
103
|
+
self,
|
|
104
|
+
weights_by_metric: Optional[Dict[str, float]] = None,
|
|
105
|
+
default_weight: float = 1.0,
|
|
106
|
+
):
|
|
107
|
+
self.weights_by_metric: Dict[str, float] = dict(weights_by_metric or {})
|
|
108
|
+
self.default_weight: float = float(default_weight)
|
|
109
|
+
|
|
110
|
+
def scalarize(self, scores_by_metric: Dict[str, float]) -> float:
|
|
111
|
+
return sum(
|
|
112
|
+
self.weights_by_metric.get(name, self.default_weight) * score
|
|
113
|
+
for name, score in scores_by_metric.items()
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class AcceptedIterationDict(TypedDict):
|
|
118
|
+
parent: PromptConfigurationId
|
|
119
|
+
child: PromptConfigurationId
|
|
120
|
+
module: ModuleId
|
|
121
|
+
before: float
|
|
122
|
+
after: float
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class AcceptedIteration(BaseModel):
|
|
126
|
+
parent: str
|
|
127
|
+
child: str
|
|
128
|
+
module: str
|
|
129
|
+
before: float
|
|
130
|
+
after: float
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class PromptConfigSnapshot(BaseModel):
|
|
134
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
135
|
+
|
|
136
|
+
parent: Optional[str]
|
|
137
|
+
prompts: Dict[str, Prompt]
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class OptimizationReport(BaseModel):
|
|
141
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
142
|
+
|
|
143
|
+
optimization_id: str
|
|
144
|
+
best_id: str
|
|
145
|
+
accepted_iterations: List[AcceptedIteration]
|
|
146
|
+
pareto_scores: Dict[str, List[float]]
|
|
147
|
+
parents: Dict[str, Optional[str]]
|
|
148
|
+
prompt_configurations: Dict[str, PromptConfigSnapshot]
|
|
@@ -0,0 +1,480 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import inspect
|
|
3
|
+
import random
|
|
4
|
+
import re
|
|
5
|
+
import statistics
|
|
6
|
+
from typing import (
|
|
7
|
+
Any,
|
|
8
|
+
Callable,
|
|
9
|
+
List,
|
|
10
|
+
Optional,
|
|
11
|
+
Protocol,
|
|
12
|
+
Sequence,
|
|
13
|
+
Tuple,
|
|
14
|
+
TYPE_CHECKING,
|
|
15
|
+
Union,
|
|
16
|
+
Dict,
|
|
17
|
+
Set,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
from deepeval.errors import DeepEvalError
|
|
21
|
+
from deepeval.metrics.base_metric import BaseMetric, BaseConversationalMetric
|
|
22
|
+
from deepeval.prompt.prompt import Prompt
|
|
23
|
+
from deepeval.prompt.api import PromptMessage
|
|
24
|
+
from deepeval.optimizer.types import (
|
|
25
|
+
ModelCallback,
|
|
26
|
+
ModuleId,
|
|
27
|
+
PromptConfigurationId,
|
|
28
|
+
PromptConfiguration,
|
|
29
|
+
PromptConfigSnapshot,
|
|
30
|
+
OptimizationReport,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
if TYPE_CHECKING:
|
|
35
|
+
from deepeval.dataset.golden import Golden, ConversationalGolden
|
|
36
|
+
from deepeval.prompt.api import PromptMessage
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def split_goldens(
|
|
40
|
+
goldens: Union[List[Golden], List[ConversationalGolden]],
|
|
41
|
+
pareto_size: int,
|
|
42
|
+
*,
|
|
43
|
+
random_state: random.Random,
|
|
44
|
+
) -> Tuple[
|
|
45
|
+
Union[List[Golden], List[ConversationalGolden]],
|
|
46
|
+
Union[List[Golden], List[ConversationalGolden]],
|
|
47
|
+
]:
|
|
48
|
+
"""
|
|
49
|
+
Split `goldens` into two disjoint parts:
|
|
50
|
+
|
|
51
|
+
- d_feedback: items not selected for the Pareto validation set
|
|
52
|
+
- d_pareto: `pareto_size` items for instance-wise Pareto scoring
|
|
53
|
+
|
|
54
|
+
The selection is deterministic given `seed`. Within each split, the
|
|
55
|
+
original order from `goldens` is preserved.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
goldens: Full list/sequence of examples.
|
|
59
|
+
pareto_size: Number of items to allocate to the Pareto set bound between [0, len(goldens)].
|
|
60
|
+
random_state: A shared `random.Random` instance that provides the source
|
|
61
|
+
of randomness. For reproducible runs, pass the same object used by
|
|
62
|
+
the GEPA loop constructed from `GEPA.random_seed`
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
(d_feedback, d_pareto)
|
|
66
|
+
"""
|
|
67
|
+
if pareto_size < 0:
|
|
68
|
+
raise ValueError("pareto_size must be >= 0")
|
|
69
|
+
|
|
70
|
+
total = len(goldens)
|
|
71
|
+
|
|
72
|
+
if total == 0:
|
|
73
|
+
# nothing to split
|
|
74
|
+
return [], []
|
|
75
|
+
|
|
76
|
+
# With a single example, we cannot form a meaningful feedback set.
|
|
77
|
+
# callers like GEPARunner should enforce a minimum of 2 goldens for
|
|
78
|
+
# optimization.
|
|
79
|
+
if total == 1:
|
|
80
|
+
return [], list(goldens)
|
|
81
|
+
|
|
82
|
+
# For total >= 2, ensure that we always leave at least one example
|
|
83
|
+
# for d_feedback. This keeps the splits disjoint while still honoring
|
|
84
|
+
# pareto_size as a target up to (total - 1).
|
|
85
|
+
chosen_size = min(pareto_size, total - 1)
|
|
86
|
+
|
|
87
|
+
indices = list(range(total))
|
|
88
|
+
random_state.shuffle(indices)
|
|
89
|
+
|
|
90
|
+
pareto_indices = set(indices[:chosen_size])
|
|
91
|
+
|
|
92
|
+
d_pareto = [goldens[i] for i in range(total) if i in pareto_indices]
|
|
93
|
+
d_feedback = [goldens[i] for i in range(total) if i not in pareto_indices]
|
|
94
|
+
|
|
95
|
+
return d_feedback, d_pareto
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
################################
|
|
99
|
+
# Prompt normalization helpers #
|
|
100
|
+
################################
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _slug(text: str) -> str:
|
|
104
|
+
slug = text.lower()
|
|
105
|
+
slug = re.sub(r"[^a-z0-9]+", "-", slug)
|
|
106
|
+
return slug.strip("-")
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def generate_module_id(prompt: Prompt, index: int, existing: Set[str]) -> str:
|
|
110
|
+
"""
|
|
111
|
+
Build a human readable module id stable within a single optimization run.
|
|
112
|
+
Prefers alias/label; enrich with model settings provider and name; dedupe; cap to 64 chars.
|
|
113
|
+
"""
|
|
114
|
+
parts: List[str] = []
|
|
115
|
+
if prompt.alias:
|
|
116
|
+
parts.append(str(prompt.alias))
|
|
117
|
+
if prompt.label:
|
|
118
|
+
parts.append(str(prompt.label))
|
|
119
|
+
|
|
120
|
+
ms = prompt.model_settings
|
|
121
|
+
if ms is not None:
|
|
122
|
+
if ms.provider is not None:
|
|
123
|
+
parts.append(ms.provider.value)
|
|
124
|
+
if ms.name:
|
|
125
|
+
parts.append(ms.name)
|
|
126
|
+
|
|
127
|
+
base = "-".join(_slug(p) for p in parts if p) or f"module-{index+1}"
|
|
128
|
+
base = base[:64] or f"module-{index+1}"
|
|
129
|
+
|
|
130
|
+
candidate = base
|
|
131
|
+
suffix = 2
|
|
132
|
+
while candidate in existing:
|
|
133
|
+
candidate = f"{base}-{suffix}"
|
|
134
|
+
candidate = candidate[:64]
|
|
135
|
+
suffix += 1
|
|
136
|
+
|
|
137
|
+
existing.add(candidate)
|
|
138
|
+
return candidate
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def normalize_seed_prompts(
|
|
142
|
+
seed_prompts: Union[Dict[ModuleId, Prompt], List[Prompt]],
|
|
143
|
+
) -> Dict[ModuleId, Prompt]:
|
|
144
|
+
"""
|
|
145
|
+
Accept either {module_id: Prompt} or List[Prompt].
|
|
146
|
+
If a list is given, generate human readable module ids.
|
|
147
|
+
"""
|
|
148
|
+
if isinstance(seed_prompts, dict):
|
|
149
|
+
return dict(seed_prompts) # shallow copy
|
|
150
|
+
|
|
151
|
+
mapping: Dict[ModuleId, Prompt] = {}
|
|
152
|
+
used: Set[str] = set()
|
|
153
|
+
for i, prompt in enumerate(seed_prompts):
|
|
154
|
+
module_id = generate_module_id(prompt, i, used)
|
|
155
|
+
mapping[module_id] = prompt
|
|
156
|
+
return mapping
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def invoke_model_callback(
|
|
160
|
+
*,
|
|
161
|
+
model_callback: ModelCallback,
|
|
162
|
+
prompt: Prompt,
|
|
163
|
+
golden: Union["Golden", "ConversationalGolden"],
|
|
164
|
+
) -> str:
|
|
165
|
+
"""
|
|
166
|
+
Call a user provided model_callback in a synchronous context.
|
|
167
|
+
|
|
168
|
+
Raises if the callback returns an awaitable.
|
|
169
|
+
"""
|
|
170
|
+
result = model_callback(prompt, golden)
|
|
171
|
+
if inspect.isawaitable(result):
|
|
172
|
+
raise DeepEvalError(
|
|
173
|
+
"model_callback returned an awaitable from a synchronous context. "
|
|
174
|
+
"Either declare the callback as `async def` and use async optimization, or call "
|
|
175
|
+
"`model.generate(...)` instead of `model.a_generate(...)` inside a sync callback."
|
|
176
|
+
)
|
|
177
|
+
return result
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
async def a_invoke_model_callback(
|
|
181
|
+
*,
|
|
182
|
+
model_callback: ModelCallback,
|
|
183
|
+
prompt: Prompt,
|
|
184
|
+
golden: Union["Golden", "ConversationalGolden"],
|
|
185
|
+
) -> str:
|
|
186
|
+
"""
|
|
187
|
+
Call a user provided model_callback in an async context.
|
|
188
|
+
|
|
189
|
+
Supports both sync and async callbacks.
|
|
190
|
+
"""
|
|
191
|
+
result = model_callback(prompt, golden)
|
|
192
|
+
if inspect.isawaitable(result):
|
|
193
|
+
return await result
|
|
194
|
+
return result
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
###########
|
|
198
|
+
# Reports #
|
|
199
|
+
###########
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def build_prompt_config_snapshots(
|
|
203
|
+
prompt_configurations_by_id: Dict[
|
|
204
|
+
PromptConfigurationId, "PromptConfiguration"
|
|
205
|
+
],
|
|
206
|
+
) -> Dict[PromptConfigurationId, PromptConfigSnapshot]:
|
|
207
|
+
"""
|
|
208
|
+
Build snapshots of all prompt configurations.
|
|
209
|
+
"""
|
|
210
|
+
snapshots: Dict[PromptConfigurationId, PromptConfigSnapshot] = {}
|
|
211
|
+
|
|
212
|
+
for cfg_id, cfg in prompt_configurations_by_id.items():
|
|
213
|
+
snapshots[cfg_id] = PromptConfigSnapshot(
|
|
214
|
+
parent=cfg.parent,
|
|
215
|
+
prompts=dict(cfg.prompts),
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
return snapshots
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def inflate_prompts_from_report(
|
|
222
|
+
report: OptimizationReport,
|
|
223
|
+
) -> Dict[str, Dict[str, Prompt]]:
|
|
224
|
+
"""
|
|
225
|
+
Build a mapping from configuration id -> { module_id -> Prompt }.
|
|
226
|
+
|
|
227
|
+
This is a convenience for users who want to work with real Prompt
|
|
228
|
+
instances instead of raw snapshots.
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
{
|
|
232
|
+
"<config_id>": {
|
|
233
|
+
"<module_id>": Prompt(...),
|
|
234
|
+
...
|
|
235
|
+
},
|
|
236
|
+
...
|
|
237
|
+
}
|
|
238
|
+
"""
|
|
239
|
+
inflated: Dict[str, Dict[str, Prompt]] = {}
|
|
240
|
+
|
|
241
|
+
for cfg_id, cfg_snapshot in report.prompt_configurations.items():
|
|
242
|
+
module_prompts: Dict[str, Prompt] = {}
|
|
243
|
+
|
|
244
|
+
for module_id, module_snapshot in cfg_snapshot.prompts.items():
|
|
245
|
+
if module_snapshot.type == "TEXT":
|
|
246
|
+
module_prompts[module_id] = Prompt(
|
|
247
|
+
text_template=module_snapshot.text_template or ""
|
|
248
|
+
)
|
|
249
|
+
else: # "LIST"
|
|
250
|
+
messages = [
|
|
251
|
+
PromptMessage(role=m.role, content=m.content)
|
|
252
|
+
for m in module_snapshot.messages or []
|
|
253
|
+
]
|
|
254
|
+
module_prompts[module_id] = Prompt(messages_template=messages)
|
|
255
|
+
|
|
256
|
+
inflated[cfg_id] = module_prompts
|
|
257
|
+
|
|
258
|
+
return inflated
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def get_best_prompts_from_report(
|
|
262
|
+
report: OptimizationReport,
|
|
263
|
+
) -> Dict[str, Prompt]:
|
|
264
|
+
"""
|
|
265
|
+
Convenience wrapper returning the best configuration's module prompts.
|
|
266
|
+
"""
|
|
267
|
+
all_prompts = inflate_prompts_from_report(report)
|
|
268
|
+
return all_prompts.get(report.best_id, {})
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
##############
|
|
272
|
+
# Validation #
|
|
273
|
+
##############
|
|
274
|
+
def _format_type_names(types: Tuple[type, ...]) -> str:
|
|
275
|
+
names = [t.__name__ for t in types]
|
|
276
|
+
if len(names) == 1:
|
|
277
|
+
return names[0]
|
|
278
|
+
if len(names) == 2:
|
|
279
|
+
return f"{names[0]} or {names[1]}"
|
|
280
|
+
return ", ".join(names[:-1]) + f", or {names[-1]}"
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def validate_instance(
|
|
284
|
+
*,
|
|
285
|
+
component: str,
|
|
286
|
+
param_name: str,
|
|
287
|
+
value: Any,
|
|
288
|
+
expected_types: Union[type, Tuple[type, ...]],
|
|
289
|
+
allow_none: bool = False,
|
|
290
|
+
) -> Any:
|
|
291
|
+
"""
|
|
292
|
+
Generic type validator.
|
|
293
|
+
|
|
294
|
+
- component: Intended to help identify what is being validated.
|
|
295
|
+
e.g. "PromptOptimizer.__init__", "PromptOptimizer.optimize", etc.
|
|
296
|
+
- param_name: the name of the parameter being validated
|
|
297
|
+
- value: the actual value passed.
|
|
298
|
+
- expected_types: a type or tuple of types to accept.
|
|
299
|
+
- allow_none: if True, None is allowed and returned as-is.
|
|
300
|
+
"""
|
|
301
|
+
if value is None and allow_none:
|
|
302
|
+
return value
|
|
303
|
+
|
|
304
|
+
if not isinstance(expected_types, tuple):
|
|
305
|
+
expected_types = (expected_types,)
|
|
306
|
+
|
|
307
|
+
if not isinstance(value, expected_types):
|
|
308
|
+
expected_desc = _format_type_names(expected_types)
|
|
309
|
+
raise DeepEvalError(
|
|
310
|
+
f"{component} expected `{param_name}` to be an instance of "
|
|
311
|
+
f"{expected_desc}, but received {type(value).__name__!r} instead."
|
|
312
|
+
)
|
|
313
|
+
return value
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def validate_sequence_of(
|
|
317
|
+
*,
|
|
318
|
+
component: str,
|
|
319
|
+
param_name: str,
|
|
320
|
+
value: Any,
|
|
321
|
+
expected_item_types: Union[type, Tuple[type, ...]],
|
|
322
|
+
sequence_types: Tuple[type, ...] = (list, tuple),
|
|
323
|
+
allow_none: bool = False,
|
|
324
|
+
) -> Any:
|
|
325
|
+
"""
|
|
326
|
+
Generic container validator.
|
|
327
|
+
|
|
328
|
+
- Ensures `value` is one of `sequence_types` (list by default).
|
|
329
|
+
- Ensures each item is an instance of `expected_item_types`.
|
|
330
|
+
|
|
331
|
+
Returns the original `value` on success.
|
|
332
|
+
"""
|
|
333
|
+
if value is None:
|
|
334
|
+
if allow_none:
|
|
335
|
+
return value
|
|
336
|
+
raise DeepEvalError(
|
|
337
|
+
f"{component} expected `{param_name}` to be a "
|
|
338
|
+
f"{_format_type_names(sequence_types)} of "
|
|
339
|
+
f"{_format_type_names(expected_item_types if isinstance(expected_item_types, tuple) else (expected_item_types,))}, "
|
|
340
|
+
"but received None instead."
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
if not isinstance(sequence_types, tuple):
|
|
344
|
+
sequence_types = (sequence_types,)
|
|
345
|
+
|
|
346
|
+
if not isinstance(value, sequence_types):
|
|
347
|
+
expected_seq = _format_type_names(sequence_types)
|
|
348
|
+
raise DeepEvalError(
|
|
349
|
+
f"{component} expected `{param_name}` to be a {expected_seq}, "
|
|
350
|
+
f"but received {type(value).__name__!r} instead."
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
if not isinstance(expected_item_types, tuple):
|
|
354
|
+
expected_item_types = (expected_item_types,)
|
|
355
|
+
|
|
356
|
+
for index, item in enumerate(value):
|
|
357
|
+
if not isinstance(item, expected_item_types):
|
|
358
|
+
expected_items = _format_type_names(expected_item_types)
|
|
359
|
+
raise DeepEvalError(
|
|
360
|
+
f"{component} expected all elements of `{param_name}` to be "
|
|
361
|
+
f"instances of {expected_items}, but element at index {index} "
|
|
362
|
+
f"has type {type(item).__name__!r}."
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
return value
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def validate_callback(
|
|
369
|
+
*,
|
|
370
|
+
component: str,
|
|
371
|
+
model_callback: Optional[ModelCallback],
|
|
372
|
+
) -> ModelCallback:
|
|
373
|
+
"""
|
|
374
|
+
Ensure that `model_callback` is provided.
|
|
375
|
+
|
|
376
|
+
- `model_callback` should be a callable that performs generation and
|
|
377
|
+
returns the model output.
|
|
378
|
+
|
|
379
|
+
Returns `model_callback` unchanged on success.
|
|
380
|
+
"""
|
|
381
|
+
if model_callback is None:
|
|
382
|
+
raise DeepEvalError(
|
|
383
|
+
f"{component} requires a `model_callback`.\n\n"
|
|
384
|
+
"supply a custom callable via `model_callback=` that performs "
|
|
385
|
+
"generation and returns the model output."
|
|
386
|
+
)
|
|
387
|
+
return model_callback
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
def validate_metrics(
|
|
391
|
+
*,
|
|
392
|
+
component: str,
|
|
393
|
+
metrics: Union[List[BaseMetric], List[BaseConversationalMetric]],
|
|
394
|
+
) -> Union[List[BaseMetric], List[BaseConversationalMetric]]:
|
|
395
|
+
|
|
396
|
+
if metrics is None or not len(metrics):
|
|
397
|
+
raise DeepEvalError(
|
|
398
|
+
f"{component} requires a `metrics`.\n\n"
|
|
399
|
+
"supply one or more DeepEval metrics via `metrics=`"
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
validate_sequence_of(
|
|
403
|
+
component=component,
|
|
404
|
+
param_name="metrics",
|
|
405
|
+
value=metrics,
|
|
406
|
+
expected_item_types=(BaseMetric, BaseConversationalMetric),
|
|
407
|
+
sequence_types=(list, tuple),
|
|
408
|
+
)
|
|
409
|
+
return list(metrics)
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
def validate_int_in_range(
|
|
413
|
+
*,
|
|
414
|
+
component: str,
|
|
415
|
+
param_name: str,
|
|
416
|
+
value: int,
|
|
417
|
+
min_inclusive: Optional[int] = None,
|
|
418
|
+
max_exclusive: Optional[int] = None,
|
|
419
|
+
) -> int:
|
|
420
|
+
"""
|
|
421
|
+
Validate that an int is within range [min_inclusive, max_exclusive).
|
|
422
|
+
|
|
423
|
+
- If `min_inclusive` is not None, value must be >= min_inclusive.
|
|
424
|
+
- If `max_exclusive` is not None, value must be < max_exclusive.
|
|
425
|
+
|
|
426
|
+
Returns the validated int on success.
|
|
427
|
+
"""
|
|
428
|
+
value = validate_instance(
|
|
429
|
+
component=component,
|
|
430
|
+
param_name=param_name,
|
|
431
|
+
value=value,
|
|
432
|
+
expected_types=int,
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
# Lower bound check
|
|
436
|
+
if min_inclusive is not None and value < min_inclusive:
|
|
437
|
+
if max_exclusive is None:
|
|
438
|
+
raise DeepEvalError(
|
|
439
|
+
f"{component} expected `{param_name}` to be >= {min_inclusive}, "
|
|
440
|
+
f"but received {value!r} instead."
|
|
441
|
+
)
|
|
442
|
+
max_inclusive = max_exclusive - 1
|
|
443
|
+
raise DeepEvalError(
|
|
444
|
+
f"{component} expected `{param_name}` to be between "
|
|
445
|
+
f"{min_inclusive} and {max_inclusive} (inclusive), "
|
|
446
|
+
f"but received {value!r} instead."
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
# Upper bound check (half-open, < max_exclusive)
|
|
450
|
+
if max_exclusive is not None and value >= max_exclusive:
|
|
451
|
+
if min_inclusive is None:
|
|
452
|
+
raise DeepEvalError(
|
|
453
|
+
f"{component} expected `{param_name}` to be < {max_exclusive}, "
|
|
454
|
+
f"but received {value!r} instead."
|
|
455
|
+
)
|
|
456
|
+
max_inclusive = max_exclusive - 1
|
|
457
|
+
raise DeepEvalError(
|
|
458
|
+
f"{component} expected `{param_name}` to be between "
|
|
459
|
+
f"{min_inclusive} and {max_inclusive} (inclusive), "
|
|
460
|
+
f"but received {value!r} instead."
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
return value
|
|
464
|
+
|
|
465
|
+
|
|
466
|
+
##############
|
|
467
|
+
# Aggregates #
|
|
468
|
+
##############
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
class Aggregator(Protocol):
|
|
472
|
+
def __call__(self, scores: Sequence[float]) -> float: ...
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
def mean_of_all(scores: Sequence[float]) -> float:
|
|
476
|
+
return statistics.fmean(scores) if scores else 0.0
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
def median_of_all(scores: Sequence[float]) -> float:
|
|
480
|
+
return statistics.median(scores) if scores else 0.0
|
deepeval/prompt/prompt.py
CHANGED
|
@@ -7,9 +7,6 @@ from enum import Enum
|
|
|
7
7
|
from typing import Optional, List, Dict, Type, Literal
|
|
8
8
|
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
|
|
9
9
|
from rich.console import Console
|
|
10
|
-
import time
|
|
11
|
-
import json
|
|
12
|
-
import os
|
|
13
10
|
from pydantic import BaseModel, ValidationError
|
|
14
11
|
import asyncio
|
|
15
12
|
import threading
|
|
@@ -37,7 +34,6 @@ from deepeval.prompt.utils import (
|
|
|
37
34
|
from deepeval.confident.api import Api, Endpoints, HttpMethods
|
|
38
35
|
from deepeval.constants import HIDDEN_DIR
|
|
39
36
|
|
|
40
|
-
|
|
41
37
|
logger = logging.getLogger(__name__)
|
|
42
38
|
|
|
43
39
|
portalocker = None
|
|
@@ -117,6 +113,7 @@ class Prompt:
|
|
|
117
113
|
model_settings: Optional[ModelSettings] = None,
|
|
118
114
|
output_type: Optional[OutputType] = None,
|
|
119
115
|
output_schema: Optional[Type[BaseModel]] = None,
|
|
116
|
+
interpolation_type: Optional[PromptInterpolationType] = None,
|
|
120
117
|
):
|
|
121
118
|
if text_template and messages_template:
|
|
122
119
|
raise TypeError(
|
|
@@ -129,7 +126,9 @@ class Prompt:
|
|
|
129
126
|
self.output_type: Optional[OutputType] = output_type
|
|
130
127
|
self.output_schema: Optional[Type[BaseModel]] = output_schema
|
|
131
128
|
self.label: Optional[str] = None
|
|
132
|
-
self.interpolation_type:
|
|
129
|
+
self.interpolation_type: PromptInterpolationType = (
|
|
130
|
+
interpolation_type or PromptInterpolationType.FSTRING
|
|
131
|
+
)
|
|
133
132
|
|
|
134
133
|
self._version = None
|
|
135
134
|
self._prompt_version_id: Optional[str] = None
|
|
@@ -178,7 +177,7 @@ class Prompt:
|
|
|
178
177
|
content = f.read()
|
|
179
178
|
try:
|
|
180
179
|
data = json.loads(content)
|
|
181
|
-
except (json.JSONDecodeError
|
|
180
|
+
except (TypeError, json.JSONDecodeError):
|
|
182
181
|
self.text_template = content
|
|
183
182
|
return content
|
|
184
183
|
|
|
@@ -364,6 +363,8 @@ class Prompt:
|
|
|
364
363
|
f.seek(0)
|
|
365
364
|
f.truncate()
|
|
366
365
|
json.dump(cache_data, f, cls=CustomEncoder)
|
|
366
|
+
f.flush()
|
|
367
|
+
os.fsync(f.fileno())
|
|
367
368
|
except portalocker.exceptions.LockException:
|
|
368
369
|
# If we can't acquire the lock, silently skip caching
|
|
369
370
|
pass
|
deepeval/test_case/__init__.py
CHANGED
|
@@ -3,13 +3,13 @@ from .llm_test_case import (
|
|
|
3
3
|
LLMTestCaseParams,
|
|
4
4
|
ToolCall,
|
|
5
5
|
ToolCallParams,
|
|
6
|
+
MLLMImage,
|
|
6
7
|
)
|
|
7
8
|
from .conversational_test_case import (
|
|
8
9
|
ConversationalTestCase,
|
|
9
10
|
Turn,
|
|
10
11
|
TurnParams,
|
|
11
12
|
)
|
|
12
|
-
from .mllm_test_case import MLLMTestCase, MLLMTestCaseParams, MLLMImage
|
|
13
13
|
from .arena_test_case import ArenaTestCase, Contestant
|
|
14
14
|
from .mcp import (
|
|
15
15
|
MCPServer,
|
|
@@ -31,8 +31,6 @@ __all__ = [
|
|
|
31
31
|
"MCPPromptCall",
|
|
32
32
|
"MCPResourceCall",
|
|
33
33
|
"MCPToolCall",
|
|
34
|
-
"MLLMTestCase",
|
|
35
|
-
"MLLMTestCaseParams",
|
|
36
34
|
"MLLMImage",
|
|
37
35
|
"ArenaTestCase",
|
|
38
36
|
"Contestant",
|