deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +139 -2
- deepeval/evaluate/evaluate.py +16 -11
- deepeval/evaluate/execute.py +13 -181
- deepeval/evaluate/utils.py +6 -26
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +14 -16
- deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
- deepeval/metrics/answer_relevancy/template.py +22 -3
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -44
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +16 -2
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +138 -149
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +7 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -19
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +158 -122
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +49 -33
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +79 -33
- deepeval/models/embedding_models/local_embedding_model.py +39 -20
- deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
- deepeval/models/embedding_models/openai_embedding_model.py +42 -22
- deepeval/models/llms/amazon_bedrock_model.py +226 -72
- deepeval/models/llms/anthropic_model.py +178 -63
- deepeval/models/llms/azure_model.py +218 -60
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +95 -40
- deepeval/models/llms/gemini_model.py +209 -64
- deepeval/models/llms/grok_model.py +139 -68
- deepeval/models/llms/kimi_model.py +140 -90
- deepeval/models/llms/litellm_model.py +131 -37
- deepeval/models/llms/local_model.py +125 -21
- deepeval/models/llms/ollama_model.py +147 -24
- deepeval/models/llms/openai_model.py +222 -269
- deepeval/models/llms/portkey_model.py +81 -22
- deepeval/models/llms/utils.py +8 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +106 -5
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +26 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +68 -1
- deepeval/test_case/llm_test_case.py +206 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +18 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import asyncio
|
|
3
|
+
import copy
|
|
4
|
+
from typing import (
|
|
5
|
+
Callable,
|
|
6
|
+
Dict,
|
|
7
|
+
List,
|
|
8
|
+
Optional,
|
|
9
|
+
Union,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
from deepeval.dataset.golden import Golden, ConversationalGolden
|
|
13
|
+
from deepeval.dataset.utils import (
|
|
14
|
+
convert_goldens_to_test_cases,
|
|
15
|
+
convert_convo_goldens_to_convo_test_cases,
|
|
16
|
+
)
|
|
17
|
+
from deepeval.errors import DeepEvalError
|
|
18
|
+
from deepeval.metrics import (
|
|
19
|
+
BaseMetric,
|
|
20
|
+
BaseConversationalMetric,
|
|
21
|
+
)
|
|
22
|
+
from deepeval.metrics.utils import copy_metrics
|
|
23
|
+
from deepeval.test_case import (
|
|
24
|
+
LLMTestCase,
|
|
25
|
+
ConversationalTestCase,
|
|
26
|
+
Turn,
|
|
27
|
+
)
|
|
28
|
+
from deepeval.prompt.prompt import Prompt
|
|
29
|
+
|
|
30
|
+
from deepeval.optimizer.types import (
|
|
31
|
+
ModelCallback,
|
|
32
|
+
PromptConfiguration,
|
|
33
|
+
Objective,
|
|
34
|
+
MeanObjective,
|
|
35
|
+
ModuleId,
|
|
36
|
+
)
|
|
37
|
+
from deepeval.optimizer.scorer.base import BaseScorer
|
|
38
|
+
from deepeval.optimizer.utils import (
|
|
39
|
+
validate_callback,
|
|
40
|
+
validate_metrics,
|
|
41
|
+
invoke_model_callback,
|
|
42
|
+
a_invoke_model_callback,
|
|
43
|
+
)
|
|
44
|
+
from deepeval.optimizer.scorer.utils import (
|
|
45
|
+
_measure_no_indicator,
|
|
46
|
+
_a_measure_no_indicator,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class Scorer(BaseScorer):
|
|
51
|
+
"""
|
|
52
|
+
Scores prompts by running model_callback, building test cases,
|
|
53
|
+
running metrics, and aggregating scores.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
DEFAULT_MODULE_ID: ModuleId = "__module__"
|
|
57
|
+
|
|
58
|
+
def __init__(
|
|
59
|
+
self,
|
|
60
|
+
model_callback: ModelCallback,
|
|
61
|
+
metrics: Union[List[BaseMetric], List[BaseConversationalMetric]],
|
|
62
|
+
max_concurrent: int,
|
|
63
|
+
throttle_seconds: float,
|
|
64
|
+
objective_scalar: Objective = MeanObjective(),
|
|
65
|
+
):
|
|
66
|
+
self.model_callback = validate_callback(
|
|
67
|
+
component="Scorer",
|
|
68
|
+
model_callback=model_callback,
|
|
69
|
+
)
|
|
70
|
+
self.metrics = validate_metrics(component="Scorer", metrics=metrics)
|
|
71
|
+
self.objective_scalar = objective_scalar
|
|
72
|
+
self._semaphore = asyncio.Semaphore(max_concurrent)
|
|
73
|
+
self._throttle = float(throttle_seconds)
|
|
74
|
+
|
|
75
|
+
########################
|
|
76
|
+
# generation & scoring #
|
|
77
|
+
########################
|
|
78
|
+
|
|
79
|
+
def generate(
|
|
80
|
+
self,
|
|
81
|
+
prompts_by_module: Dict[ModuleId, Prompt],
|
|
82
|
+
golden: Union[Golden, ConversationalGolden],
|
|
83
|
+
) -> str:
|
|
84
|
+
module_id = self._select_module_id_from_prompts(prompts_by_module)
|
|
85
|
+
prompt = prompts_by_module.get(module_id) or next(
|
|
86
|
+
iter(prompts_by_module.values())
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
return invoke_model_callback(
|
|
90
|
+
model_callback=self.model_callback,
|
|
91
|
+
prompt=prompt,
|
|
92
|
+
golden=golden,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
async def a_generate(
|
|
96
|
+
self,
|
|
97
|
+
prompts_by_module: Dict[ModuleId, Prompt],
|
|
98
|
+
golden: Union[Golden, ConversationalGolden],
|
|
99
|
+
) -> str:
|
|
100
|
+
module_id = self._select_module_id_from_prompts(prompts_by_module)
|
|
101
|
+
prompt = prompts_by_module.get(module_id) or next(
|
|
102
|
+
iter(prompts_by_module.values())
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
return await a_invoke_model_callback(
|
|
106
|
+
model_callback=self.model_callback,
|
|
107
|
+
prompt=prompt,
|
|
108
|
+
golden=golden,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
def score_pareto(
|
|
112
|
+
self,
|
|
113
|
+
prompt_configuration: PromptConfiguration,
|
|
114
|
+
d_pareto: Union[List[Golden], List[ConversationalGolden]],
|
|
115
|
+
) -> List[float]:
|
|
116
|
+
return [
|
|
117
|
+
self._score_one(prompt_configuration, golden) for golden in d_pareto
|
|
118
|
+
]
|
|
119
|
+
|
|
120
|
+
def score_minibatch(
|
|
121
|
+
self,
|
|
122
|
+
prompt_configuration: PromptConfiguration,
|
|
123
|
+
minibatch: Union[List[Golden], List[ConversationalGolden]],
|
|
124
|
+
) -> float:
|
|
125
|
+
if not minibatch:
|
|
126
|
+
return 0.0
|
|
127
|
+
|
|
128
|
+
scores = [
|
|
129
|
+
self._score_one(prompt_configuration, golden)
|
|
130
|
+
for golden in minibatch
|
|
131
|
+
]
|
|
132
|
+
return sum(scores) / len(scores)
|
|
133
|
+
|
|
134
|
+
def get_minibatch_feedback(
|
|
135
|
+
self,
|
|
136
|
+
prompt_configuration: PromptConfiguration,
|
|
137
|
+
module: ModuleId,
|
|
138
|
+
minibatch: Union[List[Golden], List[ConversationalGolden]],
|
|
139
|
+
) -> str:
|
|
140
|
+
# default metric feedback (μ_f): concat metric.reason across minibatch and cap length
|
|
141
|
+
reasons: List[str] = []
|
|
142
|
+
for golden in minibatch:
|
|
143
|
+
actual = self.generate(prompt_configuration.prompts, golden)
|
|
144
|
+
test_case = self._golden_to_test_case(golden, actual)
|
|
145
|
+
for metric in copy_metrics(self.metrics):
|
|
146
|
+
_measure_no_indicator(metric=metric, test_case=test_case)
|
|
147
|
+
if metric.reason:
|
|
148
|
+
reasons.append(str(metric.reason))
|
|
149
|
+
if not reasons:
|
|
150
|
+
return ""
|
|
151
|
+
unique: List[str] = []
|
|
152
|
+
seen = set()
|
|
153
|
+
for reason in reasons:
|
|
154
|
+
if reason not in seen:
|
|
155
|
+
unique.append(reason)
|
|
156
|
+
seen.add(reason)
|
|
157
|
+
return "\n---\n".join(
|
|
158
|
+
unique[:8]
|
|
159
|
+
) # TODO: Make how much feedback configurable
|
|
160
|
+
|
|
161
|
+
async def a_score_pareto(
|
|
162
|
+
self,
|
|
163
|
+
prompt_configuration: PromptConfiguration,
|
|
164
|
+
d_pareto: Union[List[Golden], List[ConversationalGolden]],
|
|
165
|
+
) -> List[float]:
|
|
166
|
+
tasks = [
|
|
167
|
+
self._bounded(self._a_score_one(prompt_configuration, golden))
|
|
168
|
+
for golden in d_pareto
|
|
169
|
+
]
|
|
170
|
+
return await asyncio.gather(*tasks)
|
|
171
|
+
|
|
172
|
+
async def a_score_minibatch(
|
|
173
|
+
self,
|
|
174
|
+
prompt_configuration: PromptConfiguration,
|
|
175
|
+
minibatch: Union[List[Golden], List[ConversationalGolden]],
|
|
176
|
+
) -> float:
|
|
177
|
+
tasks = [
|
|
178
|
+
self._bounded(self._a_score_one(prompt_configuration, golden))
|
|
179
|
+
for golden in minibatch
|
|
180
|
+
]
|
|
181
|
+
scores = await asyncio.gather(*tasks)
|
|
182
|
+
return sum(scores) / len(scores) if scores else 0.0
|
|
183
|
+
|
|
184
|
+
async def a_get_minibatch_feedback(
|
|
185
|
+
self,
|
|
186
|
+
prompt_configuration: PromptConfiguration,
|
|
187
|
+
module: ModuleId,
|
|
188
|
+
minibatch: Union[List[Golden], List[ConversationalGolden]],
|
|
189
|
+
) -> str:
|
|
190
|
+
async def reasons_one(golden) -> List[str]:
|
|
191
|
+
# Clone per task to avoid shared state
|
|
192
|
+
metrics = copy_metrics(self.metrics)
|
|
193
|
+
# metrics = self.metrics
|
|
194
|
+
actual = await self.a_generate(prompt_configuration.prompts, golden)
|
|
195
|
+
test_case = self._golden_to_test_case(golden, actual)
|
|
196
|
+
out: List[str] = []
|
|
197
|
+
for metric in metrics:
|
|
198
|
+
await _a_measure_no_indicator(metric, test_case)
|
|
199
|
+
if metric.reason:
|
|
200
|
+
out.append(str(metric.reason))
|
|
201
|
+
return out
|
|
202
|
+
|
|
203
|
+
tasks = [self._bounded(reasons_one(golden)) for golden in minibatch]
|
|
204
|
+
nested = await asyncio.gather(*tasks)
|
|
205
|
+
reasons: List[str] = [reason for sub in nested for reason in sub]
|
|
206
|
+
if not reasons:
|
|
207
|
+
return ""
|
|
208
|
+
unique: List[str] = []
|
|
209
|
+
seen = set()
|
|
210
|
+
for reason in reasons:
|
|
211
|
+
if reason not in seen:
|
|
212
|
+
unique.append(reason)
|
|
213
|
+
seen.add(reason)
|
|
214
|
+
return "\n---\n".join(unique[:8])
|
|
215
|
+
|
|
216
|
+
###################
|
|
217
|
+
# scoring helpers #
|
|
218
|
+
###################
|
|
219
|
+
|
|
220
|
+
def _golden_to_test_case(
|
|
221
|
+
self,
|
|
222
|
+
golden: Union[Golden, ConversationalGolden],
|
|
223
|
+
actual: str,
|
|
224
|
+
) -> Union[LLMTestCase, ConversationalTestCase]:
|
|
225
|
+
"""Convert a golden + actual output into a test case for metrics."""
|
|
226
|
+
if isinstance(golden, Golden):
|
|
227
|
+
golden.actual_output = actual
|
|
228
|
+
return convert_goldens_to_test_cases([golden])[0]
|
|
229
|
+
|
|
230
|
+
if isinstance(golden, ConversationalGolden):
|
|
231
|
+
# Build turns with actual output as assistant response
|
|
232
|
+
turns: List[Turn] = list(golden.turns or [])
|
|
233
|
+
if turns and turns[-1].role == "assistant":
|
|
234
|
+
turns[-1] = Turn(role="assistant", content=actual)
|
|
235
|
+
elif turns:
|
|
236
|
+
turns.append(Turn(role="assistant", content=actual))
|
|
237
|
+
else:
|
|
238
|
+
turns = [
|
|
239
|
+
Turn(role="assistant", content=actual),
|
|
240
|
+
]
|
|
241
|
+
|
|
242
|
+
golden.turns = turns
|
|
243
|
+
return convert_convo_goldens_to_convo_test_cases([golden])[0]
|
|
244
|
+
|
|
245
|
+
async def _bounded(self, coro):
|
|
246
|
+
if self._semaphore is None:
|
|
247
|
+
return await coro
|
|
248
|
+
async with self._semaphore:
|
|
249
|
+
res = await coro
|
|
250
|
+
if self._throttle:
|
|
251
|
+
await asyncio.sleep(self._throttle)
|
|
252
|
+
return res
|
|
253
|
+
|
|
254
|
+
async def _a_score_one(
|
|
255
|
+
self,
|
|
256
|
+
prompt_configuration: PromptConfiguration,
|
|
257
|
+
golden: Union[Golden, ConversationalGolden],
|
|
258
|
+
) -> float:
|
|
259
|
+
# Clone metrics to avoid shared-state
|
|
260
|
+
metrics = copy_metrics(self.metrics)
|
|
261
|
+
actual = await self.a_generate(prompt_configuration.prompts, golden)
|
|
262
|
+
test_case = self._golden_to_test_case(golden, actual)
|
|
263
|
+
|
|
264
|
+
per_metric: Dict[str, float] = {}
|
|
265
|
+
for metric in metrics:
|
|
266
|
+
score = await _a_measure_no_indicator(metric, test_case)
|
|
267
|
+
per_metric[metric.__class__.__name__] = float(score)
|
|
268
|
+
return self.objective_scalar.scalarize(per_metric)
|
|
269
|
+
|
|
270
|
+
def _score_one(
|
|
271
|
+
self,
|
|
272
|
+
prompt_configuration: PromptConfiguration,
|
|
273
|
+
golden: Union[Golden, ConversationalGolden],
|
|
274
|
+
) -> float:
|
|
275
|
+
metrics = copy_metrics(self.metrics)
|
|
276
|
+
actual = self.generate(prompt_configuration.prompts, golden)
|
|
277
|
+
test_case = self._golden_to_test_case(golden, actual)
|
|
278
|
+
|
|
279
|
+
per_metric: Dict[str, float] = {}
|
|
280
|
+
for metric in metrics:
|
|
281
|
+
score = _measure_no_indicator(metric, test_case)
|
|
282
|
+
per_metric[metric.__class__.__name__] = float(score)
|
|
283
|
+
return self.objective_scalar.scalarize(per_metric)
|
|
284
|
+
|
|
285
|
+
def _select_module_id_from_prompts(
|
|
286
|
+
self, prompts_by_module: Dict[ModuleId, Prompt]
|
|
287
|
+
) -> ModuleId:
|
|
288
|
+
"""
|
|
289
|
+
Default module selection strategy:
|
|
290
|
+
|
|
291
|
+
- Prefer the synthetic '__module__' key when present
|
|
292
|
+
- Otherwise fall back to the first key in prompts_by_module.
|
|
293
|
+
|
|
294
|
+
Assumes `prompts_by_module` is non-empty; callers should validate that.
|
|
295
|
+
"""
|
|
296
|
+
if self.DEFAULT_MODULE_ID in prompts_by_module:
|
|
297
|
+
return self.DEFAULT_MODULE_ID
|
|
298
|
+
|
|
299
|
+
# At this point we expect at least one key.
|
|
300
|
+
try:
|
|
301
|
+
return next(iter(prompts_by_module.keys()))
|
|
302
|
+
except StopIteration:
|
|
303
|
+
raise DeepEvalError(
|
|
304
|
+
"Scorer._select_module_id_from_prompts(...) "
|
|
305
|
+
"received an empty `prompts_by_module`. At least one Prompt is required."
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
def select_module(
|
|
309
|
+
self, prompt_configuration: PromptConfiguration
|
|
310
|
+
) -> ModuleId:
|
|
311
|
+
return self._select_module_id_from_prompts(prompt_configuration.prompts)
|
|
312
|
+
|
|
313
|
+
async def a_select_module(
|
|
314
|
+
self, prompt_configuration: PromptConfiguration
|
|
315
|
+
) -> ModuleId:
|
|
316
|
+
return self.select_module(prompt_configuration)
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import inspect
|
|
2
|
+
from typing import Callable, Union
|
|
3
|
+
|
|
4
|
+
from deepeval.metrics import BaseConversationalMetric, BaseMetric
|
|
5
|
+
from deepeval.test_case import ConversationalTestCase, LLMTestCase
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _build_measure_kwargs(func: Callable) -> dict:
|
|
9
|
+
params = inspect.signature(func).parameters
|
|
10
|
+
kwargs = {}
|
|
11
|
+
for key in ("_show_indicator", "_in_component", "_log_metric_to_confident"):
|
|
12
|
+
if key in params:
|
|
13
|
+
kwargs[key] = False
|
|
14
|
+
return kwargs
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _measure_no_indicator(
|
|
18
|
+
metric: Union[BaseMetric, BaseConversationalMetric],
|
|
19
|
+
test_case: Union[LLMTestCase, ConversationalTestCase],
|
|
20
|
+
):
|
|
21
|
+
kwargs = _build_measure_kwargs(metric.measure)
|
|
22
|
+
return metric.measure(test_case, **kwargs)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
async def _a_measure_no_indicator(
|
|
26
|
+
metric: Union[BaseMetric, BaseConversationalMetric],
|
|
27
|
+
test_case: Union[LLMTestCase, ConversationalTestCase],
|
|
28
|
+
):
|
|
29
|
+
kwargs = _build_measure_kwargs(metric.a_measure)
|
|
30
|
+
return await metric.a_measure(test_case, **kwargs)
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import uuid
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import (
|
|
7
|
+
Callable,
|
|
8
|
+
Dict,
|
|
9
|
+
List,
|
|
10
|
+
Optional,
|
|
11
|
+
TypedDict,
|
|
12
|
+
TYPE_CHECKING,
|
|
13
|
+
Union,
|
|
14
|
+
)
|
|
15
|
+
from enum import Enum
|
|
16
|
+
from pydantic import BaseModel, ConfigDict
|
|
17
|
+
|
|
18
|
+
from deepeval.prompt.prompt import Prompt
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from deepeval.dataset.golden import Golden, ConversationalGolden
|
|
22
|
+
|
|
23
|
+
PromptConfigurationId = str
|
|
24
|
+
ModuleId = str
|
|
25
|
+
ScoreVector = List[float] # scores per instance on D_pareto, aligned order
|
|
26
|
+
ScoreTable = Dict[PromptConfigurationId, ScoreVector]
|
|
27
|
+
|
|
28
|
+
# Type alias for model callback function
|
|
29
|
+
ModelCallback = Callable[[Prompt, Union["Golden", "ConversationalGolden"]], str]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class PromptConfiguration:
|
|
34
|
+
id: PromptConfigurationId
|
|
35
|
+
parent: Optional[PromptConfigurationId]
|
|
36
|
+
prompts: Dict[ModuleId, Prompt]
|
|
37
|
+
|
|
38
|
+
@staticmethod
|
|
39
|
+
def new(
|
|
40
|
+
prompts: Dict[ModuleId, Prompt],
|
|
41
|
+
parent: Optional[PromptConfigurationId] = None,
|
|
42
|
+
) -> "PromptConfiguration":
|
|
43
|
+
return PromptConfiguration(
|
|
44
|
+
id=str(uuid.uuid4()), parent=parent, prompts=dict(prompts)
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class RunnerStatusType(str, Enum):
|
|
49
|
+
"""Status events emitted by optimization runners."""
|
|
50
|
+
|
|
51
|
+
PROGRESS = "progress"
|
|
52
|
+
TIE = "tie"
|
|
53
|
+
ERROR = "error"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# Type alias for status callback function
|
|
57
|
+
RunnerStatusCallback = Callable[..., None]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class Objective(ABC):
|
|
61
|
+
"""Strategy for reducing scores per-metric to a single scalar value.
|
|
62
|
+
|
|
63
|
+
Implementations receive a mapping from metric name to score
|
|
64
|
+
(for example, {"AnswerRelevancyMetric": 0.82}) and return a
|
|
65
|
+
single float used for comparisons inside the optimizer.
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
@abstractmethod
|
|
69
|
+
def scalarize(self, scores_by_metric: Dict[str, float]) -> float:
|
|
70
|
+
raise NotImplementedError
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class MeanObjective(Objective):
|
|
74
|
+
"""Default scalarizer: unweighted arithmetic mean.
|
|
75
|
+
|
|
76
|
+
- If `scores_by_metric` is non-empty, returns the arithmetic
|
|
77
|
+
mean of all metric scores.
|
|
78
|
+
- If `scores_by_metric` is empty, returns 0.0.
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
def scalarize(self, scores_by_metric: Dict[str, float]) -> float:
|
|
82
|
+
if not scores_by_metric:
|
|
83
|
+
return 0.0
|
|
84
|
+
return sum(scores_by_metric.values()) / len(scores_by_metric)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class WeightedObjective(Objective):
|
|
88
|
+
"""
|
|
89
|
+
Objective that scales each metric's score by a user-provided weight and sums them.
|
|
90
|
+
|
|
91
|
+
- `weights_by_metric` keys should match the names of the metrics passed to the
|
|
92
|
+
metric class names passed to the PromptOptimizer.
|
|
93
|
+
- Metrics not present in `weights_by_metric` receive `default_weight`.
|
|
94
|
+
This makes it easy to emphasize a subset of metrics while keeping
|
|
95
|
+
everything else at a baseline weight of 1.0, e.g.:
|
|
96
|
+
|
|
97
|
+
WeightedObjective({"AnswerRelevancyMetric": 2.0})
|
|
98
|
+
|
|
99
|
+
which treats AnswerRelevancy as 2x as important as the other metrics.
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
def __init__(
|
|
103
|
+
self,
|
|
104
|
+
weights_by_metric: Optional[Dict[str, float]] = None,
|
|
105
|
+
default_weight: float = 1.0,
|
|
106
|
+
):
|
|
107
|
+
self.weights_by_metric: Dict[str, float] = dict(weights_by_metric or {})
|
|
108
|
+
self.default_weight: float = float(default_weight)
|
|
109
|
+
|
|
110
|
+
def scalarize(self, scores_by_metric: Dict[str, float]) -> float:
|
|
111
|
+
return sum(
|
|
112
|
+
self.weights_by_metric.get(name, self.default_weight) * score
|
|
113
|
+
for name, score in scores_by_metric.items()
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class AcceptedIterationDict(TypedDict):
|
|
118
|
+
parent: PromptConfigurationId
|
|
119
|
+
child: PromptConfigurationId
|
|
120
|
+
module: ModuleId
|
|
121
|
+
before: float
|
|
122
|
+
after: float
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class AcceptedIteration(BaseModel):
|
|
126
|
+
parent: str
|
|
127
|
+
child: str
|
|
128
|
+
module: str
|
|
129
|
+
before: float
|
|
130
|
+
after: float
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class PromptConfigSnapshot(BaseModel):
|
|
134
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
135
|
+
|
|
136
|
+
parent: Optional[str]
|
|
137
|
+
prompts: Dict[str, Prompt]
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class OptimizationReport(BaseModel):
|
|
141
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
142
|
+
|
|
143
|
+
optimization_id: str
|
|
144
|
+
best_id: str
|
|
145
|
+
accepted_iterations: List[AcceptedIteration]
|
|
146
|
+
pareto_scores: Dict[str, List[float]]
|
|
147
|
+
parents: Dict[str, Optional[str]]
|
|
148
|
+
prompt_configurations: Dict[str, PromptConfigSnapshot]
|