deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +139 -2
- deepeval/evaluate/evaluate.py +16 -11
- deepeval/evaluate/execute.py +13 -181
- deepeval/evaluate/utils.py +6 -26
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +14 -16
- deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
- deepeval/metrics/answer_relevancy/template.py +22 -3
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -44
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +16 -2
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +138 -149
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +7 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -19
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +158 -122
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +49 -33
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +79 -33
- deepeval/models/embedding_models/local_embedding_model.py +39 -20
- deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
- deepeval/models/embedding_models/openai_embedding_model.py +42 -22
- deepeval/models/llms/amazon_bedrock_model.py +226 -72
- deepeval/models/llms/anthropic_model.py +178 -63
- deepeval/models/llms/azure_model.py +218 -60
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +95 -40
- deepeval/models/llms/gemini_model.py +209 -64
- deepeval/models/llms/grok_model.py +139 -68
- deepeval/models/llms/kimi_model.py +140 -90
- deepeval/models/llms/litellm_model.py +131 -37
- deepeval/models/llms/local_model.py +125 -21
- deepeval/models/llms/ollama_model.py +147 -24
- deepeval/models/llms/openai_model.py +222 -269
- deepeval/models/llms/portkey_model.py +81 -22
- deepeval/models/llms/utils.py +8 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +106 -5
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +26 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +68 -1
- deepeval/test_case/llm_test_case.py +206 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +18 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,435 @@
|
|
|
1
|
+
# Demo Bootstrapper for MIPROv2
|
|
2
|
+
#
|
|
3
|
+
# This module implements few-shot demonstration bootstrapping following
|
|
4
|
+
# the original MIPROv2 paper. It runs the prompt on training examples
|
|
5
|
+
# and collects successful outputs as demonstrations.
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
import asyncio
|
|
9
|
+
import random
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from typing import List, Optional, Union, TYPE_CHECKING, Callable, Tuple
|
|
12
|
+
|
|
13
|
+
from deepeval.prompt.prompt import Prompt
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from deepeval.dataset.golden import Golden, ConversationalGolden
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class Demo:
|
|
21
|
+
"""
|
|
22
|
+
A single demonstration example for few-shot prompting.
|
|
23
|
+
|
|
24
|
+
Attributes:
|
|
25
|
+
input_text: The input/question from the golden
|
|
26
|
+
output_text: The successful output from the model
|
|
27
|
+
golden_index: Index of the source golden (for tracking)
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
input_text: str
|
|
31
|
+
output_text: str
|
|
32
|
+
golden_index: int = -1
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class DemoSet:
|
|
37
|
+
"""
|
|
38
|
+
A set of demonstrations to be included in a prompt.
|
|
39
|
+
|
|
40
|
+
Attributes:
|
|
41
|
+
demos: List of Demo objects
|
|
42
|
+
id: Unique identifier for this demo set
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
demos: List[Demo] = field(default_factory=list)
|
|
46
|
+
id: str = ""
|
|
47
|
+
|
|
48
|
+
def __post_init__(self):
|
|
49
|
+
if not self.id:
|
|
50
|
+
import uuid
|
|
51
|
+
|
|
52
|
+
self.id = str(uuid.uuid4())
|
|
53
|
+
|
|
54
|
+
def to_text(self, max_demos: Optional[int] = None) -> str:
|
|
55
|
+
"""Render demos as text for inclusion in prompts."""
|
|
56
|
+
demos_to_use = self.demos[:max_demos] if max_demos else self.demos
|
|
57
|
+
if not demos_to_use:
|
|
58
|
+
return ""
|
|
59
|
+
|
|
60
|
+
lines = ["Here are some examples:", ""]
|
|
61
|
+
for i, demo in enumerate(demos_to_use, 1):
|
|
62
|
+
lines.append(f"Example {i}:")
|
|
63
|
+
lines.append(f"Input: {demo.input_text}")
|
|
64
|
+
lines.append(f"Output: {demo.output_text}")
|
|
65
|
+
lines.append("")
|
|
66
|
+
|
|
67
|
+
lines.append("Now, please respond to the following:")
|
|
68
|
+
return "\n".join(lines)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class DemoBootstrapper:
|
|
72
|
+
"""
|
|
73
|
+
Bootstraps few-shot demonstrations by running the prompt on
|
|
74
|
+
training examples and keeping successful outputs.
|
|
75
|
+
|
|
76
|
+
Following MIPROv2, this:
|
|
77
|
+
1. Samples examples from the training set
|
|
78
|
+
2. Runs them through the model with the current prompt
|
|
79
|
+
3. Evaluates outputs using a simple success check
|
|
80
|
+
4. Keeps successful outputs as demonstration candidates
|
|
81
|
+
5. Creates multiple demo sets for variety
|
|
82
|
+
|
|
83
|
+
Parameters
|
|
84
|
+
----------
|
|
85
|
+
max_bootstrapped_demos : int
|
|
86
|
+
Maximum demos per set from bootstrapping. Default is 4.
|
|
87
|
+
max_labeled_demos : int
|
|
88
|
+
Maximum demos per set from labeled data (golden expected_output). Default is 4.
|
|
89
|
+
num_demo_sets : int
|
|
90
|
+
Number of different demo sets to create. Default is 5.
|
|
91
|
+
random_state : random.Random, optional
|
|
92
|
+
Random state for reproducibility.
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
def __init__(
|
|
96
|
+
self,
|
|
97
|
+
max_bootstrapped_demos: int = 4,
|
|
98
|
+
max_labeled_demos: int = 4,
|
|
99
|
+
num_demo_sets: int = 5,
|
|
100
|
+
random_state: Optional[Union[int, random.Random]] = None,
|
|
101
|
+
):
|
|
102
|
+
self.max_bootstrapped_demos = max_bootstrapped_demos
|
|
103
|
+
self.max_labeled_demos = max_labeled_demos
|
|
104
|
+
self.num_demo_sets = num_demo_sets
|
|
105
|
+
|
|
106
|
+
if isinstance(random_state, int):
|
|
107
|
+
self.random_state = random.Random(random_state)
|
|
108
|
+
else:
|
|
109
|
+
self.random_state = random_state or random.Random()
|
|
110
|
+
|
|
111
|
+
def _extract_input(
|
|
112
|
+
self,
|
|
113
|
+
golden: Union["Golden", "ConversationalGolden"],
|
|
114
|
+
) -> str:
|
|
115
|
+
"""Extract input text from a golden."""
|
|
116
|
+
if hasattr(golden, "input") and golden.input:
|
|
117
|
+
return str(golden.input)
|
|
118
|
+
if hasattr(golden, "messages") and golden.messages:
|
|
119
|
+
# For conversational, use the last user message
|
|
120
|
+
for msg in reversed(golden.messages):
|
|
121
|
+
if hasattr(msg, "role") and msg.role == "user":
|
|
122
|
+
return (
|
|
123
|
+
str(msg.content)
|
|
124
|
+
if hasattr(msg, "content")
|
|
125
|
+
else str(msg)
|
|
126
|
+
)
|
|
127
|
+
return str(golden.messages[-1])
|
|
128
|
+
return ""
|
|
129
|
+
|
|
130
|
+
def _extract_expected_output(
|
|
131
|
+
self,
|
|
132
|
+
golden: Union["Golden", "ConversationalGolden"],
|
|
133
|
+
) -> Optional[str]:
|
|
134
|
+
"""Extract expected output from a golden if available."""
|
|
135
|
+
if hasattr(golden, "expected_output") and golden.expected_output:
|
|
136
|
+
return str(golden.expected_output)
|
|
137
|
+
return None
|
|
138
|
+
|
|
139
|
+
def _is_successful(
|
|
140
|
+
self,
|
|
141
|
+
actual_output: str,
|
|
142
|
+
expected_output: Optional[str],
|
|
143
|
+
) -> bool:
|
|
144
|
+
"""
|
|
145
|
+
Simple success check for bootstrapping.
|
|
146
|
+
|
|
147
|
+
For now, we consider an output successful if:
|
|
148
|
+
- It's non-empty
|
|
149
|
+
- If expected_output exists, actual has some overlap
|
|
150
|
+
|
|
151
|
+
This is a simplified heuristic. In full MIPROv2, you'd use
|
|
152
|
+
the actual metric to validate.
|
|
153
|
+
"""
|
|
154
|
+
if not actual_output or not actual_output.strip():
|
|
155
|
+
return False
|
|
156
|
+
|
|
157
|
+
if expected_output:
|
|
158
|
+
# Simple overlap check - could be more sophisticated
|
|
159
|
+
actual_words = set(actual_output.lower().split())
|
|
160
|
+
expected_words = set(expected_output.lower().split())
|
|
161
|
+
if actual_words and expected_words:
|
|
162
|
+
overlap = len(actual_words & expected_words) / len(
|
|
163
|
+
expected_words
|
|
164
|
+
)
|
|
165
|
+
return overlap > 0.3 # At least 30% word overlap
|
|
166
|
+
|
|
167
|
+
# If no expected output, just check it's non-empty
|
|
168
|
+
return len(actual_output.strip()) > 10
|
|
169
|
+
|
|
170
|
+
def bootstrap(
|
|
171
|
+
self,
|
|
172
|
+
prompt: Prompt,
|
|
173
|
+
goldens: Union[List["Golden"], List["ConversationalGolden"]],
|
|
174
|
+
generate_fn: Callable[
|
|
175
|
+
[Prompt, Union["Golden", "ConversationalGolden"]], str
|
|
176
|
+
],
|
|
177
|
+
) -> List[DemoSet]:
|
|
178
|
+
"""
|
|
179
|
+
Bootstrap demonstration sets synchronously.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
prompt: The prompt to use for generation
|
|
183
|
+
goldens: Training examples to bootstrap from
|
|
184
|
+
generate_fn: Function that takes (prompt, golden) and returns output
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
List of DemoSet objects, each containing a different set of demos
|
|
188
|
+
"""
|
|
189
|
+
# Collect all successful demos
|
|
190
|
+
all_demos: List[Demo] = []
|
|
191
|
+
labeled_demos: List[Demo] = []
|
|
192
|
+
|
|
193
|
+
# Shuffle goldens for variety
|
|
194
|
+
shuffled_indices = list(range(len(goldens)))
|
|
195
|
+
self.random_state.shuffle(shuffled_indices)
|
|
196
|
+
|
|
197
|
+
# Try to bootstrap demos
|
|
198
|
+
attempts = 0
|
|
199
|
+
max_attempts = min(len(goldens), self.max_bootstrapped_demos * 3)
|
|
200
|
+
|
|
201
|
+
for idx in shuffled_indices[:max_attempts]:
|
|
202
|
+
golden = goldens[idx]
|
|
203
|
+
input_text = self._extract_input(golden)
|
|
204
|
+
expected = self._extract_expected_output(golden)
|
|
205
|
+
|
|
206
|
+
if not input_text:
|
|
207
|
+
continue
|
|
208
|
+
|
|
209
|
+
# If we have expected output, use it as a labeled demo
|
|
210
|
+
if (
|
|
211
|
+
expected
|
|
212
|
+
and len(labeled_demos)
|
|
213
|
+
< self.max_labeled_demos * self.num_demo_sets
|
|
214
|
+
):
|
|
215
|
+
labeled_demos.append(
|
|
216
|
+
Demo(
|
|
217
|
+
input_text=input_text,
|
|
218
|
+
output_text=expected,
|
|
219
|
+
golden_index=idx,
|
|
220
|
+
)
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
# Try to bootstrap
|
|
224
|
+
if (
|
|
225
|
+
len(all_demos)
|
|
226
|
+
< self.max_bootstrapped_demos * self.num_demo_sets
|
|
227
|
+
):
|
|
228
|
+
try:
|
|
229
|
+
output = generate_fn(prompt, golden)
|
|
230
|
+
if self._is_successful(output, expected):
|
|
231
|
+
all_demos.append(
|
|
232
|
+
Demo(
|
|
233
|
+
input_text=input_text,
|
|
234
|
+
output_text=output,
|
|
235
|
+
golden_index=idx,
|
|
236
|
+
)
|
|
237
|
+
)
|
|
238
|
+
except Exception:
|
|
239
|
+
continue
|
|
240
|
+
|
|
241
|
+
attempts += 1
|
|
242
|
+
if (
|
|
243
|
+
len(all_demos)
|
|
244
|
+
>= self.max_bootstrapped_demos * self.num_demo_sets
|
|
245
|
+
and len(labeled_demos)
|
|
246
|
+
>= self.max_labeled_demos * self.num_demo_sets
|
|
247
|
+
):
|
|
248
|
+
break
|
|
249
|
+
|
|
250
|
+
# Create diverse demo sets
|
|
251
|
+
return self._create_demo_sets(all_demos, labeled_demos)
|
|
252
|
+
|
|
253
|
+
async def a_bootstrap(
|
|
254
|
+
self,
|
|
255
|
+
prompt: Prompt,
|
|
256
|
+
goldens: Union[List["Golden"], List["ConversationalGolden"]],
|
|
257
|
+
a_generate_fn: Callable,
|
|
258
|
+
) -> List[DemoSet]:
|
|
259
|
+
"""
|
|
260
|
+
Bootstrap demonstration sets asynchronously (concurrently).
|
|
261
|
+
"""
|
|
262
|
+
labeled_demos: List[Demo] = []
|
|
263
|
+
|
|
264
|
+
shuffled_indices = list(range(len(goldens)))
|
|
265
|
+
self.random_state.shuffle(shuffled_indices)
|
|
266
|
+
|
|
267
|
+
max_attempts = min(len(goldens), self.max_bootstrapped_demos * 3)
|
|
268
|
+
selected_indices = shuffled_indices[:max_attempts]
|
|
269
|
+
|
|
270
|
+
# First pass: collect labeled demos (no async needed) and prepare bootstrap tasks
|
|
271
|
+
tasks_info: List[Tuple[int, str, Optional[str]]] = (
|
|
272
|
+
[]
|
|
273
|
+
) # (idx, input_text, expected)
|
|
274
|
+
|
|
275
|
+
for idx in selected_indices:
|
|
276
|
+
golden = goldens[idx]
|
|
277
|
+
input_text = self._extract_input(golden)
|
|
278
|
+
expected = self._extract_expected_output(golden)
|
|
279
|
+
|
|
280
|
+
if not input_text:
|
|
281
|
+
continue
|
|
282
|
+
|
|
283
|
+
# Collect labeled demos
|
|
284
|
+
if (
|
|
285
|
+
expected
|
|
286
|
+
and len(labeled_demos)
|
|
287
|
+
< self.max_labeled_demos * self.num_demo_sets
|
|
288
|
+
):
|
|
289
|
+
labeled_demos.append(
|
|
290
|
+
Demo(
|
|
291
|
+
input_text=input_text,
|
|
292
|
+
output_text=expected,
|
|
293
|
+
golden_index=idx,
|
|
294
|
+
)
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
# Queue for bootstrapping
|
|
298
|
+
tasks_info.append((idx, input_text, expected))
|
|
299
|
+
|
|
300
|
+
# Limit how many we need to bootstrap
|
|
301
|
+
max_bootstrapped = self.max_bootstrapped_demos * self.num_demo_sets
|
|
302
|
+
tasks_info = tasks_info[:max_bootstrapped]
|
|
303
|
+
|
|
304
|
+
# Run all bootstrap generations concurrently
|
|
305
|
+
async def generate_one(
|
|
306
|
+
idx: int,
|
|
307
|
+
input_text: str,
|
|
308
|
+
expected: Optional[str],
|
|
309
|
+
) -> Optional[Demo]:
|
|
310
|
+
golden = goldens[idx]
|
|
311
|
+
try:
|
|
312
|
+
output = await a_generate_fn(prompt, golden)
|
|
313
|
+
if self._is_successful(output, expected):
|
|
314
|
+
return Demo(
|
|
315
|
+
input_text=input_text,
|
|
316
|
+
output_text=output,
|
|
317
|
+
golden_index=idx,
|
|
318
|
+
)
|
|
319
|
+
except Exception:
|
|
320
|
+
pass
|
|
321
|
+
return None
|
|
322
|
+
|
|
323
|
+
results = await asyncio.gather(
|
|
324
|
+
*[generate_one(idx, inp, exp) for idx, inp, exp in tasks_info]
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
# Collect successful demos
|
|
328
|
+
all_demos = [demo for demo in results if demo is not None]
|
|
329
|
+
|
|
330
|
+
return self._create_demo_sets(all_demos, labeled_demos)
|
|
331
|
+
|
|
332
|
+
def _create_demo_sets(
|
|
333
|
+
self,
|
|
334
|
+
bootstrapped_demos: List[Demo],
|
|
335
|
+
labeled_demos: List[Demo],
|
|
336
|
+
) -> List[DemoSet]:
|
|
337
|
+
"""
|
|
338
|
+
Create multiple demo sets from bootstrapped and labeled demos.
|
|
339
|
+
|
|
340
|
+
Each set contains a mix of bootstrapped and labeled demos,
|
|
341
|
+
selected randomly for diversity.
|
|
342
|
+
"""
|
|
343
|
+
demo_sets: List[DemoSet] = []
|
|
344
|
+
|
|
345
|
+
# Always include an empty demo set (0-shot option)
|
|
346
|
+
demo_sets.append(DemoSet(demos=[], id="0-shot"))
|
|
347
|
+
|
|
348
|
+
# Create varied demo sets
|
|
349
|
+
for i in range(self.num_demo_sets):
|
|
350
|
+
demos: List[Demo] = []
|
|
351
|
+
|
|
352
|
+
# Sample from bootstrapped demos
|
|
353
|
+
if bootstrapped_demos:
|
|
354
|
+
n_boot = min(
|
|
355
|
+
self.max_bootstrapped_demos, len(bootstrapped_demos)
|
|
356
|
+
)
|
|
357
|
+
boot_sample = self.random_state.sample(
|
|
358
|
+
bootstrapped_demos, n_boot
|
|
359
|
+
)
|
|
360
|
+
demos.extend(boot_sample)
|
|
361
|
+
|
|
362
|
+
# Sample from labeled demos
|
|
363
|
+
if labeled_demos:
|
|
364
|
+
n_labeled = min(self.max_labeled_demos, len(labeled_demos))
|
|
365
|
+
labeled_sample = self.random_state.sample(
|
|
366
|
+
labeled_demos, n_labeled
|
|
367
|
+
)
|
|
368
|
+
# Avoid duplicates
|
|
369
|
+
existing_indices = {d.golden_index for d in demos}
|
|
370
|
+
for demo in labeled_sample:
|
|
371
|
+
if demo.golden_index not in existing_indices:
|
|
372
|
+
demos.append(demo)
|
|
373
|
+
existing_indices.add(demo.golden_index)
|
|
374
|
+
|
|
375
|
+
if demos:
|
|
376
|
+
self.random_state.shuffle(demos)
|
|
377
|
+
demo_sets.append(DemoSet(demos=demos))
|
|
378
|
+
|
|
379
|
+
return demo_sets
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
def render_prompt_with_demos(
|
|
383
|
+
prompt: Prompt,
|
|
384
|
+
demo_set: Optional[DemoSet],
|
|
385
|
+
max_demos: int = 8,
|
|
386
|
+
) -> Prompt:
|
|
387
|
+
"""
|
|
388
|
+
Create a new Prompt that includes demonstrations.
|
|
389
|
+
|
|
390
|
+
This prepends the demo text to the prompt's content.
|
|
391
|
+
|
|
392
|
+
Args:
|
|
393
|
+
prompt: The base prompt
|
|
394
|
+
demo_set: The demonstration set to include
|
|
395
|
+
max_demos: Maximum number of demos to include
|
|
396
|
+
|
|
397
|
+
Returns:
|
|
398
|
+
A new Prompt with demos included
|
|
399
|
+
"""
|
|
400
|
+
from deepeval.prompt.api import PromptType, PromptMessage
|
|
401
|
+
|
|
402
|
+
if not demo_set or not demo_set.demos:
|
|
403
|
+
return prompt
|
|
404
|
+
|
|
405
|
+
demo_text = demo_set.to_text(max_demos=max_demos)
|
|
406
|
+
|
|
407
|
+
if prompt.type == PromptType.LIST:
|
|
408
|
+
# For LIST prompts, prepend demos to the system message or first message
|
|
409
|
+
new_messages = []
|
|
410
|
+
demo_added = False
|
|
411
|
+
|
|
412
|
+
for msg in prompt.messages_template:
|
|
413
|
+
if not demo_added and msg.role == "system":
|
|
414
|
+
# Add demos to system message
|
|
415
|
+
new_content = f"{msg.content}\n\n{demo_text}"
|
|
416
|
+
new_messages.append(
|
|
417
|
+
PromptMessage(role=msg.role, content=new_content)
|
|
418
|
+
)
|
|
419
|
+
demo_added = True
|
|
420
|
+
else:
|
|
421
|
+
new_messages.append(msg)
|
|
422
|
+
|
|
423
|
+
if not demo_added and new_messages:
|
|
424
|
+
# No system message, add demos to first message
|
|
425
|
+
first = new_messages[0]
|
|
426
|
+
new_content = f"{demo_text}\n\n{first.content}"
|
|
427
|
+
new_messages[0] = PromptMessage(
|
|
428
|
+
role=first.role, content=new_content
|
|
429
|
+
)
|
|
430
|
+
|
|
431
|
+
return Prompt(messages_template=new_messages)
|
|
432
|
+
else:
|
|
433
|
+
# For TEXT prompts, prepend demos
|
|
434
|
+
new_text = f"{demo_text}\n\n{prompt.text_template}"
|
|
435
|
+
return Prompt(text_template=new_text)
|