deepeval 3.7.4__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/dataset/golden.py +54 -2
  3. deepeval/evaluate/evaluate.py +16 -8
  4. deepeval/evaluate/execute.py +70 -26
  5. deepeval/evaluate/utils.py +26 -22
  6. deepeval/integrations/pydantic_ai/agent.py +19 -2
  7. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  8. deepeval/metrics/__init__.py +14 -12
  9. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  10. deepeval/metrics/answer_relevancy/template.py +188 -92
  11. deepeval/metrics/base_metric.py +2 -5
  12. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  13. deepeval/metrics/contextual_precision/template.py +115 -66
  14. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  15. deepeval/metrics/contextual_recall/template.py +106 -55
  16. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  17. deepeval/metrics/contextual_relevancy/template.py +87 -58
  18. deepeval/metrics/dag/templates.py +2 -2
  19. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  20. deepeval/metrics/faithfulness/schema.py +1 -1
  21. deepeval/metrics/faithfulness/template.py +200 -115
  22. deepeval/metrics/g_eval/utils.py +2 -2
  23. deepeval/metrics/indicator.py +4 -4
  24. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  25. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  26. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  27. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  28. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  29. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  30. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  31. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  32. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  33. deepeval/metrics/ragas.py +3 -3
  34. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  35. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  36. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  37. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  38. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  39. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  40. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  41. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  42. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  43. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  44. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  45. deepeval/metrics/turn_faithfulness/template.py +218 -0
  46. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  47. deepeval/metrics/utils.py +39 -58
  48. deepeval/models/__init__.py +0 -12
  49. deepeval/models/base_model.py +16 -38
  50. deepeval/models/embedding_models/__init__.py +7 -0
  51. deepeval/models/embedding_models/azure_embedding_model.py +52 -28
  52. deepeval/models/embedding_models/local_embedding_model.py +18 -14
  53. deepeval/models/embedding_models/ollama_embedding_model.py +38 -16
  54. deepeval/models/embedding_models/openai_embedding_model.py +40 -21
  55. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  56. deepeval/models/llms/anthropic_model.py +44 -23
  57. deepeval/models/llms/azure_model.py +121 -36
  58. deepeval/models/llms/deepseek_model.py +18 -13
  59. deepeval/models/llms/gemini_model.py +129 -43
  60. deepeval/models/llms/grok_model.py +18 -13
  61. deepeval/models/llms/kimi_model.py +18 -13
  62. deepeval/models/llms/litellm_model.py +42 -22
  63. deepeval/models/llms/local_model.py +12 -7
  64. deepeval/models/llms/ollama_model.py +114 -12
  65. deepeval/models/llms/openai_model.py +137 -41
  66. deepeval/models/llms/portkey_model.py +24 -7
  67. deepeval/models/llms/utils.py +5 -3
  68. deepeval/models/retry_policy.py +17 -14
  69. deepeval/models/utils.py +46 -1
  70. deepeval/optimizer/__init__.py +5 -0
  71. deepeval/optimizer/algorithms/__init__.py +6 -0
  72. deepeval/optimizer/algorithms/base.py +29 -0
  73. deepeval/optimizer/algorithms/configs.py +18 -0
  74. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  75. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  76. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  77. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  78. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  79. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  80. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  81. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  82. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  83. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  84. deepeval/{optimization → optimizer}/configs.py +5 -8
  85. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  86. deepeval/optimizer/prompt_optimizer.py +263 -0
  87. deepeval/optimizer/rewriter/__init__.py +5 -0
  88. deepeval/optimizer/rewriter/rewriter.py +124 -0
  89. deepeval/optimizer/rewriter/utils.py +214 -0
  90. deepeval/optimizer/scorer/__init__.py +5 -0
  91. deepeval/optimizer/scorer/base.py +86 -0
  92. deepeval/optimizer/scorer/scorer.py +316 -0
  93. deepeval/optimizer/scorer/utils.py +30 -0
  94. deepeval/optimizer/types.py +148 -0
  95. deepeval/{optimization → optimizer}/utils.py +47 -165
  96. deepeval/prompt/prompt.py +5 -9
  97. deepeval/test_case/__init__.py +1 -3
  98. deepeval/test_case/api.py +12 -10
  99. deepeval/test_case/conversational_test_case.py +19 -1
  100. deepeval/test_case/llm_test_case.py +152 -1
  101. deepeval/test_case/utils.py +4 -8
  102. deepeval/test_run/api.py +15 -14
  103. deepeval/test_run/test_run.py +3 -3
  104. deepeval/tracing/patchers.py +9 -4
  105. deepeval/tracing/tracing.py +2 -2
  106. deepeval/utils.py +65 -0
  107. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  108. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/RECORD +116 -125
  109. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  110. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  111. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  112. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  113. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  114. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  115. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  116. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  117. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  118. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  119. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  120. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  121. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  122. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  123. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  124. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  125. deepeval/models/mlllms/__init__.py +0 -4
  126. deepeval/models/mlllms/azure_model.py +0 -343
  127. deepeval/models/mlllms/gemini_model.py +0 -313
  128. deepeval/models/mlllms/ollama_model.py +0 -175
  129. deepeval/models/mlllms/openai_model.py +0 -309
  130. deepeval/optimization/__init__.py +0 -13
  131. deepeval/optimization/adapters/__init__.py +0 -2
  132. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  133. deepeval/optimization/aggregates.py +0 -14
  134. deepeval/optimization/copro/configs.py +0 -31
  135. deepeval/optimization/gepa/__init__.py +0 -7
  136. deepeval/optimization/gepa/configs.py +0 -115
  137. deepeval/optimization/miprov2/configs.py +0 -134
  138. deepeval/optimization/miprov2/loop.py +0 -785
  139. deepeval/optimization/mutations/__init__.py +0 -0
  140. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  141. deepeval/optimization/policies/__init__.py +0 -16
  142. deepeval/optimization/policies/tie_breaker.py +0 -67
  143. deepeval/optimization/prompt_optimizer.py +0 -462
  144. deepeval/optimization/simba/__init__.py +0 -0
  145. deepeval/optimization/simba/configs.py +0 -33
  146. deepeval/optimization/types.py +0 -361
  147. deepeval/test_case/mllm_test_case.py +0 -170
  148. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  149. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  150. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  152. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  153. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  154. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  155. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,435 @@
1
+ # Demo Bootstrapper for MIPROv2
2
+ #
3
+ # This module implements few-shot demonstration bootstrapping following
4
+ # the original MIPROv2 paper. It runs the prompt on training examples
5
+ # and collects successful outputs as demonstrations.
6
+
7
+ from __future__ import annotations
8
+ import asyncio
9
+ import random
10
+ from dataclasses import dataclass, field
11
+ from typing import List, Optional, Union, TYPE_CHECKING, Callable, Tuple
12
+
13
+ from deepeval.prompt.prompt import Prompt
14
+
15
+ if TYPE_CHECKING:
16
+ from deepeval.dataset.golden import Golden, ConversationalGolden
17
+
18
+
19
+ @dataclass
20
+ class Demo:
21
+ """
22
+ A single demonstration example for few-shot prompting.
23
+
24
+ Attributes:
25
+ input_text: The input/question from the golden
26
+ output_text: The successful output from the model
27
+ golden_index: Index of the source golden (for tracking)
28
+ """
29
+
30
+ input_text: str
31
+ output_text: str
32
+ golden_index: int = -1
33
+
34
+
35
+ @dataclass
36
+ class DemoSet:
37
+ """
38
+ A set of demonstrations to be included in a prompt.
39
+
40
+ Attributes:
41
+ demos: List of Demo objects
42
+ id: Unique identifier for this demo set
43
+ """
44
+
45
+ demos: List[Demo] = field(default_factory=list)
46
+ id: str = ""
47
+
48
+ def __post_init__(self):
49
+ if not self.id:
50
+ import uuid
51
+
52
+ self.id = str(uuid.uuid4())
53
+
54
+ def to_text(self, max_demos: Optional[int] = None) -> str:
55
+ """Render demos as text for inclusion in prompts."""
56
+ demos_to_use = self.demos[:max_demos] if max_demos else self.demos
57
+ if not demos_to_use:
58
+ return ""
59
+
60
+ lines = ["Here are some examples:", ""]
61
+ for i, demo in enumerate(demos_to_use, 1):
62
+ lines.append(f"Example {i}:")
63
+ lines.append(f"Input: {demo.input_text}")
64
+ lines.append(f"Output: {demo.output_text}")
65
+ lines.append("")
66
+
67
+ lines.append("Now, please respond to the following:")
68
+ return "\n".join(lines)
69
+
70
+
71
+ class DemoBootstrapper:
72
+ """
73
+ Bootstraps few-shot demonstrations by running the prompt on
74
+ training examples and keeping successful outputs.
75
+
76
+ Following MIPROv2, this:
77
+ 1. Samples examples from the training set
78
+ 2. Runs them through the model with the current prompt
79
+ 3. Evaluates outputs using a simple success check
80
+ 4. Keeps successful outputs as demonstration candidates
81
+ 5. Creates multiple demo sets for variety
82
+
83
+ Parameters
84
+ ----------
85
+ max_bootstrapped_demos : int
86
+ Maximum demos per set from bootstrapping. Default is 4.
87
+ max_labeled_demos : int
88
+ Maximum demos per set from labeled data (golden expected_output). Default is 4.
89
+ num_demo_sets : int
90
+ Number of different demo sets to create. Default is 5.
91
+ random_state : random.Random, optional
92
+ Random state for reproducibility.
93
+ """
94
+
95
+ def __init__(
96
+ self,
97
+ max_bootstrapped_demos: int = 4,
98
+ max_labeled_demos: int = 4,
99
+ num_demo_sets: int = 5,
100
+ random_state: Optional[Union[int, random.Random]] = None,
101
+ ):
102
+ self.max_bootstrapped_demos = max_bootstrapped_demos
103
+ self.max_labeled_demos = max_labeled_demos
104
+ self.num_demo_sets = num_demo_sets
105
+
106
+ if isinstance(random_state, int):
107
+ self.random_state = random.Random(random_state)
108
+ else:
109
+ self.random_state = random_state or random.Random()
110
+
111
+ def _extract_input(
112
+ self,
113
+ golden: Union["Golden", "ConversationalGolden"],
114
+ ) -> str:
115
+ """Extract input text from a golden."""
116
+ if hasattr(golden, "input") and golden.input:
117
+ return str(golden.input)
118
+ if hasattr(golden, "messages") and golden.messages:
119
+ # For conversational, use the last user message
120
+ for msg in reversed(golden.messages):
121
+ if hasattr(msg, "role") and msg.role == "user":
122
+ return (
123
+ str(msg.content)
124
+ if hasattr(msg, "content")
125
+ else str(msg)
126
+ )
127
+ return str(golden.messages[-1])
128
+ return ""
129
+
130
+ def _extract_expected_output(
131
+ self,
132
+ golden: Union["Golden", "ConversationalGolden"],
133
+ ) -> Optional[str]:
134
+ """Extract expected output from a golden if available."""
135
+ if hasattr(golden, "expected_output") and golden.expected_output:
136
+ return str(golden.expected_output)
137
+ return None
138
+
139
+ def _is_successful(
140
+ self,
141
+ actual_output: str,
142
+ expected_output: Optional[str],
143
+ ) -> bool:
144
+ """
145
+ Simple success check for bootstrapping.
146
+
147
+ For now, we consider an output successful if:
148
+ - It's non-empty
149
+ - If expected_output exists, actual has some overlap
150
+
151
+ This is a simplified heuristic. In full MIPROv2, you'd use
152
+ the actual metric to validate.
153
+ """
154
+ if not actual_output or not actual_output.strip():
155
+ return False
156
+
157
+ if expected_output:
158
+ # Simple overlap check - could be more sophisticated
159
+ actual_words = set(actual_output.lower().split())
160
+ expected_words = set(expected_output.lower().split())
161
+ if actual_words and expected_words:
162
+ overlap = len(actual_words & expected_words) / len(
163
+ expected_words
164
+ )
165
+ return overlap > 0.3 # At least 30% word overlap
166
+
167
+ # If no expected output, just check it's non-empty
168
+ return len(actual_output.strip()) > 10
169
+
170
+ def bootstrap(
171
+ self,
172
+ prompt: Prompt,
173
+ goldens: Union[List["Golden"], List["ConversationalGolden"]],
174
+ generate_fn: Callable[
175
+ [Prompt, Union["Golden", "ConversationalGolden"]], str
176
+ ],
177
+ ) -> List[DemoSet]:
178
+ """
179
+ Bootstrap demonstration sets synchronously.
180
+
181
+ Args:
182
+ prompt: The prompt to use for generation
183
+ goldens: Training examples to bootstrap from
184
+ generate_fn: Function that takes (prompt, golden) and returns output
185
+
186
+ Returns:
187
+ List of DemoSet objects, each containing a different set of demos
188
+ """
189
+ # Collect all successful demos
190
+ all_demos: List[Demo] = []
191
+ labeled_demos: List[Demo] = []
192
+
193
+ # Shuffle goldens for variety
194
+ shuffled_indices = list(range(len(goldens)))
195
+ self.random_state.shuffle(shuffled_indices)
196
+
197
+ # Try to bootstrap demos
198
+ attempts = 0
199
+ max_attempts = min(len(goldens), self.max_bootstrapped_demos * 3)
200
+
201
+ for idx in shuffled_indices[:max_attempts]:
202
+ golden = goldens[idx]
203
+ input_text = self._extract_input(golden)
204
+ expected = self._extract_expected_output(golden)
205
+
206
+ if not input_text:
207
+ continue
208
+
209
+ # If we have expected output, use it as a labeled demo
210
+ if (
211
+ expected
212
+ and len(labeled_demos)
213
+ < self.max_labeled_demos * self.num_demo_sets
214
+ ):
215
+ labeled_demos.append(
216
+ Demo(
217
+ input_text=input_text,
218
+ output_text=expected,
219
+ golden_index=idx,
220
+ )
221
+ )
222
+
223
+ # Try to bootstrap
224
+ if (
225
+ len(all_demos)
226
+ < self.max_bootstrapped_demos * self.num_demo_sets
227
+ ):
228
+ try:
229
+ output = generate_fn(prompt, golden)
230
+ if self._is_successful(output, expected):
231
+ all_demos.append(
232
+ Demo(
233
+ input_text=input_text,
234
+ output_text=output,
235
+ golden_index=idx,
236
+ )
237
+ )
238
+ except Exception:
239
+ continue
240
+
241
+ attempts += 1
242
+ if (
243
+ len(all_demos)
244
+ >= self.max_bootstrapped_demos * self.num_demo_sets
245
+ and len(labeled_demos)
246
+ >= self.max_labeled_demos * self.num_demo_sets
247
+ ):
248
+ break
249
+
250
+ # Create diverse demo sets
251
+ return self._create_demo_sets(all_demos, labeled_demos)
252
+
253
+ async def a_bootstrap(
254
+ self,
255
+ prompt: Prompt,
256
+ goldens: Union[List["Golden"], List["ConversationalGolden"]],
257
+ a_generate_fn: Callable,
258
+ ) -> List[DemoSet]:
259
+ """
260
+ Bootstrap demonstration sets asynchronously (concurrently).
261
+ """
262
+ labeled_demos: List[Demo] = []
263
+
264
+ shuffled_indices = list(range(len(goldens)))
265
+ self.random_state.shuffle(shuffled_indices)
266
+
267
+ max_attempts = min(len(goldens), self.max_bootstrapped_demos * 3)
268
+ selected_indices = shuffled_indices[:max_attempts]
269
+
270
+ # First pass: collect labeled demos (no async needed) and prepare bootstrap tasks
271
+ tasks_info: List[Tuple[int, str, Optional[str]]] = (
272
+ []
273
+ ) # (idx, input_text, expected)
274
+
275
+ for idx in selected_indices:
276
+ golden = goldens[idx]
277
+ input_text = self._extract_input(golden)
278
+ expected = self._extract_expected_output(golden)
279
+
280
+ if not input_text:
281
+ continue
282
+
283
+ # Collect labeled demos
284
+ if (
285
+ expected
286
+ and len(labeled_demos)
287
+ < self.max_labeled_demos * self.num_demo_sets
288
+ ):
289
+ labeled_demos.append(
290
+ Demo(
291
+ input_text=input_text,
292
+ output_text=expected,
293
+ golden_index=idx,
294
+ )
295
+ )
296
+
297
+ # Queue for bootstrapping
298
+ tasks_info.append((idx, input_text, expected))
299
+
300
+ # Limit how many we need to bootstrap
301
+ max_bootstrapped = self.max_bootstrapped_demos * self.num_demo_sets
302
+ tasks_info = tasks_info[:max_bootstrapped]
303
+
304
+ # Run all bootstrap generations concurrently
305
+ async def generate_one(
306
+ idx: int,
307
+ input_text: str,
308
+ expected: Optional[str],
309
+ ) -> Optional[Demo]:
310
+ golden = goldens[idx]
311
+ try:
312
+ output = await a_generate_fn(prompt, golden)
313
+ if self._is_successful(output, expected):
314
+ return Demo(
315
+ input_text=input_text,
316
+ output_text=output,
317
+ golden_index=idx,
318
+ )
319
+ except Exception:
320
+ pass
321
+ return None
322
+
323
+ results = await asyncio.gather(
324
+ *[generate_one(idx, inp, exp) for idx, inp, exp in tasks_info]
325
+ )
326
+
327
+ # Collect successful demos
328
+ all_demos = [demo for demo in results if demo is not None]
329
+
330
+ return self._create_demo_sets(all_demos, labeled_demos)
331
+
332
+ def _create_demo_sets(
333
+ self,
334
+ bootstrapped_demos: List[Demo],
335
+ labeled_demos: List[Demo],
336
+ ) -> List[DemoSet]:
337
+ """
338
+ Create multiple demo sets from bootstrapped and labeled demos.
339
+
340
+ Each set contains a mix of bootstrapped and labeled demos,
341
+ selected randomly for diversity.
342
+ """
343
+ demo_sets: List[DemoSet] = []
344
+
345
+ # Always include an empty demo set (0-shot option)
346
+ demo_sets.append(DemoSet(demos=[], id="0-shot"))
347
+
348
+ # Create varied demo sets
349
+ for i in range(self.num_demo_sets):
350
+ demos: List[Demo] = []
351
+
352
+ # Sample from bootstrapped demos
353
+ if bootstrapped_demos:
354
+ n_boot = min(
355
+ self.max_bootstrapped_demos, len(bootstrapped_demos)
356
+ )
357
+ boot_sample = self.random_state.sample(
358
+ bootstrapped_demos, n_boot
359
+ )
360
+ demos.extend(boot_sample)
361
+
362
+ # Sample from labeled demos
363
+ if labeled_demos:
364
+ n_labeled = min(self.max_labeled_demos, len(labeled_demos))
365
+ labeled_sample = self.random_state.sample(
366
+ labeled_demos, n_labeled
367
+ )
368
+ # Avoid duplicates
369
+ existing_indices = {d.golden_index for d in demos}
370
+ for demo in labeled_sample:
371
+ if demo.golden_index not in existing_indices:
372
+ demos.append(demo)
373
+ existing_indices.add(demo.golden_index)
374
+
375
+ if demos:
376
+ self.random_state.shuffle(demos)
377
+ demo_sets.append(DemoSet(demos=demos))
378
+
379
+ return demo_sets
380
+
381
+
382
+ def render_prompt_with_demos(
383
+ prompt: Prompt,
384
+ demo_set: Optional[DemoSet],
385
+ max_demos: int = 8,
386
+ ) -> Prompt:
387
+ """
388
+ Create a new Prompt that includes demonstrations.
389
+
390
+ This prepends the demo text to the prompt's content.
391
+
392
+ Args:
393
+ prompt: The base prompt
394
+ demo_set: The demonstration set to include
395
+ max_demos: Maximum number of demos to include
396
+
397
+ Returns:
398
+ A new Prompt with demos included
399
+ """
400
+ from deepeval.prompt.api import PromptType, PromptMessage
401
+
402
+ if not demo_set or not demo_set.demos:
403
+ return prompt
404
+
405
+ demo_text = demo_set.to_text(max_demos=max_demos)
406
+
407
+ if prompt.type == PromptType.LIST:
408
+ # For LIST prompts, prepend demos to the system message or first message
409
+ new_messages = []
410
+ demo_added = False
411
+
412
+ for msg in prompt.messages_template:
413
+ if not demo_added and msg.role == "system":
414
+ # Add demos to system message
415
+ new_content = f"{msg.content}\n\n{demo_text}"
416
+ new_messages.append(
417
+ PromptMessage(role=msg.role, content=new_content)
418
+ )
419
+ demo_added = True
420
+ else:
421
+ new_messages.append(msg)
422
+
423
+ if not demo_added and new_messages:
424
+ # No system message, add demos to first message
425
+ first = new_messages[0]
426
+ new_content = f"{demo_text}\n\n{first.content}"
427
+ new_messages[0] = PromptMessage(
428
+ role=first.role, content=new_content
429
+ )
430
+
431
+ return Prompt(messages_template=new_messages)
432
+ else:
433
+ # For TEXT prompts, prepend demos
434
+ new_text = f"{demo_text}\n\n{prompt.text_template}"
435
+ return Prompt(text_template=new_text)