deepeval 3.7.2__py3-none-any.whl → 3.7.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/benchmarks/human_eval/human_eval.py +2 -1
  3. deepeval/cli/test.py +1 -1
  4. deepeval/config/settings.py +102 -13
  5. deepeval/dataset/dataset.py +35 -11
  6. deepeval/dataset/utils.py +2 -0
  7. deepeval/evaluate/configs.py +1 -1
  8. deepeval/evaluate/execute.py +4 -1
  9. deepeval/metrics/answer_relevancy/template.py +4 -4
  10. deepeval/metrics/argument_correctness/template.py +2 -2
  11. deepeval/metrics/bias/template.py +3 -3
  12. deepeval/metrics/contextual_precision/template.py +6 -6
  13. deepeval/metrics/contextual_recall/template.py +2 -2
  14. deepeval/metrics/contextual_relevancy/template.py +3 -3
  15. deepeval/metrics/conversation_completeness/template.py +2 -2
  16. deepeval/metrics/conversational_dag/templates.py +4 -4
  17. deepeval/metrics/conversational_g_eval/template.py +4 -3
  18. deepeval/metrics/dag/templates.py +4 -4
  19. deepeval/metrics/faithfulness/template.py +4 -4
  20. deepeval/metrics/hallucination/template.py +4 -4
  21. deepeval/metrics/misuse/template.py +2 -2
  22. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +7 -7
  23. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +6 -6
  24. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +2 -2
  25. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +3 -3
  26. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +9 -9
  27. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +4 -4
  28. deepeval/metrics/non_advice/template.py +2 -2
  29. deepeval/metrics/pii_leakage/template.py +2 -2
  30. deepeval/metrics/prompt_alignment/template.py +4 -4
  31. deepeval/metrics/role_violation/template.py +2 -2
  32. deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
  33. deepeval/metrics/toxicity/template.py +4 -4
  34. deepeval/metrics/turn_relevancy/template.py +2 -2
  35. deepeval/metrics/utils.py +3 -0
  36. deepeval/models/__init__.py +2 -0
  37. deepeval/models/embedding_models/azure_embedding_model.py +28 -15
  38. deepeval/models/embedding_models/local_embedding_model.py +23 -10
  39. deepeval/models/embedding_models/ollama_embedding_model.py +8 -6
  40. deepeval/models/embedding_models/openai_embedding_model.py +18 -2
  41. deepeval/models/llms/anthropic_model.py +17 -5
  42. deepeval/models/llms/azure_model.py +30 -18
  43. deepeval/models/llms/deepseek_model.py +22 -12
  44. deepeval/models/llms/gemini_model.py +120 -87
  45. deepeval/models/llms/grok_model.py +23 -16
  46. deepeval/models/llms/kimi_model.py +23 -12
  47. deepeval/models/llms/litellm_model.py +63 -25
  48. deepeval/models/llms/local_model.py +26 -18
  49. deepeval/models/llms/ollama_model.py +17 -7
  50. deepeval/models/llms/openai_model.py +22 -17
  51. deepeval/models/llms/portkey_model.py +132 -0
  52. deepeval/models/mlllms/__init__.py +1 -0
  53. deepeval/models/mlllms/azure_model.py +343 -0
  54. deepeval/models/mlllms/gemini_model.py +102 -73
  55. deepeval/models/mlllms/ollama_model.py +40 -9
  56. deepeval/models/mlllms/openai_model.py +65 -14
  57. deepeval/models/utils.py +48 -3
  58. deepeval/optimization/__init__.py +13 -0
  59. deepeval/optimization/adapters/__init__.py +2 -0
  60. deepeval/optimization/adapters/deepeval_scoring_adapter.py +588 -0
  61. deepeval/optimization/aggregates.py +14 -0
  62. deepeval/optimization/configs.py +34 -0
  63. deepeval/optimization/copro/configs.py +31 -0
  64. deepeval/optimization/copro/loop.py +837 -0
  65. deepeval/optimization/gepa/__init__.py +7 -0
  66. deepeval/optimization/gepa/configs.py +115 -0
  67. deepeval/optimization/gepa/loop.py +677 -0
  68. deepeval/optimization/miprov2/configs.py +134 -0
  69. deepeval/optimization/miprov2/loop.py +785 -0
  70. deepeval/optimization/mutations/__init__.py +0 -0
  71. deepeval/optimization/mutations/prompt_rewriter.py +458 -0
  72. deepeval/optimization/policies/__init__.py +16 -0
  73. deepeval/optimization/policies/selection.py +166 -0
  74. deepeval/optimization/policies/tie_breaker.py +67 -0
  75. deepeval/optimization/prompt_optimizer.py +462 -0
  76. deepeval/optimization/simba/__init__.py +0 -0
  77. deepeval/optimization/simba/configs.py +33 -0
  78. deepeval/optimization/simba/loop.py +983 -0
  79. deepeval/optimization/simba/types.py +15 -0
  80. deepeval/optimization/types.py +361 -0
  81. deepeval/optimization/utils.py +598 -0
  82. deepeval/prompt/prompt.py +10 -5
  83. deepeval/test_run/cache.py +2 -0
  84. deepeval/test_run/test_run.py +6 -1
  85. deepeval/tracing/context.py +3 -0
  86. deepeval/tracing/tracing.py +22 -11
  87. deepeval/utils.py +24 -0
  88. {deepeval-3.7.2.dist-info → deepeval-3.7.4.dist-info}/METADATA +1 -1
  89. {deepeval-3.7.2.dist-info → deepeval-3.7.4.dist-info}/RECORD +92 -66
  90. {deepeval-3.7.2.dist-info → deepeval-3.7.4.dist-info}/entry_points.txt +1 -1
  91. {deepeval-3.7.2.dist-info → deepeval-3.7.4.dist-info}/LICENSE.md +0 -0
  92. {deepeval-3.7.2.dist-info → deepeval-3.7.4.dist-info}/WHEEL +0 -0
@@ -0,0 +1,598 @@
1
+ from __future__ import annotations
2
+ import inspect
3
+ import random
4
+ import re
5
+ from typing import (
6
+ Any,
7
+ Callable,
8
+ List,
9
+ Optional,
10
+ Tuple,
11
+ TYPE_CHECKING,
12
+ Union,
13
+ Dict,
14
+ Set,
15
+ )
16
+
17
+ from deepeval.errors import DeepEvalError
18
+ from deepeval.metrics.base_metric import BaseMetric, BaseConversationalMetric
19
+ from deepeval.prompt.prompt import Prompt
20
+ from deepeval.prompt.api import PromptType, PromptMessage
21
+ from deepeval.optimization.types import (
22
+ ModuleId,
23
+ PromptConfigurationId,
24
+ PromptConfiguration,
25
+ OptimizationReport,
26
+ )
27
+
28
+
29
+ if TYPE_CHECKING:
30
+ from deepeval.dataset.golden import Golden, ConversationalGolden
31
+ from deepeval.prompt.api import PromptMessage
32
+
33
+
34
+ def split_goldens(
35
+ goldens: Union[List[Golden], List[ConversationalGolden]],
36
+ pareto_size: int,
37
+ *,
38
+ random_state: random.Random,
39
+ ) -> Tuple[
40
+ Union[List[Golden], List[ConversationalGolden]],
41
+ Union[List[Golden], List[ConversationalGolden]],
42
+ ]:
43
+ """
44
+ Split `goldens` into two disjoint parts:
45
+
46
+ - d_feedback: items not selected for the Pareto validation set
47
+ - d_pareto: `pareto_size` items for instance-wise Pareto scoring
48
+
49
+ The selection is deterministic given `seed`. Within each split, the
50
+ original order from `goldens` is preserved.
51
+
52
+ Args:
53
+ goldens: Full list/sequence of examples.
54
+ pareto_size: Number of items to allocate to the Pareto set bound between [0, len(goldens)].
55
+ random_state: A shared `random.Random` instance that provides the source
56
+ of randomness. For reproducible runs, pass the same object used by
57
+ the GEPA loop constructed from `GEPAConfig.random_seed`
58
+
59
+ Returns:
60
+ (d_feedback, d_pareto)
61
+ """
62
+ if pareto_size < 0:
63
+ raise ValueError("pareto_size must be >= 0")
64
+
65
+ total = len(goldens)
66
+
67
+ if total == 0:
68
+ # nothing to split
69
+ return [], []
70
+
71
+ # With a single example, we cannot form a meaningful feedback set.
72
+ # callers like GEPARunner should enforce a minimum of 2 goldens for
73
+ # optimization.
74
+ if total == 1:
75
+ return [], list(goldens)
76
+
77
+ # For total >= 2, ensure that we always leave at least one example
78
+ # for d_feedback. This keeps the splits disjoint while still honoring
79
+ # pareto_size as a target up to (total - 1).
80
+ chosen_size = min(pareto_size, total - 1)
81
+
82
+ indices = list(range(total))
83
+ random_state.shuffle(indices)
84
+
85
+ pareto_indices = set(indices[:chosen_size])
86
+
87
+ d_pareto = [goldens[i] for i in range(total) if i in pareto_indices]
88
+ d_feedback = [goldens[i] for i in range(total) if i not in pareto_indices]
89
+
90
+ return d_feedback, d_pareto
91
+
92
+
93
+ ################################
94
+ # Prompt normalization helpers #
95
+ ################################
96
+
97
+
98
+ def _slug(text: str) -> str:
99
+ slug = text.lower()
100
+ slug = re.sub(r"[^a-z0-9]+", "-", slug)
101
+ return slug.strip("-")
102
+
103
+
104
+ def generate_module_id(prompt: Prompt, index: int, existing: Set[str]) -> str:
105
+ """
106
+ Build a human readable module id stable within a single optimization run.
107
+ Prefers alias/label; enrich with model settings provider and name; dedupe; cap to 64 chars.
108
+ """
109
+ parts: List[str] = []
110
+ if prompt.alias:
111
+ parts.append(str(prompt.alias))
112
+ if prompt.label:
113
+ parts.append(str(prompt.label))
114
+
115
+ ms = prompt.model_settings
116
+ if ms is not None:
117
+ if ms.provider is not None:
118
+ parts.append(ms.provider.value)
119
+ if ms.name:
120
+ parts.append(ms.name)
121
+
122
+ base = "-".join(_slug(p) for p in parts if p) or f"module-{index+1}"
123
+ base = base[:64] or f"module-{index+1}"
124
+
125
+ candidate = base
126
+ suffix = 2
127
+ while candidate in existing:
128
+ candidate = f"{base}-{suffix}"
129
+ candidate = candidate[:64]
130
+ suffix += 1
131
+
132
+ existing.add(candidate)
133
+ return candidate
134
+
135
+
136
+ def normalize_seed_prompts(
137
+ seed_prompts: Union[Dict[ModuleId, Prompt], List[Prompt]],
138
+ ) -> Dict[ModuleId, Prompt]:
139
+ """
140
+ Accept either {module_id: Prompt} or List[Prompt].
141
+ If a list is given, generate human readable module ids.
142
+ """
143
+ if isinstance(seed_prompts, dict):
144
+ return dict(seed_prompts) # shallow copy
145
+
146
+ mapping: Dict[ModuleId, Prompt] = {}
147
+ used: Set[str] = set()
148
+ for i, prompt in enumerate(seed_prompts):
149
+ module_id = generate_module_id(prompt, i, used)
150
+ mapping[module_id] = prompt
151
+ return mapping
152
+
153
+
154
+ def build_model_callback_kwargs(
155
+ *,
156
+ # scoring context
157
+ golden: Optional[Union["Golden", "ConversationalGolden"]] = None,
158
+ # rewriter context
159
+ feedback_text: Optional[str] = None,
160
+ # shared
161
+ prompt: Optional[Prompt] = None,
162
+ prompt_type: Optional[str] = None,
163
+ prompt_text: Optional[str] = None,
164
+ prompt_messages: Optional[List["PromptMessage"]] = None,
165
+ ) -> Dict[str, Any]:
166
+ """
167
+ Build a superset of kwargs for GEPA model callbacks.
168
+
169
+ All keys are present in the dict so callbacks can declare any subset of:
170
+
171
+ hook: str # injected by (a_)invoke_model_callback
172
+ prompt: Prompt
173
+ prompt_type: str
174
+ prompt_text: str
175
+ prompt_messages: List[PromptMessage]
176
+ golden: Golden | ConversationalGolden
177
+ feedback_text: str
178
+
179
+ Non applicable fields are set to None.
180
+ """
181
+ return {
182
+ # scoring context
183
+ "golden": golden,
184
+ # rewriter context
185
+ "feedback_text": feedback_text,
186
+ # shared
187
+ "prompt": prompt,
188
+ "prompt_text": prompt_text,
189
+ "prompt_messages": prompt_messages,
190
+ }
191
+
192
+
193
+ def invoke_model_callback(
194
+ *,
195
+ hook: str,
196
+ model_callback: Callable[
197
+ ...,
198
+ Union[
199
+ str,
200
+ Dict,
201
+ Tuple[Union[str, Dict], float],
202
+ ],
203
+ ],
204
+ candidate_kwargs: Dict[str, Any],
205
+ ) -> Union[
206
+ str,
207
+ Dict,
208
+ Tuple[Union[str, Dict], float],
209
+ ]:
210
+ """
211
+ Call a user provided model_callback in a synchronous context.
212
+
213
+ - Filters kwargs to only those the callback accepts.
214
+ - Injects `hook` if the callback declares it.
215
+ - Raises if the callback returns an awaitable; callers must use async
216
+ helpers for async callbacks.
217
+ """
218
+ sig = inspect.signature(model_callback)
219
+ supported = set(sig.parameters.keys())
220
+
221
+ filtered = {
222
+ key: value
223
+ for key, value in candidate_kwargs.items()
224
+ if key in supported
225
+ }
226
+
227
+ if "hook" in supported:
228
+ filtered["hook"] = hook
229
+
230
+ result = model_callback(**filtered)
231
+ if inspect.isawaitable(result):
232
+ raise DeepEvalError(
233
+ "model_callback returned an awaitable from a synchronous context. "
234
+ "Either declare the callback as `async def` and use async GEPA, or call "
235
+ "`model.generate(...)` instead of `model.a_generate(...)` inside a sync callback."
236
+ )
237
+ return result
238
+
239
+
240
+ async def a_invoke_model_callback(
241
+ *,
242
+ hook: str,
243
+ model_callback: Callable[
244
+ ...,
245
+ Union[
246
+ str,
247
+ Dict,
248
+ Tuple[Union[str, Dict], float],
249
+ ],
250
+ ],
251
+ candidate_kwargs: Dict[str, Any],
252
+ ) -> Union[
253
+ str,
254
+ Dict,
255
+ Tuple[Union[str, Dict], float],
256
+ ]:
257
+ """
258
+ Call a user provided model_callback in an async context.
259
+
260
+ - Filters kwargs to only those the callback accepts.
261
+ - Injects `hook` if the callback declares it.
262
+ - Supports both sync and async callbacks.
263
+ """
264
+ sig = inspect.signature(model_callback)
265
+ supported = set(sig.parameters.keys())
266
+
267
+ filtered = {
268
+ key: value
269
+ for key, value in candidate_kwargs.items()
270
+ if key in supported
271
+ }
272
+
273
+ if "hook" in supported:
274
+ filtered["hook"] = hook
275
+
276
+ result = model_callback(**filtered)
277
+ if inspect.isawaitable(result):
278
+ return await result
279
+ return result
280
+
281
+
282
+ ###########
283
+ # Reports #
284
+ ###########
285
+
286
+
287
+ def build_prompt_config_snapshots(
288
+ prompt_configurations_by_id: Dict[
289
+ PromptConfigurationId, "PromptConfiguration"
290
+ ],
291
+ ) -> Dict[PromptConfigurationId, Dict[str, Any]]:
292
+ """
293
+ Build a serializable snapshot of all prompt configurations.
294
+
295
+ Shape matches the docs for `prompt_configurations`:
296
+
297
+ {
298
+ "<config_id>": {
299
+ "parent": "<parent_id or None>",
300
+ "prompts": {
301
+ "<module_id>": {
302
+ "type": "TEXT",
303
+ "text_template": "...",
304
+ }
305
+ # or
306
+ "<module_id>": {
307
+ "type": "LIST",
308
+ "messages": [
309
+ {"role": "system", "content": "..."},
310
+ ...
311
+ ],
312
+ },
313
+ },
314
+ },
315
+ ...
316
+ }
317
+ """
318
+ snapshots: Dict[PromptConfigurationId, Dict[str, Any]] = {}
319
+
320
+ for cfg_id, cfg in prompt_configurations_by_id.items():
321
+ prompts_snapshot: Dict[str, Any] = {}
322
+
323
+ for module_id, prompt in cfg.prompts.items():
324
+ if prompt.type is PromptType.LIST:
325
+ messages = [
326
+ {"role": msg.role, "content": (msg.content or "")}
327
+ for msg in (prompt.messages_template or [])
328
+ ]
329
+ prompts_snapshot[module_id] = {
330
+ "type": "LIST",
331
+ "messages": messages,
332
+ }
333
+ else:
334
+ prompts_snapshot[module_id] = {
335
+ "type": "TEXT",
336
+ "text_template": (prompt.text_template or ""),
337
+ }
338
+
339
+ snapshots[cfg_id] = {
340
+ "parent": cfg.parent,
341
+ "prompts": prompts_snapshot,
342
+ }
343
+
344
+ return snapshots
345
+
346
+
347
+ def inflate_prompts_from_report(
348
+ report: OptimizationReport,
349
+ ) -> Dict[str, Dict[str, Prompt]]:
350
+ """
351
+ Build a mapping from configuration id -> { module_id -> Prompt }.
352
+
353
+ This is a convenience for users who want to work with real Prompt
354
+ instances instead of raw snapshots.
355
+
356
+ Returns:
357
+ {
358
+ "<config_id>": {
359
+ "<module_id>": Prompt(...),
360
+ ...
361
+ },
362
+ ...
363
+ }
364
+ """
365
+ inflated: Dict[str, Dict[str, Prompt]] = {}
366
+
367
+ for cfg_id, cfg_snapshot in report.prompt_configurations.items():
368
+ module_prompts: Dict[str, Prompt] = {}
369
+
370
+ for module_id, module_snapshot in cfg_snapshot.prompts.items():
371
+ if module_snapshot.type == "TEXT":
372
+ module_prompts[module_id] = Prompt(
373
+ text_template=module_snapshot.text_template or ""
374
+ )
375
+ else: # "LIST"
376
+ messages = [
377
+ PromptMessage(role=m.role, content=m.content)
378
+ for m in module_snapshot.messages or []
379
+ ]
380
+ module_prompts[module_id] = Prompt(messages_template=messages)
381
+
382
+ inflated[cfg_id] = module_prompts
383
+
384
+ return inflated
385
+
386
+
387
+ def get_best_prompts_from_report(
388
+ report: OptimizationReport,
389
+ ) -> Dict[str, Prompt]:
390
+ """
391
+ Convenience wrapper returning the best configuration's module prompts.
392
+ """
393
+ all_prompts = inflate_prompts_from_report(report)
394
+ return all_prompts.get(report.best_id, {})
395
+
396
+
397
+ ##############
398
+ # Validation #
399
+ ##############
400
+ def _format_type_names(types: Tuple[type, ...]) -> str:
401
+ names = [t.__name__ for t in types]
402
+ if len(names) == 1:
403
+ return names[0]
404
+ if len(names) == 2:
405
+ return f"{names[0]} or {names[1]}"
406
+ return ", ".join(names[:-1]) + f", or {names[-1]}"
407
+
408
+
409
+ def validate_instance(
410
+ *,
411
+ component: str,
412
+ param_name: str,
413
+ value: Any,
414
+ expected_types: Union[type, Tuple[type, ...]],
415
+ allow_none: bool = False,
416
+ ) -> Any:
417
+ """
418
+ Generic type validator.
419
+
420
+ - component: Intended to help identify what is being validated.
421
+ e.g. "PromptOptimizer.__init__", "PromptOptimizer.optimize", etc.
422
+ - param_name: the name of the parameter being validated
423
+ - value: the actual value passed.
424
+ - expected_types: a type or tuple of types to accept.
425
+ - allow_none: if True, None is allowed and returned as-is.
426
+ """
427
+ if value is None and allow_none:
428
+ return value
429
+
430
+ if not isinstance(expected_types, tuple):
431
+ expected_types = (expected_types,)
432
+
433
+ if not isinstance(value, expected_types):
434
+ expected_desc = _format_type_names(expected_types)
435
+ raise DeepEvalError(
436
+ f"{component} expected `{param_name}` to be an instance of "
437
+ f"{expected_desc}, but received {type(value).__name__!r} instead."
438
+ )
439
+ return value
440
+
441
+
442
+ def validate_sequence_of(
443
+ *,
444
+ component: str,
445
+ param_name: str,
446
+ value: Any,
447
+ expected_item_types: Union[type, Tuple[type, ...]],
448
+ sequence_types: Tuple[type, ...] = (list, tuple),
449
+ allow_none: bool = False,
450
+ ) -> Any:
451
+ """
452
+ Generic container validator.
453
+
454
+ - Ensures `value` is one of `sequence_types` (list by default).
455
+ - Ensures each item is an instance of `expected_item_types`.
456
+
457
+ Returns the original `value` on success.
458
+ """
459
+ if value is None:
460
+ if allow_none:
461
+ return value
462
+ raise DeepEvalError(
463
+ f"{component} expected `{param_name}` to be a "
464
+ f"{_format_type_names(sequence_types)} of "
465
+ f"{_format_type_names(expected_item_types if isinstance(expected_item_types, tuple) else (expected_item_types,))}, "
466
+ "but received None instead."
467
+ )
468
+
469
+ if not isinstance(sequence_types, tuple):
470
+ sequence_types = (sequence_types,)
471
+
472
+ if not isinstance(value, sequence_types):
473
+ expected_seq = _format_type_names(sequence_types)
474
+ raise DeepEvalError(
475
+ f"{component} expected `{param_name}` to be a {expected_seq}, "
476
+ f"but received {type(value).__name__!r} instead."
477
+ )
478
+
479
+ if not isinstance(expected_item_types, tuple):
480
+ expected_item_types = (expected_item_types,)
481
+
482
+ for index, item in enumerate(value):
483
+ if not isinstance(item, expected_item_types):
484
+ expected_items = _format_type_names(expected_item_types)
485
+ raise DeepEvalError(
486
+ f"{component} expected all elements of `{param_name}` to be "
487
+ f"instances of {expected_items}, but element at index {index} "
488
+ f"has type {type(item).__name__!r}."
489
+ )
490
+
491
+ return value
492
+
493
+
494
+ def validate_callback(
495
+ *,
496
+ component: str,
497
+ model_callback: Optional[
498
+ Callable[
499
+ ...,
500
+ Union[
501
+ str,
502
+ Dict,
503
+ Tuple[Union[str, Dict], float],
504
+ ],
505
+ ]
506
+ ],
507
+ ) -> Callable[..., Union[str, Dict, Tuple[Union[str, Dict], float]]]:
508
+ """
509
+ Ensure that `model_callback` is provided.
510
+
511
+ - `model_callback` should be a callable that performs generation and
512
+ returns the model output.
513
+
514
+ Returns `model_callback` unchanged on success.
515
+ """
516
+ if model_callback is None:
517
+ raise DeepEvalError(
518
+ f"{component} requires a `model_callback`.\n\n"
519
+ "supply a custom callable via `model_callback=` that performs "
520
+ "generation and returns the model output."
521
+ )
522
+ return model_callback
523
+
524
+
525
+ def validate_metrics(
526
+ *,
527
+ component: str,
528
+ metrics: Union[List[BaseMetric], List[BaseConversationalMetric]],
529
+ ) -> Union[List[BaseMetric], List[BaseConversationalMetric]]:
530
+
531
+ if metrics is None or not len(metrics):
532
+ raise DeepEvalError(
533
+ f"{component} requires a `metrics`.\n\n"
534
+ "supply one or more DeepEval metrics via `metrics=`"
535
+ )
536
+
537
+ validate_sequence_of(
538
+ component=component,
539
+ param_name="metrics",
540
+ value=metrics,
541
+ expected_item_types=(BaseMetric, BaseConversationalMetric),
542
+ sequence_types=(list, tuple),
543
+ )
544
+ return list(metrics)
545
+
546
+
547
+ def validate_int_in_range(
548
+ *,
549
+ component: str,
550
+ param_name: str,
551
+ value: int,
552
+ min_inclusive: Optional[int] = None,
553
+ max_exclusive: Optional[int] = None,
554
+ ) -> int:
555
+ """
556
+ Validate that an int is within range [min_inclusive, max_exclusive).
557
+
558
+ - If `min_inclusive` is not None, value must be >= min_inclusive.
559
+ - If `max_exclusive` is not None, value must be < max_exclusive.
560
+
561
+ Returns the validated int on success.
562
+ """
563
+ value = validate_instance(
564
+ component=component,
565
+ param_name=param_name,
566
+ value=value,
567
+ expected_types=int,
568
+ )
569
+
570
+ # Lower bound check
571
+ if min_inclusive is not None and value < min_inclusive:
572
+ if max_exclusive is None:
573
+ raise DeepEvalError(
574
+ f"{component} expected `{param_name}` to be >= {min_inclusive}, "
575
+ f"but received {value!r} instead."
576
+ )
577
+ max_inclusive = max_exclusive - 1
578
+ raise DeepEvalError(
579
+ f"{component} expected `{param_name}` to be between "
580
+ f"{min_inclusive} and {max_inclusive} (inclusive), "
581
+ f"but received {value!r} instead."
582
+ )
583
+
584
+ # Upper bound check (half-open, < max_exclusive)
585
+ if max_exclusive is not None and value >= max_exclusive:
586
+ if min_inclusive is None:
587
+ raise DeepEvalError(
588
+ f"{component} expected `{param_name}` to be < {max_exclusive}, "
589
+ f"but received {value!r} instead."
590
+ )
591
+ max_inclusive = max_exclusive - 1
592
+ raise DeepEvalError(
593
+ f"{component} expected `{param_name}` to be between "
594
+ f"{min_inclusive} and {max_inclusive} (inclusive), "
595
+ f"but received {value!r} instead."
596
+ )
597
+
598
+ return value
deepeval/prompt/prompt.py CHANGED
@@ -4,12 +4,9 @@ import json
4
4
  import os
5
5
 
6
6
  from enum import Enum
7
- from typing import Optional, List, Dict, Type, Literal
7
+ from typing import Optional, List, Dict, Type, Literal, TYPE_CHECKING
8
8
  from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
9
9
  from rich.console import Console
10
- import time
11
- import json
12
- import os
13
10
  from pydantic import BaseModel, ValidationError
14
11
  import asyncio
15
12
  import threading
@@ -38,6 +35,9 @@ from deepeval.confident.api import Api, Endpoints, HttpMethods
38
35
  from deepeval.constants import HIDDEN_DIR
39
36
 
40
37
 
38
+ if TYPE_CHECKING:
39
+ from deepeval.optimization.types import OptimizationReport
40
+
41
41
  logger = logging.getLogger(__name__)
42
42
 
43
43
  portalocker = None
@@ -145,6 +145,9 @@ class Prompt:
145
145
  elif messages_template:
146
146
  self.type = PromptType.LIST
147
147
 
148
+ # updated after optimization runs
149
+ self.optimization_report: Optional["OptimizationReport"] = None
150
+
148
151
  def __del__(self):
149
152
  """Cleanup polling tasks when instance is destroyed"""
150
153
  try:
@@ -178,7 +181,7 @@ class Prompt:
178
181
  content = f.read()
179
182
  try:
180
183
  data = json.loads(content)
181
- except (json.JSONDecodeError, TypeError):
184
+ except (TypeError, json.JSONDecodeError):
182
185
  self.text_template = content
183
186
  return content
184
187
 
@@ -364,6 +367,8 @@ class Prompt:
364
367
  f.seek(0)
365
368
  f.truncate()
366
369
  json.dump(cache_data, f, cls=CustomEncoder)
370
+ f.flush()
371
+ os.fsync(f.fileno())
367
372
  except portalocker.exceptions.LockException:
368
373
  # If we can't acquire the lock, silently skip caching
369
374
  pass
@@ -90,6 +90,8 @@ class CachedTestRun(BaseModel):
90
90
  # Pydantic version below 2.0
91
91
  body = self.dict(by_alias=True, exclude_none=True)
92
92
  json.dump(body, f, cls=CustomEncoder)
93
+ f.flush()
94
+ os.fsync(f.fileno())
93
95
  return self
94
96
 
95
97
  # load from file (this happens initially during a test run)
@@ -406,9 +406,10 @@ class TestRun(BaseModel):
406
406
  try:
407
407
  body = self.model_dump(by_alias=True, exclude_none=True)
408
408
  except AttributeError:
409
- # Pydantic version below 2.0
410
409
  body = self.dict(by_alias=True, exclude_none=True)
411
410
  json.dump(body, f, cls=TestRunEncoder)
411
+ f.flush()
412
+ os.fsync(f.fileno())
412
413
  return self
413
414
 
414
415
  @classmethod
@@ -515,6 +516,8 @@ class TestRunManager:
515
516
  )
516
517
  wrapper_data = {save_under_key: test_run_data}
517
518
  json.dump(wrapper_data, file, cls=TestRunEncoder)
519
+ file.flush()
520
+ os.fsync(file.fileno())
518
521
  else:
519
522
  self.test_run.save(file)
520
523
  except portalocker.exceptions.LockException:
@@ -527,6 +530,8 @@ class TestRunManager:
527
530
  LATEST_TEST_RUN_FILE_PATH, mode="w"
528
531
  ) as file:
529
532
  json.dump({LATEST_TEST_RUN_LINK_KEY: link}, file)
533
+ file.flush()
534
+ os.fsync(file.fileno())
530
535
  except portalocker.exceptions.LockException:
531
536
  pass
532
537
 
@@ -73,6 +73,7 @@ def update_current_trace(
73
73
  tools_called: Optional[List[ToolCall]] = None,
74
74
  expected_tools: Optional[List[ToolCall]] = None,
75
75
  test_case: Optional[LLMTestCase] = None,
76
+ confident_api_key: Optional[str] = None,
76
77
  ):
77
78
  current_trace = current_trace_context.get()
78
79
  if not current_trace:
@@ -109,6 +110,8 @@ def update_current_trace(
109
110
  current_trace.tools_called = tools_called
110
111
  if expected_tools:
111
112
  current_trace.expected_tools = expected_tools
113
+ if confident_api_key:
114
+ current_trace.confident_api_key = confident_api_key
112
115
 
113
116
 
114
117
  def update_llm_span(