deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -1,115 +0,0 @@
1
- from __future__ import annotations
2
- import time
3
- from typing import Optional
4
- from pydantic import (
5
- BaseModel,
6
- confloat,
7
- conint,
8
- Field,
9
- field_validator,
10
- PositiveInt,
11
- )
12
-
13
- from deepeval.optimization.policies.tie_breaker import (
14
- TieBreaker as TieBreakerPolicy,
15
- )
16
-
17
-
18
- class GEPAConfig(BaseModel):
19
- """
20
- Core configuration for the GEPA optimization loop.
21
-
22
- This controls:
23
- - The iteration budget and acceptance threshold (iterations, min_delta).
24
- - How D_train is split into a Pareto validation subset (D_pareto)
25
- versus a feedback subset (D_feedback) (pareto_size).
26
- - How minibatches are drawn from D_feedback, either with a fixed size
27
- or dynamically from a ratio and min/max bounds (minibatch_* fields).
28
- - How ties on aggregate scores are treated (tie_tolerance, tie_breaker).
29
- - Randomness and rewrite instruction length (random_seed,
30
- rewrite_instruction_max_chars).
31
-
32
- See individual field descriptions for precise behavior.
33
- """
34
-
35
- iterations: PositiveInt = Field(
36
- default=5,
37
- description="Total number of GEPA loop iterations (mutation attempts). "
38
- "This acts as the optimization budget B in the GEPA paper.",
39
- )
40
- minibatch_size: Optional[conint(ge=1)] = Field(
41
- default=None,
42
- description="Fixed minibatch size drawn from D_feedback. When set, this "
43
- "overrides dynamic sizing based on `minibatch_ratio`, "
44
- "`minibatch_min_size`, and `minibatch_max_size`.",
45
- )
46
- minibatch_min_size: conint(ge=1) = Field(
47
- default=4,
48
- description="Hard lower bound on the minibatch size used for D_feedback "
49
- "when dynamic sizing is in effect.",
50
- )
51
- minibatch_max_size: PositiveInt = Field(
52
- default=32,
53
- description="Hard upper bound on the minibatch size used for D_feedback "
54
- "when dynamic sizing is in effect.",
55
- )
56
- minibatch_ratio: confloat(gt=0.0, le=1.0) = Field(
57
- default=0.05,
58
- description=(
59
- "Target fraction of |D_feedback| used to compute a dynamic "
60
- "minibatch size when `minibatch_size` is None. The effective "
61
- "size is round(len(D_feedback) * minibatch_ratio) bounded "
62
- "between `minibatch_min_size` and `minibatch_max_size` and not "
63
- "exceeding len(D_feedback). D_feedback is the subset of the "
64
- "provided goldens that is not allocated to D_pareto by "
65
- "`split_goldens(...)`."
66
- ),
67
- )
68
- pareto_size: conint(ge=1) = Field(
69
- default=3,
70
- description="Size of the Pareto validation subset D_pareto. The splitter "
71
- "will bind this between [0, len(goldens)], and the runner requires "
72
- "at least 2 total goldens to run GEPA.",
73
- )
74
- random_seed: conint(ge=0) = Field(
75
- default=0,
76
- description="Non-negative RNG seed for reproducibility. "
77
- "If you explicitly pass None, it is replaced with a seed "
78
- "derived from time.time_ns() via the field validator.",
79
- )
80
- min_delta: confloat(ge=0.0) = Field(
81
- default=0.0,
82
- description="Minimum improvement required for a child configuration to be "
83
- "accepted, e.g. σ_child >= σ_parent + min_delta. A small jitter "
84
- "is applied internally to avoid floating-point edge cases.",
85
- )
86
- # Two candidates are considered tied if their aggregate scores are within tie_tolerance.
87
- tie_tolerance: confloat(ge=0.0) = Field(
88
- 1e-9,
89
- description="Two candidates are considered tied on aggregate score if "
90
- "their values differ by at most this tolerance.",
91
- )
92
- tie_breaker: TieBreakerPolicy = Field(
93
- TieBreakerPolicy.PREFER_CHILD,
94
- description="Policy used to break ties when multiple prompt configurations "
95
- "share the best aggregate score. See `GEPAConfig.TieBreaker` "
96
- "for the available options. ",
97
- )
98
- rewrite_instruction_max_chars: PositiveInt = Field(
99
- default=4096,
100
- description=(
101
- "Maximum number of characters from prompt, feedback, and related text "
102
- "included in rewrite instructions."
103
- ),
104
- )
105
-
106
- @field_validator("random_seed", mode="before")
107
- @classmethod
108
- def _coerce_random_seed(cls, seed):
109
- if seed is None:
110
- return time.time_ns()
111
- else:
112
- return seed
113
-
114
-
115
- GEPAConfig.TieBreaker = TieBreakerPolicy
@@ -1,134 +0,0 @@
1
- from __future__ import annotations
2
- import time
3
- from typing import Optional
4
-
5
- from pydantic import (
6
- BaseModel,
7
- Field,
8
- PositiveInt,
9
- conint,
10
- confloat,
11
- field_validator,
12
- )
13
-
14
-
15
- class MIPROConfig(BaseModel):
16
- """
17
- Configuration for 0-shot MIPRO style prompt optimization.
18
-
19
- This is adapted to the DeepEval setting where we optimize a single Prompt
20
- (instruction) against a list of Goldens, using mini-batch evaluation and a
21
- simple surrogate over prompt candidates.
22
-
23
- Fields
24
- ------
25
- iterations:
26
- Total number of optimization trials. Each iteration selects
27
- a parent candidate, proposes a child via the PromptRewriter,
28
- evaluates it on a mini-batch, and updates the surrogate stats.
29
-
30
- minibatch_size:
31
- Fixed minibatch size drawn from the full set of goldens. When set,
32
- this overrides dynamic sizing based on `minibatch_ratio`,
33
- `minibatch_min_size`, and `minibatch_max_size`.
34
-
35
- minibatch_min_size:
36
- Hard lower bound on minibatch size when dynamic sizing is in effect.
37
-
38
- minibatch_max_size:
39
- Hard upper bound on minibatch size when dynamic sizing is in effect.
40
-
41
- minibatch_ratio:
42
- Target fraction of len(goldens) used to compute a dynamic minibatch
43
- size. The final size is bounded between `minibatch_min_size` and
44
- `minibatch_max_size`.
45
-
46
- random_seed:
47
- RNG seed for reproducibility. If set to None, a seed is derived from
48
- time.time_ns() by the validator.
49
-
50
- exploration_probability:
51
- Epsilon greedy exploration rate for candidate selection. With this
52
- probability the runner picks a random candidate; otherwise it picks
53
- the candidate with the highest mean minibatch score.
54
-
55
- full_eval_every:
56
- If set, every `full_eval_every` trials the runner fully evaluates the
57
- current best candidate (by mean minibatch score) on the full set of
58
- goldens, storing scores per-instance. If None, only a final full
59
- evaluation is done at the end.
60
-
61
- rewrite_instruction_max_chars:
62
- Maximum number of characters pulled into rewrite instructions
63
- (prompt text + feedback) when using PromptRewriter.
64
-
65
- min_delta:
66
- Minimum improvement on minibatch mean required for a child
67
- configuration to be accepted over its parent.
68
- """
69
-
70
- iterations: PositiveInt = Field(
71
- default=5,
72
- description="Total number of MIPRO trials or prompt proposals.",
73
- )
74
- minibatch_size: Optional[conint(ge=1)] = Field(
75
- default=None,
76
- description=(
77
- "Fixed minibatch size for goldens; when set, overrides dynamic sizing."
78
- ),
79
- )
80
- minibatch_min_size: conint(ge=1) = Field(
81
- default=4,
82
- description="Hard lower bound on minibatch size.",
83
- )
84
- minibatch_max_size: PositiveInt = Field(
85
- default=32,
86
- description="Hard upper bound on minibatch size.",
87
- )
88
- minibatch_ratio: confloat(gt=0.0, le=1.0) = Field(
89
- default=0.05,
90
- description=(
91
- "Target fraction of len(goldens) used to compute a dynamic minibatch "
92
- "size; bounded between minibatch_min_size and minibatch_max_size."
93
- ),
94
- )
95
- random_seed: conint(ge=0) = 0
96
- min_delta: confloat(ge=0.0) = Field(
97
- default=0.0,
98
- description=(
99
- "Minimum improvement in minibatch score required for a child "
100
- "prompt to be accepted over its parent."
101
- ),
102
- )
103
-
104
- exploration_probability: confloat(ge=0.0, le=1.0) = Field(
105
- default=0.2,
106
- description=(
107
- "Probability of sampling a random candidate instead of "
108
- "the best-by-mean minibatch score."
109
- ),
110
- )
111
-
112
- full_eval_every: Optional[PositiveInt] = Field(
113
- default=5,
114
- description=(
115
- "If set, the runner fully evaluates the current best candidate on the "
116
- "full goldens every N trials. If None, only a single full evaluation "
117
- "is performed at the end."
118
- ),
119
- )
120
-
121
- rewrite_instruction_max_chars: PositiveInt = Field(
122
- default=4096,
123
- description=(
124
- "Maximum number of characters from prompt, feedback, and related "
125
- "text included in rewrite instructions."
126
- ),
127
- )
128
-
129
- @field_validator("random_seed", mode="before")
130
- @classmethod
131
- def _coerce_random_seed(cls, seed):
132
- if seed is None:
133
- return time.time_ns()
134
- return seed