deepeval 3.7.2__py3-none-any.whl → 3.7.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/benchmarks/human_eval/human_eval.py +2 -1
- deepeval/cli/test.py +1 -1
- deepeval/config/settings.py +102 -13
- deepeval/dataset/dataset.py +35 -11
- deepeval/dataset/utils.py +2 -0
- deepeval/evaluate/configs.py +1 -1
- deepeval/evaluate/execute.py +4 -1
- deepeval/metrics/answer_relevancy/template.py +4 -4
- deepeval/metrics/argument_correctness/template.py +2 -2
- deepeval/metrics/bias/template.py +3 -3
- deepeval/metrics/contextual_precision/template.py +6 -6
- deepeval/metrics/contextual_recall/template.py +2 -2
- deepeval/metrics/contextual_relevancy/template.py +3 -3
- deepeval/metrics/conversation_completeness/template.py +2 -2
- deepeval/metrics/conversational_dag/templates.py +4 -4
- deepeval/metrics/conversational_g_eval/template.py +4 -3
- deepeval/metrics/dag/templates.py +4 -4
- deepeval/metrics/faithfulness/template.py +4 -4
- deepeval/metrics/hallucination/template.py +4 -4
- deepeval/metrics/misuse/template.py +2 -2
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +7 -7
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +6 -6
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +2 -2
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +3 -3
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +9 -9
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +4 -4
- deepeval/metrics/non_advice/template.py +2 -2
- deepeval/metrics/pii_leakage/template.py +2 -2
- deepeval/metrics/prompt_alignment/template.py +4 -4
- deepeval/metrics/role_violation/template.py +2 -2
- deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
- deepeval/metrics/toxicity/template.py +4 -4
- deepeval/metrics/turn_relevancy/template.py +2 -2
- deepeval/metrics/utils.py +3 -0
- deepeval/models/__init__.py +2 -0
- deepeval/models/embedding_models/azure_embedding_model.py +28 -15
- deepeval/models/embedding_models/local_embedding_model.py +23 -10
- deepeval/models/embedding_models/ollama_embedding_model.py +8 -6
- deepeval/models/embedding_models/openai_embedding_model.py +18 -2
- deepeval/models/llms/anthropic_model.py +17 -5
- deepeval/models/llms/azure_model.py +30 -18
- deepeval/models/llms/deepseek_model.py +22 -12
- deepeval/models/llms/gemini_model.py +120 -87
- deepeval/models/llms/grok_model.py +23 -16
- deepeval/models/llms/kimi_model.py +23 -12
- deepeval/models/llms/litellm_model.py +63 -25
- deepeval/models/llms/local_model.py +26 -18
- deepeval/models/llms/ollama_model.py +17 -7
- deepeval/models/llms/openai_model.py +22 -17
- deepeval/models/llms/portkey_model.py +132 -0
- deepeval/models/mlllms/__init__.py +1 -0
- deepeval/models/mlllms/azure_model.py +343 -0
- deepeval/models/mlllms/gemini_model.py +102 -73
- deepeval/models/mlllms/ollama_model.py +40 -9
- deepeval/models/mlllms/openai_model.py +65 -14
- deepeval/models/utils.py +48 -3
- deepeval/optimization/__init__.py +13 -0
- deepeval/optimization/adapters/__init__.py +2 -0
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +588 -0
- deepeval/optimization/aggregates.py +14 -0
- deepeval/optimization/configs.py +34 -0
- deepeval/optimization/copro/configs.py +31 -0
- deepeval/optimization/copro/loop.py +837 -0
- deepeval/optimization/gepa/__init__.py +7 -0
- deepeval/optimization/gepa/configs.py +115 -0
- deepeval/optimization/gepa/loop.py +677 -0
- deepeval/optimization/miprov2/configs.py +134 -0
- deepeval/optimization/miprov2/loop.py +785 -0
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +458 -0
- deepeval/optimization/policies/__init__.py +16 -0
- deepeval/optimization/policies/selection.py +166 -0
- deepeval/optimization/policies/tie_breaker.py +67 -0
- deepeval/optimization/prompt_optimizer.py +462 -0
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +33 -0
- deepeval/optimization/simba/loop.py +983 -0
- deepeval/optimization/simba/types.py +15 -0
- deepeval/optimization/types.py +361 -0
- deepeval/optimization/utils.py +598 -0
- deepeval/prompt/prompt.py +10 -5
- deepeval/test_run/cache.py +2 -0
- deepeval/test_run/test_run.py +6 -1
- deepeval/tracing/context.py +3 -0
- deepeval/tracing/tracing.py +22 -11
- deepeval/utils.py +24 -0
- {deepeval-3.7.2.dist-info → deepeval-3.7.4.dist-info}/METADATA +1 -1
- {deepeval-3.7.2.dist-info → deepeval-3.7.4.dist-info}/RECORD +92 -66
- {deepeval-3.7.2.dist-info → deepeval-3.7.4.dist-info}/entry_points.txt +1 -1
- {deepeval-3.7.2.dist-info → deepeval-3.7.4.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.2.dist-info → deepeval-3.7.4.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import time
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from pydantic import (
|
|
6
|
+
BaseModel,
|
|
7
|
+
Field,
|
|
8
|
+
PositiveInt,
|
|
9
|
+
conint,
|
|
10
|
+
confloat,
|
|
11
|
+
field_validator,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class MIPROConfig(BaseModel):
|
|
16
|
+
"""
|
|
17
|
+
Configuration for 0-shot MIPRO style prompt optimization.
|
|
18
|
+
|
|
19
|
+
This is adapted to the DeepEval setting where we optimize a single Prompt
|
|
20
|
+
(instruction) against a list of Goldens, using mini-batch evaluation and a
|
|
21
|
+
simple surrogate over prompt candidates.
|
|
22
|
+
|
|
23
|
+
Fields
|
|
24
|
+
------
|
|
25
|
+
iterations:
|
|
26
|
+
Total number of optimization trials. Each iteration selects
|
|
27
|
+
a parent candidate, proposes a child via the PromptRewriter,
|
|
28
|
+
evaluates it on a mini-batch, and updates the surrogate stats.
|
|
29
|
+
|
|
30
|
+
minibatch_size:
|
|
31
|
+
Fixed minibatch size drawn from the full set of goldens. When set,
|
|
32
|
+
this overrides dynamic sizing based on `minibatch_ratio`,
|
|
33
|
+
`minibatch_min_size`, and `minibatch_max_size`.
|
|
34
|
+
|
|
35
|
+
minibatch_min_size:
|
|
36
|
+
Hard lower bound on minibatch size when dynamic sizing is in effect.
|
|
37
|
+
|
|
38
|
+
minibatch_max_size:
|
|
39
|
+
Hard upper bound on minibatch size when dynamic sizing is in effect.
|
|
40
|
+
|
|
41
|
+
minibatch_ratio:
|
|
42
|
+
Target fraction of len(goldens) used to compute a dynamic minibatch
|
|
43
|
+
size. The final size is bounded between `minibatch_min_size` and
|
|
44
|
+
`minibatch_max_size`.
|
|
45
|
+
|
|
46
|
+
random_seed:
|
|
47
|
+
RNG seed for reproducibility. If set to None, a seed is derived from
|
|
48
|
+
time.time_ns() by the validator.
|
|
49
|
+
|
|
50
|
+
exploration_probability:
|
|
51
|
+
Epsilon greedy exploration rate for candidate selection. With this
|
|
52
|
+
probability the runner picks a random candidate; otherwise it picks
|
|
53
|
+
the candidate with the highest mean minibatch score.
|
|
54
|
+
|
|
55
|
+
full_eval_every:
|
|
56
|
+
If set, every `full_eval_every` trials the runner fully evaluates the
|
|
57
|
+
current best candidate (by mean minibatch score) on the full set of
|
|
58
|
+
goldens, storing scores per-instance. If None, only a final full
|
|
59
|
+
evaluation is done at the end.
|
|
60
|
+
|
|
61
|
+
rewrite_instruction_max_chars:
|
|
62
|
+
Maximum number of characters pulled into rewrite instructions
|
|
63
|
+
(prompt text + feedback) when using PromptRewriter.
|
|
64
|
+
|
|
65
|
+
min_delta:
|
|
66
|
+
Minimum improvement on minibatch mean required for a child
|
|
67
|
+
configuration to be accepted over its parent.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
iterations: PositiveInt = Field(
|
|
71
|
+
default=5,
|
|
72
|
+
description="Total number of MIPRO trials or prompt proposals.",
|
|
73
|
+
)
|
|
74
|
+
minibatch_size: Optional[conint(ge=1)] = Field(
|
|
75
|
+
default=None,
|
|
76
|
+
description=(
|
|
77
|
+
"Fixed minibatch size for goldens; when set, overrides dynamic sizing."
|
|
78
|
+
),
|
|
79
|
+
)
|
|
80
|
+
minibatch_min_size: conint(ge=1) = Field(
|
|
81
|
+
default=4,
|
|
82
|
+
description="Hard lower bound on minibatch size.",
|
|
83
|
+
)
|
|
84
|
+
minibatch_max_size: PositiveInt = Field(
|
|
85
|
+
default=32,
|
|
86
|
+
description="Hard upper bound on minibatch size.",
|
|
87
|
+
)
|
|
88
|
+
minibatch_ratio: confloat(gt=0.0, le=1.0) = Field(
|
|
89
|
+
default=0.05,
|
|
90
|
+
description=(
|
|
91
|
+
"Target fraction of len(goldens) used to compute a dynamic minibatch "
|
|
92
|
+
"size; bounded between minibatch_min_size and minibatch_max_size."
|
|
93
|
+
),
|
|
94
|
+
)
|
|
95
|
+
random_seed: conint(ge=0) = 0
|
|
96
|
+
min_delta: confloat(ge=0.0) = Field(
|
|
97
|
+
default=0.0,
|
|
98
|
+
description=(
|
|
99
|
+
"Minimum improvement in minibatch score required for a child "
|
|
100
|
+
"prompt to be accepted over its parent."
|
|
101
|
+
),
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
exploration_probability: confloat(ge=0.0, le=1.0) = Field(
|
|
105
|
+
default=0.2,
|
|
106
|
+
description=(
|
|
107
|
+
"Probability of sampling a random candidate instead of "
|
|
108
|
+
"the best-by-mean minibatch score."
|
|
109
|
+
),
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
full_eval_every: Optional[PositiveInt] = Field(
|
|
113
|
+
default=5,
|
|
114
|
+
description=(
|
|
115
|
+
"If set, the runner fully evaluates the current best candidate on the "
|
|
116
|
+
"full goldens every N trials. If None, only a single full evaluation "
|
|
117
|
+
"is performed at the end."
|
|
118
|
+
),
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
rewrite_instruction_max_chars: PositiveInt = Field(
|
|
122
|
+
default=4096,
|
|
123
|
+
description=(
|
|
124
|
+
"Maximum number of characters from prompt, feedback, and related "
|
|
125
|
+
"text included in rewrite instructions."
|
|
126
|
+
),
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
@field_validator("random_seed", mode="before")
|
|
130
|
+
@classmethod
|
|
131
|
+
def _coerce_random_seed(cls, seed):
|
|
132
|
+
if seed is None:
|
|
133
|
+
return time.time_ns()
|
|
134
|
+
return seed
|