deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +139 -2
- deepeval/evaluate/evaluate.py +16 -11
- deepeval/evaluate/execute.py +13 -181
- deepeval/evaluate/utils.py +6 -26
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +14 -16
- deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
- deepeval/metrics/answer_relevancy/template.py +22 -3
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -44
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +16 -2
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +138 -149
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +7 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -19
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +158 -122
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +49 -33
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +79 -33
- deepeval/models/embedding_models/local_embedding_model.py +39 -20
- deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
- deepeval/models/embedding_models/openai_embedding_model.py +42 -22
- deepeval/models/llms/amazon_bedrock_model.py +226 -72
- deepeval/models/llms/anthropic_model.py +178 -63
- deepeval/models/llms/azure_model.py +218 -60
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +95 -40
- deepeval/models/llms/gemini_model.py +209 -64
- deepeval/models/llms/grok_model.py +139 -68
- deepeval/models/llms/kimi_model.py +140 -90
- deepeval/models/llms/litellm_model.py +131 -37
- deepeval/models/llms/local_model.py +125 -21
- deepeval/models/llms/ollama_model.py +147 -24
- deepeval/models/llms/openai_model.py +222 -269
- deepeval/models/llms/portkey_model.py +81 -22
- deepeval/models/llms/utils.py +8 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +106 -5
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +26 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +68 -1
- deepeval/test_case/llm_test_case.py +206 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +18 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,752 @@
|
|
|
1
|
+
# MIPROv2 - Multiprompt Instruction PRoposal Optimizer Version 2
|
|
2
|
+
#
|
|
3
|
+
# This implementation follows the original MIPROv2 paper and DSPy implementation:
|
|
4
|
+
# https://arxiv.org/pdf/2406.11695
|
|
5
|
+
# https://dspy.ai/api/optimizers/MIPROv2/
|
|
6
|
+
#
|
|
7
|
+
# The algorithm works in two phases:
|
|
8
|
+
#
|
|
9
|
+
# 1. PROPOSAL PHASE:
|
|
10
|
+
# a) Generate N diverse instruction candidates upfront
|
|
11
|
+
# b) Bootstrap few-shot demonstration sets from training data
|
|
12
|
+
#
|
|
13
|
+
# 2. OPTIMIZATION PHASE: Use Bayesian Optimization (Optuna TPE) to search
|
|
14
|
+
# over the joint space of (instruction_candidate, demo_set). Each trial:
|
|
15
|
+
# - Samples an instruction candidate index
|
|
16
|
+
# - Samples a demo set index
|
|
17
|
+
# - Renders the prompt with demos
|
|
18
|
+
# - Evaluates on a minibatch of examples
|
|
19
|
+
# - Uses the score to guide the Bayesian surrogate model
|
|
20
|
+
#
|
|
21
|
+
# Periodic full evaluation is performed every `minibatch_full_eval_steps`
|
|
22
|
+
# to get accurate scores on the complete validation set.
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
import asyncio
|
|
27
|
+
import uuid
|
|
28
|
+
import random
|
|
29
|
+
import time
|
|
30
|
+
import logging
|
|
31
|
+
from typing import (
|
|
32
|
+
Dict,
|
|
33
|
+
List,
|
|
34
|
+
Tuple,
|
|
35
|
+
TYPE_CHECKING,
|
|
36
|
+
Union,
|
|
37
|
+
Optional,
|
|
38
|
+
Callable,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
import optuna
|
|
43
|
+
from optuna.samplers import TPESampler
|
|
44
|
+
|
|
45
|
+
OPTUNA_AVAILABLE = True
|
|
46
|
+
except ImportError:
|
|
47
|
+
OPTUNA_AVAILABLE = False
|
|
48
|
+
optuna = None
|
|
49
|
+
TPESampler = None
|
|
50
|
+
|
|
51
|
+
from deepeval.models.base_model import DeepEvalBaseLLM
|
|
52
|
+
from deepeval.errors import DeepEvalError
|
|
53
|
+
from deepeval.optimizer.utils import Aggregator, mean_of_all
|
|
54
|
+
from deepeval.optimizer.types import (
|
|
55
|
+
PromptConfiguration,
|
|
56
|
+
PromptConfigurationId,
|
|
57
|
+
ModuleId,
|
|
58
|
+
ScoreTable,
|
|
59
|
+
OptimizationReport,
|
|
60
|
+
RunnerStatusType,
|
|
61
|
+
RunnerStatusCallback,
|
|
62
|
+
)
|
|
63
|
+
from deepeval.optimizer.scorer.base import BaseScorer
|
|
64
|
+
from deepeval.optimizer.algorithms.base import BaseAlgorithm
|
|
65
|
+
from deepeval.optimizer.utils import build_prompt_config_snapshots
|
|
66
|
+
from deepeval.prompt.prompt import Prompt
|
|
67
|
+
from deepeval.optimizer.algorithms.miprov2.proposer import InstructionProposer
|
|
68
|
+
from deepeval.optimizer.algorithms.miprov2.bootstrapper import (
|
|
69
|
+
DemoBootstrapper,
|
|
70
|
+
DemoSet,
|
|
71
|
+
render_prompt_with_demos,
|
|
72
|
+
)
|
|
73
|
+
from deepeval.optimizer.algorithms.configs import (
|
|
74
|
+
MIPROV2_DEFAULT_NUM_CANDIDATES,
|
|
75
|
+
MIPROV2_DEFAULT_NUM_TRIALS,
|
|
76
|
+
MIPROV2_DEFAULT_MINIBATCH_SIZE,
|
|
77
|
+
MIPROV2_DEFAULT_MINIBATCH_FULL_EVAL_STEPS,
|
|
78
|
+
MIPROV2_DEFAULT_MAX_BOOTSTRAPPED_DEMOS,
|
|
79
|
+
MIPROV2_DEFAULT_MAX_LABELED_DEMOS,
|
|
80
|
+
MIPROV2_DEFAULT_NUM_DEMO_SETS,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
if TYPE_CHECKING:
|
|
84
|
+
from deepeval.dataset.golden import Golden, ConversationalGolden
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
# Suppress Optuna's verbose logging
|
|
88
|
+
logging.getLogger("optuna").setLevel(logging.WARNING)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class MIPROV2(BaseAlgorithm):
|
|
92
|
+
"""
|
|
93
|
+
MIPROv2 (Multiprompt Instruction PRoposal Optimizer Version 2)
|
|
94
|
+
|
|
95
|
+
A prompt optimizer that uses Bayesian Optimization to find the best
|
|
96
|
+
combination of instruction and few-shot demonstrations. Follows the
|
|
97
|
+
original MIPROv2 paper approach.
|
|
98
|
+
|
|
99
|
+
The optimization process:
|
|
100
|
+
1. Generate N diverse instruction candidates upfront
|
|
101
|
+
2. Bootstrap M demo sets from training examples
|
|
102
|
+
3. Use Optuna's TPE sampler for Bayesian Optimization over (instruction, demos)
|
|
103
|
+
4. Each trial evaluates a combination on a minibatch
|
|
104
|
+
5. Periodically evaluate the best combination on the full dataset
|
|
105
|
+
|
|
106
|
+
Parameters
|
|
107
|
+
----------
|
|
108
|
+
num_candidates : int
|
|
109
|
+
Number of instruction candidates to propose. Default is 10.
|
|
110
|
+
num_trials : int
|
|
111
|
+
Number of Bayesian Optimization trials. Default is 20.
|
|
112
|
+
minibatch_size : int
|
|
113
|
+
Number of examples per minibatch evaluation. Default is 25.
|
|
114
|
+
minibatch_full_eval_steps : int
|
|
115
|
+
Evaluate best on full dataset every N trials. Default is 10.
|
|
116
|
+
max_bootstrapped_demos : int
|
|
117
|
+
Maximum bootstrapped demos per demo set. Default is 4.
|
|
118
|
+
max_labeled_demos : int
|
|
119
|
+
Maximum labeled demos (from expected_output) per set. Default is 4.
|
|
120
|
+
num_demo_sets : int
|
|
121
|
+
Number of demo sets to create. Default is 5.
|
|
122
|
+
random_seed : int, optional
|
|
123
|
+
RNG seed for reproducibility. If None, derived from time.time_ns().
|
|
124
|
+
aggregate_instances : Aggregator
|
|
125
|
+
Function to aggregate per-instance scores. Default is mean_of_all.
|
|
126
|
+
scorer : BaseScorer, optional
|
|
127
|
+
Scorer for evaluating prompts. Set by PromptOptimizer.
|
|
128
|
+
"""
|
|
129
|
+
|
|
130
|
+
name = "MIPROv2"
|
|
131
|
+
SINGLE_MODULE_ID: ModuleId = "__module__"
|
|
132
|
+
|
|
133
|
+
def __init__(
|
|
134
|
+
self,
|
|
135
|
+
num_candidates: int = MIPROV2_DEFAULT_NUM_CANDIDATES,
|
|
136
|
+
num_trials: int = MIPROV2_DEFAULT_NUM_TRIALS,
|
|
137
|
+
minibatch_size: int = MIPROV2_DEFAULT_MINIBATCH_SIZE,
|
|
138
|
+
minibatch_full_eval_steps: int = MIPROV2_DEFAULT_MINIBATCH_FULL_EVAL_STEPS,
|
|
139
|
+
max_bootstrapped_demos: int = MIPROV2_DEFAULT_MAX_BOOTSTRAPPED_DEMOS,
|
|
140
|
+
max_labeled_demos: int = MIPROV2_DEFAULT_MAX_LABELED_DEMOS,
|
|
141
|
+
num_demo_sets: int = MIPROV2_DEFAULT_NUM_DEMO_SETS,
|
|
142
|
+
random_seed: Optional[int] = None,
|
|
143
|
+
aggregate_instances: Aggregator = mean_of_all,
|
|
144
|
+
scorer: Optional[BaseScorer] = None,
|
|
145
|
+
) -> None:
|
|
146
|
+
if not OPTUNA_AVAILABLE:
|
|
147
|
+
raise DeepEvalError(
|
|
148
|
+
"MIPROv2 requires the 'optuna' package for Bayesian Optimization. "
|
|
149
|
+
"Install it with: pip install optuna"
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# Validate parameters
|
|
153
|
+
if num_candidates < 1:
|
|
154
|
+
raise ValueError("num_candidates must be >= 1")
|
|
155
|
+
if num_trials < 1:
|
|
156
|
+
raise ValueError("num_trials must be >= 1")
|
|
157
|
+
if minibatch_size < 1:
|
|
158
|
+
raise ValueError("minibatch_size must be >= 1")
|
|
159
|
+
if minibatch_full_eval_steps < 1:
|
|
160
|
+
raise ValueError("minibatch_full_eval_steps must be >= 1")
|
|
161
|
+
if max_bootstrapped_demos < 0:
|
|
162
|
+
raise ValueError("max_bootstrapped_demos must be >= 0")
|
|
163
|
+
if max_labeled_demos < 0:
|
|
164
|
+
raise ValueError("max_labeled_demos must be >= 0")
|
|
165
|
+
if num_demo_sets < 1:
|
|
166
|
+
raise ValueError("num_demo_sets must be >= 1")
|
|
167
|
+
|
|
168
|
+
self.num_candidates = num_candidates
|
|
169
|
+
self.num_trials = num_trials
|
|
170
|
+
self.minibatch_size = minibatch_size
|
|
171
|
+
self.minibatch_full_eval_steps = minibatch_full_eval_steps
|
|
172
|
+
self.max_bootstrapped_demos = max_bootstrapped_demos
|
|
173
|
+
self.max_labeled_demos = max_labeled_demos
|
|
174
|
+
self.num_demo_sets = num_demo_sets
|
|
175
|
+
self.aggregate_instances = aggregate_instances
|
|
176
|
+
self.scorer = scorer
|
|
177
|
+
|
|
178
|
+
# Random seed handling
|
|
179
|
+
if random_seed is None:
|
|
180
|
+
random_seed = time.time_ns() % (2**31)
|
|
181
|
+
self.random_seed = random_seed
|
|
182
|
+
self.random_state = random.Random(random_seed)
|
|
183
|
+
|
|
184
|
+
# Runtime state
|
|
185
|
+
self.reset_state()
|
|
186
|
+
|
|
187
|
+
# Callbacks and models (set by PromptOptimizer)
|
|
188
|
+
self.status_callback: Optional[RunnerStatusCallback] = None
|
|
189
|
+
self.optimizer_model: Optional["DeepEvalBaseLLM"] = None
|
|
190
|
+
|
|
191
|
+
# Lazy-loaded components
|
|
192
|
+
self._proposer: Optional[InstructionProposer] = None
|
|
193
|
+
self._bootstrapper: Optional[DemoBootstrapper] = None
|
|
194
|
+
|
|
195
|
+
##############
|
|
196
|
+
# Public API #
|
|
197
|
+
##############
|
|
198
|
+
|
|
199
|
+
def execute(
|
|
200
|
+
self,
|
|
201
|
+
prompt: Prompt,
|
|
202
|
+
goldens: Union[List["Golden"], List["ConversationalGolden"]],
|
|
203
|
+
) -> Tuple[Prompt, OptimizationReport]:
|
|
204
|
+
"""
|
|
205
|
+
Synchronous MIPROv2 optimization.
|
|
206
|
+
|
|
207
|
+
Phase 1: Propose instruction candidates + Bootstrap demo sets
|
|
208
|
+
Phase 2: Use Bayesian Optimization to find the best combination
|
|
209
|
+
"""
|
|
210
|
+
self._validate_inputs(goldens)
|
|
211
|
+
self._ensure_scorer()
|
|
212
|
+
self._ensure_proposer()
|
|
213
|
+
self._ensure_bootstrapper()
|
|
214
|
+
self.reset_state()
|
|
215
|
+
|
|
216
|
+
# Phase 1a: Propose instruction candidates
|
|
217
|
+
self._update_status("Phase 1: Proposing instruction candidates...", 0)
|
|
218
|
+
instruction_candidates = self._proposer.propose(
|
|
219
|
+
prompt=prompt,
|
|
220
|
+
goldens=goldens,
|
|
221
|
+
num_candidates=self.num_candidates,
|
|
222
|
+
)
|
|
223
|
+
self._register_instruction_candidates(instruction_candidates)
|
|
224
|
+
|
|
225
|
+
# Phase 1b: Bootstrap demo sets
|
|
226
|
+
self._update_status(
|
|
227
|
+
"Phase 1: Bootstrapping few-shot demonstrations...", 0
|
|
228
|
+
)
|
|
229
|
+
self._demo_sets = self._bootstrapper.bootstrap(
|
|
230
|
+
prompt=prompt,
|
|
231
|
+
goldens=goldens,
|
|
232
|
+
generate_fn=self._create_generate_fn(),
|
|
233
|
+
)
|
|
234
|
+
self._update_status(f"Bootstrapped {len(self._demo_sets)} demo sets", 0)
|
|
235
|
+
|
|
236
|
+
# Phase 2: Bayesian Optimization over (instruction, demos)
|
|
237
|
+
self._update_status("Phase 2: Starting Bayesian Optimization...", 0)
|
|
238
|
+
best_instr_idx, best_demo_idx = self._run_bayesian_optimization(goldens)
|
|
239
|
+
|
|
240
|
+
# Final full evaluation if not already done
|
|
241
|
+
config_key = (best_instr_idx, best_demo_idx)
|
|
242
|
+
if config_key not in self._full_eval_cache:
|
|
243
|
+
best_config = self._get_config_by_index(best_instr_idx)
|
|
244
|
+
best_demo_set = self._demo_sets[best_demo_idx]
|
|
245
|
+
self._full_evaluate(best_config, best_demo_set, goldens)
|
|
246
|
+
|
|
247
|
+
# Build report
|
|
248
|
+
best = self._best_by_aggregate()
|
|
249
|
+
return self._build_result(best)
|
|
250
|
+
|
|
251
|
+
async def a_execute(
|
|
252
|
+
self,
|
|
253
|
+
prompt: Prompt,
|
|
254
|
+
goldens: Union[List["Golden"], List["ConversationalGolden"]],
|
|
255
|
+
) -> Tuple[Prompt, OptimizationReport]:
|
|
256
|
+
"""
|
|
257
|
+
Asynchronous MIPROv2 optimization.
|
|
258
|
+
"""
|
|
259
|
+
self._validate_inputs(goldens)
|
|
260
|
+
self._ensure_scorer()
|
|
261
|
+
self._ensure_proposer()
|
|
262
|
+
self._ensure_bootstrapper()
|
|
263
|
+
self.reset_state()
|
|
264
|
+
|
|
265
|
+
# Phase 1: Run proposal and bootstrapping concurrently
|
|
266
|
+
self._update_status(
|
|
267
|
+
"Phase 1: Proposing candidates & bootstrapping demos...", 0
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
instruction_candidates, demo_sets = await asyncio.gather(
|
|
271
|
+
self._proposer.a_propose(
|
|
272
|
+
prompt=prompt,
|
|
273
|
+
goldens=goldens,
|
|
274
|
+
num_candidates=self.num_candidates,
|
|
275
|
+
),
|
|
276
|
+
self._bootstrapper.a_bootstrap(
|
|
277
|
+
prompt=prompt,
|
|
278
|
+
goldens=goldens,
|
|
279
|
+
a_generate_fn=self._create_async_generate_fn(),
|
|
280
|
+
),
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
self._register_instruction_candidates(instruction_candidates)
|
|
284
|
+
self._demo_sets = demo_sets
|
|
285
|
+
self._update_status(
|
|
286
|
+
f"Generated {len(instruction_candidates)} candidates, {len(self._demo_sets)} demo sets",
|
|
287
|
+
0,
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
# Phase 2: Bayesian Optimization
|
|
291
|
+
self._update_status("Phase 2: Starting Bayesian Optimization...", 0)
|
|
292
|
+
best_instr_idx, best_demo_idx = await self._a_run_bayesian_optimization(
|
|
293
|
+
goldens
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
# Final full evaluation if not already done
|
|
297
|
+
config_key = (best_instr_idx, best_demo_idx)
|
|
298
|
+
if config_key not in self._full_eval_cache:
|
|
299
|
+
best_config = self._get_config_by_index(best_instr_idx)
|
|
300
|
+
best_demo_set = self._demo_sets[best_demo_idx]
|
|
301
|
+
await self._a_full_evaluate(best_config, best_demo_set, goldens)
|
|
302
|
+
|
|
303
|
+
# Build report
|
|
304
|
+
best = self._best_by_aggregate()
|
|
305
|
+
return self._build_result(best)
|
|
306
|
+
|
|
307
|
+
###################
|
|
308
|
+
# State & Helpers #
|
|
309
|
+
###################
|
|
310
|
+
|
|
311
|
+
def reset_state(self) -> None:
|
|
312
|
+
"""Reset optimization state for a new run."""
|
|
313
|
+
self.optimization_id = str(uuid.uuid4())
|
|
314
|
+
self.prompt_configurations_by_id: Dict[
|
|
315
|
+
PromptConfigurationId, PromptConfiguration
|
|
316
|
+
] = {}
|
|
317
|
+
self.parents_by_id: Dict[
|
|
318
|
+
PromptConfigurationId, Optional[PromptConfigurationId]
|
|
319
|
+
] = {}
|
|
320
|
+
self.pareto_score_table: ScoreTable = {}
|
|
321
|
+
|
|
322
|
+
# Candidate tracking
|
|
323
|
+
self._instruction_candidates: List[PromptConfiguration] = []
|
|
324
|
+
self._demo_sets: List[DemoSet] = []
|
|
325
|
+
|
|
326
|
+
# Score tracking: (instr_idx, demo_idx) -> list of minibatch scores
|
|
327
|
+
self._combination_scores: Dict[Tuple[int, int], List[float]] = {}
|
|
328
|
+
|
|
329
|
+
# Full eval cache: (instr_idx, demo_idx) -> config_id
|
|
330
|
+
self._full_eval_cache: Dict[Tuple[int, int], PromptConfigurationId] = {}
|
|
331
|
+
|
|
332
|
+
# Trial tracking
|
|
333
|
+
self._trial_history: List[Dict] = []
|
|
334
|
+
self._best_trial_key: Tuple[int, int] = (0, 0)
|
|
335
|
+
self._best_trial_score: float = float("-inf")
|
|
336
|
+
|
|
337
|
+
def _validate_inputs(
|
|
338
|
+
self,
|
|
339
|
+
goldens: Union[List["Golden"], List["ConversationalGolden"]],
|
|
340
|
+
) -> None:
|
|
341
|
+
"""Validate input parameters."""
|
|
342
|
+
if len(goldens) < 1:
|
|
343
|
+
raise DeepEvalError(
|
|
344
|
+
"MIPROv2 prompt optimization requires at least 1 golden, but "
|
|
345
|
+
f"received {len(goldens)}. Provide at least one golden to run "
|
|
346
|
+
"the optimizer."
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
def _ensure_scorer(self) -> None:
|
|
350
|
+
"""Ensure scorer is configured."""
|
|
351
|
+
if self.scorer is None:
|
|
352
|
+
raise DeepEvalError(
|
|
353
|
+
"MIPROv2 requires a `scorer`. "
|
|
354
|
+
"Construct one in PromptOptimizer and assign it to `runner.scorer`."
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
def _ensure_proposer(self) -> None:
|
|
358
|
+
"""Lazily initialize the instruction proposer."""
|
|
359
|
+
if self._proposer is None:
|
|
360
|
+
if self.optimizer_model is None:
|
|
361
|
+
raise DeepEvalError(
|
|
362
|
+
"MIPROv2 requires an `optimizer_model` for instruction proposal. "
|
|
363
|
+
"Set it via PromptOptimizer."
|
|
364
|
+
)
|
|
365
|
+
self._proposer = InstructionProposer(
|
|
366
|
+
optimizer_model=self.optimizer_model,
|
|
367
|
+
random_state=self.random_state,
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
def _ensure_bootstrapper(self) -> None:
|
|
371
|
+
"""Lazily initialize the demo bootstrapper."""
|
|
372
|
+
if self._bootstrapper is None:
|
|
373
|
+
self._bootstrapper = DemoBootstrapper(
|
|
374
|
+
max_bootstrapped_demos=self.max_bootstrapped_demos,
|
|
375
|
+
max_labeled_demos=self.max_labeled_demos,
|
|
376
|
+
num_demo_sets=self.num_demo_sets,
|
|
377
|
+
random_state=self.random_state,
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
def _create_generate_fn(
|
|
381
|
+
self,
|
|
382
|
+
) -> Callable[[Prompt, Union["Golden", "ConversationalGolden"]], str]:
|
|
383
|
+
"""Create a sync generate function for bootstrapping."""
|
|
384
|
+
|
|
385
|
+
def generate_fn(
|
|
386
|
+
prompt: Prompt,
|
|
387
|
+
golden: Union["Golden", "ConversationalGolden"],
|
|
388
|
+
) -> str:
|
|
389
|
+
# Create a temporary config for generation
|
|
390
|
+
temp_config = PromptConfiguration.new(
|
|
391
|
+
prompts={self.SINGLE_MODULE_ID: prompt}
|
|
392
|
+
)
|
|
393
|
+
return self.scorer.generate(temp_config.prompts, golden)
|
|
394
|
+
|
|
395
|
+
return generate_fn
|
|
396
|
+
|
|
397
|
+
def _create_async_generate_fn(self) -> Callable:
|
|
398
|
+
"""Create an async generate function for bootstrapping."""
|
|
399
|
+
|
|
400
|
+
async def a_generate_fn(
|
|
401
|
+
prompt: Prompt,
|
|
402
|
+
golden: Union["Golden", "ConversationalGolden"],
|
|
403
|
+
) -> str:
|
|
404
|
+
temp_config = PromptConfiguration.new(
|
|
405
|
+
prompts={self.SINGLE_MODULE_ID: prompt}
|
|
406
|
+
)
|
|
407
|
+
return await self.scorer.a_generate(temp_config.prompts, golden)
|
|
408
|
+
|
|
409
|
+
return a_generate_fn
|
|
410
|
+
|
|
411
|
+
def _register_instruction_candidates(
|
|
412
|
+
self, candidates: List[Prompt]
|
|
413
|
+
) -> None:
|
|
414
|
+
"""Register all instruction candidates as configurations."""
|
|
415
|
+
for i, prompt in enumerate(candidates):
|
|
416
|
+
config = PromptConfiguration.new(
|
|
417
|
+
prompts={self.SINGLE_MODULE_ID: prompt},
|
|
418
|
+
parent=None if i == 0 else self._instruction_candidates[0].id,
|
|
419
|
+
)
|
|
420
|
+
self._instruction_candidates.append(config)
|
|
421
|
+
self.prompt_configurations_by_id[config.id] = config
|
|
422
|
+
self.parents_by_id[config.id] = config.parent
|
|
423
|
+
|
|
424
|
+
def _get_config_by_index(self, idx: int) -> PromptConfiguration:
|
|
425
|
+
"""Get configuration by instruction candidate index."""
|
|
426
|
+
return self._instruction_candidates[idx]
|
|
427
|
+
|
|
428
|
+
def _draw_minibatch(
|
|
429
|
+
self,
|
|
430
|
+
goldens: Union[List["Golden"], List["ConversationalGolden"]],
|
|
431
|
+
) -> Union[List["Golden"], List["ConversationalGolden"]]:
|
|
432
|
+
"""Sample a minibatch from goldens."""
|
|
433
|
+
n = len(goldens)
|
|
434
|
+
if n <= 0:
|
|
435
|
+
return []
|
|
436
|
+
size = min(self.minibatch_size, n)
|
|
437
|
+
return [goldens[self.random_state.randrange(0, n)] for _ in range(size)]
|
|
438
|
+
|
|
439
|
+
def _render_config_with_demos(
|
|
440
|
+
self,
|
|
441
|
+
config: PromptConfiguration,
|
|
442
|
+
demo_set: DemoSet,
|
|
443
|
+
) -> PromptConfiguration:
|
|
444
|
+
"""Create a new config with demos rendered into the prompt."""
|
|
445
|
+
base_prompt = config.prompts[self.SINGLE_MODULE_ID]
|
|
446
|
+
rendered_prompt = render_prompt_with_demos(
|
|
447
|
+
prompt=base_prompt,
|
|
448
|
+
demo_set=demo_set,
|
|
449
|
+
max_demos=self.max_bootstrapped_demos + self.max_labeled_demos,
|
|
450
|
+
)
|
|
451
|
+
|
|
452
|
+
# Create a new config with the rendered prompt
|
|
453
|
+
rendered_config = PromptConfiguration.new(
|
|
454
|
+
prompts={self.SINGLE_MODULE_ID: rendered_prompt},
|
|
455
|
+
parent=config.id,
|
|
456
|
+
)
|
|
457
|
+
return rendered_config
|
|
458
|
+
|
|
459
|
+
############################
|
|
460
|
+
# Bayesian Optimization #
|
|
461
|
+
############################
|
|
462
|
+
|
|
463
|
+
def _run_bayesian_optimization(
|
|
464
|
+
self,
|
|
465
|
+
goldens: Union[List["Golden"], List["ConversationalGolden"]],
|
|
466
|
+
) -> Tuple[int, int]:
|
|
467
|
+
"""
|
|
468
|
+
Run Bayesian Optimization using Optuna's TPE sampler.
|
|
469
|
+
Returns the (instruction_idx, demo_set_idx) of the best combination.
|
|
470
|
+
"""
|
|
471
|
+
num_instructions = len(self._instruction_candidates)
|
|
472
|
+
num_demo_sets = len(self._demo_sets)
|
|
473
|
+
|
|
474
|
+
# Create Optuna study with TPE sampler
|
|
475
|
+
sampler = TPESampler(seed=self.random_seed)
|
|
476
|
+
study = optuna.create_study(
|
|
477
|
+
direction="maximize",
|
|
478
|
+
sampler=sampler,
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
def objective(trial: "optuna.Trial") -> float:
|
|
482
|
+
# Sample instruction and demo set indices
|
|
483
|
+
instr_idx = trial.suggest_int("instr_idx", 0, num_instructions - 1)
|
|
484
|
+
demo_idx = trial.suggest_int("demo_idx", 0, num_demo_sets - 1)
|
|
485
|
+
|
|
486
|
+
# Get the configuration and demo set
|
|
487
|
+
config = self._get_config_by_index(instr_idx)
|
|
488
|
+
demo_set = self._demo_sets[demo_idx]
|
|
489
|
+
|
|
490
|
+
# Render prompt with demos
|
|
491
|
+
rendered_config = self._render_config_with_demos(config, demo_set)
|
|
492
|
+
|
|
493
|
+
# Draw minibatch and score
|
|
494
|
+
minibatch = self._draw_minibatch(goldens)
|
|
495
|
+
score = self.scorer.score_minibatch(rendered_config, minibatch)
|
|
496
|
+
|
|
497
|
+
# Track scores for this combination
|
|
498
|
+
combo_key = (instr_idx, demo_idx)
|
|
499
|
+
if combo_key not in self._combination_scores:
|
|
500
|
+
self._combination_scores[combo_key] = []
|
|
501
|
+
self._combination_scores[combo_key].append(score)
|
|
502
|
+
|
|
503
|
+
# Update best tracking
|
|
504
|
+
if score > self._best_trial_score:
|
|
505
|
+
self._best_trial_score = score
|
|
506
|
+
self._best_trial_key = combo_key
|
|
507
|
+
|
|
508
|
+
# Record trial
|
|
509
|
+
trial_num = len(self._trial_history) + 1
|
|
510
|
+
self._trial_history.append(
|
|
511
|
+
{
|
|
512
|
+
"trial": trial_num,
|
|
513
|
+
"instr_idx": instr_idx,
|
|
514
|
+
"demo_idx": demo_idx,
|
|
515
|
+
"score": score,
|
|
516
|
+
}
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
# Progress update
|
|
520
|
+
demo_info = (
|
|
521
|
+
f"{len(demo_set.demos)} demos" if demo_set.demos else "0-shot"
|
|
522
|
+
)
|
|
523
|
+
self._update_status(
|
|
524
|
+
f"Trial {trial_num}/{self.num_trials} - "
|
|
525
|
+
f"Instr {instr_idx}, {demo_info} - Score: {score:.4f}",
|
|
526
|
+
trial_num,
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
# Periodic full evaluation
|
|
530
|
+
if trial_num % self.minibatch_full_eval_steps == 0:
|
|
531
|
+
best_instr, best_demo = self._best_trial_key
|
|
532
|
+
if (best_instr, best_demo) not in self._full_eval_cache:
|
|
533
|
+
best_config = self._get_config_by_index(best_instr)
|
|
534
|
+
best_demo_set = self._demo_sets[best_demo]
|
|
535
|
+
self._full_evaluate(best_config, best_demo_set, goldens)
|
|
536
|
+
|
|
537
|
+
return score
|
|
538
|
+
|
|
539
|
+
# Run optimization
|
|
540
|
+
study.optimize(
|
|
541
|
+
objective,
|
|
542
|
+
n_trials=self.num_trials,
|
|
543
|
+
show_progress_bar=False,
|
|
544
|
+
)
|
|
545
|
+
|
|
546
|
+
# Return the best combination
|
|
547
|
+
return (
|
|
548
|
+
study.best_params["instr_idx"],
|
|
549
|
+
study.best_params["demo_idx"],
|
|
550
|
+
)
|
|
551
|
+
|
|
552
|
+
async def _a_run_bayesian_optimization(
|
|
553
|
+
self,
|
|
554
|
+
goldens: Union[List["Golden"], List["ConversationalGolden"]],
|
|
555
|
+
) -> Tuple[int, int]:
|
|
556
|
+
"""
|
|
557
|
+
Async version of Bayesian Optimization.
|
|
558
|
+
"""
|
|
559
|
+
num_instructions = len(self._instruction_candidates)
|
|
560
|
+
num_demo_sets = len(self._demo_sets)
|
|
561
|
+
|
|
562
|
+
sampler = TPESampler(seed=self.random_seed)
|
|
563
|
+
study = optuna.create_study(
|
|
564
|
+
direction="maximize",
|
|
565
|
+
sampler=sampler,
|
|
566
|
+
)
|
|
567
|
+
|
|
568
|
+
for trial_num in range(1, self.num_trials + 1):
|
|
569
|
+
trial = study.ask()
|
|
570
|
+
|
|
571
|
+
# Sample indices
|
|
572
|
+
instr_idx = trial.suggest_int("instr_idx", 0, num_instructions - 1)
|
|
573
|
+
demo_idx = trial.suggest_int("demo_idx", 0, num_demo_sets - 1)
|
|
574
|
+
|
|
575
|
+
# Get config and demos
|
|
576
|
+
config = self._get_config_by_index(instr_idx)
|
|
577
|
+
demo_set = self._demo_sets[demo_idx]
|
|
578
|
+
rendered_config = self._render_config_with_demos(config, demo_set)
|
|
579
|
+
|
|
580
|
+
# Score on minibatch
|
|
581
|
+
minibatch = self._draw_minibatch(goldens)
|
|
582
|
+
score = await self.scorer.a_score_minibatch(
|
|
583
|
+
rendered_config, minibatch
|
|
584
|
+
)
|
|
585
|
+
|
|
586
|
+
# Track scores
|
|
587
|
+
combo_key = (instr_idx, demo_idx)
|
|
588
|
+
if combo_key not in self._combination_scores:
|
|
589
|
+
self._combination_scores[combo_key] = []
|
|
590
|
+
self._combination_scores[combo_key].append(score)
|
|
591
|
+
|
|
592
|
+
# Update best
|
|
593
|
+
if score > self._best_trial_score:
|
|
594
|
+
self._best_trial_score = score
|
|
595
|
+
self._best_trial_key = combo_key
|
|
596
|
+
|
|
597
|
+
# Record trial
|
|
598
|
+
self._trial_history.append(
|
|
599
|
+
{
|
|
600
|
+
"trial": trial_num,
|
|
601
|
+
"instr_idx": instr_idx,
|
|
602
|
+
"demo_idx": demo_idx,
|
|
603
|
+
"score": score,
|
|
604
|
+
}
|
|
605
|
+
)
|
|
606
|
+
|
|
607
|
+
# Tell Optuna the result
|
|
608
|
+
study.tell(trial, score)
|
|
609
|
+
|
|
610
|
+
# Progress update
|
|
611
|
+
demo_info = (
|
|
612
|
+
f"{len(demo_set.demos)} demos" if demo_set.demos else "0-shot"
|
|
613
|
+
)
|
|
614
|
+
self._update_status(
|
|
615
|
+
f"Trial {trial_num}/{self.num_trials} - "
|
|
616
|
+
f"Instr {instr_idx}, {demo_info} - Score: {score:.4f}",
|
|
617
|
+
trial_num,
|
|
618
|
+
)
|
|
619
|
+
|
|
620
|
+
# Periodic full evaluation
|
|
621
|
+
if trial_num % self.minibatch_full_eval_steps == 0:
|
|
622
|
+
best_instr, best_demo = self._best_trial_key
|
|
623
|
+
if (best_instr, best_demo) not in self._full_eval_cache:
|
|
624
|
+
best_config = self._get_config_by_index(best_instr)
|
|
625
|
+
best_demo_set = self._demo_sets[best_demo]
|
|
626
|
+
await self._a_full_evaluate(
|
|
627
|
+
best_config, best_demo_set, goldens
|
|
628
|
+
)
|
|
629
|
+
|
|
630
|
+
return (
|
|
631
|
+
study.best_params["instr_idx"],
|
|
632
|
+
study.best_params["demo_idx"],
|
|
633
|
+
)
|
|
634
|
+
|
|
635
|
+
############################
|
|
636
|
+
# Full Evaluation #
|
|
637
|
+
############################
|
|
638
|
+
|
|
639
|
+
def _full_evaluate(
|
|
640
|
+
self,
|
|
641
|
+
config: PromptConfiguration,
|
|
642
|
+
demo_set: DemoSet,
|
|
643
|
+
goldens: Union[List["Golden"], List["ConversationalGolden"]],
|
|
644
|
+
) -> None:
|
|
645
|
+
"""Perform full evaluation on all goldens."""
|
|
646
|
+
# Find the indices for this combination
|
|
647
|
+
instr_idx = self._instruction_candidates.index(config)
|
|
648
|
+
demo_idx = self._demo_sets.index(demo_set)
|
|
649
|
+
combo_key = (instr_idx, demo_idx)
|
|
650
|
+
|
|
651
|
+
if combo_key in self._full_eval_cache:
|
|
652
|
+
return
|
|
653
|
+
|
|
654
|
+
# Render with demos
|
|
655
|
+
rendered_config = self._render_config_with_demos(config, demo_set)
|
|
656
|
+
|
|
657
|
+
# Register the rendered config
|
|
658
|
+
self.prompt_configurations_by_id[rendered_config.id] = rendered_config
|
|
659
|
+
self.parents_by_id[rendered_config.id] = config.id
|
|
660
|
+
|
|
661
|
+
# Score on full set
|
|
662
|
+
scores = self.scorer.score_pareto(rendered_config, goldens)
|
|
663
|
+
self.pareto_score_table[rendered_config.id] = scores
|
|
664
|
+
|
|
665
|
+
# Cache the result
|
|
666
|
+
self._full_eval_cache[combo_key] = rendered_config.id
|
|
667
|
+
|
|
668
|
+
async def _a_full_evaluate(
|
|
669
|
+
self,
|
|
670
|
+
config: PromptConfiguration,
|
|
671
|
+
demo_set: DemoSet,
|
|
672
|
+
goldens: Union[List["Golden"], List["ConversationalGolden"]],
|
|
673
|
+
) -> None:
|
|
674
|
+
"""Async full evaluation."""
|
|
675
|
+
instr_idx = self._instruction_candidates.index(config)
|
|
676
|
+
demo_idx = self._demo_sets.index(demo_set)
|
|
677
|
+
combo_key = (instr_idx, demo_idx)
|
|
678
|
+
|
|
679
|
+
if combo_key in self._full_eval_cache:
|
|
680
|
+
return
|
|
681
|
+
|
|
682
|
+
rendered_config = self._render_config_with_demos(config, demo_set)
|
|
683
|
+
self.prompt_configurations_by_id[rendered_config.id] = rendered_config
|
|
684
|
+
self.parents_by_id[rendered_config.id] = config.id
|
|
685
|
+
|
|
686
|
+
scores = await self.scorer.a_score_pareto(rendered_config, goldens)
|
|
687
|
+
self.pareto_score_table[rendered_config.id] = scores
|
|
688
|
+
self._full_eval_cache[combo_key] = rendered_config.id
|
|
689
|
+
|
|
690
|
+
############################
|
|
691
|
+
# Result Building #
|
|
692
|
+
############################
|
|
693
|
+
|
|
694
|
+
def _best_by_aggregate(self) -> PromptConfiguration:
|
|
695
|
+
"""Return the best candidate based on full evaluation scores."""
|
|
696
|
+
if not self.pareto_score_table:
|
|
697
|
+
# Fall back to best by trial scores
|
|
698
|
+
best_instr, best_demo = self._best_trial_key
|
|
699
|
+
config = self._get_config_by_index(best_instr)
|
|
700
|
+
demo_set = self._demo_sets[best_demo]
|
|
701
|
+
return self._render_config_with_demos(config, demo_set)
|
|
702
|
+
|
|
703
|
+
best_id: Optional[PromptConfigurationId] = None
|
|
704
|
+
best_score = float("-inf")
|
|
705
|
+
|
|
706
|
+
for config_id, scores in self.pareto_score_table.items():
|
|
707
|
+
agg_score = self.aggregate_instances(scores)
|
|
708
|
+
if agg_score > best_score:
|
|
709
|
+
best_score = agg_score
|
|
710
|
+
best_id = config_id
|
|
711
|
+
|
|
712
|
+
if best_id is None:
|
|
713
|
+
best_instr, best_demo = self._best_trial_key
|
|
714
|
+
config = self._get_config_by_index(best_instr)
|
|
715
|
+
demo_set = self._demo_sets[best_demo]
|
|
716
|
+
return self._render_config_with_demos(config, demo_set)
|
|
717
|
+
|
|
718
|
+
return self.prompt_configurations_by_id[best_id]
|
|
719
|
+
|
|
720
|
+
def _build_result(
|
|
721
|
+
self,
|
|
722
|
+
best: PromptConfiguration,
|
|
723
|
+
) -> Tuple[Prompt, OptimizationReport]:
|
|
724
|
+
"""Build the optimization result."""
|
|
725
|
+
prompt_config_snapshots = build_prompt_config_snapshots(
|
|
726
|
+
self.prompt_configurations_by_id
|
|
727
|
+
)
|
|
728
|
+
|
|
729
|
+
report = OptimizationReport(
|
|
730
|
+
optimization_id=self.optimization_id,
|
|
731
|
+
best_id=best.id,
|
|
732
|
+
accepted_iterations=self._trial_history,
|
|
733
|
+
pareto_scores=self.pareto_score_table,
|
|
734
|
+
parents=self.parents_by_id,
|
|
735
|
+
prompt_configurations=prompt_config_snapshots,
|
|
736
|
+
)
|
|
737
|
+
|
|
738
|
+
return best.prompts[self.SINGLE_MODULE_ID], report
|
|
739
|
+
|
|
740
|
+
############################
|
|
741
|
+
# Status Updates #
|
|
742
|
+
############################
|
|
743
|
+
|
|
744
|
+
def _update_status(self, message: str, step: int) -> None:
|
|
745
|
+
"""Send status update via callback."""
|
|
746
|
+
if self.status_callback is not None:
|
|
747
|
+
self.status_callback(
|
|
748
|
+
RunnerStatusType.PROGRESS,
|
|
749
|
+
step_index=step,
|
|
750
|
+
total_steps=self.num_trials,
|
|
751
|
+
detail=message,
|
|
752
|
+
)
|