dao-ai 0.0.36__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dao_ai/__init__.py +29 -0
- dao_ai/cli.py +195 -30
- dao_ai/config.py +770 -244
- dao_ai/genie/__init__.py +1 -22
- dao_ai/genie/cache/__init__.py +1 -2
- dao_ai/genie/cache/base.py +20 -70
- dao_ai/genie/cache/core.py +75 -0
- dao_ai/genie/cache/lru.py +44 -21
- dao_ai/genie/cache/semantic.py +390 -109
- dao_ai/genie/core.py +35 -0
- dao_ai/graph.py +27 -253
- dao_ai/hooks/__init__.py +9 -6
- dao_ai/hooks/core.py +22 -190
- dao_ai/memory/__init__.py +10 -0
- dao_ai/memory/core.py +23 -5
- dao_ai/memory/databricks.py +389 -0
- dao_ai/memory/postgres.py +2 -2
- dao_ai/messages.py +6 -4
- dao_ai/middleware/__init__.py +125 -0
- dao_ai/middleware/assertions.py +778 -0
- dao_ai/middleware/base.py +50 -0
- dao_ai/middleware/core.py +61 -0
- dao_ai/middleware/guardrails.py +415 -0
- dao_ai/middleware/human_in_the_loop.py +228 -0
- dao_ai/middleware/message_validation.py +554 -0
- dao_ai/middleware/summarization.py +192 -0
- dao_ai/models.py +1177 -108
- dao_ai/nodes.py +118 -161
- dao_ai/optimization.py +664 -0
- dao_ai/orchestration/__init__.py +52 -0
- dao_ai/orchestration/core.py +287 -0
- dao_ai/orchestration/supervisor.py +264 -0
- dao_ai/orchestration/swarm.py +226 -0
- dao_ai/prompts.py +126 -29
- dao_ai/providers/databricks.py +126 -381
- dao_ai/state.py +139 -21
- dao_ai/tools/__init__.py +8 -5
- dao_ai/tools/core.py +57 -4
- dao_ai/tools/email.py +280 -0
- dao_ai/tools/genie.py +47 -24
- dao_ai/tools/mcp.py +4 -3
- dao_ai/tools/memory.py +50 -0
- dao_ai/tools/python.py +4 -12
- dao_ai/tools/search.py +14 -0
- dao_ai/tools/slack.py +1 -1
- dao_ai/tools/unity_catalog.py +8 -6
- dao_ai/tools/vector_search.py +16 -9
- dao_ai/utils.py +72 -8
- dao_ai-0.1.1.dist-info/METADATA +1878 -0
- dao_ai-0.1.1.dist-info/RECORD +62 -0
- dao_ai/chat_models.py +0 -204
- dao_ai/guardrails.py +0 -112
- dao_ai/tools/genie/__init__.py +0 -236
- dao_ai/tools/human_in_the_loop.py +0 -100
- dao_ai-0.0.36.dist-info/METADATA +0 -951
- dao_ai-0.0.36.dist-info/RECORD +0 -47
- {dao_ai-0.0.36.dist-info → dao_ai-0.1.1.dist-info}/WHEEL +0 -0
- {dao_ai-0.0.36.dist-info → dao_ai-0.1.1.dist-info}/entry_points.txt +0 -0
- {dao_ai-0.0.36.dist-info → dao_ai-0.1.1.dist-info}/licenses/LICENSE +0 -0
dao_ai/optimization.py
ADDED
|
@@ -0,0 +1,664 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Prompt optimization using GEPA (Generative Evolution of Prompts and Agents).
|
|
3
|
+
|
|
4
|
+
This module provides prompt optimization for DAO AI agents using the GEPA
|
|
5
|
+
optimizer, which uses reflective mutation to evolve prompts based on
|
|
6
|
+
evaluation feedback.
|
|
7
|
+
|
|
8
|
+
GEPA is an evolutionary optimizer that:
|
|
9
|
+
1. Takes a seed prompt (initial template)
|
|
10
|
+
2. Evaluates it against training examples
|
|
11
|
+
3. Uses a reflection LM to propose improvements
|
|
12
|
+
4. Iteratively evolves the prompt to maximize the metric
|
|
13
|
+
|
|
14
|
+
Usage:
|
|
15
|
+
from dao_ai.optimization import optimize_prompt
|
|
16
|
+
|
|
17
|
+
result = optimize_prompt(
|
|
18
|
+
prompt=my_prompt_model,
|
|
19
|
+
agent=my_agent_model,
|
|
20
|
+
dataset=my_training_dataset,
|
|
21
|
+
num_candidates=50,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
if result.improved:
|
|
25
|
+
print(f"Improved by {result.improvement:.1%}")
|
|
26
|
+
print(f"New template: {result.optimized_template}")
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
from dataclasses import dataclass, field
|
|
30
|
+
from datetime import datetime, timezone
|
|
31
|
+
from typing import Any, Callable, Optional, Sequence, Union
|
|
32
|
+
|
|
33
|
+
import mlflow
|
|
34
|
+
from gepa import EvaluationBatch, GEPAAdapter, GEPAResult, optimize
|
|
35
|
+
from loguru import logger
|
|
36
|
+
from mlflow.entities.model_registry import PromptVersion
|
|
37
|
+
from mlflow.types.responses import ResponsesAgentRequest, ResponsesAgentResponse
|
|
38
|
+
from mlflow.types.responses_helpers import Message
|
|
39
|
+
|
|
40
|
+
from dao_ai.config import (
|
|
41
|
+
AgentModel,
|
|
42
|
+
ChatPayload,
|
|
43
|
+
EvaluationDatasetEntryModel,
|
|
44
|
+
EvaluationDatasetModel,
|
|
45
|
+
PromptModel,
|
|
46
|
+
)
|
|
47
|
+
from dao_ai.utils import dao_ai_version
|
|
48
|
+
|
|
49
|
+
# Type alias for metric function
|
|
50
|
+
MetricFn = Callable[[str, "_TrainingExample"], float]
|
|
51
|
+
|
|
52
|
+
__all__ = [
|
|
53
|
+
"OptimizationResult",
|
|
54
|
+
"optimize_prompt",
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass
|
|
59
|
+
class OptimizationResult:
|
|
60
|
+
"""Result of prompt optimization.
|
|
61
|
+
|
|
62
|
+
Attributes:
|
|
63
|
+
optimized_prompt: The optimized PromptModel with new template
|
|
64
|
+
optimized_template: The optimized template string
|
|
65
|
+
original_score: Score of the original prompt
|
|
66
|
+
optimized_score: Score of the optimized prompt
|
|
67
|
+
improvement: Percentage improvement
|
|
68
|
+
num_evaluations: Number of metric evaluations performed
|
|
69
|
+
registered_version: MLflow prompt version if registered
|
|
70
|
+
metadata: Additional optimization metadata
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
optimized_prompt: PromptModel
|
|
74
|
+
optimized_template: str
|
|
75
|
+
original_score: float
|
|
76
|
+
optimized_score: float
|
|
77
|
+
improvement: float
|
|
78
|
+
num_evaluations: int
|
|
79
|
+
registered_version: Optional[PromptVersion] = None
|
|
80
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def improved(self) -> bool:
|
|
84
|
+
"""Whether the optimization improved the prompt."""
|
|
85
|
+
return self.optimized_score > self.original_score
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@dataclass
|
|
89
|
+
class _TrainingExample:
|
|
90
|
+
"""Internal training example format for GEPA."""
|
|
91
|
+
|
|
92
|
+
question: str
|
|
93
|
+
expected_facts: Optional[list[str]] = None
|
|
94
|
+
expected_response: Optional[str] = None
|
|
95
|
+
custom_inputs: Optional[dict[str, Any]] = None
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
@dataclass
|
|
99
|
+
class _Trajectory:
|
|
100
|
+
"""Trajectory data for reflection."""
|
|
101
|
+
|
|
102
|
+
question: str
|
|
103
|
+
response: str
|
|
104
|
+
expected: Any
|
|
105
|
+
score: float
|
|
106
|
+
error: Optional[str] = None
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class DAOAgentAdapter(GEPAAdapter[_TrainingExample, _Trajectory, str]):
|
|
110
|
+
"""GEPA adapter for DAO AI agents.
|
|
111
|
+
|
|
112
|
+
This adapter bridges GEPA's optimization loop with DAO AI's
|
|
113
|
+
ResponsesAgent interface.
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
agent_model: AgentModel
|
|
117
|
+
metric_fn: MetricFn
|
|
118
|
+
_agent: Optional[Any]
|
|
119
|
+
_original_prompt: Optional[Union[PromptModel, str]]
|
|
120
|
+
|
|
121
|
+
def __init__(
|
|
122
|
+
self,
|
|
123
|
+
agent_model: AgentModel,
|
|
124
|
+
metric_fn: Optional[MetricFn] = None,
|
|
125
|
+
) -> None:
|
|
126
|
+
"""Initialize the adapter.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
agent_model: The DAO AI agent model to optimize
|
|
130
|
+
metric_fn: Optional custom metric function (response, example) -> score
|
|
131
|
+
"""
|
|
132
|
+
self.agent_model = agent_model
|
|
133
|
+
self.metric_fn = metric_fn or self._default_metric
|
|
134
|
+
self._agent = None
|
|
135
|
+
self._original_prompt = None
|
|
136
|
+
|
|
137
|
+
def _get_agent(self) -> Any:
|
|
138
|
+
"""Lazily create the ResponsesAgent.
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
The ResponsesAgent instance for the configured agent model.
|
|
142
|
+
"""
|
|
143
|
+
if self._agent is None:
|
|
144
|
+
self._agent = self.agent_model.as_responses_agent()
|
|
145
|
+
return self._agent
|
|
146
|
+
|
|
147
|
+
def _default_metric(self, response: str, example: _TrainingExample) -> float:
|
|
148
|
+
"""Default metric: check if expected facts are present in response."""
|
|
149
|
+
if example.expected_facts:
|
|
150
|
+
facts_found = sum(
|
|
151
|
+
1 for fact in example.expected_facts if fact.lower() in response.lower()
|
|
152
|
+
)
|
|
153
|
+
return facts_found / len(example.expected_facts)
|
|
154
|
+
elif example.expected_response:
|
|
155
|
+
expected_words = set(example.expected_response.lower().split())
|
|
156
|
+
response_words = set(response.lower().split())
|
|
157
|
+
overlap = len(expected_words & response_words)
|
|
158
|
+
return overlap / len(expected_words) if expected_words else 0.0
|
|
159
|
+
return 0.0
|
|
160
|
+
|
|
161
|
+
def evaluate(
|
|
162
|
+
self,
|
|
163
|
+
batch: list[_TrainingExample],
|
|
164
|
+
candidate: dict[str, str],
|
|
165
|
+
capture_traces: bool = False,
|
|
166
|
+
) -> EvaluationBatch[_Trajectory, str]:
|
|
167
|
+
"""Evaluate a candidate prompt on a batch of examples.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
batch: List of training examples to evaluate
|
|
171
|
+
candidate: Dict mapping component names to text (e.g., {"prompt": "..."})
|
|
172
|
+
capture_traces: Whether to capture trajectories for reflection
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
EvaluationBatch with outputs, scores, and optional trajectories
|
|
176
|
+
"""
|
|
177
|
+
prompt_template = candidate.get("prompt", "")
|
|
178
|
+
|
|
179
|
+
# Create agent with the candidate prompt
|
|
180
|
+
original_prompt = self.agent_model.prompt
|
|
181
|
+
try:
|
|
182
|
+
# Update agent's prompt template
|
|
183
|
+
if isinstance(original_prompt, PromptModel):
|
|
184
|
+
self.agent_model.prompt = PromptModel(
|
|
185
|
+
name=original_prompt.name,
|
|
186
|
+
schema=original_prompt.schema_model,
|
|
187
|
+
default_template=prompt_template,
|
|
188
|
+
description=original_prompt.description,
|
|
189
|
+
tags=original_prompt.tags,
|
|
190
|
+
)
|
|
191
|
+
else:
|
|
192
|
+
self.agent_model.prompt = prompt_template
|
|
193
|
+
|
|
194
|
+
# Recreate agent with new prompt
|
|
195
|
+
self._agent = None
|
|
196
|
+
agent = self._get_agent()
|
|
197
|
+
|
|
198
|
+
outputs: list[str] = []
|
|
199
|
+
scores: list[float] = []
|
|
200
|
+
trajectories: list[_Trajectory] = []
|
|
201
|
+
|
|
202
|
+
for example in batch:
|
|
203
|
+
try:
|
|
204
|
+
# Build request
|
|
205
|
+
messages = [Message(role="user", content=example.question)]
|
|
206
|
+
request = ResponsesAgentRequest(
|
|
207
|
+
input=messages,
|
|
208
|
+
custom_inputs=example.custom_inputs or {},
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
# Get response
|
|
212
|
+
response: ResponsesAgentResponse = agent.predict(request)
|
|
213
|
+
|
|
214
|
+
# Extract response text
|
|
215
|
+
response_text = ""
|
|
216
|
+
if response.output and len(response.output) > 0:
|
|
217
|
+
content = response.output[0].content
|
|
218
|
+
if isinstance(content, str):
|
|
219
|
+
response_text = content
|
|
220
|
+
elif isinstance(content, list):
|
|
221
|
+
response_text = "".join(
|
|
222
|
+
item.get("text", str(item))
|
|
223
|
+
if isinstance(item, dict)
|
|
224
|
+
else str(item)
|
|
225
|
+
for item in content
|
|
226
|
+
)
|
|
227
|
+
else:
|
|
228
|
+
response_text = str(content)
|
|
229
|
+
|
|
230
|
+
# Calculate score
|
|
231
|
+
score = self.metric_fn(response_text, example)
|
|
232
|
+
|
|
233
|
+
outputs.append(response_text)
|
|
234
|
+
scores.append(score)
|
|
235
|
+
|
|
236
|
+
if capture_traces:
|
|
237
|
+
trajectories.append(
|
|
238
|
+
_Trajectory(
|
|
239
|
+
question=example.question,
|
|
240
|
+
response=response_text,
|
|
241
|
+
expected=example.expected_facts
|
|
242
|
+
or example.expected_response,
|
|
243
|
+
score=score,
|
|
244
|
+
)
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
except Exception as e:
|
|
248
|
+
logger.warning(f"Error evaluating example: {e}")
|
|
249
|
+
outputs.append("")
|
|
250
|
+
scores.append(0.0)
|
|
251
|
+
|
|
252
|
+
if capture_traces:
|
|
253
|
+
trajectories.append(
|
|
254
|
+
_Trajectory(
|
|
255
|
+
question=example.question,
|
|
256
|
+
response="",
|
|
257
|
+
expected=example.expected_facts
|
|
258
|
+
or example.expected_response,
|
|
259
|
+
score=0.0,
|
|
260
|
+
error=str(e),
|
|
261
|
+
)
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
return EvaluationBatch(
|
|
265
|
+
outputs=outputs,
|
|
266
|
+
scores=scores,
|
|
267
|
+
trajectories=trajectories if capture_traces else None,
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
finally:
|
|
271
|
+
# Restore original prompt
|
|
272
|
+
self.agent_model.prompt = original_prompt
|
|
273
|
+
self._agent = None
|
|
274
|
+
|
|
275
|
+
def make_reflective_dataset(
|
|
276
|
+
self,
|
|
277
|
+
batch: list[_TrainingExample],
|
|
278
|
+
trajectories: list[_Trajectory],
|
|
279
|
+
component_name: str,
|
|
280
|
+
) -> list[dict[str, str]]:
|
|
281
|
+
"""Create a reflective dataset for the optimizer.
|
|
282
|
+
|
|
283
|
+
Args:
|
|
284
|
+
batch: Original batch of examples
|
|
285
|
+
trajectories: Trajectories from evaluation
|
|
286
|
+
component_name: Name of component to reflect on
|
|
287
|
+
|
|
288
|
+
Returns:
|
|
289
|
+
List of dicts with inputs, outputs, and feedback
|
|
290
|
+
"""
|
|
291
|
+
reflective_data: list[dict[str, str]] = []
|
|
292
|
+
|
|
293
|
+
for example, trajectory in zip(batch, trajectories):
|
|
294
|
+
feedback_parts: list[str] = []
|
|
295
|
+
feedback_parts.append(f"Input: {trajectory.question}")
|
|
296
|
+
feedback_parts.append(f"Output: {trajectory.response[:500]}")
|
|
297
|
+
feedback_parts.append(f"Expected: {trajectory.expected}")
|
|
298
|
+
feedback_parts.append(f"Score: {trajectory.score:.2f}")
|
|
299
|
+
|
|
300
|
+
if trajectory.score < 1.0 and example.expected_facts:
|
|
301
|
+
missing = [
|
|
302
|
+
f
|
|
303
|
+
for f in example.expected_facts
|
|
304
|
+
if f.lower() not in trajectory.response.lower()
|
|
305
|
+
]
|
|
306
|
+
if missing:
|
|
307
|
+
feedback_parts.append(f"Missing facts: {missing}")
|
|
308
|
+
|
|
309
|
+
if trajectory.error:
|
|
310
|
+
feedback_parts.append(f"Error: {trajectory.error}")
|
|
311
|
+
|
|
312
|
+
reflective_data.append(
|
|
313
|
+
{
|
|
314
|
+
"input": trajectory.question,
|
|
315
|
+
"output": trajectory.response,
|
|
316
|
+
"feedback": "\n".join(feedback_parts),
|
|
317
|
+
}
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
return reflective_data
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def _convert_dataset(
|
|
324
|
+
dataset: EvaluationDatasetModel | Sequence[EvaluationDatasetEntryModel],
|
|
325
|
+
) -> list[_TrainingExample]:
|
|
326
|
+
"""Convert DAO dataset to internal training examples.
|
|
327
|
+
|
|
328
|
+
Args:
|
|
329
|
+
dataset: EvaluationDatasetModel or list of entries
|
|
330
|
+
|
|
331
|
+
Returns:
|
|
332
|
+
List of training examples
|
|
333
|
+
"""
|
|
334
|
+
entries: Sequence[EvaluationDatasetEntryModel]
|
|
335
|
+
if isinstance(dataset, EvaluationDatasetModel):
|
|
336
|
+
entries = dataset.data
|
|
337
|
+
else:
|
|
338
|
+
entries = dataset
|
|
339
|
+
|
|
340
|
+
examples: list[_TrainingExample] = []
|
|
341
|
+
|
|
342
|
+
for entry in entries:
|
|
343
|
+
payload: ChatPayload = entry.inputs
|
|
344
|
+
messages = payload.messages
|
|
345
|
+
|
|
346
|
+
# Get the user's question from messages
|
|
347
|
+
question = ""
|
|
348
|
+
for msg in messages:
|
|
349
|
+
if msg.role == "user":
|
|
350
|
+
question = msg.content
|
|
351
|
+
break
|
|
352
|
+
|
|
353
|
+
example = _TrainingExample(
|
|
354
|
+
question=question,
|
|
355
|
+
expected_facts=entry.expectations.expected_facts
|
|
356
|
+
if entry.expectations
|
|
357
|
+
else None,
|
|
358
|
+
expected_response=entry.expectations.expected_response
|
|
359
|
+
if entry.expectations
|
|
360
|
+
else None,
|
|
361
|
+
custom_inputs=payload.custom_inputs,
|
|
362
|
+
)
|
|
363
|
+
examples.append(example)
|
|
364
|
+
|
|
365
|
+
logger.debug(f"Converted {len(examples)} dataset entries to training examples")
|
|
366
|
+
return examples
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def _register_optimized_prompt(
|
|
370
|
+
prompt: PromptModel,
|
|
371
|
+
optimized_template: str,
|
|
372
|
+
improvement: float,
|
|
373
|
+
original_score: float,
|
|
374
|
+
optimized_score: float,
|
|
375
|
+
model_name: str,
|
|
376
|
+
agent_name: str,
|
|
377
|
+
num_evaluations: int,
|
|
378
|
+
train_size: int,
|
|
379
|
+
val_size: int,
|
|
380
|
+
) -> PromptVersion:
|
|
381
|
+
"""Register the optimized prompt in MLflow.
|
|
382
|
+
|
|
383
|
+
Args:
|
|
384
|
+
prompt: Original prompt model
|
|
385
|
+
optimized_template: Optimized template string
|
|
386
|
+
improvement: Improvement percentage
|
|
387
|
+
original_score: Original evaluation score
|
|
388
|
+
optimized_score: Optimized evaluation score
|
|
389
|
+
model_name: Model used for reflection/optimization
|
|
390
|
+
agent_name: Name of the agent being optimized
|
|
391
|
+
num_evaluations: Number of metric evaluations performed
|
|
392
|
+
train_size: Size of training dataset
|
|
393
|
+
val_size: Size of validation dataset
|
|
394
|
+
|
|
395
|
+
Returns:
|
|
396
|
+
Registered PromptVersion
|
|
397
|
+
"""
|
|
398
|
+
mlflow.set_registry_uri("databricks-uc")
|
|
399
|
+
|
|
400
|
+
prompt_name: str = prompt.full_name
|
|
401
|
+
optimization_timestamp: str = datetime.now(timezone.utc).isoformat()
|
|
402
|
+
|
|
403
|
+
logger.info(f"Registering optimized prompt: {prompt_name}")
|
|
404
|
+
|
|
405
|
+
# Build comprehensive tags for the prompt registry
|
|
406
|
+
tags: dict[str, str] = {
|
|
407
|
+
# DAO AI metadata
|
|
408
|
+
"dao_ai_version": dao_ai_version(),
|
|
409
|
+
"created_by": "dao_ai.optimization",
|
|
410
|
+
# Optimization metadata
|
|
411
|
+
"optimizer": "gepa",
|
|
412
|
+
"optimization_timestamp": optimization_timestamp,
|
|
413
|
+
"target_model": model_name,
|
|
414
|
+
"target_agent": agent_name,
|
|
415
|
+
# Performance metrics
|
|
416
|
+
"original_score": f"{original_score:.4f}",
|
|
417
|
+
"optimized_score": f"{optimized_score:.4f}",
|
|
418
|
+
"improvement": f"{improvement:.4f}",
|
|
419
|
+
"improvement_percent": f"{improvement:.1%}",
|
|
420
|
+
# Dataset info
|
|
421
|
+
"num_evaluations": str(num_evaluations),
|
|
422
|
+
"train_size": str(train_size),
|
|
423
|
+
"val_size": str(val_size),
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
# Preserve original prompt tags if present
|
|
427
|
+
if prompt.tags:
|
|
428
|
+
for key, value in prompt.tags.items():
|
|
429
|
+
if key not in tags: # Don't override optimization tags
|
|
430
|
+
tags[f"original_{key}"] = str(value)
|
|
431
|
+
|
|
432
|
+
# Register new version with comprehensive metadata
|
|
433
|
+
version: PromptVersion = mlflow.genai.register_prompt(
|
|
434
|
+
name=prompt_name,
|
|
435
|
+
template=optimized_template,
|
|
436
|
+
commit_message=(
|
|
437
|
+
f"Optimized with GEPA for agent '{agent_name}' "
|
|
438
|
+
f"(improvement: {improvement:.1%}, "
|
|
439
|
+
f"score: {original_score:.3f} -> {optimized_score:.3f}, "
|
|
440
|
+
f"model: {model_name})"
|
|
441
|
+
),
|
|
442
|
+
tags=tags,
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
logger.info(f"Registered as version {version.version}")
|
|
446
|
+
|
|
447
|
+
# Set 'latest' alias for most recently optimized version
|
|
448
|
+
mlflow.genai.set_prompt_alias(
|
|
449
|
+
name=prompt_name,
|
|
450
|
+
alias="latest",
|
|
451
|
+
version=version.version,
|
|
452
|
+
)
|
|
453
|
+
logger.info(f"Set 'latest' alias for version {version.version}")
|
|
454
|
+
|
|
455
|
+
# Set 'champion' alias if there was actual improvement
|
|
456
|
+
if improvement > 0:
|
|
457
|
+
mlflow.genai.set_prompt_alias(
|
|
458
|
+
name=prompt_name,
|
|
459
|
+
alias="champion",
|
|
460
|
+
version=version.version,
|
|
461
|
+
)
|
|
462
|
+
logger.info(f"Set 'champion' alias for version {version.version}")
|
|
463
|
+
|
|
464
|
+
return version
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
def optimize_prompt(
|
|
468
|
+
prompt: PromptModel,
|
|
469
|
+
agent: AgentModel,
|
|
470
|
+
dataset: EvaluationDatasetModel | Sequence[EvaluationDatasetEntryModel],
|
|
471
|
+
reflection_model: Optional[str] = None,
|
|
472
|
+
num_candidates: int = 50,
|
|
473
|
+
metric: Optional[Callable[[str, _TrainingExample], float]] = None,
|
|
474
|
+
register_if_improved: bool = True,
|
|
475
|
+
min_improvement: float = 0.0,
|
|
476
|
+
) -> OptimizationResult:
|
|
477
|
+
"""
|
|
478
|
+
Optimize a prompt using GEPA.
|
|
479
|
+
|
|
480
|
+
GEPA (Generative Evolution of Prompts and Agents) is an evolutionary
|
|
481
|
+
optimizer that uses reflective mutation to improve prompts based on
|
|
482
|
+
evaluation feedback.
|
|
483
|
+
|
|
484
|
+
Args:
|
|
485
|
+
prompt: The PromptModel to optimize
|
|
486
|
+
agent: The AgentModel that uses this prompt
|
|
487
|
+
dataset: Training data for optimization
|
|
488
|
+
reflection_model: LLM for reflection (defaults to agent's model)
|
|
489
|
+
num_candidates: Maximum metric calls / candidate evaluations
|
|
490
|
+
metric: Optional custom metric function (response, example) -> score
|
|
491
|
+
register_if_improved: Register optimized prompt in MLflow if improved
|
|
492
|
+
min_improvement: Minimum improvement required to register
|
|
493
|
+
|
|
494
|
+
Returns:
|
|
495
|
+
OptimizationResult with optimization details
|
|
496
|
+
|
|
497
|
+
Example:
|
|
498
|
+
from dao_ai.config import AgentModel, PromptModel, LLMModel
|
|
499
|
+
from dao_ai.optimization import optimize_prompt
|
|
500
|
+
|
|
501
|
+
prompt = PromptModel(
|
|
502
|
+
name="my_prompt",
|
|
503
|
+
default_template="Answer the question: {question}"
|
|
504
|
+
)
|
|
505
|
+
agent = AgentModel(
|
|
506
|
+
name="my_agent",
|
|
507
|
+
model=LLMModel(name="databricks-meta-llama-3-3-70b-instruct"),
|
|
508
|
+
prompt=prompt,
|
|
509
|
+
)
|
|
510
|
+
|
|
511
|
+
result = optimize_prompt(
|
|
512
|
+
prompt=prompt,
|
|
513
|
+
agent=agent,
|
|
514
|
+
dataset=training_data,
|
|
515
|
+
num_candidates=50,
|
|
516
|
+
)
|
|
517
|
+
|
|
518
|
+
if result.improved:
|
|
519
|
+
print(f"Improved by {result.improvement:.1%}")
|
|
520
|
+
"""
|
|
521
|
+
logger.info(f"Starting GEPA optimization for prompt: {prompt.name}")
|
|
522
|
+
|
|
523
|
+
# Get the original template
|
|
524
|
+
original_template = prompt.template
|
|
525
|
+
if not original_template:
|
|
526
|
+
raise ValueError(f"Prompt '{prompt.name}' has no template to optimize")
|
|
527
|
+
|
|
528
|
+
# Convert dataset
|
|
529
|
+
examples = _convert_dataset(dataset)
|
|
530
|
+
if not examples:
|
|
531
|
+
raise ValueError("Dataset is empty")
|
|
532
|
+
|
|
533
|
+
# Split into train/val
|
|
534
|
+
split_idx = max(1, len(examples) * 4 // 5)
|
|
535
|
+
trainset = examples[:split_idx]
|
|
536
|
+
valset = examples[split_idx:] if split_idx < len(examples) else examples
|
|
537
|
+
|
|
538
|
+
logger.info(f"Using {len(trainset)} train, {len(valset)} val examples")
|
|
539
|
+
|
|
540
|
+
# Get reflection model
|
|
541
|
+
reflection_model_name = reflection_model or agent.model.uri
|
|
542
|
+
logger.info(f"Using reflection model: {reflection_model_name}")
|
|
543
|
+
|
|
544
|
+
# Create adapter
|
|
545
|
+
adapter = DAOAgentAdapter(agent_model=agent, metric_fn=metric)
|
|
546
|
+
|
|
547
|
+
# Seed candidate
|
|
548
|
+
seed_candidate = {"prompt": original_template}
|
|
549
|
+
|
|
550
|
+
# Run GEPA optimization
|
|
551
|
+
logger.info(f"Running GEPA optimization (max {num_candidates} evaluations)...")
|
|
552
|
+
|
|
553
|
+
try:
|
|
554
|
+
result: GEPAResult = optimize(
|
|
555
|
+
seed_candidate=seed_candidate,
|
|
556
|
+
trainset=trainset,
|
|
557
|
+
valset=valset,
|
|
558
|
+
adapter=adapter,
|
|
559
|
+
reflection_lm=reflection_model_name,
|
|
560
|
+
max_metric_calls=num_candidates,
|
|
561
|
+
display_progress_bar=True,
|
|
562
|
+
skip_perfect_score=True,
|
|
563
|
+
)
|
|
564
|
+
except Exception as e:
|
|
565
|
+
logger.error(f"GEPA optimization failed: {e}")
|
|
566
|
+
return OptimizationResult(
|
|
567
|
+
optimized_prompt=prompt,
|
|
568
|
+
optimized_template=original_template,
|
|
569
|
+
original_score=0.0,
|
|
570
|
+
optimized_score=0.0,
|
|
571
|
+
improvement=0.0,
|
|
572
|
+
num_evaluations=0,
|
|
573
|
+
metadata={"error": str(e)},
|
|
574
|
+
)
|
|
575
|
+
|
|
576
|
+
# Extract results from GEPAResult
|
|
577
|
+
# GEPAResult has:
|
|
578
|
+
# - candidates: list of candidate dicts
|
|
579
|
+
# - val_aggregate_scores: list of scores (index 0 is seed)
|
|
580
|
+
# - best_idx: index of best candidate
|
|
581
|
+
# - best_candidate: dict for best candidate
|
|
582
|
+
# - total_metric_calls: number of metric evaluations
|
|
583
|
+
best_candidate: dict[str, str] = result.best_candidate
|
|
584
|
+
optimized_template: str = best_candidate.get("prompt", original_template)
|
|
585
|
+
|
|
586
|
+
# Get scores from result - val_aggregate_scores[0] is the seed candidate score
|
|
587
|
+
val_scores: list[float] = result.val_aggregate_scores
|
|
588
|
+
original_score: float = val_scores[0] if val_scores else 0.0
|
|
589
|
+
best_idx: int = result.best_idx
|
|
590
|
+
optimized_score: float = val_scores[best_idx] if val_scores else 0.0
|
|
591
|
+
num_evaluations: int = result.total_metric_calls or num_candidates
|
|
592
|
+
|
|
593
|
+
improvement: float = (
|
|
594
|
+
(optimized_score - original_score) / original_score
|
|
595
|
+
if original_score > 0
|
|
596
|
+
else 0.0
|
|
597
|
+
)
|
|
598
|
+
|
|
599
|
+
logger.info("Optimization complete!")
|
|
600
|
+
logger.info(f"Original score: {original_score:.3f}")
|
|
601
|
+
logger.info(f"Optimized score: {optimized_score:.3f}")
|
|
602
|
+
logger.info(f"Improvement: {improvement:.1%}")
|
|
603
|
+
|
|
604
|
+
# Register if improved
|
|
605
|
+
registered_version: Optional[PromptVersion] = None
|
|
606
|
+
if (
|
|
607
|
+
register_if_improved
|
|
608
|
+
and improvement >= min_improvement
|
|
609
|
+
and optimized_score > original_score
|
|
610
|
+
and optimized_template != original_template
|
|
611
|
+
):
|
|
612
|
+
try:
|
|
613
|
+
registered_version = _register_optimized_prompt(
|
|
614
|
+
prompt=prompt,
|
|
615
|
+
optimized_template=optimized_template,
|
|
616
|
+
improvement=improvement,
|
|
617
|
+
original_score=original_score,
|
|
618
|
+
optimized_score=optimized_score,
|
|
619
|
+
model_name=reflection_model_name,
|
|
620
|
+
agent_name=agent.name,
|
|
621
|
+
num_evaluations=num_evaluations,
|
|
622
|
+
train_size=len(trainset),
|
|
623
|
+
val_size=len(valset),
|
|
624
|
+
)
|
|
625
|
+
except Exception as e:
|
|
626
|
+
logger.error(f"Failed to register optimized prompt: {e}")
|
|
627
|
+
|
|
628
|
+
# Build optimized prompt model with comprehensive tags
|
|
629
|
+
optimized_tags: dict[str, str] = {
|
|
630
|
+
**(prompt.tags or {}),
|
|
631
|
+
"dao_ai_version": dao_ai_version(),
|
|
632
|
+
"optimizer": "gepa",
|
|
633
|
+
"target_model": reflection_model_name,
|
|
634
|
+
"target_agent": agent.name,
|
|
635
|
+
"original_score": f"{original_score:.4f}",
|
|
636
|
+
"optimized_score": f"{optimized_score:.4f}",
|
|
637
|
+
"improvement": f"{improvement:.4f}",
|
|
638
|
+
"num_evaluations": str(num_evaluations),
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
optimized_prompt = PromptModel(
|
|
642
|
+
name=prompt.name,
|
|
643
|
+
schema=prompt.schema_model,
|
|
644
|
+
default_template=optimized_template,
|
|
645
|
+
description=f"Optimized with GEPA for agent '{agent.name}' (improvement: {improvement:.1%})",
|
|
646
|
+
alias="champion" if improvement > min_improvement else "latest",
|
|
647
|
+
tags=optimized_tags,
|
|
648
|
+
)
|
|
649
|
+
|
|
650
|
+
return OptimizationResult(
|
|
651
|
+
optimized_prompt=optimized_prompt,
|
|
652
|
+
optimized_template=optimized_template,
|
|
653
|
+
original_score=original_score,
|
|
654
|
+
optimized_score=optimized_score,
|
|
655
|
+
improvement=improvement,
|
|
656
|
+
num_evaluations=num_evaluations,
|
|
657
|
+
registered_version=registered_version,
|
|
658
|
+
metadata={
|
|
659
|
+
"optimizer": "gepa",
|
|
660
|
+
"reflection_model": reflection_model_name,
|
|
661
|
+
"train_size": len(trainset),
|
|
662
|
+
"val_size": len(valset),
|
|
663
|
+
},
|
|
664
|
+
)
|