dao-ai 0.0.25__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. dao_ai/__init__.py +29 -0
  2. dao_ai/agent_as_code.py +5 -5
  3. dao_ai/cli.py +245 -40
  4. dao_ai/config.py +1863 -338
  5. dao_ai/genie/__init__.py +38 -0
  6. dao_ai/genie/cache/__init__.py +43 -0
  7. dao_ai/genie/cache/base.py +72 -0
  8. dao_ai/genie/cache/core.py +79 -0
  9. dao_ai/genie/cache/lru.py +347 -0
  10. dao_ai/genie/cache/semantic.py +970 -0
  11. dao_ai/genie/core.py +35 -0
  12. dao_ai/graph.py +27 -228
  13. dao_ai/hooks/__init__.py +9 -6
  14. dao_ai/hooks/core.py +27 -195
  15. dao_ai/logging.py +56 -0
  16. dao_ai/memory/__init__.py +10 -0
  17. dao_ai/memory/core.py +65 -30
  18. dao_ai/memory/databricks.py +402 -0
  19. dao_ai/memory/postgres.py +79 -38
  20. dao_ai/messages.py +6 -4
  21. dao_ai/middleware/__init__.py +125 -0
  22. dao_ai/middleware/assertions.py +806 -0
  23. dao_ai/middleware/base.py +50 -0
  24. dao_ai/middleware/core.py +67 -0
  25. dao_ai/middleware/guardrails.py +420 -0
  26. dao_ai/middleware/human_in_the_loop.py +232 -0
  27. dao_ai/middleware/message_validation.py +586 -0
  28. dao_ai/middleware/summarization.py +197 -0
  29. dao_ai/models.py +1306 -114
  30. dao_ai/nodes.py +261 -166
  31. dao_ai/optimization.py +674 -0
  32. dao_ai/orchestration/__init__.py +52 -0
  33. dao_ai/orchestration/core.py +294 -0
  34. dao_ai/orchestration/supervisor.py +278 -0
  35. dao_ai/orchestration/swarm.py +271 -0
  36. dao_ai/prompts.py +128 -31
  37. dao_ai/providers/databricks.py +645 -172
  38. dao_ai/state.py +157 -21
  39. dao_ai/tools/__init__.py +13 -5
  40. dao_ai/tools/agent.py +1 -3
  41. dao_ai/tools/core.py +64 -11
  42. dao_ai/tools/email.py +232 -0
  43. dao_ai/tools/genie.py +144 -295
  44. dao_ai/tools/mcp.py +220 -133
  45. dao_ai/tools/memory.py +50 -0
  46. dao_ai/tools/python.py +9 -14
  47. dao_ai/tools/search.py +14 -0
  48. dao_ai/tools/slack.py +22 -10
  49. dao_ai/tools/sql.py +202 -0
  50. dao_ai/tools/time.py +30 -7
  51. dao_ai/tools/unity_catalog.py +165 -88
  52. dao_ai/tools/vector_search.py +360 -40
  53. dao_ai/utils.py +218 -16
  54. dao_ai-0.1.2.dist-info/METADATA +455 -0
  55. dao_ai-0.1.2.dist-info/RECORD +64 -0
  56. {dao_ai-0.0.25.dist-info → dao_ai-0.1.2.dist-info}/WHEEL +1 -1
  57. dao_ai/chat_models.py +0 -204
  58. dao_ai/guardrails.py +0 -112
  59. dao_ai/tools/human_in_the_loop.py +0 -100
  60. dao_ai-0.0.25.dist-info/METADATA +0 -1165
  61. dao_ai-0.0.25.dist-info/RECORD +0 -41
  62. {dao_ai-0.0.25.dist-info → dao_ai-0.1.2.dist-info}/entry_points.txt +0 -0
  63. {dao_ai-0.0.25.dist-info → dao_ai-0.1.2.dist-info}/licenses/LICENSE +0 -0
dao_ai/optimization.py ADDED
@@ -0,0 +1,674 @@
1
+ """
2
+ Prompt optimization using GEPA (Generative Evolution of Prompts and Agents).
3
+
4
+ This module provides prompt optimization for DAO AI agents using the GEPA
5
+ optimizer, which uses reflective mutation to evolve prompts based on
6
+ evaluation feedback.
7
+
8
+ GEPA is an evolutionary optimizer that:
9
+ 1. Takes a seed prompt (initial template)
10
+ 2. Evaluates it against training examples
11
+ 3. Uses a reflection LM to propose improvements
12
+ 4. Iteratively evolves the prompt to maximize the metric
13
+
14
+ Usage:
15
+ from dao_ai.optimization import optimize_prompt
16
+
17
+ result = optimize_prompt(
18
+ prompt=my_prompt_model,
19
+ agent=my_agent_model,
20
+ dataset=my_training_dataset,
21
+ num_candidates=50,
22
+ )
23
+
24
+ if result.improved:
25
+ print(f"Improved by {result.improvement:.1%}")
26
+ print(f"New template: {result.optimized_template}")
27
+ """
28
+
29
+ from dataclasses import dataclass, field
30
+ from datetime import datetime, timezone
31
+ from typing import Any, Callable, Optional, Sequence, Union
32
+
33
+ import mlflow
34
+ from gepa import EvaluationBatch, GEPAAdapter, GEPAResult, optimize
35
+ from loguru import logger
36
+ from mlflow.entities.model_registry import PromptVersion
37
+ from mlflow.types.responses import ResponsesAgentRequest, ResponsesAgentResponse
38
+ from mlflow.types.responses_helpers import Message
39
+
40
+ from dao_ai.config import (
41
+ AgentModel,
42
+ ChatPayload,
43
+ EvaluationDatasetEntryModel,
44
+ EvaluationDatasetModel,
45
+ PromptModel,
46
+ )
47
+ from dao_ai.utils import dao_ai_version
48
+
49
+ # Type alias for metric function
50
+ MetricFn = Callable[[str, "_TrainingExample"], float]
51
+
52
+ __all__ = [
53
+ "OptimizationResult",
54
+ "optimize_prompt",
55
+ ]
56
+
57
+
58
+ @dataclass
59
+ class OptimizationResult:
60
+ """Result of prompt optimization.
61
+
62
+ Attributes:
63
+ optimized_prompt: The optimized PromptModel with new template
64
+ optimized_template: The optimized template string
65
+ original_score: Score of the original prompt
66
+ optimized_score: Score of the optimized prompt
67
+ improvement: Percentage improvement
68
+ num_evaluations: Number of metric evaluations performed
69
+ registered_version: MLflow prompt version if registered
70
+ metadata: Additional optimization metadata
71
+ """
72
+
73
+ optimized_prompt: PromptModel
74
+ optimized_template: str
75
+ original_score: float
76
+ optimized_score: float
77
+ improvement: float
78
+ num_evaluations: int
79
+ registered_version: Optional[PromptVersion] = None
80
+ metadata: dict[str, Any] = field(default_factory=dict)
81
+
82
+ @property
83
+ def improved(self) -> bool:
84
+ """Whether the optimization improved the prompt."""
85
+ return self.optimized_score > self.original_score
86
+
87
+
88
+ @dataclass
89
+ class _TrainingExample:
90
+ """Internal training example format for GEPA."""
91
+
92
+ question: str
93
+ expected_facts: Optional[list[str]] = None
94
+ expected_response: Optional[str] = None
95
+ custom_inputs: Optional[dict[str, Any]] = None
96
+
97
+
98
+ @dataclass
99
+ class _Trajectory:
100
+ """Trajectory data for reflection."""
101
+
102
+ question: str
103
+ response: str
104
+ expected: Any
105
+ score: float
106
+ error: Optional[str] = None
107
+
108
+
109
+ class DAOAgentAdapter(GEPAAdapter[_TrainingExample, _Trajectory, str]):
110
+ """GEPA adapter for DAO AI agents.
111
+
112
+ This adapter bridges GEPA's optimization loop with DAO AI's
113
+ ResponsesAgent interface.
114
+ """
115
+
116
+ agent_model: AgentModel
117
+ metric_fn: MetricFn
118
+ _agent: Optional[Any]
119
+ _original_prompt: Optional[Union[PromptModel, str]]
120
+
121
+ def __init__(
122
+ self,
123
+ agent_model: AgentModel,
124
+ metric_fn: Optional[MetricFn] = None,
125
+ ) -> None:
126
+ """Initialize the adapter.
127
+
128
+ Args:
129
+ agent_model: The DAO AI agent model to optimize
130
+ metric_fn: Optional custom metric function (response, example) -> score
131
+ """
132
+ self.agent_model = agent_model
133
+ self.metric_fn = metric_fn or self._default_metric
134
+ self._agent = None
135
+ self._original_prompt = None
136
+
137
+ def _get_agent(self) -> Any:
138
+ """Lazily create the ResponsesAgent.
139
+
140
+ Returns:
141
+ The ResponsesAgent instance for the configured agent model.
142
+ """
143
+ if self._agent is None:
144
+ self._agent = self.agent_model.as_responses_agent()
145
+ return self._agent
146
+
147
+ def _default_metric(self, response: str, example: _TrainingExample) -> float:
148
+ """Default metric: check if expected facts are present in response."""
149
+ if example.expected_facts:
150
+ facts_found = sum(
151
+ 1 for fact in example.expected_facts if fact.lower() in response.lower()
152
+ )
153
+ return facts_found / len(example.expected_facts)
154
+ elif example.expected_response:
155
+ expected_words = set(example.expected_response.lower().split())
156
+ response_words = set(response.lower().split())
157
+ overlap = len(expected_words & response_words)
158
+ return overlap / len(expected_words) if expected_words else 0.0
159
+ return 0.0
160
+
161
+ def evaluate(
162
+ self,
163
+ batch: list[_TrainingExample],
164
+ candidate: dict[str, str],
165
+ capture_traces: bool = False,
166
+ ) -> EvaluationBatch[_Trajectory, str]:
167
+ """Evaluate a candidate prompt on a batch of examples.
168
+
169
+ Args:
170
+ batch: List of training examples to evaluate
171
+ candidate: Dict mapping component names to text (e.g., {"prompt": "..."})
172
+ capture_traces: Whether to capture trajectories for reflection
173
+
174
+ Returns:
175
+ EvaluationBatch with outputs, scores, and optional trajectories
176
+ """
177
+ prompt_template = candidate.get("prompt", "")
178
+
179
+ # Create agent with the candidate prompt
180
+ original_prompt = self.agent_model.prompt
181
+ try:
182
+ # Update agent's prompt template
183
+ if isinstance(original_prompt, PromptModel):
184
+ self.agent_model.prompt = PromptModel(
185
+ name=original_prompt.name,
186
+ schema=original_prompt.schema_model,
187
+ default_template=prompt_template,
188
+ description=original_prompt.description,
189
+ tags=original_prompt.tags,
190
+ )
191
+ else:
192
+ self.agent_model.prompt = prompt_template
193
+
194
+ # Recreate agent with new prompt
195
+ self._agent = None
196
+ agent = self._get_agent()
197
+
198
+ outputs: list[str] = []
199
+ scores: list[float] = []
200
+ trajectories: list[_Trajectory] = []
201
+
202
+ for example in batch:
203
+ try:
204
+ # Build request
205
+ messages = [Message(role="user", content=example.question)]
206
+ request = ResponsesAgentRequest(
207
+ input=messages,
208
+ custom_inputs=example.custom_inputs or {},
209
+ )
210
+
211
+ # Get response
212
+ response: ResponsesAgentResponse = agent.predict(request)
213
+
214
+ # Extract response text
215
+ response_text = ""
216
+ if response.output and len(response.output) > 0:
217
+ content = response.output[0].content
218
+ if isinstance(content, str):
219
+ response_text = content
220
+ elif isinstance(content, list):
221
+ response_text = "".join(
222
+ item.get("text", str(item))
223
+ if isinstance(item, dict)
224
+ else str(item)
225
+ for item in content
226
+ )
227
+ else:
228
+ response_text = str(content)
229
+
230
+ # Calculate score
231
+ score = self.metric_fn(response_text, example)
232
+
233
+ outputs.append(response_text)
234
+ scores.append(score)
235
+
236
+ if capture_traces:
237
+ trajectories.append(
238
+ _Trajectory(
239
+ question=example.question,
240
+ response=response_text,
241
+ expected=example.expected_facts
242
+ or example.expected_response,
243
+ score=score,
244
+ )
245
+ )
246
+
247
+ except Exception as e:
248
+ logger.warning("Error evaluating example", error=str(e))
249
+ outputs.append("")
250
+ scores.append(0.0)
251
+
252
+ if capture_traces:
253
+ trajectories.append(
254
+ _Trajectory(
255
+ question=example.question,
256
+ response="",
257
+ expected=example.expected_facts
258
+ or example.expected_response,
259
+ score=0.0,
260
+ error=str(e),
261
+ )
262
+ )
263
+
264
+ return EvaluationBatch(
265
+ outputs=outputs,
266
+ scores=scores,
267
+ trajectories=trajectories if capture_traces else None,
268
+ )
269
+
270
+ finally:
271
+ # Restore original prompt
272
+ self.agent_model.prompt = original_prompt
273
+ self._agent = None
274
+
275
+ def make_reflective_dataset(
276
+ self,
277
+ batch: list[_TrainingExample],
278
+ trajectories: list[_Trajectory],
279
+ component_name: str,
280
+ ) -> list[dict[str, str]]:
281
+ """Create a reflective dataset for the optimizer.
282
+
283
+ Args:
284
+ batch: Original batch of examples
285
+ trajectories: Trajectories from evaluation
286
+ component_name: Name of component to reflect on
287
+
288
+ Returns:
289
+ List of dicts with inputs, outputs, and feedback
290
+ """
291
+ reflective_data: list[dict[str, str]] = []
292
+
293
+ for example, trajectory in zip(batch, trajectories):
294
+ feedback_parts: list[str] = []
295
+ feedback_parts.append(f"Input: {trajectory.question}")
296
+ feedback_parts.append(f"Output: {trajectory.response[:500]}")
297
+ feedback_parts.append(f"Expected: {trajectory.expected}")
298
+ feedback_parts.append(f"Score: {trajectory.score:.2f}")
299
+
300
+ if trajectory.score < 1.0 and example.expected_facts:
301
+ missing = [
302
+ f
303
+ for f in example.expected_facts
304
+ if f.lower() not in trajectory.response.lower()
305
+ ]
306
+ if missing:
307
+ feedback_parts.append(f"Missing facts: {missing}")
308
+
309
+ if trajectory.error:
310
+ feedback_parts.append(f"Error: {trajectory.error}")
311
+
312
+ reflective_data.append(
313
+ {
314
+ "input": trajectory.question,
315
+ "output": trajectory.response,
316
+ "feedback": "\n".join(feedback_parts),
317
+ }
318
+ )
319
+
320
+ return reflective_data
321
+
322
+
323
+ def _convert_dataset(
324
+ dataset: EvaluationDatasetModel | Sequence[EvaluationDatasetEntryModel],
325
+ ) -> list[_TrainingExample]:
326
+ """Convert DAO dataset to internal training examples.
327
+
328
+ Args:
329
+ dataset: EvaluationDatasetModel or list of entries
330
+
331
+ Returns:
332
+ List of training examples
333
+ """
334
+ entries: Sequence[EvaluationDatasetEntryModel]
335
+ if isinstance(dataset, EvaluationDatasetModel):
336
+ entries = dataset.data
337
+ else:
338
+ entries = dataset
339
+
340
+ examples: list[_TrainingExample] = []
341
+
342
+ for entry in entries:
343
+ payload: ChatPayload = entry.inputs
344
+ messages = payload.messages
345
+
346
+ # Get the user's question from messages
347
+ question = ""
348
+ for msg in messages:
349
+ if msg.role == "user":
350
+ question = msg.content
351
+ break
352
+
353
+ example = _TrainingExample(
354
+ question=question,
355
+ expected_facts=entry.expectations.expected_facts
356
+ if entry.expectations
357
+ else None,
358
+ expected_response=entry.expectations.expected_response
359
+ if entry.expectations
360
+ else None,
361
+ custom_inputs=payload.custom_inputs,
362
+ )
363
+ examples.append(example)
364
+
365
+ logger.debug(
366
+ "Converted dataset entries to training examples", examples_count=len(examples)
367
+ )
368
+ return examples
369
+
370
+
371
+ def _register_optimized_prompt(
372
+ prompt: PromptModel,
373
+ optimized_template: str,
374
+ improvement: float,
375
+ original_score: float,
376
+ optimized_score: float,
377
+ model_name: str,
378
+ agent_name: str,
379
+ num_evaluations: int,
380
+ train_size: int,
381
+ val_size: int,
382
+ ) -> PromptVersion:
383
+ """Register the optimized prompt in MLflow.
384
+
385
+ Args:
386
+ prompt: Original prompt model
387
+ optimized_template: Optimized template string
388
+ improvement: Improvement percentage
389
+ original_score: Original evaluation score
390
+ optimized_score: Optimized evaluation score
391
+ model_name: Model used for reflection/optimization
392
+ agent_name: Name of the agent being optimized
393
+ num_evaluations: Number of metric evaluations performed
394
+ train_size: Size of training dataset
395
+ val_size: Size of validation dataset
396
+
397
+ Returns:
398
+ Registered PromptVersion
399
+ """
400
+ mlflow.set_registry_uri("databricks-uc")
401
+
402
+ prompt_name: str = prompt.full_name
403
+ optimization_timestamp: str = datetime.now(timezone.utc).isoformat()
404
+
405
+ logger.info("Registering optimized prompt", prompt_name=prompt_name)
406
+
407
+ # Build comprehensive tags for the prompt registry
408
+ tags: dict[str, str] = {
409
+ # DAO AI metadata
410
+ "dao_ai_version": dao_ai_version(),
411
+ "created_by": "dao_ai.optimization",
412
+ # Optimization metadata
413
+ "optimizer": "gepa",
414
+ "optimization_timestamp": optimization_timestamp,
415
+ "target_model": model_name,
416
+ "target_agent": agent_name,
417
+ # Performance metrics
418
+ "original_score": f"{original_score:.4f}",
419
+ "optimized_score": f"{optimized_score:.4f}",
420
+ "improvement": f"{improvement:.4f}",
421
+ "improvement_percent": f"{improvement:.1%}",
422
+ # Dataset info
423
+ "num_evaluations": str(num_evaluations),
424
+ "train_size": str(train_size),
425
+ "val_size": str(val_size),
426
+ }
427
+
428
+ # Preserve original prompt tags if present
429
+ if prompt.tags:
430
+ for key, value in prompt.tags.items():
431
+ if key not in tags: # Don't override optimization tags
432
+ tags[f"original_{key}"] = str(value)
433
+
434
+ # Register new version with comprehensive metadata
435
+ version: PromptVersion = mlflow.genai.register_prompt(
436
+ name=prompt_name,
437
+ template=optimized_template,
438
+ commit_message=(
439
+ f"Optimized with GEPA for agent '{agent_name}' "
440
+ f"(improvement: {improvement:.1%}, "
441
+ f"score: {original_score:.3f} -> {optimized_score:.3f}, "
442
+ f"model: {model_name})"
443
+ ),
444
+ tags=tags,
445
+ )
446
+
447
+ logger.success(
448
+ "Registered optimized prompt version",
449
+ prompt_name=prompt_name,
450
+ version=version.version,
451
+ )
452
+
453
+ # Set 'latest' alias for most recently optimized version
454
+ mlflow.genai.set_prompt_alias(
455
+ name=prompt_name,
456
+ alias="latest",
457
+ version=version.version,
458
+ )
459
+ logger.info("Set 'latest' alias", prompt_name=prompt_name, version=version.version)
460
+
461
+ # Set 'champion' alias if there was actual improvement
462
+ if improvement > 0:
463
+ mlflow.genai.set_prompt_alias(
464
+ name=prompt_name,
465
+ alias="champion",
466
+ version=version.version,
467
+ )
468
+ logger.success(
469
+ "Set 'champion' alias", prompt_name=prompt_name, version=version.version
470
+ )
471
+
472
+ return version
473
+
474
+
475
+ def optimize_prompt(
476
+ prompt: PromptModel,
477
+ agent: AgentModel,
478
+ dataset: EvaluationDatasetModel | Sequence[EvaluationDatasetEntryModel],
479
+ reflection_model: Optional[str] = None,
480
+ num_candidates: int = 50,
481
+ metric: Optional[Callable[[str, _TrainingExample], float]] = None,
482
+ register_if_improved: bool = True,
483
+ min_improvement: float = 0.0,
484
+ ) -> OptimizationResult:
485
+ """
486
+ Optimize a prompt using GEPA.
487
+
488
+ GEPA (Generative Evolution of Prompts and Agents) is an evolutionary
489
+ optimizer that uses reflective mutation to improve prompts based on
490
+ evaluation feedback.
491
+
492
+ Args:
493
+ prompt: The PromptModel to optimize
494
+ agent: The AgentModel that uses this prompt
495
+ dataset: Training data for optimization
496
+ reflection_model: LLM for reflection (defaults to agent's model)
497
+ num_candidates: Maximum metric calls / candidate evaluations
498
+ metric: Optional custom metric function (response, example) -> score
499
+ register_if_improved: Register optimized prompt in MLflow if improved
500
+ min_improvement: Minimum improvement required to register
501
+
502
+ Returns:
503
+ OptimizationResult with optimization details
504
+
505
+ Example:
506
+ from dao_ai.config import AgentModel, PromptModel, LLMModel
507
+ from dao_ai.optimization import optimize_prompt
508
+
509
+ prompt = PromptModel(
510
+ name="my_prompt",
511
+ default_template="Answer the question: {question}"
512
+ )
513
+ agent = AgentModel(
514
+ name="my_agent",
515
+ model=LLMModel(name="databricks-meta-llama-3-3-70b-instruct"),
516
+ prompt=prompt,
517
+ )
518
+
519
+ result = optimize_prompt(
520
+ prompt=prompt,
521
+ agent=agent,
522
+ dataset=training_data,
523
+ num_candidates=50,
524
+ )
525
+
526
+ if result.improved:
527
+ print(f"Improved by {result.improvement:.1%}")
528
+ """
529
+ logger.info("Starting GEPA optimization", prompt_name=prompt.name)
530
+
531
+ # Get the original template
532
+ original_template = prompt.template
533
+ if not original_template:
534
+ raise ValueError(f"Prompt '{prompt.name}' has no template to optimize")
535
+
536
+ # Convert dataset
537
+ examples = _convert_dataset(dataset)
538
+ if not examples:
539
+ raise ValueError("Dataset is empty")
540
+
541
+ # Split into train/val
542
+ split_idx = max(1, len(examples) * 4 // 5)
543
+ trainset = examples[:split_idx]
544
+ valset = examples[split_idx:] if split_idx < len(examples) else examples
545
+
546
+ logger.info("Dataset split", train_size=len(trainset), val_size=len(valset))
547
+
548
+ # Get reflection model
549
+ reflection_model_name = reflection_model or agent.model.uri
550
+ logger.info("Using reflection model", model=reflection_model_name)
551
+
552
+ # Create adapter
553
+ adapter = DAOAgentAdapter(agent_model=agent, metric_fn=metric)
554
+
555
+ # Seed candidate
556
+ seed_candidate = {"prompt": original_template}
557
+
558
+ # Run GEPA optimization
559
+ logger.info("Running GEPA optimization", max_evaluations=num_candidates)
560
+
561
+ try:
562
+ result: GEPAResult = optimize(
563
+ seed_candidate=seed_candidate,
564
+ trainset=trainset,
565
+ valset=valset,
566
+ adapter=adapter,
567
+ reflection_lm=reflection_model_name,
568
+ max_metric_calls=num_candidates,
569
+ display_progress_bar=True,
570
+ skip_perfect_score=True,
571
+ )
572
+ except Exception as e:
573
+ logger.error("GEPA optimization failed", error=str(e))
574
+ return OptimizationResult(
575
+ optimized_prompt=prompt,
576
+ optimized_template=original_template,
577
+ original_score=0.0,
578
+ optimized_score=0.0,
579
+ improvement=0.0,
580
+ num_evaluations=0,
581
+ metadata={"error": str(e)},
582
+ )
583
+
584
+ # Extract results from GEPAResult
585
+ # GEPAResult has:
586
+ # - candidates: list of candidate dicts
587
+ # - val_aggregate_scores: list of scores (index 0 is seed)
588
+ # - best_idx: index of best candidate
589
+ # - best_candidate: dict for best candidate
590
+ # - total_metric_calls: number of metric evaluations
591
+ best_candidate: dict[str, str] = result.best_candidate
592
+ optimized_template: str = best_candidate.get("prompt", original_template)
593
+
594
+ # Get scores from result - val_aggregate_scores[0] is the seed candidate score
595
+ val_scores: list[float] = result.val_aggregate_scores
596
+ original_score: float = val_scores[0] if val_scores else 0.0
597
+ best_idx: int = result.best_idx
598
+ optimized_score: float = val_scores[best_idx] if val_scores else 0.0
599
+ num_evaluations: int = result.total_metric_calls or num_candidates
600
+
601
+ improvement: float = (
602
+ (optimized_score - original_score) / original_score
603
+ if original_score > 0
604
+ else 0.0
605
+ )
606
+
607
+ logger.success(
608
+ "Optimization complete",
609
+ original_score=f"{original_score:.3f}",
610
+ optimized_score=f"{optimized_score:.3f}",
611
+ improvement=f"{improvement:.1%}",
612
+ )
613
+
614
+ # Register if improved
615
+ registered_version: Optional[PromptVersion] = None
616
+ if (
617
+ register_if_improved
618
+ and improvement >= min_improvement
619
+ and optimized_score > original_score
620
+ and optimized_template != original_template
621
+ ):
622
+ try:
623
+ registered_version = _register_optimized_prompt(
624
+ prompt=prompt,
625
+ optimized_template=optimized_template,
626
+ improvement=improvement,
627
+ original_score=original_score,
628
+ optimized_score=optimized_score,
629
+ model_name=reflection_model_name,
630
+ agent_name=agent.name,
631
+ num_evaluations=num_evaluations,
632
+ train_size=len(trainset),
633
+ val_size=len(valset),
634
+ )
635
+ except Exception as e:
636
+ logger.error("Failed to register optimized prompt", error=str(e))
637
+
638
+ # Build optimized prompt model with comprehensive tags
639
+ optimized_tags: dict[str, str] = {
640
+ **(prompt.tags or {}),
641
+ "dao_ai_version": dao_ai_version(),
642
+ "optimizer": "gepa",
643
+ "target_model": reflection_model_name,
644
+ "target_agent": agent.name,
645
+ "original_score": f"{original_score:.4f}",
646
+ "optimized_score": f"{optimized_score:.4f}",
647
+ "improvement": f"{improvement:.4f}",
648
+ "num_evaluations": str(num_evaluations),
649
+ }
650
+
651
+ optimized_prompt = PromptModel(
652
+ name=prompt.name,
653
+ schema=prompt.schema_model,
654
+ default_template=optimized_template,
655
+ description=f"Optimized with GEPA for agent '{agent.name}' (improvement: {improvement:.1%})",
656
+ alias="champion" if improvement > min_improvement else "latest",
657
+ tags=optimized_tags,
658
+ )
659
+
660
+ return OptimizationResult(
661
+ optimized_prompt=optimized_prompt,
662
+ optimized_template=optimized_template,
663
+ original_score=original_score,
664
+ optimized_score=optimized_score,
665
+ improvement=improvement,
666
+ num_evaluations=num_evaluations,
667
+ registered_version=registered_version,
668
+ metadata={
669
+ "optimizer": "gepa",
670
+ "reflection_model": reflection_model_name,
671
+ "train_size": len(trainset),
672
+ "val_size": len(valset),
673
+ },
674
+ )