dao-ai 0.0.36__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. dao_ai/__init__.py +29 -0
  2. dao_ai/cli.py +195 -30
  3. dao_ai/config.py +770 -244
  4. dao_ai/genie/__init__.py +1 -22
  5. dao_ai/genie/cache/__init__.py +1 -2
  6. dao_ai/genie/cache/base.py +20 -70
  7. dao_ai/genie/cache/core.py +75 -0
  8. dao_ai/genie/cache/lru.py +44 -21
  9. dao_ai/genie/cache/semantic.py +390 -109
  10. dao_ai/genie/core.py +35 -0
  11. dao_ai/graph.py +27 -253
  12. dao_ai/hooks/__init__.py +9 -6
  13. dao_ai/hooks/core.py +22 -190
  14. dao_ai/memory/__init__.py +10 -0
  15. dao_ai/memory/core.py +23 -5
  16. dao_ai/memory/databricks.py +389 -0
  17. dao_ai/memory/postgres.py +2 -2
  18. dao_ai/messages.py +6 -4
  19. dao_ai/middleware/__init__.py +125 -0
  20. dao_ai/middleware/assertions.py +778 -0
  21. dao_ai/middleware/base.py +50 -0
  22. dao_ai/middleware/core.py +61 -0
  23. dao_ai/middleware/guardrails.py +415 -0
  24. dao_ai/middleware/human_in_the_loop.py +228 -0
  25. dao_ai/middleware/message_validation.py +554 -0
  26. dao_ai/middleware/summarization.py +192 -0
  27. dao_ai/models.py +1177 -108
  28. dao_ai/nodes.py +118 -161
  29. dao_ai/optimization.py +664 -0
  30. dao_ai/orchestration/__init__.py +52 -0
  31. dao_ai/orchestration/core.py +287 -0
  32. dao_ai/orchestration/supervisor.py +264 -0
  33. dao_ai/orchestration/swarm.py +226 -0
  34. dao_ai/prompts.py +126 -29
  35. dao_ai/providers/databricks.py +126 -381
  36. dao_ai/state.py +139 -21
  37. dao_ai/tools/__init__.py +8 -5
  38. dao_ai/tools/core.py +57 -4
  39. dao_ai/tools/email.py +280 -0
  40. dao_ai/tools/genie.py +47 -24
  41. dao_ai/tools/mcp.py +4 -3
  42. dao_ai/tools/memory.py +50 -0
  43. dao_ai/tools/python.py +4 -12
  44. dao_ai/tools/search.py +14 -0
  45. dao_ai/tools/slack.py +1 -1
  46. dao_ai/tools/unity_catalog.py +8 -6
  47. dao_ai/tools/vector_search.py +16 -9
  48. dao_ai/utils.py +72 -8
  49. dao_ai-0.1.1.dist-info/METADATA +1878 -0
  50. dao_ai-0.1.1.dist-info/RECORD +62 -0
  51. dao_ai/chat_models.py +0 -204
  52. dao_ai/guardrails.py +0 -112
  53. dao_ai/tools/genie/__init__.py +0 -236
  54. dao_ai/tools/human_in_the_loop.py +0 -100
  55. dao_ai-0.0.36.dist-info/METADATA +0 -951
  56. dao_ai-0.0.36.dist-info/RECORD +0 -47
  57. {dao_ai-0.0.36.dist-info → dao_ai-0.1.1.dist-info}/WHEEL +0 -0
  58. {dao_ai-0.0.36.dist-info → dao_ai-0.1.1.dist-info}/entry_points.txt +0 -0
  59. {dao_ai-0.0.36.dist-info → dao_ai-0.1.1.dist-info}/licenses/LICENSE +0 -0
dao_ai/optimization.py ADDED
@@ -0,0 +1,664 @@
1
+ """
2
+ Prompt optimization using GEPA (Generative Evolution of Prompts and Agents).
3
+
4
+ This module provides prompt optimization for DAO AI agents using the GEPA
5
+ optimizer, which uses reflective mutation to evolve prompts based on
6
+ evaluation feedback.
7
+
8
+ GEPA is an evolutionary optimizer that:
9
+ 1. Takes a seed prompt (initial template)
10
+ 2. Evaluates it against training examples
11
+ 3. Uses a reflection LM to propose improvements
12
+ 4. Iteratively evolves the prompt to maximize the metric
13
+
14
+ Usage:
15
+ from dao_ai.optimization import optimize_prompt
16
+
17
+ result = optimize_prompt(
18
+ prompt=my_prompt_model,
19
+ agent=my_agent_model,
20
+ dataset=my_training_dataset,
21
+ num_candidates=50,
22
+ )
23
+
24
+ if result.improved:
25
+ print(f"Improved by {result.improvement:.1%}")
26
+ print(f"New template: {result.optimized_template}")
27
+ """
28
+
29
+ from dataclasses import dataclass, field
30
+ from datetime import datetime, timezone
31
+ from typing import Any, Callable, Optional, Sequence, Union
32
+
33
+ import mlflow
34
+ from gepa import EvaluationBatch, GEPAAdapter, GEPAResult, optimize
35
+ from loguru import logger
36
+ from mlflow.entities.model_registry import PromptVersion
37
+ from mlflow.types.responses import ResponsesAgentRequest, ResponsesAgentResponse
38
+ from mlflow.types.responses_helpers import Message
39
+
40
+ from dao_ai.config import (
41
+ AgentModel,
42
+ ChatPayload,
43
+ EvaluationDatasetEntryModel,
44
+ EvaluationDatasetModel,
45
+ PromptModel,
46
+ )
47
+ from dao_ai.utils import dao_ai_version
48
+
49
+ # Type alias for metric function
50
+ MetricFn = Callable[[str, "_TrainingExample"], float]
51
+
52
+ __all__ = [
53
+ "OptimizationResult",
54
+ "optimize_prompt",
55
+ ]
56
+
57
+
58
+ @dataclass
59
+ class OptimizationResult:
60
+ """Result of prompt optimization.
61
+
62
+ Attributes:
63
+ optimized_prompt: The optimized PromptModel with new template
64
+ optimized_template: The optimized template string
65
+ original_score: Score of the original prompt
66
+ optimized_score: Score of the optimized prompt
67
+ improvement: Percentage improvement
68
+ num_evaluations: Number of metric evaluations performed
69
+ registered_version: MLflow prompt version if registered
70
+ metadata: Additional optimization metadata
71
+ """
72
+
73
+ optimized_prompt: PromptModel
74
+ optimized_template: str
75
+ original_score: float
76
+ optimized_score: float
77
+ improvement: float
78
+ num_evaluations: int
79
+ registered_version: Optional[PromptVersion] = None
80
+ metadata: dict[str, Any] = field(default_factory=dict)
81
+
82
+ @property
83
+ def improved(self) -> bool:
84
+ """Whether the optimization improved the prompt."""
85
+ return self.optimized_score > self.original_score
86
+
87
+
88
+ @dataclass
89
+ class _TrainingExample:
90
+ """Internal training example format for GEPA."""
91
+
92
+ question: str
93
+ expected_facts: Optional[list[str]] = None
94
+ expected_response: Optional[str] = None
95
+ custom_inputs: Optional[dict[str, Any]] = None
96
+
97
+
98
+ @dataclass
99
+ class _Trajectory:
100
+ """Trajectory data for reflection."""
101
+
102
+ question: str
103
+ response: str
104
+ expected: Any
105
+ score: float
106
+ error: Optional[str] = None
107
+
108
+
109
+ class DAOAgentAdapter(GEPAAdapter[_TrainingExample, _Trajectory, str]):
110
+ """GEPA adapter for DAO AI agents.
111
+
112
+ This adapter bridges GEPA's optimization loop with DAO AI's
113
+ ResponsesAgent interface.
114
+ """
115
+
116
+ agent_model: AgentModel
117
+ metric_fn: MetricFn
118
+ _agent: Optional[Any]
119
+ _original_prompt: Optional[Union[PromptModel, str]]
120
+
121
+ def __init__(
122
+ self,
123
+ agent_model: AgentModel,
124
+ metric_fn: Optional[MetricFn] = None,
125
+ ) -> None:
126
+ """Initialize the adapter.
127
+
128
+ Args:
129
+ agent_model: The DAO AI agent model to optimize
130
+ metric_fn: Optional custom metric function (response, example) -> score
131
+ """
132
+ self.agent_model = agent_model
133
+ self.metric_fn = metric_fn or self._default_metric
134
+ self._agent = None
135
+ self._original_prompt = None
136
+
137
+ def _get_agent(self) -> Any:
138
+ """Lazily create the ResponsesAgent.
139
+
140
+ Returns:
141
+ The ResponsesAgent instance for the configured agent model.
142
+ """
143
+ if self._agent is None:
144
+ self._agent = self.agent_model.as_responses_agent()
145
+ return self._agent
146
+
147
+ def _default_metric(self, response: str, example: _TrainingExample) -> float:
148
+ """Default metric: check if expected facts are present in response."""
149
+ if example.expected_facts:
150
+ facts_found = sum(
151
+ 1 for fact in example.expected_facts if fact.lower() in response.lower()
152
+ )
153
+ return facts_found / len(example.expected_facts)
154
+ elif example.expected_response:
155
+ expected_words = set(example.expected_response.lower().split())
156
+ response_words = set(response.lower().split())
157
+ overlap = len(expected_words & response_words)
158
+ return overlap / len(expected_words) if expected_words else 0.0
159
+ return 0.0
160
+
161
+ def evaluate(
162
+ self,
163
+ batch: list[_TrainingExample],
164
+ candidate: dict[str, str],
165
+ capture_traces: bool = False,
166
+ ) -> EvaluationBatch[_Trajectory, str]:
167
+ """Evaluate a candidate prompt on a batch of examples.
168
+
169
+ Args:
170
+ batch: List of training examples to evaluate
171
+ candidate: Dict mapping component names to text (e.g., {"prompt": "..."})
172
+ capture_traces: Whether to capture trajectories for reflection
173
+
174
+ Returns:
175
+ EvaluationBatch with outputs, scores, and optional trajectories
176
+ """
177
+ prompt_template = candidate.get("prompt", "")
178
+
179
+ # Create agent with the candidate prompt
180
+ original_prompt = self.agent_model.prompt
181
+ try:
182
+ # Update agent's prompt template
183
+ if isinstance(original_prompt, PromptModel):
184
+ self.agent_model.prompt = PromptModel(
185
+ name=original_prompt.name,
186
+ schema=original_prompt.schema_model,
187
+ default_template=prompt_template,
188
+ description=original_prompt.description,
189
+ tags=original_prompt.tags,
190
+ )
191
+ else:
192
+ self.agent_model.prompt = prompt_template
193
+
194
+ # Recreate agent with new prompt
195
+ self._agent = None
196
+ agent = self._get_agent()
197
+
198
+ outputs: list[str] = []
199
+ scores: list[float] = []
200
+ trajectories: list[_Trajectory] = []
201
+
202
+ for example in batch:
203
+ try:
204
+ # Build request
205
+ messages = [Message(role="user", content=example.question)]
206
+ request = ResponsesAgentRequest(
207
+ input=messages,
208
+ custom_inputs=example.custom_inputs or {},
209
+ )
210
+
211
+ # Get response
212
+ response: ResponsesAgentResponse = agent.predict(request)
213
+
214
+ # Extract response text
215
+ response_text = ""
216
+ if response.output and len(response.output) > 0:
217
+ content = response.output[0].content
218
+ if isinstance(content, str):
219
+ response_text = content
220
+ elif isinstance(content, list):
221
+ response_text = "".join(
222
+ item.get("text", str(item))
223
+ if isinstance(item, dict)
224
+ else str(item)
225
+ for item in content
226
+ )
227
+ else:
228
+ response_text = str(content)
229
+
230
+ # Calculate score
231
+ score = self.metric_fn(response_text, example)
232
+
233
+ outputs.append(response_text)
234
+ scores.append(score)
235
+
236
+ if capture_traces:
237
+ trajectories.append(
238
+ _Trajectory(
239
+ question=example.question,
240
+ response=response_text,
241
+ expected=example.expected_facts
242
+ or example.expected_response,
243
+ score=score,
244
+ )
245
+ )
246
+
247
+ except Exception as e:
248
+ logger.warning(f"Error evaluating example: {e}")
249
+ outputs.append("")
250
+ scores.append(0.0)
251
+
252
+ if capture_traces:
253
+ trajectories.append(
254
+ _Trajectory(
255
+ question=example.question,
256
+ response="",
257
+ expected=example.expected_facts
258
+ or example.expected_response,
259
+ score=0.0,
260
+ error=str(e),
261
+ )
262
+ )
263
+
264
+ return EvaluationBatch(
265
+ outputs=outputs,
266
+ scores=scores,
267
+ trajectories=trajectories if capture_traces else None,
268
+ )
269
+
270
+ finally:
271
+ # Restore original prompt
272
+ self.agent_model.prompt = original_prompt
273
+ self._agent = None
274
+
275
+ def make_reflective_dataset(
276
+ self,
277
+ batch: list[_TrainingExample],
278
+ trajectories: list[_Trajectory],
279
+ component_name: str,
280
+ ) -> list[dict[str, str]]:
281
+ """Create a reflective dataset for the optimizer.
282
+
283
+ Args:
284
+ batch: Original batch of examples
285
+ trajectories: Trajectories from evaluation
286
+ component_name: Name of component to reflect on
287
+
288
+ Returns:
289
+ List of dicts with inputs, outputs, and feedback
290
+ """
291
+ reflective_data: list[dict[str, str]] = []
292
+
293
+ for example, trajectory in zip(batch, trajectories):
294
+ feedback_parts: list[str] = []
295
+ feedback_parts.append(f"Input: {trajectory.question}")
296
+ feedback_parts.append(f"Output: {trajectory.response[:500]}")
297
+ feedback_parts.append(f"Expected: {trajectory.expected}")
298
+ feedback_parts.append(f"Score: {trajectory.score:.2f}")
299
+
300
+ if trajectory.score < 1.0 and example.expected_facts:
301
+ missing = [
302
+ f
303
+ for f in example.expected_facts
304
+ if f.lower() not in trajectory.response.lower()
305
+ ]
306
+ if missing:
307
+ feedback_parts.append(f"Missing facts: {missing}")
308
+
309
+ if trajectory.error:
310
+ feedback_parts.append(f"Error: {trajectory.error}")
311
+
312
+ reflective_data.append(
313
+ {
314
+ "input": trajectory.question,
315
+ "output": trajectory.response,
316
+ "feedback": "\n".join(feedback_parts),
317
+ }
318
+ )
319
+
320
+ return reflective_data
321
+
322
+
323
+ def _convert_dataset(
324
+ dataset: EvaluationDatasetModel | Sequence[EvaluationDatasetEntryModel],
325
+ ) -> list[_TrainingExample]:
326
+ """Convert DAO dataset to internal training examples.
327
+
328
+ Args:
329
+ dataset: EvaluationDatasetModel or list of entries
330
+
331
+ Returns:
332
+ List of training examples
333
+ """
334
+ entries: Sequence[EvaluationDatasetEntryModel]
335
+ if isinstance(dataset, EvaluationDatasetModel):
336
+ entries = dataset.data
337
+ else:
338
+ entries = dataset
339
+
340
+ examples: list[_TrainingExample] = []
341
+
342
+ for entry in entries:
343
+ payload: ChatPayload = entry.inputs
344
+ messages = payload.messages
345
+
346
+ # Get the user's question from messages
347
+ question = ""
348
+ for msg in messages:
349
+ if msg.role == "user":
350
+ question = msg.content
351
+ break
352
+
353
+ example = _TrainingExample(
354
+ question=question,
355
+ expected_facts=entry.expectations.expected_facts
356
+ if entry.expectations
357
+ else None,
358
+ expected_response=entry.expectations.expected_response
359
+ if entry.expectations
360
+ else None,
361
+ custom_inputs=payload.custom_inputs,
362
+ )
363
+ examples.append(example)
364
+
365
+ logger.debug(f"Converted {len(examples)} dataset entries to training examples")
366
+ return examples
367
+
368
+
369
+ def _register_optimized_prompt(
370
+ prompt: PromptModel,
371
+ optimized_template: str,
372
+ improvement: float,
373
+ original_score: float,
374
+ optimized_score: float,
375
+ model_name: str,
376
+ agent_name: str,
377
+ num_evaluations: int,
378
+ train_size: int,
379
+ val_size: int,
380
+ ) -> PromptVersion:
381
+ """Register the optimized prompt in MLflow.
382
+
383
+ Args:
384
+ prompt: Original prompt model
385
+ optimized_template: Optimized template string
386
+ improvement: Improvement percentage
387
+ original_score: Original evaluation score
388
+ optimized_score: Optimized evaluation score
389
+ model_name: Model used for reflection/optimization
390
+ agent_name: Name of the agent being optimized
391
+ num_evaluations: Number of metric evaluations performed
392
+ train_size: Size of training dataset
393
+ val_size: Size of validation dataset
394
+
395
+ Returns:
396
+ Registered PromptVersion
397
+ """
398
+ mlflow.set_registry_uri("databricks-uc")
399
+
400
+ prompt_name: str = prompt.full_name
401
+ optimization_timestamp: str = datetime.now(timezone.utc).isoformat()
402
+
403
+ logger.info(f"Registering optimized prompt: {prompt_name}")
404
+
405
+ # Build comprehensive tags for the prompt registry
406
+ tags: dict[str, str] = {
407
+ # DAO AI metadata
408
+ "dao_ai_version": dao_ai_version(),
409
+ "created_by": "dao_ai.optimization",
410
+ # Optimization metadata
411
+ "optimizer": "gepa",
412
+ "optimization_timestamp": optimization_timestamp,
413
+ "target_model": model_name,
414
+ "target_agent": agent_name,
415
+ # Performance metrics
416
+ "original_score": f"{original_score:.4f}",
417
+ "optimized_score": f"{optimized_score:.4f}",
418
+ "improvement": f"{improvement:.4f}",
419
+ "improvement_percent": f"{improvement:.1%}",
420
+ # Dataset info
421
+ "num_evaluations": str(num_evaluations),
422
+ "train_size": str(train_size),
423
+ "val_size": str(val_size),
424
+ }
425
+
426
+ # Preserve original prompt tags if present
427
+ if prompt.tags:
428
+ for key, value in prompt.tags.items():
429
+ if key not in tags: # Don't override optimization tags
430
+ tags[f"original_{key}"] = str(value)
431
+
432
+ # Register new version with comprehensive metadata
433
+ version: PromptVersion = mlflow.genai.register_prompt(
434
+ name=prompt_name,
435
+ template=optimized_template,
436
+ commit_message=(
437
+ f"Optimized with GEPA for agent '{agent_name}' "
438
+ f"(improvement: {improvement:.1%}, "
439
+ f"score: {original_score:.3f} -> {optimized_score:.3f}, "
440
+ f"model: {model_name})"
441
+ ),
442
+ tags=tags,
443
+ )
444
+
445
+ logger.info(f"Registered as version {version.version}")
446
+
447
+ # Set 'latest' alias for most recently optimized version
448
+ mlflow.genai.set_prompt_alias(
449
+ name=prompt_name,
450
+ alias="latest",
451
+ version=version.version,
452
+ )
453
+ logger.info(f"Set 'latest' alias for version {version.version}")
454
+
455
+ # Set 'champion' alias if there was actual improvement
456
+ if improvement > 0:
457
+ mlflow.genai.set_prompt_alias(
458
+ name=prompt_name,
459
+ alias="champion",
460
+ version=version.version,
461
+ )
462
+ logger.info(f"Set 'champion' alias for version {version.version}")
463
+
464
+ return version
465
+
466
+
467
+ def optimize_prompt(
468
+ prompt: PromptModel,
469
+ agent: AgentModel,
470
+ dataset: EvaluationDatasetModel | Sequence[EvaluationDatasetEntryModel],
471
+ reflection_model: Optional[str] = None,
472
+ num_candidates: int = 50,
473
+ metric: Optional[Callable[[str, _TrainingExample], float]] = None,
474
+ register_if_improved: bool = True,
475
+ min_improvement: float = 0.0,
476
+ ) -> OptimizationResult:
477
+ """
478
+ Optimize a prompt using GEPA.
479
+
480
+ GEPA (Generative Evolution of Prompts and Agents) is an evolutionary
481
+ optimizer that uses reflective mutation to improve prompts based on
482
+ evaluation feedback.
483
+
484
+ Args:
485
+ prompt: The PromptModel to optimize
486
+ agent: The AgentModel that uses this prompt
487
+ dataset: Training data for optimization
488
+ reflection_model: LLM for reflection (defaults to agent's model)
489
+ num_candidates: Maximum metric calls / candidate evaluations
490
+ metric: Optional custom metric function (response, example) -> score
491
+ register_if_improved: Register optimized prompt in MLflow if improved
492
+ min_improvement: Minimum improvement required to register
493
+
494
+ Returns:
495
+ OptimizationResult with optimization details
496
+
497
+ Example:
498
+ from dao_ai.config import AgentModel, PromptModel, LLMModel
499
+ from dao_ai.optimization import optimize_prompt
500
+
501
+ prompt = PromptModel(
502
+ name="my_prompt",
503
+ default_template="Answer the question: {question}"
504
+ )
505
+ agent = AgentModel(
506
+ name="my_agent",
507
+ model=LLMModel(name="databricks-meta-llama-3-3-70b-instruct"),
508
+ prompt=prompt,
509
+ )
510
+
511
+ result = optimize_prompt(
512
+ prompt=prompt,
513
+ agent=agent,
514
+ dataset=training_data,
515
+ num_candidates=50,
516
+ )
517
+
518
+ if result.improved:
519
+ print(f"Improved by {result.improvement:.1%}")
520
+ """
521
+ logger.info(f"Starting GEPA optimization for prompt: {prompt.name}")
522
+
523
+ # Get the original template
524
+ original_template = prompt.template
525
+ if not original_template:
526
+ raise ValueError(f"Prompt '{prompt.name}' has no template to optimize")
527
+
528
+ # Convert dataset
529
+ examples = _convert_dataset(dataset)
530
+ if not examples:
531
+ raise ValueError("Dataset is empty")
532
+
533
+ # Split into train/val
534
+ split_idx = max(1, len(examples) * 4 // 5)
535
+ trainset = examples[:split_idx]
536
+ valset = examples[split_idx:] if split_idx < len(examples) else examples
537
+
538
+ logger.info(f"Using {len(trainset)} train, {len(valset)} val examples")
539
+
540
+ # Get reflection model
541
+ reflection_model_name = reflection_model or agent.model.uri
542
+ logger.info(f"Using reflection model: {reflection_model_name}")
543
+
544
+ # Create adapter
545
+ adapter = DAOAgentAdapter(agent_model=agent, metric_fn=metric)
546
+
547
+ # Seed candidate
548
+ seed_candidate = {"prompt": original_template}
549
+
550
+ # Run GEPA optimization
551
+ logger.info(f"Running GEPA optimization (max {num_candidates} evaluations)...")
552
+
553
+ try:
554
+ result: GEPAResult = optimize(
555
+ seed_candidate=seed_candidate,
556
+ trainset=trainset,
557
+ valset=valset,
558
+ adapter=adapter,
559
+ reflection_lm=reflection_model_name,
560
+ max_metric_calls=num_candidates,
561
+ display_progress_bar=True,
562
+ skip_perfect_score=True,
563
+ )
564
+ except Exception as e:
565
+ logger.error(f"GEPA optimization failed: {e}")
566
+ return OptimizationResult(
567
+ optimized_prompt=prompt,
568
+ optimized_template=original_template,
569
+ original_score=0.0,
570
+ optimized_score=0.0,
571
+ improvement=0.0,
572
+ num_evaluations=0,
573
+ metadata={"error": str(e)},
574
+ )
575
+
576
+ # Extract results from GEPAResult
577
+ # GEPAResult has:
578
+ # - candidates: list of candidate dicts
579
+ # - val_aggregate_scores: list of scores (index 0 is seed)
580
+ # - best_idx: index of best candidate
581
+ # - best_candidate: dict for best candidate
582
+ # - total_metric_calls: number of metric evaluations
583
+ best_candidate: dict[str, str] = result.best_candidate
584
+ optimized_template: str = best_candidate.get("prompt", original_template)
585
+
586
+ # Get scores from result - val_aggregate_scores[0] is the seed candidate score
587
+ val_scores: list[float] = result.val_aggregate_scores
588
+ original_score: float = val_scores[0] if val_scores else 0.0
589
+ best_idx: int = result.best_idx
590
+ optimized_score: float = val_scores[best_idx] if val_scores else 0.0
591
+ num_evaluations: int = result.total_metric_calls or num_candidates
592
+
593
+ improvement: float = (
594
+ (optimized_score - original_score) / original_score
595
+ if original_score > 0
596
+ else 0.0
597
+ )
598
+
599
+ logger.info("Optimization complete!")
600
+ logger.info(f"Original score: {original_score:.3f}")
601
+ logger.info(f"Optimized score: {optimized_score:.3f}")
602
+ logger.info(f"Improvement: {improvement:.1%}")
603
+
604
+ # Register if improved
605
+ registered_version: Optional[PromptVersion] = None
606
+ if (
607
+ register_if_improved
608
+ and improvement >= min_improvement
609
+ and optimized_score > original_score
610
+ and optimized_template != original_template
611
+ ):
612
+ try:
613
+ registered_version = _register_optimized_prompt(
614
+ prompt=prompt,
615
+ optimized_template=optimized_template,
616
+ improvement=improvement,
617
+ original_score=original_score,
618
+ optimized_score=optimized_score,
619
+ model_name=reflection_model_name,
620
+ agent_name=agent.name,
621
+ num_evaluations=num_evaluations,
622
+ train_size=len(trainset),
623
+ val_size=len(valset),
624
+ )
625
+ except Exception as e:
626
+ logger.error(f"Failed to register optimized prompt: {e}")
627
+
628
+ # Build optimized prompt model with comprehensive tags
629
+ optimized_tags: dict[str, str] = {
630
+ **(prompt.tags or {}),
631
+ "dao_ai_version": dao_ai_version(),
632
+ "optimizer": "gepa",
633
+ "target_model": reflection_model_name,
634
+ "target_agent": agent.name,
635
+ "original_score": f"{original_score:.4f}",
636
+ "optimized_score": f"{optimized_score:.4f}",
637
+ "improvement": f"{improvement:.4f}",
638
+ "num_evaluations": str(num_evaluations),
639
+ }
640
+
641
+ optimized_prompt = PromptModel(
642
+ name=prompt.name,
643
+ schema=prompt.schema_model,
644
+ default_template=optimized_template,
645
+ description=f"Optimized with GEPA for agent '{agent.name}' (improvement: {improvement:.1%})",
646
+ alias="champion" if improvement > min_improvement else "latest",
647
+ tags=optimized_tags,
648
+ )
649
+
650
+ return OptimizationResult(
651
+ optimized_prompt=optimized_prompt,
652
+ optimized_template=optimized_template,
653
+ original_score=original_score,
654
+ optimized_score=optimized_score,
655
+ improvement=improvement,
656
+ num_evaluations=num_evaluations,
657
+ registered_version=registered_version,
658
+ metadata={
659
+ "optimizer": "gepa",
660
+ "reflection_model": reflection_model_name,
661
+ "train_size": len(trainset),
662
+ "val_size": len(valset),
663
+ },
664
+ )