lm-deluge 0.0.67__py3-none-any.whl → 0.0.90__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lm-deluge might be problematic. Click here for more details.
- lm_deluge/__init__.py +1 -2
- lm_deluge/api_requests/anthropic.py +117 -22
- lm_deluge/api_requests/base.py +84 -11
- lm_deluge/api_requests/bedrock.py +30 -6
- lm_deluge/api_requests/chat_reasoning.py +4 -0
- lm_deluge/api_requests/gemini.py +166 -20
- lm_deluge/api_requests/openai.py +145 -25
- lm_deluge/batches.py +15 -45
- lm_deluge/client.py +309 -50
- lm_deluge/config.py +15 -3
- lm_deluge/models/__init__.py +14 -1
- lm_deluge/models/anthropic.py +29 -14
- lm_deluge/models/arcee.py +16 -0
- lm_deluge/models/deepseek.py +36 -4
- lm_deluge/models/google.py +42 -0
- lm_deluge/models/grok.py +24 -0
- lm_deluge/models/kimi.py +36 -0
- lm_deluge/models/minimax.py +18 -0
- lm_deluge/models/openai.py +100 -0
- lm_deluge/models/openrouter.py +133 -7
- lm_deluge/models/together.py +11 -0
- lm_deluge/models/zai.py +50 -0
- lm_deluge/pipelines/gepa/__init__.py +95 -0
- lm_deluge/pipelines/gepa/core.py +354 -0
- lm_deluge/pipelines/gepa/docs/samples.py +705 -0
- lm_deluge/pipelines/gepa/examples/01_synthetic_keywords.py +140 -0
- lm_deluge/pipelines/gepa/examples/02_gsm8k_math.py +261 -0
- lm_deluge/pipelines/gepa/examples/03_hotpotqa_multihop.py +300 -0
- lm_deluge/pipelines/gepa/examples/04_batch_classification.py +271 -0
- lm_deluge/pipelines/gepa/examples/simple_qa.py +129 -0
- lm_deluge/pipelines/gepa/optimizer.py +435 -0
- lm_deluge/pipelines/gepa/proposer.py +235 -0
- lm_deluge/pipelines/gepa/util.py +165 -0
- lm_deluge/{llm_tools → pipelines}/score.py +2 -2
- lm_deluge/{llm_tools → pipelines}/translate.py +5 -3
- lm_deluge/prompt.py +537 -88
- lm_deluge/request_context.py +7 -2
- lm_deluge/server/__init__.py +24 -0
- lm_deluge/server/__main__.py +144 -0
- lm_deluge/server/adapters.py +369 -0
- lm_deluge/server/app.py +388 -0
- lm_deluge/server/auth.py +71 -0
- lm_deluge/server/model_policy.py +215 -0
- lm_deluge/server/models_anthropic.py +172 -0
- lm_deluge/server/models_openai.py +175 -0
- lm_deluge/tool/__init__.py +1130 -0
- lm_deluge/tool/builtin/anthropic/__init__.py +300 -0
- lm_deluge/tool/builtin/anthropic/bash.py +0 -0
- lm_deluge/tool/builtin/anthropic/computer_use.py +0 -0
- lm_deluge/tool/builtin/gemini.py +59 -0
- lm_deluge/tool/builtin/openai.py +74 -0
- lm_deluge/tool/cua/__init__.py +173 -0
- lm_deluge/tool/cua/actions.py +148 -0
- lm_deluge/tool/cua/base.py +27 -0
- lm_deluge/tool/cua/batch.py +215 -0
- lm_deluge/tool/cua/converters.py +466 -0
- lm_deluge/tool/cua/kernel.py +702 -0
- lm_deluge/tool/cua/trycua.py +989 -0
- lm_deluge/tool/prefab/__init__.py +45 -0
- lm_deluge/tool/prefab/batch_tool.py +156 -0
- lm_deluge/tool/prefab/docs.py +1119 -0
- lm_deluge/tool/prefab/email.py +294 -0
- lm_deluge/tool/prefab/filesystem.py +1711 -0
- lm_deluge/tool/prefab/full_text_search/__init__.py +285 -0
- lm_deluge/tool/prefab/full_text_search/tantivy_index.py +396 -0
- lm_deluge/tool/prefab/memory.py +458 -0
- lm_deluge/tool/prefab/otc/__init__.py +165 -0
- lm_deluge/tool/prefab/otc/executor.py +281 -0
- lm_deluge/tool/prefab/otc/parse.py +188 -0
- lm_deluge/tool/prefab/random.py +212 -0
- lm_deluge/tool/prefab/rlm/__init__.py +296 -0
- lm_deluge/tool/prefab/rlm/executor.py +349 -0
- lm_deluge/tool/prefab/rlm/parse.py +144 -0
- lm_deluge/tool/prefab/sandbox/__init__.py +19 -0
- lm_deluge/tool/prefab/sandbox/daytona_sandbox.py +483 -0
- lm_deluge/tool/prefab/sandbox/docker_sandbox.py +609 -0
- lm_deluge/tool/prefab/sandbox/fargate_sandbox.py +546 -0
- lm_deluge/tool/prefab/sandbox/modal_sandbox.py +469 -0
- lm_deluge/tool/prefab/sandbox/seatbelt_sandbox.py +827 -0
- lm_deluge/tool/prefab/sheets.py +385 -0
- lm_deluge/tool/prefab/skills.py +0 -0
- lm_deluge/tool/prefab/subagents.py +233 -0
- lm_deluge/tool/prefab/todos.py +342 -0
- lm_deluge/tool/prefab/tool_search.py +169 -0
- lm_deluge/tool/prefab/web_search.py +199 -0
- lm_deluge/tracker.py +16 -13
- lm_deluge/util/schema.py +412 -0
- lm_deluge/warnings.py +8 -0
- {lm_deluge-0.0.67.dist-info → lm_deluge-0.0.90.dist-info}/METADATA +23 -9
- lm_deluge-0.0.90.dist-info/RECORD +132 -0
- lm_deluge/built_in_tools/anthropic/__init__.py +0 -128
- lm_deluge/built_in_tools/openai.py +0 -28
- lm_deluge/presets/cerebras.py +0 -17
- lm_deluge/presets/meta.py +0 -13
- lm_deluge/tool.py +0 -849
- lm_deluge-0.0.67.dist-info/RECORD +0 -72
- lm_deluge/{llm_tools → pipelines}/__init__.py +1 -1
- /lm_deluge/{llm_tools → pipelines}/classify.py +0 -0
- /lm_deluge/{llm_tools → pipelines}/extract.py +0 -0
- /lm_deluge/{llm_tools → pipelines}/locate.py +0 -0
- /lm_deluge/{llm_tools → pipelines}/ocr.py +0 -0
- /lm_deluge/{built_in_tools/anthropic/bash.py → skills/anthropic.py} +0 -0
- /lm_deluge/{built_in_tools/anthropic/computer_use.py → skills/compat.py} +0 -0
- /lm_deluge/{built_in_tools → tool/builtin}/anthropic/editor.py +0 -0
- /lm_deluge/{built_in_tools → tool/builtin}/base.py +0 -0
- {lm_deluge-0.0.67.dist-info → lm_deluge-0.0.90.dist-info}/WHEEL +0 -0
- {lm_deluge-0.0.67.dist-info → lm_deluge-0.0.90.dist-info}/licenses/LICENSE +0 -0
- {lm_deluge-0.0.67.dist-info → lm_deluge-0.0.90.dist-info}/top_level.txt +0 -0
lm_deluge/models/zai.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
ZAI_MODELS = {
|
|
2
|
+
"glm-4.7": {
|
|
3
|
+
"id": "glm-4.7",
|
|
4
|
+
"name": "glm-4.7",
|
|
5
|
+
"api_base": "https://api.z.ai/api/anthropic/v1",
|
|
6
|
+
"api_key_env_var": "ZAI_API_KEY",
|
|
7
|
+
"supports_json": True,
|
|
8
|
+
"api_spec": "anthropic",
|
|
9
|
+
"input_cost": 0.6,
|
|
10
|
+
"cached_input_cost": 0.6,
|
|
11
|
+
"cache_write_cost": 0.6,
|
|
12
|
+
"output_cost": 2.20,
|
|
13
|
+
},
|
|
14
|
+
"glm-4.6": {
|
|
15
|
+
"id": "glm-4.6",
|
|
16
|
+
"name": "glm-4.6",
|
|
17
|
+
"api_base": "https://api.z.ai/api/anthropic/v1",
|
|
18
|
+
"api_key_env_var": "ZAI_API_KEY",
|
|
19
|
+
"supports_json": True,
|
|
20
|
+
"api_spec": "anthropic",
|
|
21
|
+
"input_cost": 0.6,
|
|
22
|
+
"cached_input_cost": 0.6,
|
|
23
|
+
"cache_write_cost": 0.6,
|
|
24
|
+
"output_cost": 2.20,
|
|
25
|
+
},
|
|
26
|
+
"glm-4.5": {
|
|
27
|
+
"id": "glm-4.5",
|
|
28
|
+
"name": "glm-4.5",
|
|
29
|
+
"api_base": "https://api.z.ai/api/anthropic/v1",
|
|
30
|
+
"api_key_env_var": "ZAI_API_KEY",
|
|
31
|
+
"supports_json": True,
|
|
32
|
+
"api_spec": "anthropic",
|
|
33
|
+
"input_cost": 0.6,
|
|
34
|
+
"cached_input_cost": 0.6,
|
|
35
|
+
"cache_write_cost": 0.6,
|
|
36
|
+
"output_cost": 2.20,
|
|
37
|
+
},
|
|
38
|
+
"glm-4.5-air": {
|
|
39
|
+
"id": "glm-4.5-air",
|
|
40
|
+
"name": "glm-4.5-air",
|
|
41
|
+
"api_base": "https://api.z.ai/api/anthropic/v1",
|
|
42
|
+
"api_key_env_var": "ZAI_API_KEY",
|
|
43
|
+
"supports_json": True,
|
|
44
|
+
"api_spec": "anthropic",
|
|
45
|
+
"input_cost": 0.6,
|
|
46
|
+
"cached_input_cost": 0.6,
|
|
47
|
+
"cache_write_cost": 0.6,
|
|
48
|
+
"output_cost": 2.20,
|
|
49
|
+
},
|
|
50
|
+
}
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""
|
|
2
|
+
GEPA (Genetic Pareto) prompt optimizer for lm-deluge.
|
|
3
|
+
|
|
4
|
+
This module provides an evolutionary optimizer for text components in AI systems.
|
|
5
|
+
It analyzes whole trajectories to propose improvements to prompts, tool descriptions,
|
|
6
|
+
and other text-based configuration.
|
|
7
|
+
|
|
8
|
+
Example usage:
|
|
9
|
+
from lm_deluge import LLMClient
|
|
10
|
+
from lm_deluge.prompt import Conversation, Message
|
|
11
|
+
from lm_deluge.pipelines.gepa import Component, EvalResult, optimize
|
|
12
|
+
|
|
13
|
+
# Define components to optimize
|
|
14
|
+
components = {
|
|
15
|
+
"system_prompt": Component(
|
|
16
|
+
description="Instructions given to the model",
|
|
17
|
+
value="You are a helpful assistant.",
|
|
18
|
+
),
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
# Define how to evaluate one example
|
|
22
|
+
def evaluate(client: LLMClient, values: dict[str, str], example: dict) -> EvalResult:
|
|
23
|
+
# Build prompt with current component values
|
|
24
|
+
conv = Conversation.system(values["system_prompt"])
|
|
25
|
+
conv = conv.add(Message.user(example["question"]))
|
|
26
|
+
|
|
27
|
+
# Run inference
|
|
28
|
+
response = client.process_prompts_sync([conv], show_progress=False)[0]
|
|
29
|
+
answer = response.completion
|
|
30
|
+
|
|
31
|
+
# Score the result
|
|
32
|
+
correct = example["answer"].lower() in answer.lower()
|
|
33
|
+
score = 1.0 if correct else 0.0
|
|
34
|
+
|
|
35
|
+
# Build feedback for the proposer
|
|
36
|
+
feedback = f"Score: {score}. Expected: {example['answer']}"
|
|
37
|
+
|
|
38
|
+
# Return full trajectory
|
|
39
|
+
full_conv = conv.add(Message.ai(answer))
|
|
40
|
+
return EvalResult(conversation=full_conv, score=score, feedback=feedback)
|
|
41
|
+
|
|
42
|
+
# Run optimization
|
|
43
|
+
result = optimize(
|
|
44
|
+
components=components,
|
|
45
|
+
evaluate_fn=evaluate,
|
|
46
|
+
dataset=train_examples,
|
|
47
|
+
task_client=LLMClient("gpt-4o-mini"),
|
|
48
|
+
proposer_client=LLMClient("gpt-4o"),
|
|
49
|
+
max_iterations=50,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
print(f"Best score: {result.best_score}")
|
|
53
|
+
print(f"Best prompt: {result.best_candidate['system_prompt']}")
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
from lm_deluge.pipelines.gepa.core import (
|
|
57
|
+
Component,
|
|
58
|
+
EvalResult,
|
|
59
|
+
GEPAResult,
|
|
60
|
+
GEPAState,
|
|
61
|
+
Proposal,
|
|
62
|
+
)
|
|
63
|
+
from lm_deluge.pipelines.gepa.optimizer import GEPAEngine, optimize
|
|
64
|
+
from lm_deluge.pipelines.gepa.proposer import (
|
|
65
|
+
DEFAULT_PROPOSAL_PROMPT,
|
|
66
|
+
build_proposal_prompt,
|
|
67
|
+
parse_proposal_response,
|
|
68
|
+
propose_improvement_sync,
|
|
69
|
+
)
|
|
70
|
+
from lm_deluge.pipelines.gepa.util import (
|
|
71
|
+
extract_text_from_response,
|
|
72
|
+
format_components_for_prompt,
|
|
73
|
+
format_conversation_compact,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
__all__ = [
|
|
77
|
+
# Core types
|
|
78
|
+
"Component",
|
|
79
|
+
"EvalResult",
|
|
80
|
+
"Proposal",
|
|
81
|
+
"GEPAState",
|
|
82
|
+
"GEPAResult",
|
|
83
|
+
# Main API
|
|
84
|
+
"optimize",
|
|
85
|
+
"GEPAEngine",
|
|
86
|
+
# Proposer utilities
|
|
87
|
+
"DEFAULT_PROPOSAL_PROMPT",
|
|
88
|
+
"build_proposal_prompt",
|
|
89
|
+
"parse_proposal_response",
|
|
90
|
+
"propose_improvement_sync",
|
|
91
|
+
# Formatting utilities
|
|
92
|
+
"format_conversation_compact",
|
|
93
|
+
"format_components_for_prompt",
|
|
94
|
+
"extract_text_from_response",
|
|
95
|
+
]
|
|
@@ -0,0 +1,354 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Core types for GEPA optimization.
|
|
3
|
+
|
|
4
|
+
This module defines the fundamental data structures used throughout the optimizer.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
import pickle
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
from lm_deluge.prompt import Conversation
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class Component:
|
|
20
|
+
"""
|
|
21
|
+
A text component to optimize.
|
|
22
|
+
|
|
23
|
+
Attributes:
|
|
24
|
+
description: What this component does, shown to the proposer LLM
|
|
25
|
+
(e.g., "System prompt given to the agent at conversation start")
|
|
26
|
+
value: The current text value
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
description: str
|
|
30
|
+
value: str
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class EvalResult:
|
|
35
|
+
"""
|
|
36
|
+
Result of evaluating one example.
|
|
37
|
+
|
|
38
|
+
Attributes:
|
|
39
|
+
conversation: The full trajectory (what actually happened)
|
|
40
|
+
score: Numeric score, higher is better
|
|
41
|
+
feedback: Explanation of the result (shown to proposer)
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
conversation: Conversation
|
|
45
|
+
score: float
|
|
46
|
+
feedback: str
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class Proposal:
|
|
51
|
+
"""
|
|
52
|
+
A proposed change to one component.
|
|
53
|
+
|
|
54
|
+
Attributes:
|
|
55
|
+
component_name: Which component to change
|
|
56
|
+
new_value: The proposed new text
|
|
57
|
+
reasoning: Why the proposer thinks this will help
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
component_name: str
|
|
61
|
+
new_value: str
|
|
62
|
+
reasoning: str
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@dataclass
|
|
66
|
+
class GEPAState:
|
|
67
|
+
"""
|
|
68
|
+
Mutable optimization state.
|
|
69
|
+
|
|
70
|
+
Tracks all candidates, their scores, and the Pareto frontier.
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
# Component info (fixed after init)
|
|
74
|
+
component_names: list[str] = field(default_factory=list)
|
|
75
|
+
component_descriptions: dict[str, str] = field(default_factory=dict)
|
|
76
|
+
|
|
77
|
+
# Candidates: each is a dict mapping component_name -> text
|
|
78
|
+
candidates: list[dict[str, str]] = field(default_factory=list)
|
|
79
|
+
candidate_parents: list[int | None] = field(default_factory=list)
|
|
80
|
+
|
|
81
|
+
# Scores: candidate_scores[candidate_idx][example_idx] = score
|
|
82
|
+
candidate_scores: list[dict[int, float]] = field(default_factory=list)
|
|
83
|
+
|
|
84
|
+
# Pareto front tracking
|
|
85
|
+
# pareto_front[example_idx] = set of candidate indices achieving best score
|
|
86
|
+
pareto_front: dict[int, set[int]] = field(default_factory=dict)
|
|
87
|
+
# pareto_scores[example_idx] = best score achieved
|
|
88
|
+
pareto_scores: dict[int, float] = field(default_factory=dict)
|
|
89
|
+
|
|
90
|
+
# Counters
|
|
91
|
+
iteration: int = 0
|
|
92
|
+
total_evals: int = 0
|
|
93
|
+
|
|
94
|
+
@classmethod
|
|
95
|
+
def initialize(
|
|
96
|
+
cls,
|
|
97
|
+
components: dict[str, Component],
|
|
98
|
+
seed_scores: dict[int, float],
|
|
99
|
+
) -> GEPAState:
|
|
100
|
+
"""
|
|
101
|
+
Initialize state with seed candidate and its scores.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
components: The components being optimized
|
|
105
|
+
seed_scores: Scores for seed candidate on each example (example_idx -> score)
|
|
106
|
+
"""
|
|
107
|
+
state = cls()
|
|
108
|
+
|
|
109
|
+
# Store component info
|
|
110
|
+
state.component_names = list(components.keys())
|
|
111
|
+
state.component_descriptions = {
|
|
112
|
+
name: comp.description for name, comp in components.items()
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
# Add seed candidate
|
|
116
|
+
seed_values = {name: comp.value for name, comp in components.items()}
|
|
117
|
+
state.candidates = [seed_values]
|
|
118
|
+
state.candidate_parents = [None]
|
|
119
|
+
state.candidate_scores = [dict(seed_scores)]
|
|
120
|
+
|
|
121
|
+
# Initialize Pareto front with seed
|
|
122
|
+
state.pareto_front = {ex_idx: {0} for ex_idx in seed_scores}
|
|
123
|
+
state.pareto_scores = dict(seed_scores)
|
|
124
|
+
|
|
125
|
+
state.total_evals = len(seed_scores)
|
|
126
|
+
|
|
127
|
+
return state
|
|
128
|
+
|
|
129
|
+
def add_candidate(
|
|
130
|
+
self,
|
|
131
|
+
values: dict[str, str],
|
|
132
|
+
parent_idx: int | None,
|
|
133
|
+
scores: dict[int, float],
|
|
134
|
+
) -> int:
|
|
135
|
+
"""
|
|
136
|
+
Add a new candidate to the population.
|
|
137
|
+
|
|
138
|
+
Returns the index of the new candidate.
|
|
139
|
+
"""
|
|
140
|
+
new_idx = len(self.candidates)
|
|
141
|
+
|
|
142
|
+
self.candidates.append(dict(values))
|
|
143
|
+
self.candidate_parents.append(parent_idx)
|
|
144
|
+
self.candidate_scores.append(dict(scores))
|
|
145
|
+
|
|
146
|
+
# Update Pareto front
|
|
147
|
+
for ex_idx, score in scores.items():
|
|
148
|
+
self._update_pareto(ex_idx, score, new_idx)
|
|
149
|
+
|
|
150
|
+
return new_idx
|
|
151
|
+
|
|
152
|
+
def _update_pareto(
|
|
153
|
+
self, example_idx: int, score: float, candidate_idx: int
|
|
154
|
+
) -> None:
|
|
155
|
+
"""Update Pareto front for one example."""
|
|
156
|
+
current_best = self.pareto_scores.get(example_idx, float("-inf"))
|
|
157
|
+
|
|
158
|
+
if score > current_best:
|
|
159
|
+
self.pareto_scores[example_idx] = score
|
|
160
|
+
self.pareto_front[example_idx] = {candidate_idx}
|
|
161
|
+
elif score == current_best:
|
|
162
|
+
if example_idx not in self.pareto_front:
|
|
163
|
+
self.pareto_front[example_idx] = set()
|
|
164
|
+
self.pareto_front[example_idx].add(candidate_idx)
|
|
165
|
+
|
|
166
|
+
def get_frontier_candidates(self) -> set[int]:
|
|
167
|
+
"""Get all candidate indices that are on the Pareto front for any example."""
|
|
168
|
+
frontier: set[int] = set()
|
|
169
|
+
for candidates in self.pareto_front.values():
|
|
170
|
+
frontier.update(candidates)
|
|
171
|
+
return frontier
|
|
172
|
+
|
|
173
|
+
def best_candidate_idx(self) -> int:
|
|
174
|
+
"""Get index of candidate with highest average score."""
|
|
175
|
+
if not self.candidates:
|
|
176
|
+
return 0
|
|
177
|
+
|
|
178
|
+
best_idx = 0
|
|
179
|
+
best_avg = float("-inf")
|
|
180
|
+
|
|
181
|
+
for idx, scores in enumerate(self.candidate_scores):
|
|
182
|
+
if scores:
|
|
183
|
+
avg = sum(scores.values()) / len(scores)
|
|
184
|
+
if avg > best_avg:
|
|
185
|
+
best_avg = avg
|
|
186
|
+
best_idx = idx
|
|
187
|
+
|
|
188
|
+
return best_idx
|
|
189
|
+
|
|
190
|
+
def get_candidate_avg_score(self, idx: int) -> float:
|
|
191
|
+
"""Get average score for a candidate."""
|
|
192
|
+
scores = self.candidate_scores[idx]
|
|
193
|
+
if not scores:
|
|
194
|
+
return 0.0
|
|
195
|
+
return sum(scores.values()) / len(scores)
|
|
196
|
+
|
|
197
|
+
def get_improvable_examples(self, perfect_score: float = 1.0) -> list[int]:
|
|
198
|
+
"""Get example indices where we haven't achieved perfect score."""
|
|
199
|
+
return [
|
|
200
|
+
ex_idx
|
|
201
|
+
for ex_idx, score in self.pareto_scores.items()
|
|
202
|
+
if score < perfect_score
|
|
203
|
+
]
|
|
204
|
+
|
|
205
|
+
def save(self, run_dir: str | Path) -> None:
|
|
206
|
+
"""Save state to disk."""
|
|
207
|
+
run_dir = Path(run_dir)
|
|
208
|
+
run_dir.mkdir(parents=True, exist_ok=True)
|
|
209
|
+
|
|
210
|
+
# Save full state as pickle
|
|
211
|
+
state_path = run_dir / "gepa_state.pkl"
|
|
212
|
+
with open(state_path, "wb") as f:
|
|
213
|
+
pickle.dump(self.__dict__, f)
|
|
214
|
+
|
|
215
|
+
# Save human-readable summary
|
|
216
|
+
summary = {
|
|
217
|
+
"num_candidates": len(self.candidates),
|
|
218
|
+
"iteration": self.iteration,
|
|
219
|
+
"total_evals": self.total_evals,
|
|
220
|
+
"best_idx": self.best_candidate_idx(),
|
|
221
|
+
"best_score": self.get_candidate_avg_score(self.best_candidate_idx()),
|
|
222
|
+
"components": self.component_names,
|
|
223
|
+
"pareto_size": len(self.get_frontier_candidates()),
|
|
224
|
+
}
|
|
225
|
+
summary_path = run_dir / "gepa_summary.json"
|
|
226
|
+
with open(summary_path, "w") as f:
|
|
227
|
+
json.dump(summary, f, indent=2)
|
|
228
|
+
|
|
229
|
+
@classmethod
|
|
230
|
+
def load(cls, run_dir: str | Path) -> GEPAState:
|
|
231
|
+
"""Load state from disk."""
|
|
232
|
+
run_dir = Path(run_dir)
|
|
233
|
+
state_path = run_dir / "gepa_state.pkl"
|
|
234
|
+
|
|
235
|
+
with open(state_path, "rb") as f:
|
|
236
|
+
data = pickle.load(f)
|
|
237
|
+
|
|
238
|
+
state = cls()
|
|
239
|
+
state.__dict__.update(data)
|
|
240
|
+
return state
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
@dataclass(frozen=True)
|
|
244
|
+
class GEPAResult:
|
|
245
|
+
"""
|
|
246
|
+
Immutable snapshot of optimization results.
|
|
247
|
+
|
|
248
|
+
Use this to inspect results after optimization completes.
|
|
249
|
+
"""
|
|
250
|
+
|
|
251
|
+
candidates: tuple[dict[str, str], ...]
|
|
252
|
+
candidate_parents: tuple[int | None, ...]
|
|
253
|
+
candidate_avg_scores: tuple[float, ...]
|
|
254
|
+
|
|
255
|
+
best_idx: int
|
|
256
|
+
best_candidate: dict[str, str]
|
|
257
|
+
best_score: float
|
|
258
|
+
|
|
259
|
+
total_evals: int
|
|
260
|
+
iterations: int
|
|
261
|
+
|
|
262
|
+
component_names: tuple[str, ...]
|
|
263
|
+
component_descriptions: dict[str, str]
|
|
264
|
+
|
|
265
|
+
run_dir: str | None = None
|
|
266
|
+
|
|
267
|
+
@classmethod
|
|
268
|
+
def from_state(cls, state: GEPAState, run_dir: str | None = None) -> GEPAResult:
|
|
269
|
+
"""Create an immutable result from mutable state."""
|
|
270
|
+
avg_scores = tuple(
|
|
271
|
+
state.get_candidate_avg_score(i) for i in range(len(state.candidates))
|
|
272
|
+
)
|
|
273
|
+
best_idx = state.best_candidate_idx()
|
|
274
|
+
|
|
275
|
+
return cls(
|
|
276
|
+
candidates=tuple(dict(c) for c in state.candidates),
|
|
277
|
+
candidate_parents=tuple(state.candidate_parents),
|
|
278
|
+
candidate_avg_scores=avg_scores,
|
|
279
|
+
best_idx=best_idx,
|
|
280
|
+
best_candidate=dict(state.candidates[best_idx]),
|
|
281
|
+
best_score=avg_scores[best_idx] if avg_scores else 0.0,
|
|
282
|
+
total_evals=state.total_evals,
|
|
283
|
+
iterations=state.iteration,
|
|
284
|
+
component_names=tuple(state.component_names),
|
|
285
|
+
component_descriptions=dict(state.component_descriptions),
|
|
286
|
+
run_dir=run_dir,
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
@property
|
|
290
|
+
def num_candidates(self) -> int:
|
|
291
|
+
return len(self.candidates)
|
|
292
|
+
|
|
293
|
+
def best_k(self, k: int = 5) -> list[tuple[int, dict[str, str], float]]:
|
|
294
|
+
"""Get the top k candidates by average score."""
|
|
295
|
+
indexed = [
|
|
296
|
+
(i, self.candidates[i], self.candidate_avg_scores[i])
|
|
297
|
+
for i in range(len(self.candidates))
|
|
298
|
+
]
|
|
299
|
+
indexed.sort(key=lambda x: x[2], reverse=True)
|
|
300
|
+
return indexed[:k]
|
|
301
|
+
|
|
302
|
+
def lineage(self, idx: int) -> list[int]:
|
|
303
|
+
"""Get the ancestry chain for a candidate (oldest first)."""
|
|
304
|
+
chain = [idx]
|
|
305
|
+
while True:
|
|
306
|
+
parent = self.candidate_parents[chain[-1]]
|
|
307
|
+
if parent is None:
|
|
308
|
+
break
|
|
309
|
+
chain.append(parent)
|
|
310
|
+
return list(reversed(chain))
|
|
311
|
+
|
|
312
|
+
def diff(
|
|
313
|
+
self, parent_idx: int, child_idx: int, only_changed: bool = True
|
|
314
|
+
) -> dict[str, tuple[str, str]]:
|
|
315
|
+
"""
|
|
316
|
+
Show differences between two candidates.
|
|
317
|
+
|
|
318
|
+
Returns dict mapping component_name -> (old_value, new_value).
|
|
319
|
+
"""
|
|
320
|
+
parent = self.candidates[parent_idx]
|
|
321
|
+
child = self.candidates[child_idx]
|
|
322
|
+
|
|
323
|
+
result = {}
|
|
324
|
+
all_keys = set(parent.keys()) | set(child.keys())
|
|
325
|
+
|
|
326
|
+
for key in all_keys:
|
|
327
|
+
old = parent.get(key, "")
|
|
328
|
+
new = child.get(key, "")
|
|
329
|
+
if not only_changed or old != new:
|
|
330
|
+
result[key] = (old, new)
|
|
331
|
+
|
|
332
|
+
return result
|
|
333
|
+
|
|
334
|
+
def to_dict(self) -> dict[str, Any]:
|
|
335
|
+
"""Convert to JSON-serializable dict."""
|
|
336
|
+
return {
|
|
337
|
+
"candidates": list(self.candidates),
|
|
338
|
+
"candidate_parents": list(self.candidate_parents),
|
|
339
|
+
"candidate_avg_scores": list(self.candidate_avg_scores),
|
|
340
|
+
"best_idx": self.best_idx,
|
|
341
|
+
"best_candidate": self.best_candidate,
|
|
342
|
+
"best_score": self.best_score,
|
|
343
|
+
"total_evals": self.total_evals,
|
|
344
|
+
"iterations": self.iterations,
|
|
345
|
+
"component_names": list(self.component_names),
|
|
346
|
+
"run_dir": self.run_dir,
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
def __repr__(self) -> str:
|
|
350
|
+
return (
|
|
351
|
+
f"GEPAResult(candidates={self.num_candidates}, "
|
|
352
|
+
f"best_score={self.best_score:.4f}, "
|
|
353
|
+
f"total_evals={self.total_evals})"
|
|
354
|
+
)
|