dslighting 1.3.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dsat/__init__.py +3 -0
- dsat/benchmark/__init__.py +1 -0
- dsat/benchmark/benchmark.py +168 -0
- dsat/benchmark/datasci.py +291 -0
- dsat/benchmark/mle.py +777 -0
- dsat/benchmark/sciencebench.py +304 -0
- dsat/common/__init__.py +0 -0
- dsat/common/constants.py +11 -0
- dsat/common/exceptions.py +48 -0
- dsat/common/typing.py +19 -0
- dsat/config.py +79 -0
- dsat/models/__init__.py +3 -0
- dsat/models/candidates.py +16 -0
- dsat/models/formats.py +52 -0
- dsat/models/task.py +64 -0
- dsat/operators/__init__.py +0 -0
- dsat/operators/aflow_ops.py +90 -0
- dsat/operators/autokaggle_ops.py +170 -0
- dsat/operators/automind_ops.py +38 -0
- dsat/operators/base.py +22 -0
- dsat/operators/code.py +45 -0
- dsat/operators/dsagent_ops.py +123 -0
- dsat/operators/llm_basic.py +84 -0
- dsat/prompts/__init__.py +0 -0
- dsat/prompts/aflow_prompt.py +76 -0
- dsat/prompts/aide_prompt.py +52 -0
- dsat/prompts/autokaggle_prompt.py +290 -0
- dsat/prompts/automind_prompt.py +29 -0
- dsat/prompts/common.py +51 -0
- dsat/prompts/data_interpreter_prompt.py +82 -0
- dsat/prompts/dsagent_prompt.py +88 -0
- dsat/runner.py +554 -0
- dsat/services/__init__.py +0 -0
- dsat/services/data_analyzer.py +387 -0
- dsat/services/llm.py +486 -0
- dsat/services/llm_single.py +421 -0
- dsat/services/sandbox.py +386 -0
- dsat/services/states/__init__.py +0 -0
- dsat/services/states/autokaggle_state.py +43 -0
- dsat/services/states/base.py +14 -0
- dsat/services/states/dsa_log.py +13 -0
- dsat/services/states/experience.py +237 -0
- dsat/services/states/journal.py +153 -0
- dsat/services/states/operator_library.py +290 -0
- dsat/services/vdb.py +76 -0
- dsat/services/workspace.py +178 -0
- dsat/tasks/__init__.py +3 -0
- dsat/tasks/handlers.py +376 -0
- dsat/templates/open_ended/grade_template.py +107 -0
- dsat/tools/__init__.py +4 -0
- dsat/utils/__init__.py +0 -0
- dsat/utils/context.py +172 -0
- dsat/utils/dynamic_import.py +71 -0
- dsat/utils/parsing.py +33 -0
- dsat/workflows/__init__.py +12 -0
- dsat/workflows/base.py +53 -0
- dsat/workflows/factory.py +439 -0
- dsat/workflows/manual/__init__.py +0 -0
- dsat/workflows/manual/autokaggle_workflow.py +148 -0
- dsat/workflows/manual/data_interpreter_workflow.py +153 -0
- dsat/workflows/manual/deepanalyze_workflow.py +484 -0
- dsat/workflows/manual/dsagent_workflow.py +76 -0
- dsat/workflows/search/__init__.py +0 -0
- dsat/workflows/search/aflow_workflow.py +344 -0
- dsat/workflows/search/aide_workflow.py +283 -0
- dsat/workflows/search/automind_workflow.py +237 -0
- dsat/workflows/templates/__init__.py +0 -0
- dsat/workflows/templates/basic_kaggle_loop.py +71 -0
- dslighting/__init__.py +170 -0
- dslighting/core/__init__.py +13 -0
- dslighting/core/agent.py +646 -0
- dslighting/core/config_builder.py +318 -0
- dslighting/core/data_loader.py +422 -0
- dslighting/core/task_detector.py +422 -0
- dslighting/utils/__init__.py +19 -0
- dslighting/utils/defaults.py +151 -0
- dslighting-1.3.9.dist-info/METADATA +554 -0
- dslighting-1.3.9.dist-info/RECORD +80 -0
- dslighting-1.3.9.dist-info/WHEEL +5 -0
- dslighting-1.3.9.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Implements Experience, which manages the state of a meta-optimization process.
|
|
3
|
+
This is the core state representation for Paradigm 3 (AFlow-style) evolutionary search.
|
|
4
|
+
"""
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
from datetime import datetime, timezone
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Optional, List, Any, Dict, Tuple
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
|
|
13
|
+
from dsat.models.candidates import WorkflowCandidate
|
|
14
|
+
from dsat.services.states.base import State
|
|
15
|
+
from dsat.services.workspace import WorkspaceService
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
class Experience(State):
|
|
20
|
+
"""
|
|
21
|
+
Acts as the database for the meta-optimizer. It saves and loads
|
|
22
|
+
workflow scores and modification history to guide the search process,
|
|
23
|
+
persisting state to the filesystem within the run's workspace.
|
|
24
|
+
"""
|
|
25
|
+
def __init__(self, workspace: WorkspaceService):
|
|
26
|
+
self.workspace = workspace
|
|
27
|
+
# Define paths within the managed workspace
|
|
28
|
+
self.scores_file = workspace.get_path("state") / "scores.jsonl"
|
|
29
|
+
self.experience_file = workspace.get_path("state") / "experience.json"
|
|
30
|
+
self.candidates_dir = workspace.get_path("candidates")
|
|
31
|
+
|
|
32
|
+
# Initialize state files if they don't exist
|
|
33
|
+
self.scores_file.touch()
|
|
34
|
+
if not self.experience_file.exists():
|
|
35
|
+
with open(self.experience_file, 'w') as f:
|
|
36
|
+
json.dump({}, f)
|
|
37
|
+
|
|
38
|
+
def _load_all_candidates(self) -> List[WorkflowCandidate]:
|
|
39
|
+
"""Loads all recorded candidates from the scores file."""
|
|
40
|
+
candidates = []
|
|
41
|
+
if not self.scores_file.exists() or self.scores_file.stat().st_size == 0:
|
|
42
|
+
return []
|
|
43
|
+
|
|
44
|
+
with open(self.scores_file, "r", encoding="utf-8") as f:
|
|
45
|
+
for line in f:
|
|
46
|
+
try:
|
|
47
|
+
data = json.loads(line)
|
|
48
|
+
score_type = data.get("score_type", "fitness")
|
|
49
|
+
if score_type not in {"fitness", "fine"}:
|
|
50
|
+
continue
|
|
51
|
+
code_path = Path(data['code_path'])
|
|
52
|
+
if code_path.exists():
|
|
53
|
+
with open(code_path, "r", encoding="utf-8") as code_file:
|
|
54
|
+
code = code_file.read()
|
|
55
|
+
candidates.append(
|
|
56
|
+
WorkflowCandidate(
|
|
57
|
+
workflow_code=code,
|
|
58
|
+
fitness=data['fitness'],
|
|
59
|
+
round_num=data.get('round')
|
|
60
|
+
)
|
|
61
|
+
)
|
|
62
|
+
except (json.JSONDecodeError, KeyError) as e:
|
|
63
|
+
logger.warning(f"Skipping malformed line in scores.jsonl: {e}")
|
|
64
|
+
return candidates
|
|
65
|
+
|
|
66
|
+
def get_experience_summary(self, parent_round_num: Optional[int]) -> str:
|
|
67
|
+
"""
|
|
68
|
+
Loads and formats the experience log for a specific parent candidate.
|
|
69
|
+
"""
|
|
70
|
+
if parent_round_num is None:
|
|
71
|
+
parent_round_num = -1
|
|
72
|
+
|
|
73
|
+
if not self.experience_file.exists():
|
|
74
|
+
return "Experience log not found."
|
|
75
|
+
|
|
76
|
+
with open(self.experience_file, "r", encoding="utf-8") as f:
|
|
77
|
+
try:
|
|
78
|
+
all_experience = json.load(f)
|
|
79
|
+
except json.JSONDecodeError:
|
|
80
|
+
return "Could not parse experience log."
|
|
81
|
+
|
|
82
|
+
if not isinstance(all_experience, dict):
|
|
83
|
+
return "Could not parse experience log."
|
|
84
|
+
|
|
85
|
+
def _coerce_list(value: Any) -> list[dict]:
|
|
86
|
+
if not isinstance(value, list):
|
|
87
|
+
return []
|
|
88
|
+
out: list[dict] = []
|
|
89
|
+
for item in value:
|
|
90
|
+
if isinstance(item, dict):
|
|
91
|
+
out.append(item)
|
|
92
|
+
return out
|
|
93
|
+
|
|
94
|
+
summary_lines = []
|
|
95
|
+
if parent_round_num >= 0:
|
|
96
|
+
summary_lines.append(f"History of modifications for parent from round {parent_round_num}:")
|
|
97
|
+
else:
|
|
98
|
+
summary_lines.append("History of modifications (no specific parent selected):")
|
|
99
|
+
|
|
100
|
+
parent_key = str(parent_round_num)
|
|
101
|
+
parent_exp = all_experience.get(parent_key) if isinstance(all_experience.get(parent_key), dict) else {}
|
|
102
|
+
parent_success = _coerce_list(parent_exp.get("success"))
|
|
103
|
+
parent_failure = _coerce_list(parent_exp.get("failure"))
|
|
104
|
+
|
|
105
|
+
summary_lines.append("\n### Successful Modifications:")
|
|
106
|
+
if parent_success:
|
|
107
|
+
for mod in parent_success:
|
|
108
|
+
child = mod.get("child_round")
|
|
109
|
+
score_after = float(mod.get("score_after", 0.0) or 0.0)
|
|
110
|
+
delta = mod.get("delta")
|
|
111
|
+
delta_str = ""
|
|
112
|
+
if isinstance(delta, (int, float)):
|
|
113
|
+
delta_str = f", Δ={float(delta):+.4f}"
|
|
114
|
+
summary_lines.append(
|
|
115
|
+
f"- (Child Round {child}, New Score: {score_after:.4f}{delta_str}) {mod.get('modification','')}"
|
|
116
|
+
)
|
|
117
|
+
else:
|
|
118
|
+
summary_lines.append("- (none yet)")
|
|
119
|
+
|
|
120
|
+
summary_lines.append("\n### Failed Modifications:")
|
|
121
|
+
if parent_failure:
|
|
122
|
+
for mod in parent_failure:
|
|
123
|
+
child = mod.get("child_round")
|
|
124
|
+
score_after = float(mod.get("score_after", 0.0) or 0.0)
|
|
125
|
+
delta = mod.get("delta")
|
|
126
|
+
delta_str = ""
|
|
127
|
+
if isinstance(delta, (int, float)):
|
|
128
|
+
delta_str = f", Δ={float(delta):+.4f}"
|
|
129
|
+
summary_lines.append(
|
|
130
|
+
f"- (Child Round {child}, New Score: {score_after:.4f}{delta_str}) {mod.get('modification','')}"
|
|
131
|
+
)
|
|
132
|
+
else:
|
|
133
|
+
summary_lines.append("- (none yet)")
|
|
134
|
+
|
|
135
|
+
# Add global successful examples to give the optimizer concrete positive patterns,
|
|
136
|
+
# even when the selected parent has no successes yet.
|
|
137
|
+
global_success: list[Tuple[float, str, dict]] = []
|
|
138
|
+
for pkey, pexp in all_experience.items():
|
|
139
|
+
if not isinstance(pexp, dict):
|
|
140
|
+
continue
|
|
141
|
+
for mod in _coerce_list(pexp.get("success")):
|
|
142
|
+
try:
|
|
143
|
+
score_after = float(mod.get("score_after", 0.0) or 0.0)
|
|
144
|
+
except Exception:
|
|
145
|
+
score_after = 0.0
|
|
146
|
+
global_success.append((score_after, str(pkey), mod))
|
|
147
|
+
|
|
148
|
+
global_success.sort(key=lambda t: t[0], reverse=True)
|
|
149
|
+
top_global = global_success[:5]
|
|
150
|
+
summary_lines.append("\n### Successful Examples (Global Top-5):")
|
|
151
|
+
if top_global:
|
|
152
|
+
for score_after, pkey, mod in top_global:
|
|
153
|
+
child = mod.get("child_round")
|
|
154
|
+
delta = mod.get("delta")
|
|
155
|
+
delta_str = ""
|
|
156
|
+
if isinstance(delta, (int, float)):
|
|
157
|
+
delta_str = f", Δ={float(delta):+.4f}"
|
|
158
|
+
summary_lines.append(
|
|
159
|
+
f"- (Parent {pkey} → Child {child}, Score: {score_after:.4f}{delta_str}) {mod.get('modification','')}"
|
|
160
|
+
)
|
|
161
|
+
else:
|
|
162
|
+
summary_lines.append("- (none yet)")
|
|
163
|
+
|
|
164
|
+
return "\n".join(summary_lines)
|
|
165
|
+
|
|
166
|
+
def select_parent_candidate(self, top_k: int) -> Optional[WorkflowCandidate]:
|
|
167
|
+
"""
|
|
168
|
+
Selects a parent candidate using a softmax probability distribution over the
|
|
169
|
+
top_k best-performing unique candidates, balancing exploration and exploitation.
|
|
170
|
+
"""
|
|
171
|
+
all_candidates = self._load_all_candidates()
|
|
172
|
+
if not all_candidates:
|
|
173
|
+
return None
|
|
174
|
+
|
|
175
|
+
# Sort by fitness (higher is better) and take the top k
|
|
176
|
+
sorted_candidates = sorted(all_candidates, key=lambda c: c.fitness or -1.0, reverse=True)
|
|
177
|
+
top_candidates = sorted_candidates[:top_k]
|
|
178
|
+
|
|
179
|
+
if not top_candidates:
|
|
180
|
+
return None
|
|
181
|
+
|
|
182
|
+
fitness_scores = np.array([c.fitness for c in top_candidates])
|
|
183
|
+
# Softmax probabilities: e^score / sum(e^scores)
|
|
184
|
+
probabilities = np.exp(fitness_scores) / np.sum(np.exp(fitness_scores))
|
|
185
|
+
|
|
186
|
+
return np.random.choice(top_candidates, p=probabilities)
|
|
187
|
+
|
|
188
|
+
def record_score(
|
|
189
|
+
self,
|
|
190
|
+
round_num: int,
|
|
191
|
+
fitness: float,
|
|
192
|
+
code: str,
|
|
193
|
+
*,
|
|
194
|
+
score_type: str = "fitness",
|
|
195
|
+
extra: Optional[Dict[str, Any]] = None,
|
|
196
|
+
) -> None:
|
|
197
|
+
"""Saves the workflow code and appends its score to the log."""
|
|
198
|
+
candidate_code_path = self.candidates_dir / f"round_{round_num}_workflow.py"
|
|
199
|
+
with open(candidate_code_path, "w", encoding="utf-8") as f:
|
|
200
|
+
f.write(code)
|
|
201
|
+
|
|
202
|
+
with open(self.scores_file, "a", encoding="utf-8") as f:
|
|
203
|
+
payload: Dict[str, Any] = {
|
|
204
|
+
"round": round_num,
|
|
205
|
+
"fitness": fitness,
|
|
206
|
+
"code_path": str(candidate_code_path),
|
|
207
|
+
"score_type": str(score_type or "fitness"),
|
|
208
|
+
}
|
|
209
|
+
if extra:
|
|
210
|
+
payload["extra"] = extra
|
|
211
|
+
f.write(json.dumps(payload) + "\n")
|
|
212
|
+
|
|
213
|
+
def record_experience(self, parent_round: int, child_round: int, modification: str, score_before: float, score_after: float):
|
|
214
|
+
"""Records the outcome of a modification attempt in the experience log."""
|
|
215
|
+
with open(self.experience_file, 'r+') as f:
|
|
216
|
+
data = json.load(f)
|
|
217
|
+
parent_key = str(parent_round)
|
|
218
|
+
if parent_key not in data:
|
|
219
|
+
data[parent_key] = {"success": [], "failure": []}
|
|
220
|
+
|
|
221
|
+
outcome = {
|
|
222
|
+
"child_round": child_round,
|
|
223
|
+
"modification": modification,
|
|
224
|
+
"score_before": float(score_before),
|
|
225
|
+
"score_after": score_after,
|
|
226
|
+
"delta": float(score_after) - float(score_before),
|
|
227
|
+
"recorded_at": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
if score_after > score_before:
|
|
231
|
+
data[parent_key]["success"].append(outcome)
|
|
232
|
+
else:
|
|
233
|
+
data[parent_key]["failure"].append(outcome)
|
|
234
|
+
|
|
235
|
+
f.seek(0)
|
|
236
|
+
json.dump(data, f, indent=4)
|
|
237
|
+
f.truncate()
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
# dsat/services/states/journal.py
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Implements JournalState, which manages a tree of solution attempts (Nodes).
|
|
5
|
+
This is the core state representation for Paradigm 2 (AIDE/AutoMind-style) search agents.
|
|
6
|
+
"""
|
|
7
|
+
import uuid
|
|
8
|
+
from functools import total_ordering
|
|
9
|
+
from typing import Optional, Any, List, Dict, Set
|
|
10
|
+
|
|
11
|
+
from pydantic import BaseModel, Field
|
|
12
|
+
|
|
13
|
+
from dsat.common.typing import ExecutionResult
|
|
14
|
+
from dsat.utils.context import truncate_output
|
|
15
|
+
from dsat.services.states.base import State
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@total_ordering
|
|
19
|
+
class MetricValue(BaseModel):
|
|
20
|
+
"""
|
|
21
|
+
Represents a comparable metric that can be configured for maximization or minimization.
|
|
22
|
+
A value of None is considered worse than any numeric value.
|
|
23
|
+
"""
|
|
24
|
+
value: Optional[float]
|
|
25
|
+
maximize: bool = True
|
|
26
|
+
|
|
27
|
+
def __gt__(self, other: "MetricValue") -> bool:
|
|
28
|
+
if self.value is None:
|
|
29
|
+
return False
|
|
30
|
+
if other.value is None:
|
|
31
|
+
return True
|
|
32
|
+
return (self.value > other.value) if self.maximize else (self.value < other.value)
|
|
33
|
+
|
|
34
|
+
def __eq__(self, other: Any) -> bool:
|
|
35
|
+
return isinstance(other, MetricValue) and self.value == other.value
|
|
36
|
+
|
|
37
|
+
def __str__(self) -> str:
|
|
38
|
+
direction = "↑" if self.maximize else "↓"
|
|
39
|
+
val_str = f"{self.value:.4f}" if self.value is not None else "N/A"
|
|
40
|
+
return f"Metric{direction}({val_str})"
|
|
41
|
+
|
|
42
|
+
class Node(BaseModel):
|
|
43
|
+
"""
|
|
44
|
+
Represents a single attempt or node in the solution search tree.
|
|
45
|
+
Each node contains the code, plan, execution results, and review analysis.
|
|
46
|
+
"""
|
|
47
|
+
code: str
|
|
48
|
+
plan: str
|
|
49
|
+
|
|
50
|
+
id: str = Field(default_factory=lambda: uuid.uuid4().hex)
|
|
51
|
+
parent_id: Optional[str] = None
|
|
52
|
+
children_ids: Set[str] = Field(default_factory=set)
|
|
53
|
+
|
|
54
|
+
# Execution Results
|
|
55
|
+
term_out: str = ""
|
|
56
|
+
exec_time: float = 0.0
|
|
57
|
+
exc_type: Optional[str] = None
|
|
58
|
+
exec_metadata: Dict[str, Any] = Field(default_factory=dict)
|
|
59
|
+
|
|
60
|
+
# LLM Recordings
|
|
61
|
+
task_context: Dict[str, Any] = Field(default_factory=dict)
|
|
62
|
+
generate_prompt: Optional[str] = None
|
|
63
|
+
llm_generate: Optional[Dict[str, Any]] = None
|
|
64
|
+
review_context: Optional[Dict[str, Any]] = None
|
|
65
|
+
llm_review: Optional[Dict[str, Any]] = None
|
|
66
|
+
|
|
67
|
+
# Review Results
|
|
68
|
+
analysis: str = ""
|
|
69
|
+
metric: MetricValue = Field(default_factory=lambda: MetricValue(value=None))
|
|
70
|
+
is_buggy: bool = True
|
|
71
|
+
step: int = -1
|
|
72
|
+
|
|
73
|
+
# Artifact paths
|
|
74
|
+
code_artifact_path: Optional[str] = None
|
|
75
|
+
final_submission_path: Optional[str] = None
|
|
76
|
+
|
|
77
|
+
def absorb_exec_result(self, exec_result: ExecutionResult):
|
|
78
|
+
"""Updates the node with the results from a sandbox execution."""
|
|
79
|
+
stdout = exec_result.stdout or ""
|
|
80
|
+
stderr = exec_result.stderr or ""
|
|
81
|
+
combined_output = f"STDOUT:\n{stdout}\n\nSTDERR:\n{stderr}".strip()
|
|
82
|
+
self.term_out = truncate_output(combined_output)
|
|
83
|
+
self.exc_type = exec_result.exc_type
|
|
84
|
+
self.is_buggy = not exec_result.success
|
|
85
|
+
self.exec_metadata = exec_result.metadata or {}
|
|
86
|
+
|
|
87
|
+
class Config:
|
|
88
|
+
"""Pydantic configuration."""
|
|
89
|
+
json_encoders = {set: list} # Allow sets to be serialized to lists in JSON
|
|
90
|
+
|
|
91
|
+
class JournalState(State, BaseModel):
|
|
92
|
+
"""
|
|
93
|
+
Manages the entire search tree (the "Journal") of solution nodes.
|
|
94
|
+
Provides methods for appending nodes, traversing the tree, and selecting
|
|
95
|
+
nodes based on different criteria (e.g., best, buggy).
|
|
96
|
+
"""
|
|
97
|
+
nodes: Dict[str, Node] = Field(default_factory=dict)
|
|
98
|
+
|
|
99
|
+
def __len__(self) -> int:
|
|
100
|
+
return len(self.nodes)
|
|
101
|
+
|
|
102
|
+
def append(self, node: Node, parent: Optional[Node] = None):
|
|
103
|
+
"""Adds a new node to the journal, linking it to a parent if provided."""
|
|
104
|
+
if parent:
|
|
105
|
+
if parent.id not in self.nodes:
|
|
106
|
+
raise ValueError(f"Parent node with id {parent.id} not in journal.")
|
|
107
|
+
node.parent_id = parent.id
|
|
108
|
+
self.nodes[parent.id].children_ids.add(node.id)
|
|
109
|
+
node.step = len(self)
|
|
110
|
+
self.nodes[node.id] = node
|
|
111
|
+
|
|
112
|
+
def get_node(self, node_id: str) -> Optional[Node]:
|
|
113
|
+
"""Retrieves a node by its ID."""
|
|
114
|
+
return self.nodes.get(node_id)
|
|
115
|
+
|
|
116
|
+
def get_best_node(self) -> Optional[Node]:
|
|
117
|
+
"""Finds the best-performing, non-buggy node in the entire journal."""
|
|
118
|
+
good_nodes = [n for n in self.nodes.values() if not n.is_buggy]
|
|
119
|
+
if not good_nodes:
|
|
120
|
+
return None
|
|
121
|
+
return max(good_nodes, key=lambda n: n.metric)
|
|
122
|
+
|
|
123
|
+
def generate_summary(self, max_nodes: int = 3) -> str:
|
|
124
|
+
"""
|
|
125
|
+
Creates a textual summary of successful past attempts for prompt context.
|
|
126
|
+
MODIFIED: Now selects the `max_nodes` BEST performing successful attempts.
|
|
127
|
+
"""
|
|
128
|
+
good_nodes = sorted(
|
|
129
|
+
[n for n in self.nodes.values() if not n.is_buggy and n.metric.value is not None],
|
|
130
|
+
key=lambda x: x.metric,
|
|
131
|
+
reverse=True # MetricValue handles > comparison correctly, so reverse=True gets the best
|
|
132
|
+
)
|
|
133
|
+
if not good_nodes:
|
|
134
|
+
return "No successful solutions have been found yet."
|
|
135
|
+
|
|
136
|
+
# Apply windowing: take the top `max_nodes`
|
|
137
|
+
selected_nodes = good_nodes[:max_nodes]
|
|
138
|
+
|
|
139
|
+
summary_parts = []
|
|
140
|
+
for n in selected_nodes:
|
|
141
|
+
summary_part = (
|
|
142
|
+
f"Attempt #{n.step}:\n"
|
|
143
|
+
f"Plan: {n.plan}\n"
|
|
144
|
+
f"Result Analysis: {n.analysis}\n"
|
|
145
|
+
f"Validation Metric: {n.metric}\n"
|
|
146
|
+
)
|
|
147
|
+
summary_parts.append(summary_part)
|
|
148
|
+
|
|
149
|
+
prefix = ""
|
|
150
|
+
if len(good_nodes) > len(selected_nodes):
|
|
151
|
+
prefix = f"[... {len(good_nodes) - len(selected_nodes)} other successful attempts exist ...]\n"
|
|
152
|
+
|
|
153
|
+
return prefix + "Here is a summary of the best performing attempts:\n" + "\n------------------\n".join(summary_parts)
|
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Persistent library for discovered operators.
|
|
3
|
+
|
|
4
|
+
This stores LLM-proposed Operator code + metadata across runs, and tracks basic
|
|
5
|
+
usage/success statistics to support simple operator-level selection.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import hashlib
|
|
11
|
+
import json
|
|
12
|
+
import math
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
from datetime import datetime, timezone
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _utc_now_iso() -> str:
|
|
20
|
+
return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _safe_int(value: Any, default: int = 0) -> int:
|
|
24
|
+
try:
|
|
25
|
+
return int(value)
|
|
26
|
+
except Exception:
|
|
27
|
+
return default
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass(frozen=True)
|
|
31
|
+
class LibraryOperatorSpec:
|
|
32
|
+
name: str
|
|
33
|
+
version: int
|
|
34
|
+
code: str
|
|
35
|
+
description: str
|
|
36
|
+
inputs: str
|
|
37
|
+
outputs: str
|
|
38
|
+
triggers: str
|
|
39
|
+
task_types: List[str]
|
|
40
|
+
uses: int
|
|
41
|
+
successes: int
|
|
42
|
+
failures: int
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class OperatorLibrary:
|
|
46
|
+
def __init__(self, path: Path):
|
|
47
|
+
self.path = path
|
|
48
|
+
self._data: Dict[str, Any] = {"operators": {}}
|
|
49
|
+
self._load()
|
|
50
|
+
|
|
51
|
+
def _load(self) -> None:
|
|
52
|
+
if not self.path.exists():
|
|
53
|
+
self._data = {"operators": {}}
|
|
54
|
+
return
|
|
55
|
+
try:
|
|
56
|
+
payload = json.loads(self.path.read_text(encoding="utf-8"))
|
|
57
|
+
except Exception:
|
|
58
|
+
self._data = {"operators": {}}
|
|
59
|
+
return
|
|
60
|
+
|
|
61
|
+
if not isinstance(payload, dict) or not isinstance(payload.get("operators"), dict):
|
|
62
|
+
self._data = {"operators": {}}
|
|
63
|
+
return
|
|
64
|
+
self._data = payload
|
|
65
|
+
|
|
66
|
+
def _save(self) -> None:
|
|
67
|
+
self.path.parent.mkdir(parents=True, exist_ok=True)
|
|
68
|
+
tmp = self.path.with_suffix(self.path.suffix + ".tmp")
|
|
69
|
+
tmp.write_text(
|
|
70
|
+
json.dumps(self._data, ensure_ascii=False, indent=2, sort_keys=True),
|
|
71
|
+
encoding="utf-8",
|
|
72
|
+
)
|
|
73
|
+
tmp.replace(self.path)
|
|
74
|
+
|
|
75
|
+
def has(self, name: str) -> bool:
|
|
76
|
+
return name in self._data.get("operators", {})
|
|
77
|
+
|
|
78
|
+
def get_best_version(self, name: str) -> Optional[LibraryOperatorSpec]:
|
|
79
|
+
"""
|
|
80
|
+
Return the best available version for an operator name.
|
|
81
|
+
|
|
82
|
+
Selection is based on (success_rate, uses, version) in descending order.
|
|
83
|
+
Returns None if the operator is missing or has no usable versions.
|
|
84
|
+
"""
|
|
85
|
+
operators = self._data.get("operators", {})
|
|
86
|
+
if not isinstance(operators, dict):
|
|
87
|
+
return None
|
|
88
|
+
|
|
89
|
+
record = operators.get(name)
|
|
90
|
+
if not isinstance(record, dict):
|
|
91
|
+
return None
|
|
92
|
+
versions = record.get("versions", [])
|
|
93
|
+
if not isinstance(versions, list) or not versions:
|
|
94
|
+
return None
|
|
95
|
+
|
|
96
|
+
best: Optional[Dict[str, Any]] = None
|
|
97
|
+
best_key: Optional[tuple[float, int, int]] = None
|
|
98
|
+
best_version = 1
|
|
99
|
+
|
|
100
|
+
for v in versions:
|
|
101
|
+
if not isinstance(v, dict):
|
|
102
|
+
continue
|
|
103
|
+
code = str(v.get("code") or "")
|
|
104
|
+
if not code.strip():
|
|
105
|
+
continue
|
|
106
|
+
uses = _safe_int(v.get("uses"), 0)
|
|
107
|
+
successes = _safe_int(v.get("successes"), 0)
|
|
108
|
+
version = _safe_int(v.get("version"), 1)
|
|
109
|
+
success_rate = successes / uses if uses > 0 else 0.0
|
|
110
|
+
key = (float(success_rate), int(uses), int(version))
|
|
111
|
+
if best_key is None or key > best_key:
|
|
112
|
+
best = v
|
|
113
|
+
best_key = key
|
|
114
|
+
best_version = version
|
|
115
|
+
|
|
116
|
+
if best is None:
|
|
117
|
+
return None
|
|
118
|
+
|
|
119
|
+
return LibraryOperatorSpec(
|
|
120
|
+
name=str(name),
|
|
121
|
+
version=int(best_version),
|
|
122
|
+
code=str(best.get("code") or ""),
|
|
123
|
+
description=str(best.get("description") or "").strip(),
|
|
124
|
+
inputs=str(best.get("inputs") or "").strip(),
|
|
125
|
+
outputs=str(best.get("outputs") or "").strip(),
|
|
126
|
+
triggers=str(best.get("triggers") or "").strip(),
|
|
127
|
+
task_types=list(best.get("task_types") or []),
|
|
128
|
+
uses=_safe_int(best.get("uses"), 0),
|
|
129
|
+
successes=_safe_int(best.get("successes"), 0),
|
|
130
|
+
failures=_safe_int(best.get("failures"), 0),
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
def add_version(
|
|
134
|
+
self,
|
|
135
|
+
name: str,
|
|
136
|
+
*,
|
|
137
|
+
code: str,
|
|
138
|
+
description: str,
|
|
139
|
+
inputs: str = "",
|
|
140
|
+
outputs: str = "",
|
|
141
|
+
triggers: str = "",
|
|
142
|
+
task_types: Optional[Iterable[str]] = None,
|
|
143
|
+
) -> int:
|
|
144
|
+
operators = self._data.setdefault("operators", {})
|
|
145
|
+
record = operators.setdefault(name, {"versions": []})
|
|
146
|
+
versions: List[Dict[str, Any]] = record.setdefault("versions", [])
|
|
147
|
+
|
|
148
|
+
code_hash = hashlib.sha256(code.encode("utf-8")).hexdigest()[:16]
|
|
149
|
+
for v in versions:
|
|
150
|
+
if v.get("code_hash") == code_hash:
|
|
151
|
+
return _safe_int(v.get("version"), default=1)
|
|
152
|
+
|
|
153
|
+
next_version = 1 + max((_safe_int(v.get("version"), 0) for v in versions), default=0)
|
|
154
|
+
versions.append(
|
|
155
|
+
{
|
|
156
|
+
"version": next_version,
|
|
157
|
+
"code_hash": code_hash,
|
|
158
|
+
"code": code,
|
|
159
|
+
"description": (description or "").strip(),
|
|
160
|
+
"inputs": (inputs or "").strip(),
|
|
161
|
+
"outputs": (outputs or "").strip(),
|
|
162
|
+
"triggers": (triggers or "").strip(),
|
|
163
|
+
"task_types": [t for t in (task_types or []) if isinstance(t, str) and t.strip()],
|
|
164
|
+
"created_at": _utc_now_iso(),
|
|
165
|
+
"uses": 0,
|
|
166
|
+
"successes": 0,
|
|
167
|
+
"failures": 0,
|
|
168
|
+
"seen_competition_ids": [],
|
|
169
|
+
}
|
|
170
|
+
)
|
|
171
|
+
self._save()
|
|
172
|
+
return next_version
|
|
173
|
+
|
|
174
|
+
def record_outcome(
|
|
175
|
+
self,
|
|
176
|
+
name: str,
|
|
177
|
+
version: int,
|
|
178
|
+
*,
|
|
179
|
+
success: bool,
|
|
180
|
+
competition_ids: Optional[Iterable[str]] = None,
|
|
181
|
+
) -> None:
|
|
182
|
+
operators = self._data.get("operators", {})
|
|
183
|
+
record = operators.get(name)
|
|
184
|
+
if not isinstance(record, dict):
|
|
185
|
+
return
|
|
186
|
+
versions = record.get("versions", [])
|
|
187
|
+
if not isinstance(versions, list):
|
|
188
|
+
return
|
|
189
|
+
|
|
190
|
+
for v in versions:
|
|
191
|
+
if _safe_int(v.get("version"), -1) != int(version):
|
|
192
|
+
continue
|
|
193
|
+
v["uses"] = _safe_int(v.get("uses"), 0) + 1
|
|
194
|
+
if success:
|
|
195
|
+
v["successes"] = _safe_int(v.get("successes"), 0) + 1
|
|
196
|
+
v["last_success_at"] = _utc_now_iso()
|
|
197
|
+
else:
|
|
198
|
+
v["failures"] = _safe_int(v.get("failures"), 0) + 1
|
|
199
|
+
v["last_used_at"] = _utc_now_iso()
|
|
200
|
+
|
|
201
|
+
if competition_ids:
|
|
202
|
+
seen = set(v.get("seen_competition_ids") or [])
|
|
203
|
+
for cid in competition_ids:
|
|
204
|
+
if isinstance(cid, str) and cid:
|
|
205
|
+
seen.add(cid)
|
|
206
|
+
v["seen_competition_ids"] = sorted(seen)
|
|
207
|
+
|
|
208
|
+
self._save()
|
|
209
|
+
return
|
|
210
|
+
|
|
211
|
+
def select_for_prompt(
|
|
212
|
+
self,
|
|
213
|
+
max_ops: int,
|
|
214
|
+
*,
|
|
215
|
+
competition_ids: Optional[Iterable[str]] = None,
|
|
216
|
+
task_types: Optional[Iterable[str]] = None,
|
|
217
|
+
) -> List[LibraryOperatorSpec]:
|
|
218
|
+
"""
|
|
219
|
+
Select up to `max_ops` operators (best version per operator) using a simple UCB score.
|
|
220
|
+
Adds a small bonus when an operator has succeeded on the same competition_ids or task_types.
|
|
221
|
+
"""
|
|
222
|
+
operators = self._data.get("operators", {})
|
|
223
|
+
if not isinstance(operators, dict) or max_ops <= 0:
|
|
224
|
+
return []
|
|
225
|
+
|
|
226
|
+
preferred_competitions = {
|
|
227
|
+
cid for cid in (competition_ids or []) if isinstance(cid, str) and cid.strip()
|
|
228
|
+
}
|
|
229
|
+
preferred_task_types = {
|
|
230
|
+
t.strip().lower() for t in (task_types or []) if isinstance(t, str) and t.strip()
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
all_versions: List[Tuple[str, Dict[str, Any]]] = []
|
|
234
|
+
for name, record in operators.items():
|
|
235
|
+
if not isinstance(record, dict):
|
|
236
|
+
continue
|
|
237
|
+
for v in record.get("versions", []) or []:
|
|
238
|
+
if isinstance(v, dict):
|
|
239
|
+
all_versions.append((name, v))
|
|
240
|
+
|
|
241
|
+
total_uses = sum(_safe_int(v.get("uses"), 0) for _, v in all_versions)
|
|
242
|
+
total_uses = max(1, total_uses)
|
|
243
|
+
|
|
244
|
+
best_by_name: Dict[str, Tuple[float, Dict[str, Any]]] = {}
|
|
245
|
+
for name, v in all_versions:
|
|
246
|
+
uses = _safe_int(v.get("uses"), 0)
|
|
247
|
+
successes = _safe_int(v.get("successes"), 0)
|
|
248
|
+
mean = successes / max(1, uses)
|
|
249
|
+
ucb = mean + math.sqrt(2.0 * math.log(total_uses + 1.0) / (uses + 1.0))
|
|
250
|
+
|
|
251
|
+
bonus = 0.0
|
|
252
|
+
if preferred_competitions:
|
|
253
|
+
seen = {c for c in (v.get("seen_competition_ids") or []) if isinstance(c, str) and c}
|
|
254
|
+
if seen & preferred_competitions:
|
|
255
|
+
bonus += 0.25
|
|
256
|
+
if preferred_task_types:
|
|
257
|
+
vtypes = {
|
|
258
|
+
t.strip().lower()
|
|
259
|
+
for t in (v.get("task_types") or [])
|
|
260
|
+
if isinstance(t, str) and t.strip()
|
|
261
|
+
}
|
|
262
|
+
if vtypes & preferred_task_types:
|
|
263
|
+
bonus += 0.15
|
|
264
|
+
|
|
265
|
+
score = ucb + bonus
|
|
266
|
+
prev = best_by_name.get(name)
|
|
267
|
+
if prev is None or score > prev[0]:
|
|
268
|
+
best_by_name[name] = (score, v)
|
|
269
|
+
|
|
270
|
+
ranked = sorted(best_by_name.items(), key=lambda kv: kv[1][0], reverse=True)
|
|
271
|
+
selected = ranked[:max_ops]
|
|
272
|
+
|
|
273
|
+
specs: List[LibraryOperatorSpec] = []
|
|
274
|
+
for name, (_score, v) in selected:
|
|
275
|
+
specs.append(
|
|
276
|
+
LibraryOperatorSpec(
|
|
277
|
+
name=name,
|
|
278
|
+
version=_safe_int(v.get("version"), 1),
|
|
279
|
+
code=str(v.get("code") or ""),
|
|
280
|
+
description=str(v.get("description") or "").strip(),
|
|
281
|
+
inputs=str(v.get("inputs") or "").strip(),
|
|
282
|
+
outputs=str(v.get("outputs") or "").strip(),
|
|
283
|
+
triggers=str(v.get("triggers") or "").strip(),
|
|
284
|
+
task_types=list(v.get("task_types") or []),
|
|
285
|
+
uses=_safe_int(v.get("uses"), 0),
|
|
286
|
+
successes=_safe_int(v.get("successes"), 0),
|
|
287
|
+
failures=_safe_int(v.get("failures"), 0),
|
|
288
|
+
)
|
|
289
|
+
)
|
|
290
|
+
return specs
|