gradex 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gradex/__init__.py +3 -0
- gradex/ai/__init__.py +1 -0
- gradex/ai/brief.py +94 -0
- gradex/ai/client.py +232 -0
- gradex/ai/discover.py +280 -0
- gradex/ai/prompts/benchmark_design.md +21 -0
- gradex/ai/prompts/gate_design.md +16 -0
- gradex/ai/prompts/optimize_brief.md +33 -0
- gradex/ai/prompts/repo_analysis.md +21 -0
- gradex/analytics.py +200 -0
- gradex/backends/__init__.py +1 -0
- gradex/backends/base.py +75 -0
- gradex/backends/pool.py +96 -0
- gradex/backends/worktree.py +187 -0
- gradex/cli.py +588 -0
- gradex/config.py +70 -0
- gradex/dashboard/__init__.py +1 -0
- gradex/dashboard/broadcaster.py +75 -0
- gradex/dashboard/server.py +164 -0
- gradex/dashboard/templates/index.html +449 -0
- gradex/doctor.py +138 -0
- gradex/export.py +125 -0
- gradex/hosts/__init__.py +40 -0
- gradex/hosts/base.py +67 -0
- gradex/hosts/claude_code.py +191 -0
- gradex/hosts/cursor.py +142 -0
- gradex/orchestrator.py +246 -0
- gradex/repository.py +201 -0
- gradex/runner/__init__.py +1 -0
- gradex/runner/benchmark.py +127 -0
- gradex/runner/cache.py +106 -0
- gradex/runner/gate.py +71 -0
- gradex/security/__init__.py +1 -0
- gradex/security/scrubber.py +47 -0
- gradex/state.py +87 -0
- gradex/subagent.py +258 -0
- gradex/traces.py +109 -0
- gradex-0.1.0.dist-info/METADATA +92 -0
- gradex-0.1.0.dist-info/RECORD +41 -0
- gradex-0.1.0.dist-info/WHEEL +4 -0
- gradex-0.1.0.dist-info/entry_points.txt +2 -0
gradex/__init__.py
ADDED
gradex/ai/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""evo AI layer — LLM client, discover skill, and brief generator."""
|
gradex/ai/brief.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""Brief generator: render the per-experiment optimisation prompt template."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import jinja2
|
|
9
|
+
|
|
10
|
+
from gradex.ai.client import LLMClient
|
|
11
|
+
|
|
12
|
+
_PROMPTS_DIR = Path(__file__).parent / "prompts"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class ExperimentSummary:
|
|
17
|
+
"""A compact summary of one past experiment for inclusion in a brief."""
|
|
18
|
+
|
|
19
|
+
hypothesis: str
|
|
20
|
+
result: str # "improved" | "regressed" | "gate_failed" | "failed"
|
|
21
|
+
reason: str # e.g. "score went from 41.2 to 45.0 (regression)"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class BriefGenerator:
|
|
25
|
+
"""Renders ``optimize_brief.md`` into a Markdown prompt for a subagent.
|
|
26
|
+
|
|
27
|
+
``generate()`` is **synchronous** — briefs are pure Jinja2 template
|
|
28
|
+
renders, not LLM calls. The subagent that *receives* the brief is the
|
|
29
|
+
LLM.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(self, client: LLMClient) -> None:
|
|
33
|
+
# Client stored for future extensions; not used in synchronous rendering.
|
|
34
|
+
self._client = client
|
|
35
|
+
|
|
36
|
+
def generate(
|
|
37
|
+
self,
|
|
38
|
+
optimization_target: str,
|
|
39
|
+
metric: str,
|
|
40
|
+
metric_direction: str,
|
|
41
|
+
baseline_score: float,
|
|
42
|
+
best_score: float,
|
|
43
|
+
benchmark_cmd: str,
|
|
44
|
+
gate_cmd: str,
|
|
45
|
+
past_experiments: list[ExperimentSummary],
|
|
46
|
+
shared_notes: str = "",
|
|
47
|
+
agent_index: int = 0,
|
|
48
|
+
) -> str:
|
|
49
|
+
"""Render the optimisation brief for one parallel agent.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
optimization_target: One-sentence description of the target.
|
|
53
|
+
metric: Human-readable metric string.
|
|
54
|
+
metric_direction: ``"higher"`` or ``"lower"``.
|
|
55
|
+
baseline_score: Score at the start of the run.
|
|
56
|
+
best_score: Best score seen so far.
|
|
57
|
+
benchmark_cmd: Shell command to run the benchmark.
|
|
58
|
+
gate_cmd: Shell command to run the gate tests.
|
|
59
|
+
past_experiments: History of previous attempts.
|
|
60
|
+
shared_notes: Cross-agent knowledge to include.
|
|
61
|
+
agent_index: Controls experiment ordering for variety.
|
|
62
|
+
Index 0 → oldest-first;
|
|
63
|
+
Index > 0 → reversed (newest-first) for diversity.
|
|
64
|
+
|
|
65
|
+
Returns the rendered Markdown string; never calls the LLM.
|
|
66
|
+
"""
|
|
67
|
+
template = self._load_template()
|
|
68
|
+
|
|
69
|
+
# Vary ordering so parallel agents explore different strategies.
|
|
70
|
+
experiments = list(past_experiments)
|
|
71
|
+
if agent_index > 0 and experiments:
|
|
72
|
+
experiments = list(reversed(experiments))
|
|
73
|
+
|
|
74
|
+
return template.render(
|
|
75
|
+
optimization_target=optimization_target,
|
|
76
|
+
metric=metric,
|
|
77
|
+
direction=metric_direction,
|
|
78
|
+
baseline_score=baseline_score,
|
|
79
|
+
best_score=best_score,
|
|
80
|
+
benchmark_cmd=benchmark_cmd,
|
|
81
|
+
gate_cmd=gate_cmd,
|
|
82
|
+
failed_experiments=experiments,
|
|
83
|
+
shared_notes=shared_notes,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
def _load_template(self) -> jinja2.Template:
|
|
87
|
+
"""Load ``optimize_brief.md`` as a Jinja2 :class:`~jinja2.Template`."""
|
|
88
|
+
env = jinja2.Environment(
|
|
89
|
+
loader=jinja2.FileSystemLoader(str(_PROMPTS_DIR)),
|
|
90
|
+
autoescape=False,
|
|
91
|
+
trim_blocks=True,
|
|
92
|
+
lstrip_blocks=True,
|
|
93
|
+
)
|
|
94
|
+
return env.get_template("optimize_brief.md")
|
gradex/ai/client.py
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
"""Unified LLM client supporting Anthropic, OpenAI, and Ollama."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import time
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from threading import Lock
|
|
9
|
+
|
|
10
|
+
from gradex.config import LLMConfig, load_llm_config
|
|
11
|
+
|
|
12
|
+
_rate_limiters: dict[str, "_TokenBucket"] = {}
|
|
13
|
+
_rl_lock = Lock()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class _TokenBucket:
|
|
17
|
+
"""Token bucket limiter for max requests per 60 seconds."""
|
|
18
|
+
|
|
19
|
+
def __init__(self, max_requests: int = 50) -> None:
|
|
20
|
+
self._max = max_requests
|
|
21
|
+
self._tokens = float(max_requests)
|
|
22
|
+
self._last_refill = time.monotonic()
|
|
23
|
+
self._lock = Lock()
|
|
24
|
+
|
|
25
|
+
def _refill(self) -> None:
|
|
26
|
+
now = time.monotonic()
|
|
27
|
+
elapsed = now - self._last_refill
|
|
28
|
+
refill = elapsed * (self._max / 60.0)
|
|
29
|
+
self._tokens = min(self._max, self._tokens + refill)
|
|
30
|
+
self._last_refill = now
|
|
31
|
+
|
|
32
|
+
def consume(self) -> float:
|
|
33
|
+
"""Consume one token; return wait seconds if throttled."""
|
|
34
|
+
with self._lock:
|
|
35
|
+
self._refill()
|
|
36
|
+
if self._tokens >= 1.0:
|
|
37
|
+
self._tokens -= 1.0
|
|
38
|
+
return 0.0
|
|
39
|
+
return (1.0 - self._tokens) / (self._max / 60.0)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _get_bucket(provider: str) -> "_TokenBucket":
|
|
43
|
+
with _rl_lock:
|
|
44
|
+
if provider not in _rate_limiters:
|
|
45
|
+
_rate_limiters[provider] = _TokenBucket(max_requests=50)
|
|
46
|
+
return _rate_limiters[provider]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class LLMResponse:
|
|
51
|
+
"""The result of a single LLM completion call."""
|
|
52
|
+
|
|
53
|
+
text: str
|
|
54
|
+
input_tokens: int
|
|
55
|
+
output_tokens: int
|
|
56
|
+
provider: str
|
|
57
|
+
model: str
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class LLMClient:
|
|
61
|
+
"""Unified LLM client supporting Anthropic, OpenAI, and Ollama.
|
|
62
|
+
|
|
63
|
+
All three backends implement the same interface: a system prompt plus a
|
|
64
|
+
user prompt produce a text response. Provider SDKs are imported lazily
|
|
65
|
+
so the package can be installed without requiring all of them.
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
def __init__(self, config: LLMConfig | None = None) -> None:
|
|
69
|
+
self._config: LLMConfig = config if config is not None else load_llm_config()
|
|
70
|
+
|
|
71
|
+
async def complete(
|
|
72
|
+
self,
|
|
73
|
+
system: str,
|
|
74
|
+
user: str,
|
|
75
|
+
max_tokens: int | None = None,
|
|
76
|
+
) -> LLMResponse:
|
|
77
|
+
"""Send a system+user prompt and return the assistant response.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
system: System prompt text.
|
|
81
|
+
user: User message text.
|
|
82
|
+
max_tokens: Token budget; defaults to ``config.max_tokens``.
|
|
83
|
+
|
|
84
|
+
Raises:
|
|
85
|
+
ValueError: When the configured provider is not recognised.
|
|
86
|
+
"""
|
|
87
|
+
cfg = self._config
|
|
88
|
+
tokens = max_tokens if max_tokens is not None else cfg.max_tokens
|
|
89
|
+
provider = cfg.provider
|
|
90
|
+
bucket = _get_bucket(provider)
|
|
91
|
+
|
|
92
|
+
for attempt in range(3):
|
|
93
|
+
wait = bucket.consume()
|
|
94
|
+
if wait == 0.0:
|
|
95
|
+
break
|
|
96
|
+
backoff = wait * (2**attempt)
|
|
97
|
+
await asyncio.sleep(backoff)
|
|
98
|
+
|
|
99
|
+
if provider == "anthropic":
|
|
100
|
+
return await self._complete_anthropic(system, user, tokens)
|
|
101
|
+
elif provider == "openai":
|
|
102
|
+
return await self._complete_openai(system, user, tokens)
|
|
103
|
+
elif provider == "ollama":
|
|
104
|
+
return await self._complete_ollama(system, user, tokens)
|
|
105
|
+
elif provider == "groq":
|
|
106
|
+
return await self._complete_groq(system, user, tokens)
|
|
107
|
+
else:
|
|
108
|
+
raise ValueError(
|
|
109
|
+
f"Unknown provider: {provider!r}. "
|
|
110
|
+
f"Choose: anthropic, openai, ollama, groq"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# ------------------------------------------------------------------
|
|
114
|
+
# Provider implementations
|
|
115
|
+
# ------------------------------------------------------------------
|
|
116
|
+
|
|
117
|
+
async def _complete_anthropic(
|
|
118
|
+
self, system: str, user: str, max_tokens: int
|
|
119
|
+
) -> LLMResponse:
|
|
120
|
+
"""Call the Anthropic Messages API."""
|
|
121
|
+
import anthropic as ant
|
|
122
|
+
|
|
123
|
+
client = ant.AsyncAnthropic(api_key=self._config.api_key or None)
|
|
124
|
+
msg = await client.messages.create(
|
|
125
|
+
model=self._config.effective_model(),
|
|
126
|
+
max_tokens=max_tokens,
|
|
127
|
+
system=system,
|
|
128
|
+
messages=[{"role": "user", "content": user}],
|
|
129
|
+
)
|
|
130
|
+
return LLMResponse(
|
|
131
|
+
text=msg.content[0].text, # type: ignore[union-attr] # always TextBlock in practice
|
|
132
|
+
input_tokens=msg.usage.input_tokens,
|
|
133
|
+
output_tokens=msg.usage.output_tokens,
|
|
134
|
+
provider="anthropic",
|
|
135
|
+
model=self._config.effective_model(),
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
async def _complete_openai(
|
|
139
|
+
self, system: str, user: str, max_tokens: int
|
|
140
|
+
) -> LLMResponse:
|
|
141
|
+
"""Call the OpenAI Chat Completions API."""
|
|
142
|
+
from openai import AsyncOpenAI
|
|
143
|
+
|
|
144
|
+
client = AsyncOpenAI(api_key=self._config.api_key or None)
|
|
145
|
+
resp = await client.chat.completions.create(
|
|
146
|
+
model=self._config.effective_model(),
|
|
147
|
+
max_tokens=max_tokens,
|
|
148
|
+
messages=[
|
|
149
|
+
{"role": "system", "content": system},
|
|
150
|
+
{"role": "user", "content": user},
|
|
151
|
+
],
|
|
152
|
+
)
|
|
153
|
+
choice = resp.choices[0]
|
|
154
|
+
usage = resp.usage
|
|
155
|
+
return LLMResponse(
|
|
156
|
+
text=choice.message.content or "",
|
|
157
|
+
input_tokens=usage.prompt_tokens if usage else 0,
|
|
158
|
+
output_tokens=usage.completion_tokens if usage else 0,
|
|
159
|
+
provider="openai",
|
|
160
|
+
model=self._config.effective_model(),
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
async def _complete_groq(
|
|
164
|
+
self, system: str, user: str, max_tokens: int
|
|
165
|
+
) -> LLMResponse:
|
|
166
|
+
"""Call Groq's OpenAI-compatible endpoint via the OpenAI SDK.
|
|
167
|
+
|
|
168
|
+
Groq free tier: 14,400 requests/day.
|
|
169
|
+
Get a key at https://console.groq.com.
|
|
170
|
+
Best free model: ``llama-3.3-70b-versatile``.
|
|
171
|
+
"""
|
|
172
|
+
from openai import AsyncOpenAI
|
|
173
|
+
|
|
174
|
+
client = AsyncOpenAI(
|
|
175
|
+
api_key=self._config.api_key or None,
|
|
176
|
+
base_url=self._config.groq_base_url,
|
|
177
|
+
)
|
|
178
|
+
resp = await client.chat.completions.create(
|
|
179
|
+
model=self._config.effective_model(),
|
|
180
|
+
max_tokens=max_tokens,
|
|
181
|
+
messages=[
|
|
182
|
+
{"role": "system", "content": system},
|
|
183
|
+
{"role": "user", "content": user},
|
|
184
|
+
],
|
|
185
|
+
temperature=self._config.temperature,
|
|
186
|
+
)
|
|
187
|
+
choice = resp.choices[0]
|
|
188
|
+
usage = resp.usage
|
|
189
|
+
return LLMResponse(
|
|
190
|
+
text=choice.message.content or "",
|
|
191
|
+
input_tokens=usage.prompt_tokens if usage else 0,
|
|
192
|
+
output_tokens=usage.completion_tokens if usage else 0,
|
|
193
|
+
provider="groq",
|
|
194
|
+
model=self._config.effective_model(),
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
async def _complete_ollama(
|
|
198
|
+
self, system: str, user: str, max_tokens: int
|
|
199
|
+
) -> LLMResponse:
|
|
200
|
+
"""Call Ollama's OpenAI-compatible endpoint via httpx.
|
|
201
|
+
|
|
202
|
+
Ollama exposes ``http://localhost:11434/v1`` — no extra SDK required.
|
|
203
|
+
"""
|
|
204
|
+
|
|
205
|
+
import httpx
|
|
206
|
+
|
|
207
|
+
payload = {
|
|
208
|
+
"model": self._config.effective_model(),
|
|
209
|
+
"messages": [
|
|
210
|
+
{"role": "system", "content": system},
|
|
211
|
+
{"role": "user", "content": user},
|
|
212
|
+
],
|
|
213
|
+
"stream": False,
|
|
214
|
+
"options": {"num_predict": max_tokens},
|
|
215
|
+
}
|
|
216
|
+
async with httpx.AsyncClient(timeout=120.0) as http:
|
|
217
|
+
resp = await http.post(
|
|
218
|
+
f"{self._config.ollama_base_url}/chat/completions",
|
|
219
|
+
json=payload,
|
|
220
|
+
)
|
|
221
|
+
resp.raise_for_status()
|
|
222
|
+
data: dict[str, object] = resp.json()
|
|
223
|
+
choices = data["choices"]
|
|
224
|
+
text: str = choices[0]["message"]["content"] # type: ignore[index]
|
|
225
|
+
usage: dict[str, int] = data.get("usage", {}) # type: ignore[assignment] # resp.json() is untyped
|
|
226
|
+
return LLMResponse(
|
|
227
|
+
text=text,
|
|
228
|
+
input_tokens=usage.get("prompt_tokens", 0),
|
|
229
|
+
output_tokens=usage.get("completion_tokens", 0),
|
|
230
|
+
provider="ollama",
|
|
231
|
+
model=self._config.effective_model(),
|
|
232
|
+
)
|
gradex/ai/discover.py
ADDED
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
"""Discover skill: analyse a repo and set up a baseline optimization run."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import re
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Literal
|
|
10
|
+
|
|
11
|
+
import jinja2
|
|
12
|
+
|
|
13
|
+
from gradex.ai.client import LLMClient
|
|
14
|
+
from gradex.backends.base import Backend
|
|
15
|
+
|
|
16
|
+
PROMPTS_DIR = Path(__file__).parent / "prompts"
|
|
17
|
+
|
|
18
|
+
# ---------------------------------------------------------------------------
|
|
19
|
+
# Result type
|
|
20
|
+
# ---------------------------------------------------------------------------
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class DiscoverResult:
|
|
25
|
+
"""Everything the discover flow found and set up."""
|
|
26
|
+
|
|
27
|
+
optimization_target: str
|
|
28
|
+
metric: str
|
|
29
|
+
metric_direction: Literal["higher", "lower"]
|
|
30
|
+
benchmark_script: str
|
|
31
|
+
benchmark_path: Path
|
|
32
|
+
gate_cmds: list[str]
|
|
33
|
+
baseline_score: float
|
|
34
|
+
run_id: str
|
|
35
|
+
notes: str
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# ---------------------------------------------------------------------------
|
|
39
|
+
# Skill
|
|
40
|
+
# ---------------------------------------------------------------------------
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class DiscoverSkill:
|
|
44
|
+
"""Analyse a repository, design a benchmark, and establish a baseline score.
|
|
45
|
+
|
|
46
|
+
Steps performed by :meth:`run`:
|
|
47
|
+
|
|
48
|
+
1. Scan the repo to build a context string.
|
|
49
|
+
2. Ask the LLM to pick an optimisation target and metric.
|
|
50
|
+
3. Ask the LLM to write a Python benchmark script; save it.
|
|
51
|
+
4. Detect existing test files.
|
|
52
|
+
5. Ask the LLM to identify gate commands.
|
|
53
|
+
6. Run the benchmark once to capture the baseline score.
|
|
54
|
+
7. Create a :class:`~evo.state.Run` in the database.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
def __init__(self, client: LLMClient, backend: Backend) -> None:
|
|
58
|
+
self._client = client
|
|
59
|
+
self._backend = backend
|
|
60
|
+
|
|
61
|
+
# ------------------------------------------------------------------
|
|
62
|
+
# Main entry point
|
|
63
|
+
# ------------------------------------------------------------------
|
|
64
|
+
|
|
65
|
+
async def run(
|
|
66
|
+
self,
|
|
67
|
+
repo_root: Path,
|
|
68
|
+
hint: str = "",
|
|
69
|
+
) -> DiscoverResult:
|
|
70
|
+
"""Run the full discover flow for *repo_root*.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
repo_root: Root directory of the repository to analyse.
|
|
74
|
+
hint: Optional user hint, e.g. "make the parser faster".
|
|
75
|
+
|
|
76
|
+
Returns a :class:`DiscoverResult` with all artefacts populated.
|
|
77
|
+
"""
|
|
78
|
+
# 1 — Repo context
|
|
79
|
+
repo_context = self.scan_repo(repo_root)
|
|
80
|
+
test_files = self.detect_test_files(repo_root)
|
|
81
|
+
|
|
82
|
+
# 2 — Target + metric
|
|
83
|
+
analysis_system = (PROMPTS_DIR / "repo_analysis.md").read_text(encoding="utf-8")
|
|
84
|
+
analysis_user = repo_context
|
|
85
|
+
if hint:
|
|
86
|
+
analysis_user += f"\n\nUser hint: {hint}"
|
|
87
|
+
resp1 = await self._client.complete(analysis_system, analysis_user)
|
|
88
|
+
optimization_target = self._parse_xml_tag(resp1.text, "optimization_target")
|
|
89
|
+
metric = self._parse_xml_tag(resp1.text, "metric")
|
|
90
|
+
direction = self._infer_direction(metric)
|
|
91
|
+
|
|
92
|
+
# 3 — Benchmark script
|
|
93
|
+
bench_system = jinja2.Template(
|
|
94
|
+
(PROMPTS_DIR / "benchmark_design.md").read_text(encoding="utf-8")
|
|
95
|
+
).render(
|
|
96
|
+
target=optimization_target,
|
|
97
|
+
metric=metric,
|
|
98
|
+
repo_context=repo_context,
|
|
99
|
+
)
|
|
100
|
+
resp2 = await self._client.complete(
|
|
101
|
+
bench_system, "Write the benchmark script now."
|
|
102
|
+
)
|
|
103
|
+
benchmark_script = self._parse_xml_tag(resp2.text, "benchmark_script")
|
|
104
|
+
notes = ""
|
|
105
|
+
try:
|
|
106
|
+
notes = self._parse_xml_tag(resp2.text, "notes")
|
|
107
|
+
except ValueError:
|
|
108
|
+
pass
|
|
109
|
+
|
|
110
|
+
# Write benchmark to disk
|
|
111
|
+
evo_dir = repo_root / ".gradex"
|
|
112
|
+
evo_dir.mkdir(parents=True, exist_ok=True)
|
|
113
|
+
benchmark_path = evo_dir / "benchmark.py"
|
|
114
|
+
benchmark_path.write_text(benchmark_script, encoding="utf-8")
|
|
115
|
+
|
|
116
|
+
# 4 + 5 — Gate commands
|
|
117
|
+
gate_system = jinja2.Template(
|
|
118
|
+
(PROMPTS_DIR / "gate_design.md").read_text(encoding="utf-8")
|
|
119
|
+
).render(
|
|
120
|
+
target=optimization_target,
|
|
121
|
+
test_files=test_files,
|
|
122
|
+
)
|
|
123
|
+
resp3 = await self._client.complete(gate_system, "Identify the gate commands.")
|
|
124
|
+
gate_cmds_raw = self._parse_xml_tag(resp3.text, "gate_cmds")
|
|
125
|
+
gate_cmds: list[str] = json.loads(gate_cmds_raw)
|
|
126
|
+
|
|
127
|
+
# 6 — Baseline
|
|
128
|
+
baseline_score = await self._run_baseline(repo_root, benchmark_path)
|
|
129
|
+
|
|
130
|
+
# 7 — Persist Run
|
|
131
|
+
from gradex.repository import RunRepository
|
|
132
|
+
|
|
133
|
+
run = RunRepository().create(
|
|
134
|
+
benchmark_cmd=f"python {benchmark_path}",
|
|
135
|
+
metric_direction=direction,
|
|
136
|
+
gate_cmds=gate_cmds,
|
|
137
|
+
baseline_score=baseline_score,
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
return DiscoverResult(
|
|
141
|
+
optimization_target=optimization_target,
|
|
142
|
+
metric=metric,
|
|
143
|
+
metric_direction=direction,
|
|
144
|
+
benchmark_script=benchmark_script,
|
|
145
|
+
benchmark_path=benchmark_path,
|
|
146
|
+
gate_cmds=gate_cmds,
|
|
147
|
+
baseline_score=baseline_score,
|
|
148
|
+
run_id=run.id,
|
|
149
|
+
notes=notes,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# ------------------------------------------------------------------
|
|
153
|
+
# Helpers
|
|
154
|
+
# ------------------------------------------------------------------
|
|
155
|
+
|
|
156
|
+
def scan_repo(self, repo_root: Path) -> str:
|
|
157
|
+
"""Return a compact text summary of *repo_root* for LLM context.
|
|
158
|
+
|
|
159
|
+
Skips ``.git``, ``.evo``, ``__pycache__``, ``node_modules``,
|
|
160
|
+
``.venv``, and ``dist``. Output is capped at 3 000 characters.
|
|
161
|
+
"""
|
|
162
|
+
_SKIP = {".git", ".gradex", "__pycache__", "node_modules", ".venv", "dist"}
|
|
163
|
+
entries: list[str] = []
|
|
164
|
+
ext_counts: dict[str, int] = {}
|
|
165
|
+
total_size = 0
|
|
166
|
+
|
|
167
|
+
def _walk(path: Path, depth: int = 0) -> None:
|
|
168
|
+
if depth > 3 or len(entries) >= 60:
|
|
169
|
+
return
|
|
170
|
+
try:
|
|
171
|
+
children = sorted(path.iterdir())
|
|
172
|
+
except PermissionError:
|
|
173
|
+
return
|
|
174
|
+
for item in children:
|
|
175
|
+
if item.name in _SKIP:
|
|
176
|
+
continue
|
|
177
|
+
if len(entries) >= 60:
|
|
178
|
+
entries.append(" ... (truncated)")
|
|
179
|
+
return
|
|
180
|
+
indent = " " * depth
|
|
181
|
+
if item.is_dir():
|
|
182
|
+
entries.append(f"{indent}{item.name}/")
|
|
183
|
+
_walk(item, depth + 1)
|
|
184
|
+
else:
|
|
185
|
+
entries.append(f"{indent}{item.name}")
|
|
186
|
+
ext = item.suffix.lower() or "(no ext)"
|
|
187
|
+
ext_counts[ext] = ext_counts.get(ext, 0) + 1
|
|
188
|
+
try:
|
|
189
|
+
nonlocal total_size
|
|
190
|
+
total_size += item.stat().st_size
|
|
191
|
+
except OSError:
|
|
192
|
+
pass
|
|
193
|
+
|
|
194
|
+
_walk(repo_root)
|
|
195
|
+
tree_str = "\n".join(entries)
|
|
196
|
+
ext_str = ", ".join(f"{k}: {v}" for k, v in sorted(ext_counts.items()))
|
|
197
|
+
result = (
|
|
198
|
+
f"Directory tree:\n{tree_str}\n\n"
|
|
199
|
+
f"File types: {ext_str}\n"
|
|
200
|
+
f"Total size: {total_size} bytes"
|
|
201
|
+
)
|
|
202
|
+
return result[:3000]
|
|
203
|
+
|
|
204
|
+
def detect_test_files(self, repo_root: Path) -> list[str]:
|
|
205
|
+
"""Return relative paths to test files in *repo_root* (max 20).
|
|
206
|
+
|
|
207
|
+
Matches ``test_*.py`` and ``*_test.py`` patterns anywhere under
|
|
208
|
+
*repo_root*.
|
|
209
|
+
"""
|
|
210
|
+
seen: set[str] = set()
|
|
211
|
+
results: list[str] = []
|
|
212
|
+
for pattern in ("**/test_*.py", "**/*_test.py"):
|
|
213
|
+
for path in sorted(repo_root.glob(pattern)):
|
|
214
|
+
rel = str(path.relative_to(repo_root))
|
|
215
|
+
if rel not in seen:
|
|
216
|
+
seen.add(rel)
|
|
217
|
+
results.append(rel)
|
|
218
|
+
return results[:20]
|
|
219
|
+
|
|
220
|
+
def _parse_xml_tag(self, text: str, tag: str) -> str:
|
|
221
|
+
"""Extract the content of ``<tag>…</tag>`` from *text*.
|
|
222
|
+
|
|
223
|
+
Strips surrounding whitespace.
|
|
224
|
+
|
|
225
|
+
Raises:
|
|
226
|
+
ValueError: If the tag is absent.
|
|
227
|
+
"""
|
|
228
|
+
match = re.search(f"<{tag}>(.*?)</{tag}>", text, re.DOTALL)
|
|
229
|
+
if not match:
|
|
230
|
+
raise ValueError(f"Tag <{tag}> not found in LLM response")
|
|
231
|
+
return match.group(1).strip()
|
|
232
|
+
|
|
233
|
+
def _infer_direction(self, metric: str) -> Literal["higher", "lower"]:
|
|
234
|
+
"""Infer optimisation direction from a metric description string.
|
|
235
|
+
|
|
236
|
+
``"lower"`` keywords: lower, minimize, latency, cost, ms, error, loss.
|
|
237
|
+
``"higher"`` keywords: higher, maximize, accuracy, score, %, throughput.
|
|
238
|
+
Defaults to ``"lower"``.
|
|
239
|
+
"""
|
|
240
|
+
lower_m = metric.lower()
|
|
241
|
+
higher_keywords = {"higher", "maximize", "accuracy", "score", "%", "throughput"}
|
|
242
|
+
lower_keywords = {
|
|
243
|
+
"lower",
|
|
244
|
+
"minimize",
|
|
245
|
+
"latency",
|
|
246
|
+
"cost",
|
|
247
|
+
"ms",
|
|
248
|
+
"millisecond",
|
|
249
|
+
"error",
|
|
250
|
+
"loss",
|
|
251
|
+
}
|
|
252
|
+
for kw in higher_keywords:
|
|
253
|
+
if kw in lower_m:
|
|
254
|
+
return "higher"
|
|
255
|
+
for kw in lower_keywords:
|
|
256
|
+
if kw in lower_m:
|
|
257
|
+
return "lower"
|
|
258
|
+
return "lower"
|
|
259
|
+
|
|
260
|
+
# ------------------------------------------------------------------
|
|
261
|
+
# Baseline execution
|
|
262
|
+
# ------------------------------------------------------------------
|
|
263
|
+
|
|
264
|
+
async def _run_baseline(self, repo_root: Path, benchmark_path: Path) -> float:
|
|
265
|
+
"""Execute *benchmark_path* once and parse the resulting score.
|
|
266
|
+
|
|
267
|
+
Raises:
|
|
268
|
+
ValueError: If the benchmark times out or yields no parseable score.
|
|
269
|
+
"""
|
|
270
|
+
from gradex.runner.benchmark import BenchmarkRunner
|
|
271
|
+
|
|
272
|
+
runner = BenchmarkRunner(self._backend, timeout=60)
|
|
273
|
+
result = await runner.run(repo_root, ["python", str(benchmark_path)])
|
|
274
|
+
if result.timed_out:
|
|
275
|
+
raise ValueError("Baseline benchmark timed out")
|
|
276
|
+
if result.score is None:
|
|
277
|
+
raise ValueError(
|
|
278
|
+
f"Baseline benchmark returned no parseable score: {result.parse_error!r}"
|
|
279
|
+
)
|
|
280
|
+
return result.score
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
You are an expert software engineer writing a Python benchmark script.
|
|
2
|
+
|
|
3
|
+
Target to optimize: {{ target }}
|
|
4
|
+
Metric: {{ metric }}
|
|
5
|
+
Repository context: {{ repo_context }}
|
|
6
|
+
|
|
7
|
+
Write a Python script that:
|
|
8
|
+
1. Imports and runs the target code
|
|
9
|
+
2. Measures the metric numerically (time it, count it, score it)
|
|
10
|
+
3. Prints ONLY a single float on the last line of stdout (e.g. `print(f"{value:.4f}")`)
|
|
11
|
+
4. Is deterministic enough to compare across runs (warm up if needed, average 3 runs)
|
|
12
|
+
5. Has no hardcoded paths — uses relative imports or Path(__file__).parent
|
|
13
|
+
6. Completes in under 30 seconds
|
|
14
|
+
|
|
15
|
+
Respond in this exact format:
|
|
16
|
+
<benchmark_script>
|
|
17
|
+
[complete Python script, no markdown fences]
|
|
18
|
+
</benchmark_script>
|
|
19
|
+
<notes>
|
|
20
|
+
[any setup steps needed before running, e.g. "requires sample_data/ directory"]
|
|
21
|
+
</notes>
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
You are an expert software engineer identifying regression tests for a code optimization task.
|
|
2
|
+
|
|
3
|
+
Target to optimize: {{ target }}
|
|
4
|
+
Repository test files found: {{ test_files }}
|
|
5
|
+
|
|
6
|
+
Identify the pytest command(s) that must pass after every experiment.
|
|
7
|
+
Prefer: tests directly covering the optimized module.
|
|
8
|
+
If no specific tests exist, use the full test suite.
|
|
9
|
+
|
|
10
|
+
Respond in this exact format:
|
|
11
|
+
<gate_cmds>
|
|
12
|
+
["pytest tests/test_parser.py", "pytest tests/test_integration.py"]
|
|
13
|
+
</gate_cmds>
|
|
14
|
+
<rationale>
|
|
15
|
+
[why these tests are the right gate]
|
|
16
|
+
</rationale>
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
You are a subagent in a parallel code optimization loop.
|
|
2
|
+
|
|
3
|
+
## Your task
|
|
4
|
+
{{ optimization_target }}
|
|
5
|
+
Metric: {{ metric }} ({{ direction }})
|
|
6
|
+
Current best score: {{ best_score }}
|
|
7
|
+
Baseline score: {{ baseline_score }}
|
|
8
|
+
|
|
9
|
+
## What has been tried (do NOT repeat these)
|
|
10
|
+
{% for exp in failed_experiments %}
|
|
11
|
+
- Hypothesis: {{ exp.hypothesis }}
|
|
12
|
+
Result: {{ exp.result }} ({{ exp.reason }})
|
|
13
|
+
{% endfor %}
|
|
14
|
+
|
|
15
|
+
## Shared knowledge from all agents
|
|
16
|
+
{{ shared_notes }}
|
|
17
|
+
|
|
18
|
+
## Your hypothesis
|
|
19
|
+
Based on the above, form ONE new hypothesis that has NOT been tried.
|
|
20
|
+
Be specific: name the function, file, and change you will make.
|
|
21
|
+
|
|
22
|
+
## Instructions
|
|
23
|
+
1. Read the relevant source files
|
|
24
|
+
2. Implement your hypothesis
|
|
25
|
+
3. Run the benchmark: `{{ benchmark_cmd }}`
|
|
26
|
+
The last line of stdout is your score. Lower/higher is better per the metric above.
|
|
27
|
+
4. Run the gate: `{{ gate_cmd }}`
|
|
28
|
+
5. Write your results to `.gradex/result.json`:
|
|
29
|
+
{"score": <float>, "hypothesis": "<one sentence>", "change_summary": "<what you changed>"}
|
|
30
|
+
6. Write gate result to `.gradex/gate.json`:
|
|
31
|
+
{"passed": <bool>, "failures": [<strings>]}
|
|
32
|
+
|
|
33
|
+
Do not stop until both files are written.
|