gradex 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gradex/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """gradex: manage and interact with coding host plugins."""
2
+
3
+ __version__ = "0.1.0"
gradex/ai/__init__.py ADDED
@@ -0,0 +1 @@
1
+ """evo AI layer — LLM client, discover skill, and brief generator."""
gradex/ai/brief.py ADDED
@@ -0,0 +1,94 @@
1
+ """Brief generator: render the per-experiment optimisation prompt template."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+
8
+ import jinja2
9
+
10
+ from gradex.ai.client import LLMClient
11
+
12
+ _PROMPTS_DIR = Path(__file__).parent / "prompts"
13
+
14
+
15
+ @dataclass
16
+ class ExperimentSummary:
17
+ """A compact summary of one past experiment for inclusion in a brief."""
18
+
19
+ hypothesis: str
20
+ result: str # "improved" | "regressed" | "gate_failed" | "failed"
21
+ reason: str # e.g. "score went from 41.2 to 45.0 (regression)"
22
+
23
+
24
+ class BriefGenerator:
25
+ """Renders ``optimize_brief.md`` into a Markdown prompt for a subagent.
26
+
27
+ ``generate()`` is **synchronous** — briefs are pure Jinja2 template
28
+ renders, not LLM calls. The subagent that *receives* the brief is the
29
+ LLM.
30
+ """
31
+
32
+ def __init__(self, client: LLMClient) -> None:
33
+ # Client stored for future extensions; not used in synchronous rendering.
34
+ self._client = client
35
+
36
+ def generate(
37
+ self,
38
+ optimization_target: str,
39
+ metric: str,
40
+ metric_direction: str,
41
+ baseline_score: float,
42
+ best_score: float,
43
+ benchmark_cmd: str,
44
+ gate_cmd: str,
45
+ past_experiments: list[ExperimentSummary],
46
+ shared_notes: str = "",
47
+ agent_index: int = 0,
48
+ ) -> str:
49
+ """Render the optimisation brief for one parallel agent.
50
+
51
+ Args:
52
+ optimization_target: One-sentence description of the target.
53
+ metric: Human-readable metric string.
54
+ metric_direction: ``"higher"`` or ``"lower"``.
55
+ baseline_score: Score at the start of the run.
56
+ best_score: Best score seen so far.
57
+ benchmark_cmd: Shell command to run the benchmark.
58
+ gate_cmd: Shell command to run the gate tests.
59
+ past_experiments: History of previous attempts.
60
+ shared_notes: Cross-agent knowledge to include.
61
+ agent_index: Controls experiment ordering for variety.
62
+ Index 0 → oldest-first;
63
+ Index > 0 → reversed (newest-first) for diversity.
64
+
65
+ Returns the rendered Markdown string; never calls the LLM.
66
+ """
67
+ template = self._load_template()
68
+
69
+ # Vary ordering so parallel agents explore different strategies.
70
+ experiments = list(past_experiments)
71
+ if agent_index > 0 and experiments:
72
+ experiments = list(reversed(experiments))
73
+
74
+ return template.render(
75
+ optimization_target=optimization_target,
76
+ metric=metric,
77
+ direction=metric_direction,
78
+ baseline_score=baseline_score,
79
+ best_score=best_score,
80
+ benchmark_cmd=benchmark_cmd,
81
+ gate_cmd=gate_cmd,
82
+ failed_experiments=experiments,
83
+ shared_notes=shared_notes,
84
+ )
85
+
86
+ def _load_template(self) -> jinja2.Template:
87
+ """Load ``optimize_brief.md`` as a Jinja2 :class:`~jinja2.Template`."""
88
+ env = jinja2.Environment(
89
+ loader=jinja2.FileSystemLoader(str(_PROMPTS_DIR)),
90
+ autoescape=False,
91
+ trim_blocks=True,
92
+ lstrip_blocks=True,
93
+ )
94
+ return env.get_template("optimize_brief.md")
gradex/ai/client.py ADDED
@@ -0,0 +1,232 @@
1
+ """Unified LLM client supporting Anthropic, OpenAI, and Ollama."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import time
7
+ from dataclasses import dataclass
8
+ from threading import Lock
9
+
10
+ from gradex.config import LLMConfig, load_llm_config
11
+
12
+ _rate_limiters: dict[str, "_TokenBucket"] = {}
13
+ _rl_lock = Lock()
14
+
15
+
16
+ class _TokenBucket:
17
+ """Token bucket limiter for max requests per 60 seconds."""
18
+
19
+ def __init__(self, max_requests: int = 50) -> None:
20
+ self._max = max_requests
21
+ self._tokens = float(max_requests)
22
+ self._last_refill = time.monotonic()
23
+ self._lock = Lock()
24
+
25
+ def _refill(self) -> None:
26
+ now = time.monotonic()
27
+ elapsed = now - self._last_refill
28
+ refill = elapsed * (self._max / 60.0)
29
+ self._tokens = min(self._max, self._tokens + refill)
30
+ self._last_refill = now
31
+
32
+ def consume(self) -> float:
33
+ """Consume one token; return wait seconds if throttled."""
34
+ with self._lock:
35
+ self._refill()
36
+ if self._tokens >= 1.0:
37
+ self._tokens -= 1.0
38
+ return 0.0
39
+ return (1.0 - self._tokens) / (self._max / 60.0)
40
+
41
+
42
+ def _get_bucket(provider: str) -> "_TokenBucket":
43
+ with _rl_lock:
44
+ if provider not in _rate_limiters:
45
+ _rate_limiters[provider] = _TokenBucket(max_requests=50)
46
+ return _rate_limiters[provider]
47
+
48
+
49
+ @dataclass
50
+ class LLMResponse:
51
+ """The result of a single LLM completion call."""
52
+
53
+ text: str
54
+ input_tokens: int
55
+ output_tokens: int
56
+ provider: str
57
+ model: str
58
+
59
+
60
+ class LLMClient:
61
+ """Unified LLM client supporting Anthropic, OpenAI, and Ollama.
62
+
63
+ All three backends implement the same interface: a system prompt plus a
64
+ user prompt produce a text response. Provider SDKs are imported lazily
65
+ so the package can be installed without requiring all of them.
66
+ """
67
+
68
+ def __init__(self, config: LLMConfig | None = None) -> None:
69
+ self._config: LLMConfig = config if config is not None else load_llm_config()
70
+
71
+ async def complete(
72
+ self,
73
+ system: str,
74
+ user: str,
75
+ max_tokens: int | None = None,
76
+ ) -> LLMResponse:
77
+ """Send a system+user prompt and return the assistant response.
78
+
79
+ Args:
80
+ system: System prompt text.
81
+ user: User message text.
82
+ max_tokens: Token budget; defaults to ``config.max_tokens``.
83
+
84
+ Raises:
85
+ ValueError: When the configured provider is not recognised.
86
+ """
87
+ cfg = self._config
88
+ tokens = max_tokens if max_tokens is not None else cfg.max_tokens
89
+ provider = cfg.provider
90
+ bucket = _get_bucket(provider)
91
+
92
+ for attempt in range(3):
93
+ wait = bucket.consume()
94
+ if wait == 0.0:
95
+ break
96
+ backoff = wait * (2**attempt)
97
+ await asyncio.sleep(backoff)
98
+
99
+ if provider == "anthropic":
100
+ return await self._complete_anthropic(system, user, tokens)
101
+ elif provider == "openai":
102
+ return await self._complete_openai(system, user, tokens)
103
+ elif provider == "ollama":
104
+ return await self._complete_ollama(system, user, tokens)
105
+ elif provider == "groq":
106
+ return await self._complete_groq(system, user, tokens)
107
+ else:
108
+ raise ValueError(
109
+ f"Unknown provider: {provider!r}. "
110
+ f"Choose: anthropic, openai, ollama, groq"
111
+ )
112
+
113
+ # ------------------------------------------------------------------
114
+ # Provider implementations
115
+ # ------------------------------------------------------------------
116
+
117
+ async def _complete_anthropic(
118
+ self, system: str, user: str, max_tokens: int
119
+ ) -> LLMResponse:
120
+ """Call the Anthropic Messages API."""
121
+ import anthropic as ant
122
+
123
+ client = ant.AsyncAnthropic(api_key=self._config.api_key or None)
124
+ msg = await client.messages.create(
125
+ model=self._config.effective_model(),
126
+ max_tokens=max_tokens,
127
+ system=system,
128
+ messages=[{"role": "user", "content": user}],
129
+ )
130
+ return LLMResponse(
131
+ text=msg.content[0].text, # type: ignore[union-attr] # always TextBlock in practice
132
+ input_tokens=msg.usage.input_tokens,
133
+ output_tokens=msg.usage.output_tokens,
134
+ provider="anthropic",
135
+ model=self._config.effective_model(),
136
+ )
137
+
138
+ async def _complete_openai(
139
+ self, system: str, user: str, max_tokens: int
140
+ ) -> LLMResponse:
141
+ """Call the OpenAI Chat Completions API."""
142
+ from openai import AsyncOpenAI
143
+
144
+ client = AsyncOpenAI(api_key=self._config.api_key or None)
145
+ resp = await client.chat.completions.create(
146
+ model=self._config.effective_model(),
147
+ max_tokens=max_tokens,
148
+ messages=[
149
+ {"role": "system", "content": system},
150
+ {"role": "user", "content": user},
151
+ ],
152
+ )
153
+ choice = resp.choices[0]
154
+ usage = resp.usage
155
+ return LLMResponse(
156
+ text=choice.message.content or "",
157
+ input_tokens=usage.prompt_tokens if usage else 0,
158
+ output_tokens=usage.completion_tokens if usage else 0,
159
+ provider="openai",
160
+ model=self._config.effective_model(),
161
+ )
162
+
163
+ async def _complete_groq(
164
+ self, system: str, user: str, max_tokens: int
165
+ ) -> LLMResponse:
166
+ """Call Groq's OpenAI-compatible endpoint via the OpenAI SDK.
167
+
168
+ Groq free tier: 14,400 requests/day.
169
+ Get a key at https://console.groq.com.
170
+ Best free model: ``llama-3.3-70b-versatile``.
171
+ """
172
+ from openai import AsyncOpenAI
173
+
174
+ client = AsyncOpenAI(
175
+ api_key=self._config.api_key or None,
176
+ base_url=self._config.groq_base_url,
177
+ )
178
+ resp = await client.chat.completions.create(
179
+ model=self._config.effective_model(),
180
+ max_tokens=max_tokens,
181
+ messages=[
182
+ {"role": "system", "content": system},
183
+ {"role": "user", "content": user},
184
+ ],
185
+ temperature=self._config.temperature,
186
+ )
187
+ choice = resp.choices[0]
188
+ usage = resp.usage
189
+ return LLMResponse(
190
+ text=choice.message.content or "",
191
+ input_tokens=usage.prompt_tokens if usage else 0,
192
+ output_tokens=usage.completion_tokens if usage else 0,
193
+ provider="groq",
194
+ model=self._config.effective_model(),
195
+ )
196
+
197
+ async def _complete_ollama(
198
+ self, system: str, user: str, max_tokens: int
199
+ ) -> LLMResponse:
200
+ """Call Ollama's OpenAI-compatible endpoint via httpx.
201
+
202
+ Ollama exposes ``http://localhost:11434/v1`` — no extra SDK required.
203
+ """
204
+
205
+ import httpx
206
+
207
+ payload = {
208
+ "model": self._config.effective_model(),
209
+ "messages": [
210
+ {"role": "system", "content": system},
211
+ {"role": "user", "content": user},
212
+ ],
213
+ "stream": False,
214
+ "options": {"num_predict": max_tokens},
215
+ }
216
+ async with httpx.AsyncClient(timeout=120.0) as http:
217
+ resp = await http.post(
218
+ f"{self._config.ollama_base_url}/chat/completions",
219
+ json=payload,
220
+ )
221
+ resp.raise_for_status()
222
+ data: dict[str, object] = resp.json()
223
+ choices = data["choices"]
224
+ text: str = choices[0]["message"]["content"] # type: ignore[index]
225
+ usage: dict[str, int] = data.get("usage", {}) # type: ignore[assignment] # resp.json() is untyped
226
+ return LLMResponse(
227
+ text=text,
228
+ input_tokens=usage.get("prompt_tokens", 0),
229
+ output_tokens=usage.get("completion_tokens", 0),
230
+ provider="ollama",
231
+ model=self._config.effective_model(),
232
+ )
gradex/ai/discover.py ADDED
@@ -0,0 +1,280 @@
1
+ """Discover skill: analyse a repo and set up a baseline optimization run."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import re
7
+ from dataclasses import dataclass
8
+ from pathlib import Path
9
+ from typing import Literal
10
+
11
+ import jinja2
12
+
13
+ from gradex.ai.client import LLMClient
14
+ from gradex.backends.base import Backend
15
+
16
+ PROMPTS_DIR = Path(__file__).parent / "prompts"
17
+
18
+ # ---------------------------------------------------------------------------
19
+ # Result type
20
+ # ---------------------------------------------------------------------------
21
+
22
+
23
+ @dataclass
24
+ class DiscoverResult:
25
+ """Everything the discover flow found and set up."""
26
+
27
+ optimization_target: str
28
+ metric: str
29
+ metric_direction: Literal["higher", "lower"]
30
+ benchmark_script: str
31
+ benchmark_path: Path
32
+ gate_cmds: list[str]
33
+ baseline_score: float
34
+ run_id: str
35
+ notes: str
36
+
37
+
38
+ # ---------------------------------------------------------------------------
39
+ # Skill
40
+ # ---------------------------------------------------------------------------
41
+
42
+
43
+ class DiscoverSkill:
44
+ """Analyse a repository, design a benchmark, and establish a baseline score.
45
+
46
+ Steps performed by :meth:`run`:
47
+
48
+ 1. Scan the repo to build a context string.
49
+ 2. Ask the LLM to pick an optimisation target and metric.
50
+ 3. Ask the LLM to write a Python benchmark script; save it.
51
+ 4. Detect existing test files.
52
+ 5. Ask the LLM to identify gate commands.
53
+ 6. Run the benchmark once to capture the baseline score.
54
+ 7. Create a :class:`~evo.state.Run` in the database.
55
+ """
56
+
57
+ def __init__(self, client: LLMClient, backend: Backend) -> None:
58
+ self._client = client
59
+ self._backend = backend
60
+
61
+ # ------------------------------------------------------------------
62
+ # Main entry point
63
+ # ------------------------------------------------------------------
64
+
65
+ async def run(
66
+ self,
67
+ repo_root: Path,
68
+ hint: str = "",
69
+ ) -> DiscoverResult:
70
+ """Run the full discover flow for *repo_root*.
71
+
72
+ Args:
73
+ repo_root: Root directory of the repository to analyse.
74
+ hint: Optional user hint, e.g. "make the parser faster".
75
+
76
+ Returns a :class:`DiscoverResult` with all artefacts populated.
77
+ """
78
+ # 1 — Repo context
79
+ repo_context = self.scan_repo(repo_root)
80
+ test_files = self.detect_test_files(repo_root)
81
+
82
+ # 2 — Target + metric
83
+ analysis_system = (PROMPTS_DIR / "repo_analysis.md").read_text(encoding="utf-8")
84
+ analysis_user = repo_context
85
+ if hint:
86
+ analysis_user += f"\n\nUser hint: {hint}"
87
+ resp1 = await self._client.complete(analysis_system, analysis_user)
88
+ optimization_target = self._parse_xml_tag(resp1.text, "optimization_target")
89
+ metric = self._parse_xml_tag(resp1.text, "metric")
90
+ direction = self._infer_direction(metric)
91
+
92
+ # 3 — Benchmark script
93
+ bench_system = jinja2.Template(
94
+ (PROMPTS_DIR / "benchmark_design.md").read_text(encoding="utf-8")
95
+ ).render(
96
+ target=optimization_target,
97
+ metric=metric,
98
+ repo_context=repo_context,
99
+ )
100
+ resp2 = await self._client.complete(
101
+ bench_system, "Write the benchmark script now."
102
+ )
103
+ benchmark_script = self._parse_xml_tag(resp2.text, "benchmark_script")
104
+ notes = ""
105
+ try:
106
+ notes = self._parse_xml_tag(resp2.text, "notes")
107
+ except ValueError:
108
+ pass
109
+
110
+ # Write benchmark to disk
111
+ evo_dir = repo_root / ".gradex"
112
+ evo_dir.mkdir(parents=True, exist_ok=True)
113
+ benchmark_path = evo_dir / "benchmark.py"
114
+ benchmark_path.write_text(benchmark_script, encoding="utf-8")
115
+
116
+ # 4 + 5 — Gate commands
117
+ gate_system = jinja2.Template(
118
+ (PROMPTS_DIR / "gate_design.md").read_text(encoding="utf-8")
119
+ ).render(
120
+ target=optimization_target,
121
+ test_files=test_files,
122
+ )
123
+ resp3 = await self._client.complete(gate_system, "Identify the gate commands.")
124
+ gate_cmds_raw = self._parse_xml_tag(resp3.text, "gate_cmds")
125
+ gate_cmds: list[str] = json.loads(gate_cmds_raw)
126
+
127
+ # 6 — Baseline
128
+ baseline_score = await self._run_baseline(repo_root, benchmark_path)
129
+
130
+ # 7 — Persist Run
131
+ from gradex.repository import RunRepository
132
+
133
+ run = RunRepository().create(
134
+ benchmark_cmd=f"python {benchmark_path}",
135
+ metric_direction=direction,
136
+ gate_cmds=gate_cmds,
137
+ baseline_score=baseline_score,
138
+ )
139
+
140
+ return DiscoverResult(
141
+ optimization_target=optimization_target,
142
+ metric=metric,
143
+ metric_direction=direction,
144
+ benchmark_script=benchmark_script,
145
+ benchmark_path=benchmark_path,
146
+ gate_cmds=gate_cmds,
147
+ baseline_score=baseline_score,
148
+ run_id=run.id,
149
+ notes=notes,
150
+ )
151
+
152
+ # ------------------------------------------------------------------
153
+ # Helpers
154
+ # ------------------------------------------------------------------
155
+
156
+ def scan_repo(self, repo_root: Path) -> str:
157
+ """Return a compact text summary of *repo_root* for LLM context.
158
+
159
+ Skips ``.git``, ``.evo``, ``__pycache__``, ``node_modules``,
160
+ ``.venv``, and ``dist``. Output is capped at 3 000 characters.
161
+ """
162
+ _SKIP = {".git", ".gradex", "__pycache__", "node_modules", ".venv", "dist"}
163
+ entries: list[str] = []
164
+ ext_counts: dict[str, int] = {}
165
+ total_size = 0
166
+
167
+ def _walk(path: Path, depth: int = 0) -> None:
168
+ if depth > 3 or len(entries) >= 60:
169
+ return
170
+ try:
171
+ children = sorted(path.iterdir())
172
+ except PermissionError:
173
+ return
174
+ for item in children:
175
+ if item.name in _SKIP:
176
+ continue
177
+ if len(entries) >= 60:
178
+ entries.append(" ... (truncated)")
179
+ return
180
+ indent = " " * depth
181
+ if item.is_dir():
182
+ entries.append(f"{indent}{item.name}/")
183
+ _walk(item, depth + 1)
184
+ else:
185
+ entries.append(f"{indent}{item.name}")
186
+ ext = item.suffix.lower() or "(no ext)"
187
+ ext_counts[ext] = ext_counts.get(ext, 0) + 1
188
+ try:
189
+ nonlocal total_size
190
+ total_size += item.stat().st_size
191
+ except OSError:
192
+ pass
193
+
194
+ _walk(repo_root)
195
+ tree_str = "\n".join(entries)
196
+ ext_str = ", ".join(f"{k}: {v}" for k, v in sorted(ext_counts.items()))
197
+ result = (
198
+ f"Directory tree:\n{tree_str}\n\n"
199
+ f"File types: {ext_str}\n"
200
+ f"Total size: {total_size} bytes"
201
+ )
202
+ return result[:3000]
203
+
204
+ def detect_test_files(self, repo_root: Path) -> list[str]:
205
+ """Return relative paths to test files in *repo_root* (max 20).
206
+
207
+ Matches ``test_*.py`` and ``*_test.py`` patterns anywhere under
208
+ *repo_root*.
209
+ """
210
+ seen: set[str] = set()
211
+ results: list[str] = []
212
+ for pattern in ("**/test_*.py", "**/*_test.py"):
213
+ for path in sorted(repo_root.glob(pattern)):
214
+ rel = str(path.relative_to(repo_root))
215
+ if rel not in seen:
216
+ seen.add(rel)
217
+ results.append(rel)
218
+ return results[:20]
219
+
220
+ def _parse_xml_tag(self, text: str, tag: str) -> str:
221
+ """Extract the content of ``<tag>…</tag>`` from *text*.
222
+
223
+ Strips surrounding whitespace.
224
+
225
+ Raises:
226
+ ValueError: If the tag is absent.
227
+ """
228
+ match = re.search(f"<{tag}>(.*?)</{tag}>", text, re.DOTALL)
229
+ if not match:
230
+ raise ValueError(f"Tag <{tag}> not found in LLM response")
231
+ return match.group(1).strip()
232
+
233
+ def _infer_direction(self, metric: str) -> Literal["higher", "lower"]:
234
+ """Infer optimisation direction from a metric description string.
235
+
236
+ ``"lower"`` keywords: lower, minimize, latency, cost, ms, error, loss.
237
+ ``"higher"`` keywords: higher, maximize, accuracy, score, %, throughput.
238
+ Defaults to ``"lower"``.
239
+ """
240
+ lower_m = metric.lower()
241
+ higher_keywords = {"higher", "maximize", "accuracy", "score", "%", "throughput"}
242
+ lower_keywords = {
243
+ "lower",
244
+ "minimize",
245
+ "latency",
246
+ "cost",
247
+ "ms",
248
+ "millisecond",
249
+ "error",
250
+ "loss",
251
+ }
252
+ for kw in higher_keywords:
253
+ if kw in lower_m:
254
+ return "higher"
255
+ for kw in lower_keywords:
256
+ if kw in lower_m:
257
+ return "lower"
258
+ return "lower"
259
+
260
+ # ------------------------------------------------------------------
261
+ # Baseline execution
262
+ # ------------------------------------------------------------------
263
+
264
+ async def _run_baseline(self, repo_root: Path, benchmark_path: Path) -> float:
265
+ """Execute *benchmark_path* once and parse the resulting score.
266
+
267
+ Raises:
268
+ ValueError: If the benchmark times out or yields no parseable score.
269
+ """
270
+ from gradex.runner.benchmark import BenchmarkRunner
271
+
272
+ runner = BenchmarkRunner(self._backend, timeout=60)
273
+ result = await runner.run(repo_root, ["python", str(benchmark_path)])
274
+ if result.timed_out:
275
+ raise ValueError("Baseline benchmark timed out")
276
+ if result.score is None:
277
+ raise ValueError(
278
+ f"Baseline benchmark returned no parseable score: {result.parse_error!r}"
279
+ )
280
+ return result.score
@@ -0,0 +1,21 @@
1
+ You are an expert software engineer writing a Python benchmark script.
2
+
3
+ Target to optimize: {{ target }}
4
+ Metric: {{ metric }}
5
+ Repository context: {{ repo_context }}
6
+
7
+ Write a Python script that:
8
+ 1. Imports and runs the target code
9
+ 2. Measures the metric numerically (time it, count it, score it)
10
+ 3. Prints ONLY a single float on the last line of stdout (e.g. `print(f"{value:.4f}")`)
11
+ 4. Is deterministic enough to compare across runs (warm up if needed, average 3 runs)
12
+ 5. Has no hardcoded paths — uses relative imports or Path(__file__).parent
13
+ 6. Completes in under 30 seconds
14
+
15
+ Respond in this exact format:
16
+ <benchmark_script>
17
+ [complete Python script, no markdown fences]
18
+ </benchmark_script>
19
+ <notes>
20
+ [any setup steps needed before running, e.g. "requires sample_data/ directory"]
21
+ </notes>
@@ -0,0 +1,16 @@
1
+ You are an expert software engineer identifying regression tests for a code optimization task.
2
+
3
+ Target to optimize: {{ target }}
4
+ Repository test files found: {{ test_files }}
5
+
6
+ Identify the pytest command(s) that must pass after every experiment.
7
+ Prefer: tests directly covering the optimized module.
8
+ If no specific tests exist, use the full test suite.
9
+
10
+ Respond in this exact format:
11
+ <gate_cmds>
12
+ ["pytest tests/test_parser.py", "pytest tests/test_integration.py"]
13
+ </gate_cmds>
14
+ <rationale>
15
+ [why these tests are the right gate]
16
+ </rationale>
@@ -0,0 +1,33 @@
1
+ You are a subagent in a parallel code optimization loop.
2
+
3
+ ## Your task
4
+ {{ optimization_target }}
5
+ Metric: {{ metric }} ({{ direction }})
6
+ Current best score: {{ best_score }}
7
+ Baseline score: {{ baseline_score }}
8
+
9
+ ## What has been tried (do NOT repeat these)
10
+ {% for exp in failed_experiments %}
11
+ - Hypothesis: {{ exp.hypothesis }}
12
+ Result: {{ exp.result }} ({{ exp.reason }})
13
+ {% endfor %}
14
+
15
+ ## Shared knowledge from all agents
16
+ {{ shared_notes }}
17
+
18
+ ## Your hypothesis
19
+ Based on the above, form ONE new hypothesis that has NOT been tried.
20
+ Be specific: name the function, file, and change you will make.
21
+
22
+ ## Instructions
23
+ 1. Read the relevant source files
24
+ 2. Implement your hypothesis
25
+ 3. Run the benchmark: `{{ benchmark_cmd }}`
26
+ The last line of stdout is your score. Lower/higher is better per the metric above.
27
+ 4. Run the gate: `{{ gate_cmd }}`
28
+ 5. Write your results to `.gradex/result.json`:
29
+ {"score": <float>, "hypothesis": "<one sentence>", "change_summary": "<what you changed>"}
30
+ 6. Write gate result to `.gradex/gate.json`:
31
+ {"passed": <bool>, "failures": [<strings>]}
32
+
33
+ Do not stop until both files are written.