specthis 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- specthis/__init__.py +3 -0
- specthis/audit.py +29 -0
- specthis/cache.py +26 -0
- specthis/cli.py +132 -0
- specthis/export.py +30 -0
- specthis/install.py +69 -0
- specthis/lock.py +29 -0
- specthis/refresh.py +28 -0
- specthis/serve.py +22 -0
- specthis/templates/__init__.py +0 -0
- specthis/templates/agents/__init__.py +0 -0
- specthis/templates/agents/experiment-runner.md +87 -0
- specthis/templates/agents/spec-auditor.md +126 -0
- specthis/templates/agents/spec-implementer.md +103 -0
- specthis/templates/specs/AGENTS.md +308 -0
- specthis/templates/specs/README.md +128 -0
- specthis/templates/specs/__init__.py +0 -0
- specthis-0.0.1.dist-info/METADATA +148 -0
- specthis-0.0.1.dist-info/RECORD +22 -0
- specthis-0.0.1.dist-info/WHEEL +4 -0
- specthis-0.0.1.dist-info/entry_points.txt +2 -0
- specthis-0.0.1.dist-info/licenses/LICENSE +21 -0
specthis/__init__.py
ADDED
specthis/audit.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Spec consistency audit (operation 1 from specs/AGENTS.md).
|
|
2
|
+
|
|
3
|
+
Status: **stub**. The reference implementation is index-based: it
|
|
4
|
+
reads ``specs/_index.json`` and ``specs/_routing.json`` (produced by
|
|
5
|
+
:mod:`specthis.export`) and reports each entry's script existence,
|
|
6
|
+
contract-in-spirit, output schema, export routing, and freshness in a
|
|
7
|
+
single markdown table.
|
|
8
|
+
|
|
9
|
+
Port plan:
|
|
10
|
+
- ``walk_index(specs_dir) -> list[EntryReport]``
|
|
11
|
+
- ``check_compute_entry(entry, index) -> EntryReport``
|
|
12
|
+
- ``check_report_entry(entry, index, routing) -> EntryReport``
|
|
13
|
+
- ``format_table(reports) -> str``
|
|
14
|
+
|
|
15
|
+
Until ported, invoke the bundled ``spec-auditor`` subagent in Claude
|
|
16
|
+
Code instead — it implements the same checks by reading the index
|
|
17
|
+
files directly.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def run_audit(specs_dir: Path) -> str: # pragma: no cover - stub
|
|
26
|
+
raise NotImplementedError(
|
|
27
|
+
"specthis.audit is not yet ported. Use the spec-auditor subagent "
|
|
28
|
+
"(installed by `specthis install`) in the meantime."
|
|
29
|
+
)
|
specthis/cache.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""S3-backed compute cache for spec entries.
|
|
2
|
+
|
|
3
|
+
Status: **stub**. The reference implementation provides four
|
|
4
|
+
operations keyed by an entry's ``inputs_certified`` hash:
|
|
5
|
+
|
|
6
|
+
- ``push <entry>``: tar the entry's ``results/<entry>/`` directory and
|
|
7
|
+
upload to ``s3://<bucket>/cache/<input_sig>/<entry>.tar.gz``.
|
|
8
|
+
- ``fetch <entry>``: download and unpack into ``results/<entry>/``.
|
|
9
|
+
- ``has <entry>``: HEAD-check S3 for the artefact.
|
|
10
|
+
- ``list``: list cached entries with their input signatures.
|
|
11
|
+
|
|
12
|
+
Requires ``specthis[s3]`` extra (boto3) and AWS credentials available
|
|
13
|
+
in the standard chain (env, profile, instance role).
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def push(entry: str, bucket: str, specs_dir: Path) -> None: # pragma: no cover - stub
|
|
22
|
+
raise NotImplementedError("specthis.cache is not yet ported.")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def fetch(entry: str, bucket: str, specs_dir: Path) -> None: # pragma: no cover - stub
|
|
26
|
+
raise NotImplementedError("specthis.cache is not yet ported.")
|
specthis/cli.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
"""specthis command-line entry point."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import click
|
|
9
|
+
|
|
10
|
+
from . import __version__
|
|
11
|
+
from .install import install_agents, init_specs_dir
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@click.group(context_settings={"help_option_names": ["-h", "--help"]})
|
|
15
|
+
@click.version_option(__version__, prog_name="specthis")
|
|
16
|
+
def main() -> None:
|
|
17
|
+
"""Spec-driven research workflow: dashboard, agents, refresh pipeline."""
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@main.command("install")
|
|
21
|
+
@click.option(
|
|
22
|
+
"--path",
|
|
23
|
+
"project_path",
|
|
24
|
+
type=click.Path(file_okay=False, path_type=Path),
|
|
25
|
+
default=Path.cwd(),
|
|
26
|
+
show_default="current directory",
|
|
27
|
+
help="Project root in which to install .claude/agents/.",
|
|
28
|
+
)
|
|
29
|
+
@click.option(
|
|
30
|
+
"--force",
|
|
31
|
+
is_flag=True,
|
|
32
|
+
help="Overwrite existing agent files.",
|
|
33
|
+
)
|
|
34
|
+
@click.option(
|
|
35
|
+
"--agent",
|
|
36
|
+
"selected",
|
|
37
|
+
multiple=True,
|
|
38
|
+
type=click.Choice(["spec-auditor", "spec-implementer", "experiment-runner"]),
|
|
39
|
+
help="Install only the named agent(s). Repeatable. Default: all three.",
|
|
40
|
+
)
|
|
41
|
+
def install_cmd(project_path: Path, force: bool, selected: tuple[str, ...]) -> None:
|
|
42
|
+
"""Copy the specthis subagent templates into <project>/.claude/agents/."""
|
|
43
|
+
installed, skipped = install_agents(
|
|
44
|
+
project_path=project_path,
|
|
45
|
+
force=force,
|
|
46
|
+
agents=list(selected) if selected else None,
|
|
47
|
+
)
|
|
48
|
+
for name in installed:
|
|
49
|
+
click.echo(f" installed {name}")
|
|
50
|
+
for name, reason in skipped:
|
|
51
|
+
click.echo(f" skipped {name} ({reason})", err=True)
|
|
52
|
+
if not installed and skipped:
|
|
53
|
+
click.echo("\nNothing changed. Re-run with --force to overwrite.", err=True)
|
|
54
|
+
sys.exit(1)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@main.command("init")
|
|
58
|
+
@click.option(
|
|
59
|
+
"--path",
|
|
60
|
+
"project_path",
|
|
61
|
+
type=click.Path(file_okay=False, path_type=Path),
|
|
62
|
+
default=Path.cwd(),
|
|
63
|
+
show_default="current directory",
|
|
64
|
+
help="Project root in which to create specs/.",
|
|
65
|
+
)
|
|
66
|
+
@click.option(
|
|
67
|
+
"--force",
|
|
68
|
+
is_flag=True,
|
|
69
|
+
help="Overwrite existing template files in specs/.",
|
|
70
|
+
)
|
|
71
|
+
def init_cmd(project_path: Path, force: bool) -> None:
|
|
72
|
+
"""Create specs/ with README.md and AGENTS.md spec-format templates."""
|
|
73
|
+
created, skipped = init_specs_dir(project_path=project_path, force=force)
|
|
74
|
+
for path in created:
|
|
75
|
+
click.echo(f" created {path}")
|
|
76
|
+
for path, reason in skipped:
|
|
77
|
+
click.echo(f" skipped {path} ({reason})", err=True)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@main.command("audit")
|
|
81
|
+
@click.option(
|
|
82
|
+
"--specs",
|
|
83
|
+
"specs_dir",
|
|
84
|
+
type=click.Path(file_okay=False, exists=True, path_type=Path),
|
|
85
|
+
default=Path("specs"),
|
|
86
|
+
show_default=True,
|
|
87
|
+
help="specs/ directory to audit.",
|
|
88
|
+
)
|
|
89
|
+
def audit_cmd(specs_dir: Path) -> None:
|
|
90
|
+
"""Run the consistency audit over specs/. (stub — port pending)"""
|
|
91
|
+
click.echo(
|
|
92
|
+
f"specthis audit: not yet implemented. Would audit {specs_dir}.\n"
|
|
93
|
+
"Until then, invoke the spec-auditor subagent in Claude Code.",
|
|
94
|
+
err=True,
|
|
95
|
+
)
|
|
96
|
+
sys.exit(2)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@main.command("refresh")
|
|
100
|
+
@click.option(
|
|
101
|
+
"--specs",
|
|
102
|
+
"specs_dir",
|
|
103
|
+
type=click.Path(file_okay=False, exists=True, path_type=Path),
|
|
104
|
+
default=Path("specs"),
|
|
105
|
+
show_default=True,
|
|
106
|
+
)
|
|
107
|
+
def refresh_cmd(specs_dir: Path) -> None:
|
|
108
|
+
"""Re-run stale entries respecting the lock file. (stub — port pending)"""
|
|
109
|
+
click.echo("specthis refresh: not yet implemented.", err=True)
|
|
110
|
+
sys.exit(2)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
@main.command("serve")
|
|
114
|
+
@click.option("--host", default="127.0.0.1", show_default=True)
|
|
115
|
+
@click.option("--port", type=int, default=8765, show_default=True)
|
|
116
|
+
def serve_cmd(host: str, port: int) -> None:
|
|
117
|
+
"""Serve the specs.html dashboard with live reload. (stub — port pending)"""
|
|
118
|
+
click.echo("specthis serve: not yet implemented.", err=True)
|
|
119
|
+
sys.exit(2)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
@main.command("lock")
|
|
123
|
+
@click.argument("subcommand", type=click.Choice(["status", "record", "clear"]))
|
|
124
|
+
@click.argument("entry", required=False)
|
|
125
|
+
def lock_cmd(subcommand: str, entry: str | None) -> None:
|
|
126
|
+
"""Manage the spec inputs_certified content-hash lock. (stub — port pending)"""
|
|
127
|
+
click.echo(f"specthis lock {subcommand}: not yet implemented.", err=True)
|
|
128
|
+
sys.exit(2)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
if __name__ == "__main__":
|
|
132
|
+
main()
|
specthis/export.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Dashboard renderer: builds specs/specs.html + _index.json + _routing.json.
|
|
2
|
+
|
|
3
|
+
Status: **stub**. The reference implementation walks ``specs/*.md``,
|
|
4
|
+
parses each frontmatter + entry block, joins against the working tree
|
|
5
|
+
(script existence, output existence + top-level keys, export
|
|
6
|
+
artefacts, host-doc routing), and emits three artefacts:
|
|
7
|
+
|
|
8
|
+
- ``specs/specs.html`` — a single-file browsable dashboard of every
|
|
9
|
+
spec, entry, and pairing.
|
|
10
|
+
- ``specs/_index.json`` — per-spec frontmatter + per-entry facts,
|
|
11
|
+
consumed by :mod:`specthis.audit` and the auditor subagent.
|
|
12
|
+
- ``specs/_routing.json`` — per host-doc, per label section, the
|
|
13
|
+
``\\input{}`` / ``\\includegraphics{}`` lines found inside, plus
|
|
14
|
+
``\\sectionversion`` proximity flags.
|
|
15
|
+
|
|
16
|
+
Port plan:
|
|
17
|
+
- ``parse_spec(path) -> SpecFile``
|
|
18
|
+
- ``join_against_worktree(specs, project_root) -> IndexData``
|
|
19
|
+
- ``walk_host_docs(reports_dir) -> RoutingData``
|
|
20
|
+
- ``render_html(index, routing) -> str``
|
|
21
|
+
- ``write_artefacts(specs_dir, index, routing, html) -> None``
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def render(specs_dir: Path, project_root: Path) -> None: # pragma: no cover - stub
|
|
30
|
+
raise NotImplementedError("specthis.export is not yet ported.")
|
specthis/install.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""Scaffolder: copy bundled templates into a project directory."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from importlib import resources
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
AGENT_NAMES = ("spec-auditor", "spec-implementer", "experiment-runner")
|
|
9
|
+
SPEC_TEMPLATE_NAMES = ("README.md", "AGENTS.md")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _read_template(subdir: str, filename: str) -> str:
|
|
13
|
+
"""Read a bundled template file from the installed package."""
|
|
14
|
+
package = f"specthis.templates.{subdir}"
|
|
15
|
+
return resources.files(package).joinpath(filename).read_text(encoding="utf-8")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def install_agents(
|
|
19
|
+
project_path: Path,
|
|
20
|
+
force: bool = False,
|
|
21
|
+
agents: list[str] | None = None,
|
|
22
|
+
) -> tuple[list[str], list[tuple[str, str]]]:
|
|
23
|
+
"""Copy agent templates into ``<project_path>/.claude/agents/``.
|
|
24
|
+
|
|
25
|
+
Returns ``(installed, skipped)`` where ``installed`` is a list of agent
|
|
26
|
+
names written and ``skipped`` is a list of ``(name, reason)``.
|
|
27
|
+
"""
|
|
28
|
+
selected = agents or list(AGENT_NAMES)
|
|
29
|
+
target_dir = project_path / ".claude" / "agents"
|
|
30
|
+
target_dir.mkdir(parents=True, exist_ok=True)
|
|
31
|
+
|
|
32
|
+
installed: list[str] = []
|
|
33
|
+
skipped: list[tuple[str, str]] = []
|
|
34
|
+
for name in selected:
|
|
35
|
+
if name not in AGENT_NAMES:
|
|
36
|
+
skipped.append((name, "unknown agent"))
|
|
37
|
+
continue
|
|
38
|
+
target = target_dir / f"{name}.md"
|
|
39
|
+
if target.exists() and not force:
|
|
40
|
+
skipped.append((name, "already exists; use --force"))
|
|
41
|
+
continue
|
|
42
|
+
body = _read_template("agents", f"{name}.md")
|
|
43
|
+
target.write_text(body, encoding="utf-8")
|
|
44
|
+
installed.append(name)
|
|
45
|
+
return installed, skipped
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def init_specs_dir(
|
|
49
|
+
project_path: Path,
|
|
50
|
+
force: bool = False,
|
|
51
|
+
) -> tuple[list[Path], list[tuple[Path, str]]]:
|
|
52
|
+
"""Create ``<project_path>/specs/`` with the README and AGENTS templates.
|
|
53
|
+
|
|
54
|
+
Returns ``(created, skipped)``.
|
|
55
|
+
"""
|
|
56
|
+
target_dir = project_path / "specs"
|
|
57
|
+
target_dir.mkdir(parents=True, exist_ok=True)
|
|
58
|
+
|
|
59
|
+
created: list[Path] = []
|
|
60
|
+
skipped: list[tuple[Path, str]] = []
|
|
61
|
+
for filename in SPEC_TEMPLATE_NAMES:
|
|
62
|
+
target = target_dir / filename
|
|
63
|
+
if target.exists() and not force:
|
|
64
|
+
skipped.append((target, "already exists; use --force"))
|
|
65
|
+
continue
|
|
66
|
+
body = _read_template("specs", filename)
|
|
67
|
+
target.write_text(body, encoding="utf-8")
|
|
68
|
+
created.append(target)
|
|
69
|
+
return created, skipped
|
specthis/lock.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Content-hash lock for spec entries.
|
|
2
|
+
|
|
3
|
+
Status: **stub**. The reference implementation maintains
|
|
4
|
+
``specs/_lock.json`` keyed by entry name, recording:
|
|
5
|
+
|
|
6
|
+
- ``inputs_certified``: SHA-256 of (spec body + script body + workflow
|
|
7
|
+
files) at the moment the entry was certified ``script ready``.
|
|
8
|
+
- ``depends_on``: the entry's resolved dependency list at certification
|
|
9
|
+
time.
|
|
10
|
+
- ``author``, ``ts``: who certified, when.
|
|
11
|
+
|
|
12
|
+
The refresh orchestrator (:mod:`specthis.refresh`) compares the live
|
|
13
|
+
hash against ``inputs_certified`` before re-running an entry; a
|
|
14
|
+
mismatch surfaces as ``spec audit needed`` and blocks the rerun.
|
|
15
|
+
|
|
16
|
+
Port plan:
|
|
17
|
+
- ``compute_inputs_hash(entry) -> str``
|
|
18
|
+
- ``record_inputs(entry, author) -> None``
|
|
19
|
+
- ``status(entry) -> Literal["certified", "drifted", "uncertified"]``
|
|
20
|
+
- ``clear(entry) -> None``
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def record_inputs(entry: str, author: str, specs_dir: Path) -> None: # pragma: no cover - stub
|
|
29
|
+
raise NotImplementedError("specthis.lock is not yet ported.")
|
specthis/refresh.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Refresh orchestrator: re-runs stale entries respecting the lock.
|
|
2
|
+
|
|
3
|
+
Status: **stub**. The reference implementation:
|
|
4
|
+
|
|
5
|
+
1. Reads ``specs/_index.json`` (built by :mod:`specthis.export`) and
|
|
6
|
+
``specs/_lock.json`` (managed by :mod:`specthis.lock`).
|
|
7
|
+
2. For each entry, classifies state as one of: ``fresh``, ``stale``,
|
|
8
|
+
``unbound`` (uncertified), or ``spec audit needed`` (live inputs
|
|
9
|
+
hash diverged from ``inputs_certified``).
|
|
10
|
+
3. For ``stale`` entries, optionally fetches from S3 cache
|
|
11
|
+
(:mod:`specthis.cache`) before falling back to a local rerun via
|
|
12
|
+
the project's Makefile or a per-entry ``run:`` command.
|
|
13
|
+
4. After a successful rerun, optionally pushes the new artefacts to
|
|
14
|
+
the S3 cache.
|
|
15
|
+
|
|
16
|
+
Port plan:
|
|
17
|
+
- ``classify(entry, index, lock) -> EntryState``
|
|
18
|
+
- ``plan(entries) -> RefreshPlan``
|
|
19
|
+
- ``execute(plan, dry_run) -> RefreshReport``
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def run(specs_dir: Path, dry_run: bool = False) -> None: # pragma: no cover - stub
|
|
28
|
+
raise NotImplementedError("specthis.refresh is not yet ported.")
|
specthis/serve.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Local dev server for specs.html with file-watch reload.
|
|
2
|
+
|
|
3
|
+
Status: **stub**. The reference implementation serves
|
|
4
|
+
``specs/specs.html`` over HTTP, watches ``specs/*.md`` for changes,
|
|
5
|
+
and reruns the export pipeline (:func:`specthis.export.render`) on
|
|
6
|
+
each change before triggering a browser reload via SSE.
|
|
7
|
+
|
|
8
|
+
Port plan:
|
|
9
|
+
- ``serve(host, port, specs_dir, project_root) -> None``
|
|
10
|
+
- ``_watch(paths) -> Iterator[Event]``
|
|
11
|
+
- ``_sse_handler(...) -> AsgiApp``
|
|
12
|
+
|
|
13
|
+
Requires ``specthis[serve]`` extra (uvicorn + starlette).
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def serve(host: str, port: int, specs_dir: Path, project_root: Path) -> None: # pragma: no cover - stub
|
|
22
|
+
raise NotImplementedError("specthis.serve is not yet ported.")
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: experiment-runner
|
|
3
|
+
description: Kicks off a long-running experiment (a `make <target>` or a script under experiments/) in the background and monitors its log for milestones, NaN/error lines, and completion. Use whenever the user says "run experiment X", "kick off the fit", "rerun <target>", or "monitor the running job". Frees the main thread from training-step log noise — reports only milestone/completion/error lines. Does NOT edit code, does NOT touch specs, does NOT analyse the result JSON beyond confirming it landed.
|
|
4
|
+
tools: Read, Glob, Grep, Bash, ToolSearch
|
|
5
|
+
color: orange
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
You are the experiment-runner. You launch one experiment (a Makefile
|
|
9
|
+
target or a direct script invocation), monitor its log without
|
|
10
|
+
flooding the parent's context, and report milestones / errors /
|
|
11
|
+
completion.
|
|
12
|
+
|
|
13
|
+
## Inputs you need
|
|
14
|
+
|
|
15
|
+
- The experiment to run, given as either a `make <target>` name OR a
|
|
16
|
+
script path with any args.
|
|
17
|
+
- A log path the parent wants you to use (default:
|
|
18
|
+
`/tmp/<experiment-name>.log`).
|
|
19
|
+
|
|
20
|
+
## Procedure
|
|
21
|
+
|
|
22
|
+
1. **Inspect first**: Read the Makefile target (or the script) to
|
|
23
|
+
confirm the expected output path and any milestone-log markers
|
|
24
|
+
(search for `print(`, `logger.info`, `tqdm`). If output already
|
|
25
|
+
exists and is newer than the script, ask the parent whether to skip
|
|
26
|
+
or force-rerun before launching anything.
|
|
27
|
+
2. **Launch in the background.** The project's `CLAUDE.md` or
|
|
28
|
+
`README.md` should document any required env vars (e.g.
|
|
29
|
+
`LD_LIBRARY_PATH`, `PYTHONUNBUFFERED`). Wrap the launch with
|
|
30
|
+
`> /tmp/<name>.log 2>&1 &` and pass `run_in_background: true` on
|
|
31
|
+
the Bash call so you get the PID back and the call returns
|
|
32
|
+
immediately. NEVER tail in the foreground.
|
|
33
|
+
3. **Monitor for milestones, not raw training steps.** Load the
|
|
34
|
+
`Monitor` tool via `ToolSearch` (it is a deferred tool) and use it
|
|
35
|
+
with a regex that catches:
|
|
36
|
+
- error / traceback lines
|
|
37
|
+
(`Error|Traceback|RuntimeError|AssertionError|OOM|CUDA out of memory`)
|
|
38
|
+
- NaN / Inf indicators (`nan|inf|NaN|Inf`)
|
|
39
|
+
- milestone markers (`epoch \d|iter \d+|step \d+|cell \d+|saved`)
|
|
40
|
+
- completion markers (the script's final-line pattern, e.g. `done`,
|
|
41
|
+
`Wrote `, `Saved JSON to `)
|
|
42
|
+
|
|
43
|
+
If `Monitor` cannot be loaded, fall back to periodic
|
|
44
|
+
`grep -E "<regex>" /tmp/<name>.log | tail -n 5` polls with the
|
|
45
|
+
`Bash` tool — but spaced out (≥ 270s between polls so the prompt
|
|
46
|
+
cache stays warm; see harness guidance).
|
|
47
|
+
4. **Report only the interesting lines** back to the parent. Do not
|
|
48
|
+
paste large slabs of training output. A heartbeat every ~10 minutes
|
|
49
|
+
("still running, last milestone: epoch 42, loss=...") is enough.
|
|
50
|
+
5. **On NaN or error**: report immediately, including the last ~20
|
|
51
|
+
lines of context. Do NOT auto-kill the job unless the parent told
|
|
52
|
+
you to.
|
|
53
|
+
6. **On completion**: confirm the expected output JSON exists at the
|
|
54
|
+
path the spec / Makefile declared. Report the path, the file's
|
|
55
|
+
mtime, and the file size. Do NOT open the JSON to inspect numbers
|
|
56
|
+
— that is the parent's job.
|
|
57
|
+
|
|
58
|
+
## Hard rules
|
|
59
|
+
|
|
60
|
+
- Do NOT edit any source file. You have no Edit/Write tools.
|
|
61
|
+
- Do NOT touch specs or `reports/`.
|
|
62
|
+
- Do NOT analyse the result JSON's contents — just confirm it landed.
|
|
63
|
+
- Do NOT poll faster than ~270 s between log checks. Faster polling
|
|
64
|
+
wastes the parent's prompt cache without making the experiment
|
|
65
|
+
finish sooner.
|
|
66
|
+
- Do NOT kill the job without being asked.
|
|
67
|
+
- Respect any hardware limits documented in the project's `CLAUDE.md`
|
|
68
|
+
/ `README.md`. If you hit one (e.g. `CUDA out of memory`), report it
|
|
69
|
+
and let the parent decide.
|
|
70
|
+
|
|
71
|
+
## Report-back shape
|
|
72
|
+
|
|
73
|
+
While running:
|
|
74
|
+
```
|
|
75
|
+
[experiment-runner] <name> PID=<pid> last milestone: <line> elapsed: <hh:mm:ss>
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
On completion:
|
|
79
|
+
```
|
|
80
|
+
[experiment-runner] <name> DONE output: <path> size: <bytes> elapsed: <hh:mm:ss>
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
On failure:
|
|
84
|
+
```
|
|
85
|
+
[experiment-runner] <name> FAILED reason: <one line> log tail:
|
|
86
|
+
<last ~20 lines>
|
|
87
|
+
```
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: spec-auditor
|
|
3
|
+
description: Read-only consistency audit of specs/ against the working tree. Use whenever the user says "audit the specs", "check the specs", "are the specs in sync", or after editing a spec file to verify scripts/outputs/exports/routing still line up. Returns a single markdown table per specs/AGENTS.md operation 1. Never runs scripts, never edits anything.
|
|
4
|
+
tools: Read, Glob, Grep
|
|
5
|
+
color: blue
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
You are the spec-auditor. Your one job is operation 1 ("Audit") from
|
|
9
|
+
`specs/AGENTS.md`. You are strictly read-only.
|
|
10
|
+
|
|
11
|
+
## Index-first workflow
|
|
12
|
+
|
|
13
|
+
**Always start by reading `specs/_index.json` and `specs/_routing.json`**
|
|
14
|
+
— these are precomputed by the specthis dashboard renderer and contain,
|
|
15
|
+
in queryable form, everything the audit needs from the spec files
|
|
16
|
+
themselves:
|
|
17
|
+
|
|
18
|
+
- `_index.json`: per spec file, frontmatter (`kind`, `depends_on`,
|
|
19
|
+
`host_doc`, `section_label`, `mtime`) and per entry: `name`, `kind`,
|
|
20
|
+
`status`, `export_status`, `script` + `script_exists`, `output` +
|
|
21
|
+
`output_exists`, `export_outputs` + `export_outputs_exist`,
|
|
22
|
+
`output_top_level_keys`, `workflows`.
|
|
23
|
+
- `_routing.json`: per host doc, per `\label{...}` found in that doc:
|
|
24
|
+
`label_line`, `section_line`, `inputs` (all `\input{}` files in that
|
|
25
|
+
section), `includegraphics`, `sectionversion_present_within_10_lines`.
|
|
26
|
+
|
|
27
|
+
Treat the index as authoritative for the cheap mechanical checks
|
|
28
|
+
(script existence, output existence, status, depends_on listing,
|
|
29
|
+
routing presence, sectionversion proximity, top-level JSON keys). Only
|
|
30
|
+
fall back to `Read` on the underlying spec/script/host-doc files when:
|
|
31
|
+
|
|
32
|
+
1. The contract docs themselves (`specs/README.md`,
|
|
33
|
+
`specs/AGENTS.md`) — read once per session to know what to check.
|
|
34
|
+
2. A flagged inconsistency where the index says something is wrong and
|
|
35
|
+
you need the spec/script body to characterise the failure (e.g. "the
|
|
36
|
+
JSON exists but a required key is missing" — go open the JSON to see
|
|
37
|
+
what's actually there).
|
|
38
|
+
3. The spec body needs to confirm an unindexed claim — e.g. verifying
|
|
39
|
+
`depends_on` targets actually appear in the prose, or the script
|
|
40
|
+
body matches the spec's "contract in spirit" (only for entries the
|
|
41
|
+
index flagged as suspicious).
|
|
42
|
+
4. The index is missing or out of date (`mtime` of `_index.json` older
|
|
43
|
+
than the youngest `specs/*.md`) — in that case, fall back to
|
|
44
|
+
globbing `specs/*.md` and reading each one. Note this in your output
|
|
45
|
+
as a "stale index" warning so the user can rerun
|
|
46
|
+
`specthis serve --index-only` (or the project's equivalent
|
|
47
|
+
index-export command).
|
|
48
|
+
|
|
49
|
+
The whole point of the index is to replace ~80% of the per-audit Reads
|
|
50
|
+
with two cheap JSON lookups. A well-run audit should average ≤10 Read
|
|
51
|
+
calls, not 60+.
|
|
52
|
+
|
|
53
|
+
## Procedure
|
|
54
|
+
|
|
55
|
+
Follow `specs/AGENTS.md` §1 verbatim. The condensed checklist (each
|
|
56
|
+
step now leans on the index — only Read when noted):
|
|
57
|
+
|
|
58
|
+
1. Read `specs/README.md` and `specs/AGENTS.md` first (the audit
|
|
59
|
+
contract).
|
|
60
|
+
2. Read `specs/_index.json` and `specs/_routing.json`. (If either is
|
|
61
|
+
missing or older than the youngest `specs/*.md`, flag stale-index
|
|
62
|
+
and fall back to spec walking.)
|
|
63
|
+
3. For each entry in `_index.json[spec][entries]` that declares
|
|
64
|
+
`Script:` / `Output:`:
|
|
65
|
+
- `script_exists` → ✓/✗ directly from the index.
|
|
66
|
+
- Contract-in-spirit: only Read the script body when the index
|
|
67
|
+
flags something off (e.g. `script ready` but
|
|
68
|
+
`script_exists=false`, or output exists but schema keys look
|
|
69
|
+
wrong).
|
|
70
|
+
- `output_exists` and `output_top_level_keys` from the index → check
|
|
71
|
+
against the schema declared in the entry contract without opening
|
|
72
|
+
the JSON.
|
|
73
|
+
- If the entry has `export_outputs`, `export_outputs_exist` is a
|
|
74
|
+
parallel list — ✓/✗ per artefact directly from the index.
|
|
75
|
+
4. For each entry that declares `host_doc` + `section_label`, look up
|
|
76
|
+
`_routing.json[host_doc][sections][section_label]`:
|
|
77
|
+
- Label presence: section is present iff the label key exists.
|
|
78
|
+
- For each artefact in the entry's `export_outputs`, check if it
|
|
79
|
+
appears in the section's `inputs` or `includegraphics` (basename
|
|
80
|
+
match acceptable for `inputs`, full-path or basename for
|
|
81
|
+
`includegraphics`). Mismatch on either side → **orphaned export**
|
|
82
|
+
(entry exports it but no `\input`) or **stale routing** (host doc
|
|
83
|
+
inputs something the entry doesn't export).
|
|
84
|
+
- `sectionversion_present_within_10_lines` from the index gives the
|
|
85
|
+
`\sectionversion` proximity check directly.
|
|
86
|
+
5. Frontmatter check: every spec in `_index.json` has `kind` ∈
|
|
87
|
+
{meta, definitions, templates, compute, report, figure}, and every
|
|
88
|
+
`depends_on` entry is a known spec filename (lookup against
|
|
89
|
+
`_index.json` keys). Verifying that `depends_on` entries appear in
|
|
90
|
+
the body is a Read only if an entry looks suspicious.
|
|
91
|
+
6. Document conventions: the project's report convention (declared in
|
|
92
|
+
`specs/README.md` / `specs/AGENTS.md`) — version file presence,
|
|
93
|
+
per-section `\sectionversion` proximity (already in the routing
|
|
94
|
+
index), and any other top-level `.tex` requirements. Use Grep, not
|
|
95
|
+
full Reads.
|
|
96
|
+
|
|
97
|
+
## Output format
|
|
98
|
+
|
|
99
|
+
Return exactly one markdown table, one row per entry:
|
|
100
|
+
|
|
101
|
+
```
|
|
102
|
+
| entry | spec status | script ✓ | contract ✓ | output ✓ | output schema ✓ | export status | export script ✓ | export output ✓ | report routing ✓ | notes |
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
Use `✓`, `✗`, or `n/a`. Keep notes short ("required key missing",
|
|
106
|
+
"schema mismatch on `<key>`", "spec lies: status says `script ready`
|
|
107
|
+
but script does not exist", "export script writes outside `reports/`").
|
|
108
|
+
|
|
109
|
+
After the table, append at most a 5-line **summary** section listing:
|
|
110
|
+
- count of entries with `script TBD`
|
|
111
|
+
- count of contract mismatches
|
|
112
|
+
- count of orphaned exports / stale routings
|
|
113
|
+
- any frontmatter or document-convention violations
|
|
114
|
+
|
|
115
|
+
Do not propose actions in the audit output unless the user explicitly
|
|
116
|
+
asked for "audit + next steps" — in that case append a short
|
|
117
|
+
**proposed next steps** block (operation 2 from AGENTS.md). Default is
|
|
118
|
+
audit-only.
|
|
119
|
+
|
|
120
|
+
## Hard rules
|
|
121
|
+
|
|
122
|
+
- Do NOT run scripts (no project run commands, no `make`, no Python).
|
|
123
|
+
- Do NOT edit any file. You have no Edit/Write tools.
|
|
124
|
+
- Do NOT open large result JSONs in full — key existence is enough.
|
|
125
|
+
- Do NOT compile any document under `reports/`.
|
|
126
|
+
- Do NOT transcribe result numbers into the report.
|