npm - quiver-cli - Versions diffs - 0.1.0 → 0.2.0 - Mend

quiver-cli 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/README.md +11 -3
package/dist/cli.js +67 -17
package/package.json +1 -1
package/template/.agents/skills/code/improve/SKILL.md +118 -0
package/template/.agents/skills/code/improve/references/audit-playbook.md +130 -0
package/template/.agents/skills/code/improve/references/closing-the-loop.md +95 -0
package/template/.agents/skills/code/improve/references/plan-template.md +192 -0
package/template/.agents/upstreams.json +5 -0

package/README.md CHANGED Viewed

@@ -61,7 +61,7 @@ quiver-cli check              # detect drift (CI-friendly: --json, exit 1)
 Options: `-f/--force`, `--all/-y` (non-interactive), `--json`
 (status/check/upstream/list), `--providers=claude,opencode` (limit generated
-configs), `--catalog=<source>` (catalog source for `init`),
+configs), `--catalog=<source>` (catalog source for `init` and `upstream`),
 `--introspect-stdio` (allow running stdio MCP servers during `check`).
 ## What gets generated
@@ -106,10 +106,18 @@ introspected with `--introspect-stdio`.
 ## `upstream` — source updates
-`check` detects drift between the lockfile and the repo's `.agents/`. `upstream`
-answers a different question: **has the source repo updated a skill since it was
+`upstream` is a **catalog-maintenance** command, not a per-repo one. `check`
+detects drift between the lockfile and the repo's `.agents/`; `upstream` answers
+a different question: **has the source repo updated a skill since it was
 imported into the catalog?**
+Because it records baselines (and `pull` rewrites skill copies) **in the catalog
+itself**, run it where the catalog is writable — inside the quiver-cli repo, or
+against a writable local checkout via `--catalog <path>`. Run from a consuming
+repo, where the catalog is the read-only installed package (or a remote cache),
+it aborts with guidance; use `quiver-cli check` / `quiver-cli update` there
+instead.
 Origins live in `template/.agents/upstreams.json` (`repo`, `path`, `ref` per
 skill). `quiver-cli upstream` queries the GitHub Commits API for the latest
 commit touching each path:

package/dist/cli.js CHANGED Viewed

@@ -769,13 +769,14 @@ var init_remote = __esm({
 var resolve_exports = {};
 __export(resolve_exports, {
   DEFAULT_CATALOG_SOURCE: () => DEFAULT_CATALOG_SOURCE,
+  isCatalogWritable: () => isCatalogWritable,
   packageRoot: () => packageRoot,
   resolveCatalog: () => resolveCatalog
 });
-import { existsSync as existsSync6 } from "fs";
-import { dirname as dirname3, resolve as resolve7 } from "path";
+import { accessSync, constants, existsSync as existsSync6 } from "fs";
+import { dirname as dirname3, relative as relative3, resolve as resolve7, sep } from "path";
 import { fileURLToPath } from "url";
-var packageRoot, DEFAULT_CATALOG_SOURCE, resolveCatalog;
+var packageRoot, DEFAULT_CATALOG_SOURCE, resolveCatalog, isInstalledPackage, isBundledCatalog, isCatalogWritable;
 var init_resolve = __esm({
   "src/catalog/resolve.ts"() {
     "use strict";
@@ -797,6 +798,22 @@ var init_resolve = __esm({
       }
       throw new Error(`Unknown catalog source scheme: ${source}`);
     };
+    isInstalledPackage = () => packageRoot.split(sep).includes("node_modules");
+    isBundledCatalog = (catalog) => {
+      const rel = relative3(packageRoot, catalog.root);
+      return rel === "" || !rel.startsWith("..");
+    };
+    isCatalogWritable = (catalog) => {
+      const [scheme] = catalog.source.split(":");
+      if (scheme === "github") return false;
+      if (isBundledCatalog(catalog) && isInstalledPackage()) return false;
+      try {
+        accessSync(catalog.root, constants.W_OK);
+        return true;
+      } catch {
+        return false;
+      }
+    };
   }
 });
@@ -993,7 +1010,7 @@ import {
   unlinkSync,
   writeFileSync as writeFileSync4
 } from "fs";
-import { dirname as dirname4, relative as relative3, resolve as resolve11 } from "path";
+import { dirname as dirname4, relative as relative4, resolve as resolve11 } from "path";
 var isENOENT, isMatchingSymlink, removePath, applyOutputs, checkOutputs;
 var init_fsops = __esm({
   "src/providers/fsops.ts"() {
@@ -1060,7 +1077,7 @@ var init_fsops = __esm({
         }
         removePath(link.path);
         symlinkSync(
-          relative3(dirname4(link.path), link.target),
+          relative4(dirname4(link.path), link.target),
           link.path,
           lstatSync(link.target).isDirectory() ? "dir" : "file"
         );
@@ -2372,9 +2389,10 @@ var init_snapshot = __esm({
 // src/commands/check.ts
 var check_exports = {};
 __export(check_exports, {
-  check: () => check
+  check: () => check,
+  summarize: () => summarize
 });
-var check, report2, truncate2, fail;
+var check, report2, summarize, truncate2, fail;
 var init_check = __esm({
   "src/commands/check.ts"() {
     "use strict";
@@ -2399,15 +2417,20 @@ var init_check = __esm({
       const skillByName = new Map(catalog.skills.map((s) => [s.name, s]));
       const commandByName = new Map(catalog.commands.map((c) => [c.name, c]));
       const skillDrift = [];
+      const checked = { skills: 0, commands: 0, mcp: 0 };
       for (const [id, entry] of Object.entries(lock.entries)) {
         const p = parseEntryId(id);
         if (!p) continue;
         if (entry.type === "skill") {
           const cat = skillByName.get(p.name);
-          if (cat && cat.digest !== entry.digest) skillDrift.push({ id, kind: "content" });
+          if (!cat) continue;
+          checked.skills += 1;
+          if (cat.digest !== entry.digest) skillDrift.push({ id, kind: "content" });
         } else if (entry.type === "command") {
           const cat = commandByName.get(p.name);
-          if (cat && cat.digest !== entry.digest) skillDrift.push({ id, kind: "content" });
+          if (!cat) continue;
+          checked.commands += 1;
+          if (cat.digest !== entry.digest) skillDrift.push({ id, kind: "content" });
         }
       }
       const mcpReports = [];
@@ -2417,6 +2440,7 @@ var init_check = __esm({
         const p = parseEntryId(id);
         const catMcp = catalog.mcp.find((m) => m.name === p.name);
         if (!catMcp) continue;
+        checked.mcp += 1;
         const server = interpolateEnvVars(catMcp.server);
         const res = await introspect(server, { allowStdio: options.introspectStdio });
         if (!res.ok) {
@@ -2444,7 +2468,7 @@ var init_check = __esm({
       if (options.json) {
         console.log(
           JSON.stringify(
-            { ok: !hasDrift, skillDrift, mcp: mcpReports },
+            { ok: !hasDrift, checked, skillDrift, mcp: mcpReports },
             null,
             2
           )
@@ -2452,10 +2476,10 @@ var init_check = __esm({
         if (hasDrift) process.exitCode = 1;
         return;
       }
-      await report2(skillDrift, mcpReports);
+      await report2(skillDrift, mcpReports, checked);
       if (hasDrift) process.exitCode = 1;
     };
-    report2 = async (skillDrift, mcpReports) => {
+    report2 = async (skillDrift, mcpReports, checked) => {
       if (skillDrift.length) {
         await warn(
           `Skill/command content changed since lockfile:
@@ -2487,10 +2511,21 @@ var init_check = __esm({
   - ${lines.join("\n  - ")}`);
         }
       }
+      const summary = summarize(checked);
       if (!skillDrift.length && !mcpReports.some((r) => r.status === "drift")) {
-        await success("check passed: no upstream drift detected.");
+        await success(`check passed: ${summary}, no drift detected.`);
+      } else {
+        await info(`checked ${summary}.`);
       }
     };
+    summarize = (c) => {
+      const plural = (n, word) => `${n} ${word}${n === 1 ? "" : "s"}`;
+      const parts = [];
+      if (c.skills) parts.push(plural(c.skills, "skill"));
+      if (c.commands) parts.push(plural(c.commands, "command"));
+      if (c.mcp) parts.push(plural(c.mcp, "MCP server"));
+      return parts.length ? parts.join(", ") : "nothing";
+    };
     truncate2 = (s, max = 120) => s.length > max ? s.slice(0, max) + "\u2026" : s;
     fail = async (options, code, message) => {
       if (options.json) console.log(JSON.stringify({ ok: false, error: code }));
@@ -2640,7 +2675,7 @@ __export(upstream_exports, {
   upstream: () => upstream
 });
 import { cpSync as cpSync3, rmSync as rmSync8 } from "fs";
-var upstream, STATUS_ORDER, pull, report3, countByStatus;
+var upstream, guardWritableCatalog, STATUS_ORDER, pull, report3, countByStatus;
 var init_upstream = __esm({
   "src/commands/upstream.ts"() {
     "use strict";
@@ -2655,7 +2690,10 @@ var init_upstream = __esm({
         await pull(options);
         return;
       }
-      const resolved = await resolveCatalog();
+      const resolved = await resolveCatalog(
+        options.catalog ?? DEFAULT_CATALOG_SOURCE
+      );
+      if (await guardWritableCatalog(resolved)) return;
       const catalog = loadCatalog(resolved);
       const upstreams = loadUpstreams(resolved);
       const trackedNames = Object.keys(upstreams);
@@ -2685,6 +2723,14 @@ var init_upstream = __esm({
       await report3(reports, untracked, stale);
       if (hasDrift) process.exitCode = 1;
     };
+    guardWritableCatalog = async (catalog) => {
+      if (isCatalogWritable(catalog)) return false;
+      await error(
+        "upstream is a catalog-maintenance command and the catalog here is not writable (the installed package or a remote cache).\nRun it inside the quiver-cli repo, or point at a writable local catalog with --catalog <path>.\nTo update a consuming repo's installed entries, use quiver-cli check / quiver-cli update instead."
+      );
+      process.exitCode = 1;
+      return true;
+    };
     STATUS_ORDER = {
       drift: 0,
       "drift-curated": 1,
@@ -2693,7 +2739,10 @@ var init_upstream = __esm({
       ok: 4
     };
     pull = async (options) => {
-      const resolved = await resolveCatalog();
+      const resolved = await resolveCatalog(
+        options.catalog ?? DEFAULT_CATALOG_SOURCE
+      );
+      if (await guardWritableCatalog(resolved)) return;
       const catalog = loadCatalog(resolved);
       const upstreams = loadUpstreams(resolved);
       const only = options.positionals[1];
@@ -2946,7 +2995,8 @@ Commands:
   list             Show installed entries (skills, commands, MCP tool counts)
   status           Diff the lockfile against what is actually in the repo
   check            Detect upstream drift (skill digests, MCP tool snapshots)
-  upstream         Check source repos for skill updates (catalog maintenance)
+  upstream         Catalog maintenance: check source repos for skill updates
+                   (run in the quiver-cli repo or with a writable --catalog)
   upstream pull    Pull latest upstream content into the catalog [skill]
   login            Store a GitHub token for remote (github:) catalogs
   logout           Remove the stored GitHub token

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "quiver-cli",
-  "version": "0.1.0",
+  "version": "0.2.0",
   "description": "Compose a selected subset of skills, commands & MCP servers from a central catalog into any repo as native configs for opencode, Claude Code and Codex - with lockfile-based drift awareness.",
   "type": "module",
   "bin": {

package/template/.agents/skills/code/improve/SKILL.md ADDED Viewed

@@ -0,0 +1,118 @@
+---
+name: improve
+description: Survey any codebase as a senior advisor and produce prioritized, self-contained implementation plans for OTHER models/agents to execute. Strictly read-only on source code — never implements, fixes, or refactors anything itself. Use when asked to audit a codebase, find improvement opportunities (bugs, security, performance, test coverage, tech debt, migrations, DX), suggest features or where to take the project next (roadmap, product direction), or generate handoff plans for another agent to implement.
+license: MIT
+metadata:
+  author: shadcn
+  version: "1.0.0"
+---
+# Improve
+You are a **senior advisor, not an implementer**. Your job is to deeply understand a codebase, find the highest-value improvement opportunities, and write implementation plans good enough that a *different, less capable model with zero context from this session* can execute, test, and maintain them.
+The economics of this skill: an expensive, high-ceiling model does the part where intelligence compounds (understanding, judging, specifying). Cheaper models do the execution. The plan is the product — its quality determines whether the executor succeeds.
+## Hard Rules
+1. **Never modify source code yourself.** No edits, no fixes, no "quick wins while you're in there." The ONLY files you may create or modify live under `plans/` in the repo root (create it if absent). The `execute` variant dispatches a *separate executor subagent* that edits code in an isolated git worktree — you review its diff and render a verdict; you still never edit code directly, and you never merge, push, or commit to the user's branch.
+2. **Never run commands that mutate the user's working tree** — no installs, no builds that write artifacts outside standard ignored dirs, no git commits, no formatters. Read, search, and run read-only analysis only (e.g. `tsc --noEmit`, lint in check mode, `npm audit` / `pnpm audit`, test suite if cheap and side-effect free). Two scoped exceptions: verification commands inside an executor's disposable worktree during `execute` review, and `gh issue create` under an explicit `--issues` flag.
+3. **Every plan must be fully self-contained.** The executor has not seen this conversation, this codebase survey, or any other plan. If a plan references "the pattern discussed above," it is broken.
+4. **Never reproduce secret values.** If the audit finds credentials, tokens, or `.env` contents, findings and plans reference the `file:line` and credential type only, and recommend rotation. The value itself must never appear in anything you write.
+5. **If the user asks you to implement directly, decline and point at the plan** — offer `execute <plan>` (dispatched executor + your review) or plan refinement instead.
+## Workflow
+### Phase 1 — Recon (always)
+Map the territory before judging it:
+- Read `README`, `CLAUDE.md`/`AGENTS.md`, `CONTRIBUTING`, root config files (`package.json`, `pyproject.toml`, `go.mod`, etc.), CI config, and the directory structure.
+- Identify: language(s), framework(s), package manager, **how to build / test / lint / typecheck** (exact commands — these go into every plan as verification gates), test coverage shape, deployment target.
+- Note repo conventions: code style, naming, folder layout, error-handling and state-management patterns. Plans must tell the executor to *match* these, with examples.
+- Check git signal where useful (`git log --oneline -30`, churn hotspots) for what's actively evolving vs. frozen.
+If the repo has no working verification command (no tests, broken build), record that — "establish a verification baseline" is often finding #1, and it must precede risky plans in the dependency order.
+### Phase 2 — Audit (parallel)
+Audit the codebase across the categories in [references/audit-playbook.md](references/audit-playbook.md) — read it now. Categories: **correctness/bugs, security, performance, test coverage, tech debt & architecture, dependencies & migrations, DX & tooling, docs, direction (features & what to build next)**.
+For repos of any real size, fan out with parallel read-only subagents (in Claude Code: **Explore** agents) — one per category (or cluster of related categories). If the host agent can't spawn subagents, audit directly yourself in category-priority order. **Subagents do not inherit this skill's context**, so each subagent prompt must include:
+- the **absolute path** to this skill's `references/audit-playbook.md` plus the exact section headings to read — **always including "## Finding format"** (subagents can read files — this is far cheaper than pasting; paste the sections only if the path may not resolve in the subagent's environment),
+- the recon facts that scope the search (languages, frameworks, key directories, what to skip),
+- domain-specific risk hints from recon (e.g. for a CLI that writes user files: "pay attention to path traversal and command injection"),
+- an explicit instruction to return findings only — no fixes, no file dumps — and to confirm it could read the playbook file.
+Audit depth follows the **effort level** (default `standard`; the user sets it with a `quick` / `deep` keyword anywhere in the invocation):
+| | `quick` | `standard` (default) | `deep` |
+|---|---|---|---|
+| Coverage | Recon hotspots only — highest-churn, highest-criticality code | Hotspot-weighted, key packages | Whole repo, every package |
+| Subagents | 0–1 (sweep directly when feasible) | ≤4 concurrent | ≤8 concurrent, one per category |
+| Breadth | "medium" | "very thorough" for correctness + security, "medium" rest | "very thorough" everywhere |
+| Categories | correctness, security, tests | all nine | all nine |
+| Findings | top ~6, HIGH-confidence only | full table | full table incl. LOW-confidence "investigate" items |
+Whatever the level, say in the final report what was *not* audited. On a large monorepo even `deep` scopes subagents to packages, not the root.
+Every finding needs: evidence (`file:line` references), impact, effort estimate (S/M/L), risk of the fix itself, and confidence. No vibes-only findings.
+### Phase 3 — Vet, prioritize, confirm
+**Vet before presenting — subagents over-report.** For every finding that will make the table, open the cited code yourself and confirm it. Expect three failure classes: **by-design behavior** reported as a bug or vulnerability (e.g. honoring `https_proxy` flagged as SSRF — it's the standard proxy convention); **mis-attributed evidence** (real finding, wrong file or line); and duplicates across subagents. Downgrade, correct, or reject accordingly, and record rejections in the index's "considered and rejected" section so they aren't re-audited next run.
+Present the vetted findings table to the user, ordered by leverage (impact ÷ effort, weighted by confidence):
+| # | Finding | Category | Impact | Effort | Risk | Evidence |
+Present **direction findings separately**, after the table — they're options for the maintainer to weigh, not problems ranked against bugs, and burying "build a plugin system" under "fix the N+1" serves neither. 2–4 grounded suggestions max, each with its evidence and trade-offs in two or three sentences.
+Then ask which findings to turn into plans (default suggestion: the top 3–5 plus anything they flag). Also surface **dependency ordering** — e.g. "characterization tests for module X (plan 02) must land before the refactor of X (plan 05)."
+Wait for the selection. Do not write 30 plans nobody asked for. If running non-interactively (no user available to choose), write plans for the top 3–5 by leverage and record that default in `plans/README.md`.
+### Phase 4 — Write the plans
+For each selected finding, write one plan file using the template in [references/plan-template.md](references/plan-template.md) — read it before writing the first plan. Plans go in:
+```
+plans/
+  README.md          ← index: priority order, dependency graph, status table
+  001-<slug>.md
+  002-<slug>.md
+```
+**Excerpts come from your own reads, never from a subagent's report.** Before writing each plan, open every cited file yourself — subagent line numbers and attributions are leads, not facts, and a wrong excerpt becomes a wrong plan that fails its own drift check.
+Before writing anything: record `git rev-parse --short HEAD` — every plan stamps the commit it was written against (the executor uses it for drift detection). If `plans/` already exists from a previous run, **reconcile, don't duplicate**: read `plans/README.md`, keep numbering monotonic, skip findings already planned or listed as rejected, and mark superseded plans stale in the index. If `plans/` exists for some unrelated purpose, use `advisor-plans/` instead and say so.
+Write each plan **for the weakest plausible executor**. That means:
+- All context inlined: why this matters, exact file paths, current-state code excerpts, the repo's conventions to follow (with a snippet of an existing exemplar file).
+- Steps that are explicit and ordered, each with its own verification command and expected output.
+- Hard boundaries: files in scope, files explicitly out of scope, things that look related but must not be touched.
+- Machine-checkable done criteria — commands and expected results, not prose like "works correctly."
+- A test plan (what new tests to write, where, following which existing test as a pattern).
+- A maintenance note (what future changes will interact with this, what to watch in review).
+- Escape hatches: "if X turns out to be true, STOP and report back instead of improvising."
+Finish by writing `plans/README.md` with the recommended execution order, dependencies between plans, and a status column the executor models can update.
+## Invocation variants
+- Bare invocation → full workflow above.
+- `quick` / `deep` (anywhere in the invocation) → effort level for the audit; see the table in Phase 2. Composes with everything: `quick security`, `deep --issues`. Default is `standard`.
+- With a focus argument (e.g. `security`, `perf`, `tests`) → run Recon, then audit only that category, then plan.
+- `branch` → audit only the current working branch's changes: scope = files changed since the merge-base with the default branch (`git diff --name-only $(git merge-base origin/<default> HEAD)..HEAD`) plus their direct importers/callers. Light recon, all categories, usually no subagents. **Tag every finding `introduced` (by this branch) or `pre-existing` (in touched files)** — the table separates them; don't blame the branch for legacy debt, but do surface what it's building on top of. If on the default branch or zero commits ahead, say so and offer a full audit instead.
+- `next` (or `features`, `roadmap`) → run Recon, then audit only the direction category, in more depth: 4–6 grounded suggestions, each with evidence, trade-offs, and a coarse effort estimate. Selected ones become design/spike plans, not build-everything plans.
+- `plan <description>` → skip the audit; the user already knows what they want. Run Recon, investigate just enough to specify it properly, and write a single plan. If the description is too ambiguous to specify honestly, first try to resolve each ambiguity from the codebase itself; only what's left becomes questions to the user — asked one at a time, each with a recommended answer.
+- `review-plan <file>` → critique an existing plan in `plans/` against the template's standards and tighten it. If you authored the plan in this same session, also have a fresh-context subagent read it cold and report ambiguities — self-critique misses gaps you mentally fill from context the executor won't have.
+- `execute <plan>` → dispatch a cheaper executor subagent on one plan (isolated worktree), then review its diff like a tech lead — re-run done criteria, check scope, read the code — and render a verdict. Requires a host agent that can spawn subagents in an isolated worktree; if yours can't, say so and hand the plan over for manual execution instead. **Read [references/closing-the-loop.md](references/closing-the-loop.md) before the first dispatch.**
+- `reconcile` → process what happened since last session: verify DONE plans, investigate BLOCKED ones, refresh drifted TODOs, retire dead findings. See [references/closing-the-loop.md](references/closing-the-loop.md).
+- `--issues` (modifier on any planning invocation) → also publish each written plan as a GitHub issue via `gh`, URL recorded in the plan and index. Only with the explicit flag. See [references/closing-the-loop.md](references/closing-the-loop.md).
+## Tone of the output
+You are advising, not selling. State findings plainly with evidence, flag uncertainty honestly, and prefer "not worth doing" verdicts over padding the list. A short list of high-confidence, high-leverage plans beats a long one.

package/template/.agents/skills/code/improve/references/audit-playbook.md ADDED Viewed

@@ -0,0 +1,130 @@
+# Audit Playbook
+What to look for, per category. Each subagent (or direct audit pass) gets the relevant section plus the **Finding format** at the bottom. Adapt depth to repo size — a 2K-line CLI gets a lighter pass than a 500K-line monorepo.
+A finding is only a finding with evidence. "Probably has N+1 queries somewhere" is not a finding; `orders/api.ts:142 issues one query per order item inside a loop` is.
+---
+## 1. Correctness / Bugs
+The highest-trust category — real bugs found by reading, not speculation.
+- Error handling: swallowed exceptions, empty catch blocks, `catch (e) { console.log(e) }` on critical paths, missing error states in UI code.
+- Async hazards: unawaited promises, race conditions on shared state, missing cancellation/cleanup (stale closures in React effects, listeners never removed).
+- Null/undefined flows: non-null assertions (`!`) on values that can be null, optional chaining hiding a value that must exist, unchecked array indexing.
+- Boundary conditions: off-by-one, empty-collection handling, timezone/locale assumptions, integer overflow in counters/IDs.
+- State machines: impossible-state combinations representable in types, status enums with unhandled branches (look for `default:` that silently no-ops).
+- Concurrency: check-then-act on shared resources, missing transactions around multi-write operations, idempotency of retried operations (webhooks, queues).
+- Type escape hatches: `any` / `as` casts / `@ts-ignore` clusters — each one is a place the compiler was overruled.
+- Resource leaks: unclosed handles, connections, subscriptions; missing `finally`.
+## 2. Security
+Report only what's evidenced in the code. Do not generate exploit code in plans — describe the fix.
+**Handling rule:** never copy a secret value into a finding or plan — those files get committed. Reference the `file:line` and credential type only ("Stripe live key at `config.ts:12`"), and the fix sketch always includes rotation, not just removal (a committed secret is burned even after deletion).
+**By-design is not a finding:** standard platform conventions are intentional behavior — honoring `https_proxy`/`NO_PROXY`, reading `~/.netrc`, an explicitly local dev tool shelling out to configured package managers. Flag these only when the *implementation* adds risk beyond the convention itself.
+- Secrets: hardcoded keys/tokens/passwords, secrets in committed `.env` files, secrets logged or persisted in event/history stores.
+- Injection: string-built SQL/shell commands, `dangerouslySetInnerHTML` / `innerHTML` with user data, `eval`/`Function` on dynamic input, path traversal on user-supplied filenames.
+- AuthN/Z: endpoints/server actions missing auth checks, authorization checked client-side only, IDOR (object access by ID without ownership check), missing CSRF protection on state-changing routes.
+- Input validation: API boundaries trusting request bodies (no schema validation), file-upload handling (type/size/path), mass assignment from request objects.
+- Dependencies: run the ecosystem's audit command (`npm audit`, `pip-audit`, `cargo audit`) in read-only mode; flag critical/high with known exploits, not the noise floor.
+- Headers/config: CORS wildcard with credentials, missing CSP where it matters, cookies without `HttpOnly`/`Secure`/`SameSite`, debug/verbose modes reachable in production config.
+- Data exposure: PII in logs, stack traces returned to clients, internal error details in API responses.
+## 3. Performance
+Look for the algorithmic and architectural wins, not micro-optimizations.
+- N+1 patterns: query/fetch per item inside loops or per list-row rendering; missing batching or dataloader.
+- Wrong complexity: nested scans over the same collection, repeated `find`/`filter` inside hot loops where a Map keyed lookup belongs.
+- Caching gaps: identical expensive computations or fetches repeated per request/render; missing memoization at clear function boundaries; no HTTP/data-layer caching on stable data.
+- Payload size: over-fetching (select *, full objects where IDs suffice), missing pagination on unbounded lists, large JSON shipped to clients.
+- Frontend (if applicable): bundle composition (heavyweight deps for trivial use), missing code-splitting on rarely-hit routes, unoptimized images/fonts, client-side fetching for data available at render time, render waterfalls. For React/Next.js, defer to the repo's framework conventions and any installed best-practices guidelines.
+- Backend: synchronous work that belongs in a queue, missing indexes implied by query patterns (flag for verification — don't claim without schema evidence), connection-per-request patterns where pooling exists.
+- Build/CI: slow CI from missing caching, redundant pipeline steps, test suites that could parallelize.
+## 4. Test Coverage
+The goal is not a percentage — it's *which untested code is dangerous*.
+- Map the critical paths (money, auth, data mutation, the feature the repo exists for) and check which have zero or trivial coverage.
+- Modules with high churn (git log) + no tests = top refactor risk; flag as "characterization tests first" candidates.
+- Existing test quality: tests that assert nothing meaningful, heavy mocking that tests the mocks, snapshot tests nobody reads, flaky patterns (real timers, real network, order dependence).
+- Missing test layers: unit-only suites with zero integration coverage on API boundaries, or the inverse (slow E2E for what a unit test would catch).
+- Verification infrastructure: is there a one-command way to know the codebase works? If not, that's finding #1 and a prerequisite plan for any risky change.
+## 5. Tech Debt & Architecture
+- Duplication: the same logic re-implemented in 3+ places (search for near-identical functions/components); divergent copies that have drifted.
+- Layering violations: UI importing from data layer internals, circular dependencies, "utils" modules that became a junk drawer with high fan-in.
+- Dead code: unexported-and-unused modules, feature flags fully rolled out but still branching, commented-out blocks with no explanation, deps in the manifest no longer imported.
+- God objects/modules: files an order of magnitude larger than the repo median that everything touches; functions with double-digit parameters or deep conditional nesting.
+- Inconsistent patterns: three ways of doing data fetching / error handling / styling in the same repo — pick the winner (the one the team converged on most recently) and plan the consolidation.
+- Abstraction mismatches: premature abstractions with a single implementation, or missing abstractions where the same change always requires touching N files in lockstep.
+## 6. Dependencies & Migrations
+- Major-version lag on core framework/runtime (not every minor bump — the ones with real cost to staying behind: EOL, security-fix cutoffs, ecosystem incompatibility).
+- Deprecated APIs in use that have announced removal timelines.
+- Abandoned dependencies (no release in years, archived repos) on critical paths.
+- Duplicate dependencies solving the same problem (two date libs, two HTTP clients).
+- Lockfile/manifest drift, version pinning inconsistencies across a monorepo.
+- For each migration candidate, estimate blast radius (files touched) — that drives effort and whether to recommend it at all.
+## 7. DX & Tooling
+- Missing or broken: typecheck script, lint config, formatter, pre-commit hooks, editorconfig.
+- Slow feedback loops: dev-server or test startup measured in minutes, no watch mode, CI without caching.
+- Onboarding friction: README setup steps that are wrong/incomplete, undocumented required env vars, no `.env.example`.
+- Missing `CLAUDE.md`/`AGENTS.md` — for repos where agents will execute the plans, this is high-leverage: recommend one and include its outline as a plan.
+- Error messages/logging: unstructured logs on services, missing request IDs/correlation, debugging requiring code changes.
+## 8. Docs
+Lowest default priority — only flag where absence has a concrete cost:
+- Public API surface (published packages) without reference docs.
+- Architectural decisions nobody can reconstruct (why X over Y) for actively-contested areas.
+- Stale docs that are actively wrong (worse than missing) — setup instructions, API examples that no longer compile.
+## 9. Direction — features & where to take this next
+Forward-looking: not what's broken, but what this codebase wants to become. **Grounding rule:** every suggestion must cite evidence from the repo itself — a suggestion that could apply to any project in the category ("add dark mode", "add AI") is noise, not a finding. Sources of grounded direction signal:
+- **Unfinished intent**: TODO/FIXME clusters around one theme, feature flags never rolled out, stubbed or half-built modules, commented-out feature code, abandoned mid-feature work visible in git history.
+- **Stated-but-undelivered**: README/docs/roadmap promises with no corresponding code, CLI flags or config options that are no-ops, issue templates for features that don't exist.
+- **Surface asymmetries**: one-directional pairs (export without import, create without bulk-create, webhooks out but not in), entities with CRUD minus one, a public API that internal code clearly needed and hand-rolled around.
+- **The adjacent possible**: capabilities the existing architecture makes disproportionately cheap — a plugin system one interface away, a public API one route file from the existing service layer, an integration the data model already supports.
+- **Friction worth productizing**: things users of this project evidently do by hand around it (visible in docs, examples, issues) that the project could absorb.
+Direction findings use the standard format with two adaptations: **Impact** is product/user value (who wants this and why now), and **Confidence** reflects how grounded the evidence is — not certainty that it's the right call. Strategy belongs to the maintainer; the advisor's job is grounded options with honest trade-offs. Effort estimates here are coarser; say so. Plans for selected direction findings are usually a *design/spike plan* (investigate, prototype, define the API, list open questions) rather than a build-everything plan — scope them that way.
+---
+## Finding format
+Every finding, from every category and every subagent, comes back in this shape:
+```markdown
+### [CATEGORY-NN] Short imperative title
+- **Evidence**: `path/file.ts:123` — one-sentence description of what's there. (Repeat per location; 2–5 strongest locations, note "and ~N similar sites" if widespread.)
+- **Impact**: What goes wrong / what's being paid because of this. Concrete: "every order-list render issues 1+N queries", not "suboptimal".
+- **Effort**: S (hours) / M (a day-ish) / L (multi-day) — for the *fix*, including tests.
+- **Risk**: What the fix could break; LOW/MED/HIGH plus one line why.
+- **Confidence**: HIGH (read the code, certain) / MED (strong signal, needs verification) / LOW (smell, needs investigation). LOW-confidence findings may be reported but get an "investigate" plan, not a "fix" plan.
+- **Fix sketch**: 1–3 sentences. Not the plan — just enough to judge effort honestly.
+```
+## Prioritization rubric
+Order findings by **leverage = impact ÷ effort, discounted by confidence and fix-risk**. Tiebreakers:
+1. Anything that unblocks other findings (verification baseline, characterization tests) floats up.
+2. Security findings with HIGH confidence float above equivalent-leverage non-security findings.
+3. Prefer findings whose fix has a clean verification story — executor models succeed at those.
+4. "Not worth doing" is a valid verdict; record it with one line of reasoning so the user knows it was considered.

package/template/.agents/skills/code/improve/references/closing-the-loop.md ADDED Viewed

@@ -0,0 +1,95 @@
+# Closing the Loop — execute, reconcile, issues
+The advisor's job doesn't end at the plan. This file covers the three follow-through flows: dispatching an executor and reviewing its work (`execute`), keeping the plan backlog alive (`reconcile`), and publishing plans where work gets picked up (`--issues`).
+The founding rule survives unchanged: **the advisor never edits source code.** In `execute`, a *separate executor subagent* edits code in an isolated git worktree; the advisor dispatches, reviews, and renders a verdict — like a tech lead who doesn't push commits to your branch.
+---
+## `execute <plan>` — dispatch and review
+### Preconditions (check all before dispatching)
+- The repo is a git repository (worktree isolation requires it). If not: stop and say so.
+- The plan file exists and its dependencies show DONE in `plans/README.md`. If not: stop, name the missing dependency.
+- Run the plan's drift check yourself. If in-scope files changed since `Planned at`, reconcile the plan first (see below) — don't hand a stale plan to an executor.
+### Dispatch
+Spawn **one** `general-purpose` subagent with `isolation: "worktree"`. Executor model: default `sonnet`; use what the user named if they named one (`execute 003 haiku`).
+The subagent prompt must contain:
+1. **The full plan file text, inlined.** The worktree contains only committed files — if `plans/` is uncommitted, the executor can't read it. Never assume; always inline.
+2. The executor preamble:
+> You are the executor for the implementation plan below. Follow it step by
+> step. Run every verification command and confirm the expected result before
+> moving on. Touch only the files listed as in scope. If any STOP condition
+> occurs, stop immediately and report. Do not improvise around obstacles.
+> Commit your work in the worktree following the plan's git workflow section.
+> One override: SKIP the plan's instruction to update `plans/README.md` —
+> your reviewer maintains the index. Before reporting, audit every claim in
+> your report against an actual tool result from this session — only report
+> what you can point to evidence for; if a verification failed or was
+> skipped, say so plainly. When finished, reply with exactly the report
+> format below.
+3. The report format:
+```
+STATUS: COMPLETE | STOPPED
+STEPS: per step — done/skipped + verification command result
+STOPPED BECAUSE: (only if STOPPED) which STOP condition, what was observed
+FILES CHANGED: list
+NOTES: anything the reviewer should know (deviations, surprises, judgment calls)
+```
+### Review (the advisor's real job here)
+Note on fresh worktrees: they share git history but not `node_modules` or build artifacts — the executor must install dependencies first, and check tooling that resolves from `dist/` may need one build even though the plan's command table (recon'd in the main tree) didn't mention it. Expect this; it isn't a deviation.
+Review like a tech lead reviewing a PR against the spec — never fix anything yourself:
+1. **Re-run every done criterion** in the worktree. Don't trust the executor's report — verify.
+2. **Scope compliance**: `git -C <worktree> diff --stat` against the plan's in-scope list. Any file outside scope fails review, full stop.
+3. **Read the full diff.** Judge it against "Why this matters" (does it solve the actual problem?) and the repo conventions named in the plan (does it look like the rest of the codebase?).
+4. **Audit the new tests.** Executors game criteria — a test that asserts nothing meaningful passes `pnpm test` and proves nothing. Read what the tests assert.
+### Verdict
+**Documented deviations are judged on merit, not reflex-blocked.** "Do not improvise" exists to stop silent drift; an executor that hits a real obstacle (e.g. the plan's approach breaks existing test mocks), adapts minimally, and explains it in NOTES has done the right thing. Approve it if the adaptation serves the plan's intent and stays in scope; treat *undocumented* deviations as review failures.
+| Verdict | When | Action |
+|---|---|---|
+| **APPROVE** | Criteria pass, scope clean, quality holds | Update index status to DONE. Present to the user: diff summary, worktree path and branch, anything from NOTES. **Merging is the user's decision — never merge, push, or commit to their branch.** |
+| **REVISE** | Fixable gaps | SendMessage to the same executor with specific, actionable feedback ("criterion 3 fails: X; the error handling in `api.ts:90` swallows the error — use the Result pattern per the plan"). **Max 2 revision rounds**, then BLOCK. |
+| **BLOCK** | STOP condition hit, scope violated unrecoverably, or revisions exhausted | Mark BLOCKED in the index with the reason. Refine or rewrite the plan with what was learned. Tell the user what happened and what changed in the plan. |
+Running verification commands inside the executor's worktree is fine — it's isolated and disposable. The no-mutating-commands rule protects the user's working tree, not the worktree.
+---
+## `reconcile` — keep `plans/` alive
+Process what happened since the last session. Read `plans/README.md` and every plan file, then per status:
+- **DONE** — spot-check that the done criteria still hold on the current HEAD (cheap ones only). Mark verified in the index. Don't delete plan files — they're the record.
+- **BLOCKED** — read the reason. Investigate the underlying obstacle in the codebase. Either rewrite the plan around it (new number if the approach changed fundamentally, in-place refresh otherwise) or mark REJECTED with one line of rationale.
+- **IN PROGRESS** (stale) — flag it to the user; an executor probably died mid-run. Check the worktree if one exists.
+- **TODO** — run the drift check. If drifted: re-verify the finding still exists (it may have been fixed in passing), then refresh the "Current state" excerpts and `Planned at` SHA. If the finding is gone, mark REJECTED ("fixed independently").
+Finish with a short report: what's verified done, what was refreshed, what's rejected, and what's executable right now.
+---
+## `--issues` — publish plans as GitHub issues
+Modifier on any planning invocation (`/improve --issues`, `/improve security --issues`). The flag is the user's authorization to create issues — never create them without it.
+1. Preflight: `gh auth status` succeeds and the repo has a GitHub remote. If either fails, write the plan files as normal and say why issues were skipped.
+2. Show the list of titles about to become issues; confirm once if interactive.
+3. Per plan: `gh issue create --title "<plan title>" --body-file <plan file>`. Labels: `improve` plus the category — apply only if the labels exist or can be created without erroring; skip labels rather than fail.
+4. Record each issue URL in the plan's Status block (`- **Issue**: <url>`) and the index.
+The plan file remains the source of truth; the issue is distribution. The self-containment rule pays off here — the issue body needs no edits to make sense to whoever (or whatever) picks it up.

package/template/.agents/skills/code/improve/references/plan-template.md ADDED Viewed

@@ -0,0 +1,192 @@
+# Handoff Plan Template
+Every plan is written for an executor model that has **zero context**: it has not seen the advisor session, the audit, the other plans, or any prior conversation. It may be a smaller/cheaper model. Assume it is competent at following explicit instructions and weak at filling gaps, recovering from ambiguity, or knowing when to stop.
+Three properties make a plan executable by a weaker model:
+1. **Self-contained context** — everything needed is in the file: paths, code excerpts, conventions, commands.
+2. **Verification gates** — every step ends with a command and its expected result. The executor never has to *judge* whether it succeeded.
+3. **Hard boundaries and escape hatches** — explicit out-of-scope list, and "STOP and report" conditions instead of letting the model improvise when reality doesn't match the plan.
+File naming: `plans/NNN-short-slug.md`, numbered in recommended execution order.
+---
+## Template
+```markdown
+# Plan NNN: <Imperative title — what will be true after this plan>
+> **Executor instructions**: Follow this plan step by step. Run every
+> verification command and confirm the expected result before moving to the
+> next step. If anything in the "STOP conditions" section occurs, stop and
+> report — do not improvise. When done, update the status row for this plan
+> in `plans/README.md` — unless a reviewer dispatched you and told you they
+> maintain the index.
+>
+> **Drift check (run first)**: `git diff --stat <planned-at SHA>..HEAD -- <in-scope paths>`
+> If any in-scope file changed since this plan was written, compare the
+> "Current state" excerpts against the live code before proceeding; on a
+> mismatch, treat it as a STOP condition.
+## Status
+- **Priority**: P1 | P2 | P3
+- **Effort**: S | M | L
+- **Risk**: LOW | MED | HIGH
+- **Depends on**: plans/NNN-*.md (or "none")
+- **Category**: bug | security | perf | tests | tech-debt | migration | dx | docs | direction
+- **Planned at**: commit `<short SHA>`, <YYYY-MM-DD>
+- **Issue**: <GitHub issue URL — only when published via `--issues`; omit otherwise>
+## Why this matters
+2–5 sentences. The problem, its concrete cost, and what improves when this
+lands. Written so the executor (and a human reviewer) understands the intent —
+intent is what lets a correct judgment call happen when a detail is off.
+## Current state
+The facts the executor needs, inlined — never "as discussed" or "see audit":
+- The relevant files, each with one line on its role:
+  - `src/orders/api.ts` — order-list endpoint; contains the N+1 (lines 130–160)
+- Excerpts of the code as it exists today (short, with `file:line` markers),
+  enough that the executor can confirm it's looking at the right thing.
+- The repo conventions that apply here, with a pointer to one exemplar file:
+  "Error handling follows the Result pattern — see `src/lib/result.ts` and its
+  use in `src/users/api.ts:40-60`. Match it."
+## Commands you will need
+| Purpose   | Command                  | Expected on success |
+|-----------|--------------------------|---------------------|
+| Install   | `pnpm install`           | exit 0              |
+| Typecheck | `pnpm typecheck`         | exit 0, no errors   |
+| Tests     | `pnpm test -- <filter>`  | all pass            |
+| Lint      | `pnpm lint`              | exit 0              |
+(Exact commands from this repo — verified during recon, not guessed.)
+## Suggested executor toolkit
+(Optional — include only when relevant skills/tools plausibly exist in the
+executor's environment. Skip the section otherwise.)
+- Skills the executor should invoke if available, and for what:
+  "use `vercel-react-best-practices` when writing the memoization in step 3".
+- Reference docs worth reading before starting, by path or URL.
+## Scope
+**In scope** (the only files you should modify):
+- `src/orders/api.ts`
+- `src/orders/api.test.ts` (create)
+**Out of scope** (do NOT touch, even though they look related):
+- `src/orders/legacy-api.ts` — deprecated path, scheduled for deletion;
+  changing it wastes effort and risks the v1 clients still pinned to it.
+- Any change to the public response shape — clients depend on it.
+## Git workflow
+(Filled from recon — match the repo's observed conventions.)
+- Branch: `advisor/NNN-<slug>` (or the repo's branch-naming convention if one is evident)
+- Commit per step or per logical unit; message style: <match repo, e.g. conventional commits — include an example from `git log`>
+- Do NOT push or open a PR unless the operator instructed it.
+## Steps
+### Step 1: <imperative title>
+What to do, precisely. Reference exact files/symbols. Include the target code
+shape when it's load-bearing (the pattern to produce, not necessarily every
+line).
+**Verify**: `<command>` → <expected output>
+### Step 2: ...
+(Each step small enough to verify independently. Order steps so the codebase
+is never broken between steps when possible — e.g. add new path, switch
+callers, then remove old path.)
+## Test plan
+- New tests to write, in which file, covering which cases (list them:
+  happy path, the specific bug/regression this plan fixes, named edge cases).
+- Which existing test to use as the structural pattern:
+  "model after `src/users/api.test.ts`".
+- Verification: `<test command>` → all pass, including N new tests.
+## Done criteria
+Machine-checkable. ALL must hold:
+- [ ] `pnpm typecheck` exits 0
+- [ ] `pnpm test` exits 0; new tests for <X> exist and pass
+- [ ] `grep -rn "<old pattern>" src/` returns no matches
+- [ ] No files outside the in-scope list are modified (`git status`)
+- [ ] `plans/README.md` status row updated
+## STOP conditions
+Stop and report back (do not improvise) if:
+- The code at the locations in "Current state" doesn't match the excerpts
+  (the codebase has drifted since this plan was written).
+- A step's verification fails twice after a reasonable fix attempt.
+- The fix appears to require touching an out-of-scope file.
+- You discover the assumption "<key assumption>" is false.
+## Maintenance notes
+For the human/agent who owns this code after the change lands:
+- What future changes will interact with this (e.g. "if pagination is added
+  to this endpoint, the batching in step 2 must be revisited").
+- What a reviewer should scrutinize in the PR.
+- Any follow-up explicitly deferred out of this plan (and why).
+```
+---
+## Index file: `plans/README.md`
+Written once by the advisor after all plans, updated by executors:
+```markdown
+# Implementation Plans
+Generated by the improve skill on <date>. Execute in the order below unless
+dependencies say otherwise. Each executor: read the plan fully before starting,
+honor its STOP conditions, and update your row when done.
+## Execution order & status
+| Plan | Title | Priority | Effort | Depends on | Status |
+|------|-------|----------|--------|------------|--------|
+| 001  | ...   | P1       | S      | —          | TODO   |
+| 002  | ...   | P1       | M      | 001        | TODO   |
+Status values: TODO | IN PROGRESS | DONE | BLOCKED (with one-line reason) | REJECTED (with one-line rationale — finding fixed independently or approach abandoned)
+## Dependency notes
+- 002 requires 001 because <reason>.
+## Findings considered and rejected
+- <finding>: not worth doing because <one line>. (So nobody re-audits it.)
+```
+## Quality bar — check before finishing each plan
+- Could a model that has never seen this repo execute this with only the plan file and the repo? If any step requires knowledge from the advisor session, inline that knowledge.
+- Is every verification a command with an expected result, not a judgment ("make sure it works")?
+- Does every step name exact files and symbols, not "the relevant module"?
+- Are the STOP conditions specific to this plan's actual risks, not boilerplate?
+- Would a reviewer reading only "Why this matters" + "Done criteria" understand what they're approving?
+- No secret values anywhere in the file — locations and credential types only.
+- "Planned at" SHA is filled in and the in-scope paths in the drift check match the Scope section.

package/template/.agents/upstreams.json CHANGED Viewed

@@ -76,5 +76,10 @@
     "ref": "main",
     "commit": "05eb2b968bdc769ad78df9628dc2260e1dec903c",
     "fetchedAt": "2026-06-10T21:53:29.207Z"
+  },
+  "improve": {
+    "repo": "shadcn-ui/ui",
+    "path": "skills/improve",
+    "ref": "main"
   }
 }