npm - contextdevkit - Versions diffs - 1.8.0 - Mend

contextdevkit 1.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (345) hide show

package/templates/contextkit/runtime/state/state-io.mjs ADDED Viewed

@@ -0,0 +1,172 @@
+/**
+ * Canonical state.json substrate ([ADR-0015](../../memory/decisions/0015-pipeline-dsl-working-stage-and-multi-session-work-claims.md) Part C).
+ *
+ * One schema, two kinds of in-flight items:
+ *   - `kind: 'task'`          — a DevPipeline task currently in `working/`
+ *   - `kind: 'pipeline-run'`  — a single execution of a squad's `pipeline.yaml`
+ *
+ * Storage: one file per item under `contextkit/pipeline/<id>/state.json` (the
+ * pipeline directory is single-sourced via `paths.mjs`). Defensive everywhere
+ * — corrupt or missing JSON returns `null`, not throws.
+ *
+ * Schema (canonical):
+ *   {
+ *     kind: 'task' | 'pipeline-run',
+ *     id: string,
+ *     status: 'backlog'|'working'|'testing'|'done'|'running'|'blocked-on-checkpoint'|'failed',
+ *     ownerSessionId: string|null,
+ *     ownerUser: string|null,
+ *     branch: string|null,
+ *     step: { current: string, total: number } | null,        // pipeline-run only
+ *     startedAt: number,
+ *     lastHeartbeat: number,
+ *     endedAt: number|null,
+ *     cycles: Record<string, number>                          // pipeline-run only
+ *   }
+ *
+ * Zero-dep, pure ESM over `node:*`. The hot path may import this — it never
+ * pulls in the optional `yaml` dep.
+ */
+import { existsSync, mkdirSync, readdirSync, readFileSync, statSync, unlinkSync } from 'node:fs';
+import { dirname, resolve } from 'node:path';
+import { writeFileAtomicSync } from '../hooks/safe-io.mjs';
+const VALID_KINDS = new Set(['task', 'pipeline-run']);
+const VALID_STATUSES = new Set(['backlog', 'working', 'testing', 'done', 'running', 'blocked-on-checkpoint', 'failed']);
+let warnedOnce = false;
+/**
+ * @param {string} pipeDir — repo-relative pipeline root (single-sourced from paths.mjs)
+ * @param {string} id
+ */
+function fileFor(pipeDir, id) {
+  return resolve(pipeDir, String(id), 'state.json');
+}
+/**
+ * Reads a state file. Returns `null` when missing OR corrupt — never throws.
+ * On the FIRST corruption per process, logs a single line so the maintainer
+ * sees it; subsequent corruptions are silent.
+ *
+ * @param {string} pipeDir
+ * @param {string} id
+ * @returns {object | null}
+ */
+export function readState(pipeDir, id) {
+  const file = fileFor(pipeDir, id);
+  if (!existsSync(file)) return null;
+  try {
+    const raw = readFileSync(file, 'utf-8').replace(/^/, '');
+    const parsed = JSON.parse(raw);
+    return normalize(parsed);
+  } catch {
+    if (!warnedOnce) {
+      process.stderr.write(`[state-io] corrupt or unreadable: ${file}\n`);
+      warnedOnce = true;
+    }
+    return null;
+  }
+}
+/**
+ * Writes/updates a state file with a partial payload — fields not present in
+ * `patch` are preserved from the existing file. Validates `kind` + `status`;
+ * malformed inputs throw (state.json is owned by the kit, garbage in is a bug).
+ *
+ * Atomic via `writeFileAtomicSync` (tmp + rename).
+ *
+ * @param {string} pipeDir
+ * @param {string} id
+ * @param {object} patch
+ * @returns {object} the merged record
+ */
+export function writeState(pipeDir, id, patch) {
+  if (!patch || typeof patch !== 'object') throw new Error('writeState: patch must be an object');
+  const previous = readState(pipeDir, id) || {};
+  const merged = { ...previous, ...patch, id: String(id) };
+  if (merged.kind != null && !VALID_KINDS.has(merged.kind)) throw new Error(`writeState: invalid kind "${merged.kind}"`);
+  if (merged.status != null && !VALID_STATUSES.has(merged.status)) throw new Error(`writeState: invalid status "${merged.status}"`);
+  if (typeof merged.startedAt !== 'number') merged.startedAt = Date.now();
+  merged.lastHeartbeat = typeof patch.lastHeartbeat === 'number' ? patch.lastHeartbeat : Date.now();
+  const file = fileFor(pipeDir, id);
+  mkdirSync(dirname(file), { recursive: true });
+  writeFileAtomicSync(file, JSON.stringify(merged, null, 2));
+  return merged;
+}
+/**
+ * Lists every state file under `pipeDir/<id>/state.json`. Optional `{ kind }`
+ * filter and `{ sinceMs }` cutoff (returns only states whose `startedAt` >=
+ * the cutoff). Sorted by `startedAt` descending (newest first).
+ *
+ * @param {string} pipeDir
+ * @param {{ kind?: 'task'|'pipeline-run', sinceMs?: number }} [opts]
+ * @returns {object[]}
+ */
+export function listStates(pipeDir, opts = {}) {
+  if (!existsSync(pipeDir)) return [];
+  const states = [];
+  let entries;
+  try {
+    entries = readdirSync(pipeDir, { withFileTypes: true });
+  } catch {
+    return [];
+  }
+  for (const ent of entries) {
+    if (!ent.isDirectory()) continue;
+    const state = readState(pipeDir, ent.name);
+    if (!state) continue;
+    if (opts.kind && state.kind !== opts.kind) continue;
+    if (opts.sinceMs && (state.startedAt || 0) < opts.sinceMs) continue;
+    states.push(state);
+  }
+  states.sort((a, b) => (b.startedAt || 0) - (a.startedAt || 0));
+  return states;
+}
+/**
+ * Removes state files whose `endedAt` is older than `olderThanDays` ago. Live
+ * states (no `endedAt`) are never pruned. Returns the count removed.
+ *
+ * @param {string} pipeDir
+ * @param {{ olderThanDays: number }} opts
+ * @returns {number}
+ */
+export function prune(pipeDir, { olderThanDays }) {
+  if (!Number.isFinite(olderThanDays) || olderThanDays <= 0) return 0;
+  const cutoff = Date.now() - olderThanDays * 24 * 60 * 60 * 1000;
+  let removed = 0;
+  for (const state of listStates(pipeDir)) {
+    if (typeof state.endedAt !== 'number' || state.endedAt > cutoff) continue;
+    const file = fileFor(pipeDir, state.id);
+    try {
+      if (existsSync(file)) { unlinkSync(file); removed += 1; }
+    } catch { /* best-effort */ }
+  }
+  return removed;
+}
+/**
+ * Coerces a parsed record into the canonical shape with safe defaults. Unknown
+ * fields are dropped; missing required fields get sensible defaults so a
+ * partially-written record still parses.
+ *
+ * @param {object} obj
+ * @returns {object}
+ */
+function normalize(obj) {
+  const safe = obj && typeof obj === 'object' ? obj : {};
+  return {
+    kind: VALID_KINDS.has(safe.kind) ? safe.kind : 'task',
+    id: typeof safe.id === 'string' ? safe.id : String(safe.id ?? ''),
+    status: VALID_STATUSES.has(safe.status) ? safe.status : 'backlog',
+    ownerSessionId: typeof safe.ownerSessionId === 'string' ? safe.ownerSessionId : null,
+    ownerUser: typeof safe.ownerUser === 'string' ? safe.ownerUser : null,
+    branch: typeof safe.branch === 'string' ? safe.branch : null,
+    step: safe.step && typeof safe.step === 'object' ? safe.step : null,
+    startedAt: typeof safe.startedAt === 'number' ? safe.startedAt : 0,
+    lastHeartbeat: typeof safe.lastHeartbeat === 'number' ? safe.lastHeartbeat : 0,
+    endedAt: typeof safe.endedAt === 'number' ? safe.endedAt : null,
+    cycles: safe.cycles && typeof safe.cycles === 'object' ? safe.cycles : {},
+  };
+}

package/templates/contextkit/runtime/statusline.mjs ADDED Viewed

@@ -0,0 +1,51 @@
+#!/usr/bin/env node
+/**
+ * Status-line widget for Claude Code (wired as `settings.statusLine` at level >= 1).
+ *
+ * Prints ONE compact line about the ContextDevKit state of the current project:
+ *   🌀 L6 · 11 sess · 5 ADR · 2 bklog
+ *
+ * It runs on every prompt, so it stays cheap (a few directory counts + one config
+ * read) and zero-dependency. It NEVER throws — on any error it prints a minimal
+ * fallback so the status line can't break the session. Claude Code pipes session
+ * JSON on stdin; we don't need it (we read the project at `process.cwd()`).
+ */
+import { existsSync, readdirSync } from 'node:fs';
+import { resolve } from 'node:path';
+import { pathsFor } from './config/paths.mjs';
+import { readJsonSafe } from './hooks/safe-io.mjs';
+const ROOT = process.cwd();
+const P = pathsFor(ROOT);
+function count(dir, re) {
+  try {
+    return readdirSync(resolve(ROOT, dir)).filter((f) => re.test(f)).length;
+  } catch {
+    return 0;
+  }
+}
+function level() {
+  const lvl = Number(readJsonSafe(P.config, {}).level);
+  return Number.isInteger(lvl) ? lvl : null;
+}
+function main() {
+  try {
+    if (!existsSync(P.platform)) {
+      process.stdout.write('🌀 contextdevkit');
+      return;
+    }
+    const lvl = level();
+    const sess = count('contextkit/memory/sessions', /^\d{4}-\d{2}-\d{2}-\d{2,}-.+\.md$/);
+    const adrs = count('contextkit/memory/decisions', /^\d{4}-.+\.md$/);
+    const bklog = count('contextkit/pipeline/backlog', /\.md$/);
+    const parts = [lvl ? `L${lvl}` : null, `${sess} sess`, `${adrs} ADR`, `${bklog} bklog`].filter(Boolean);
+    process.stdout.write(`🌀 ${parts.join(' · ')}`);
+  } catch {
+    process.stdout.write('🌀 contextdevkit');
+  }
+}
+main();

package/templates/contextkit/squads/README.md ADDED Viewed

@@ -0,0 +1,115 @@
+# Squads — how the sub-agents are organized
+> The sub-agents in `.claude/agents/` (installed at **Level 4**) aren't a loose
+> pile — they're organized into **squads** with distinct jobs, and a coordination
+> rule for when they disagree. This is the roster + the rules. Manage it with
+> `/squad`. Enable the squads relevant to your project; ignore the rest.
+## The squads
+### 🛠️ devteam — constructive (builds + reviews)
+Ships code and guards the constitution. Members:
+| Agent | When to use |
+| --- | --- |
+| `architect` | Cross-cutting design, choosing a pattern, planning a migration — *before* code |
+| `code-reviewer` | Pre-merge audit against `CLAUDE.md` (style, structure, SRP, immutable rules) |
+| `context-keeper` | The platform itself: ADRs, sessions, glossary, hooks, commands, memory |
+| `security` | Auth, secrets, crypto, trust boundaries, dependency risk, security review |
+| `test-engineer` | Test strategy + writing tests, raising coverage, regression for a bug |
+| _(add yours)_ | Domain agents you scaffold: `backend`, `frontend`, `db`, … (from `_TEMPLATE.md`) |
+### 🧪 qa-team — adversarial (verifies behaviour under stress)
+A red team with a different epistemic axis: it audits *behaviour*, not style.
+Single entry point is the orchestrator.
+| Agent | Tier | When to use |
+| --- | --- | --- |
+| `qa-orchestrator` | 1 | The router + sign-off. `/test-plan`, `/scaffold-tests`, `/qa-signoff` |
+| `qa-unit` | 1 | Fast isolated unit tests (mocked deps) |
+| `qa-integration` | 1 | Cross-boundary tests (HTTP/DB/queue/fs) |
+| `qa-fuzzer` | 1 | Property-based / adversarial input on parsers, validators, auth |
+| `qa-perf` | 2 | Benchmarks/profiling once a hot path is identified |
+| `qa-e2e` | 2 | Critical user journeys through the real UI |
+### 🔐 security-team — security & infra (DevSecOps)
+The system's safety net: application security **and** the infrastructure and
+supply chain it runs on. Enable on any project that ships to real users (i.e.
+almost all). Owns the security best-practices bar and the dependency policy.
+| Agent | When to use |
+| --- | --- |
+| `security` | AppSec **lead**: auth, secrets, crypto, input/trust boundaries, threat modeling, security review; owns the security bar + dependency policy |
+| `code-security` | The **code's** external surface: third-party integration code (API clients/SDKs, webhooks, (de)serialization), dependency provenance/SBOM + **license policy**, SAST/CodeQL triage |
+| `infra-security` | The **platform's** threat model: IaC/cloud misconfig, IAM least-privilege, network exposure, secrets management, container/runtime + CI/CD supply-chain hardening |
+| `devops` | Infra & **delivery**: CI/CD, build/deploy, environments, secrets plumbing, observability, release safety — *builds what `infra-security` audits* |
+| _(add yours)_ | `sre` (reliability/incident), `iac` (policy-as-code) as the system grows |
+`security` and `devops` are **shared** with devteam/ops-team — security is a
+cross-cutting concern, not a silo. On a Critical/High finding, security-team can
+**block a release** (veto on the L5/L6 gates for high-risk paths).
+### ⚖️ compliance-team — data protection & law
+Enable when the product handles personal data (especially of Brazilian residents).
+| Agent | When to use |
+| --- | --- |
+| `privacy-lgpd` | LGPD (Lei 13.709/2018): legal basis, consent, data-subject rights (Art. 18), retention/deletion, DPO, incidents/ANPD, processors. Standardized Brazilian-law skills. |
+| _(add yours)_ | `gdpr`, `soc2`, `hipaa`, `accessibility-law` … per your jurisdiction/market |
+### 🎨 design-team — UI/UX
+Enable when the product has a user interface.
+| Agent | When to use |
+| --- | --- |
+| `ux-designer` | Flows, information architecture, interaction, usability (incl. empty/error states) |
+| `ui-designer` | Visual design + design system/tokens, layout, responsive behaviour |
+| `accessibility` | WCAG 2.1 AA: semantics, keyboard, screen readers, contrast, focus |
+### 📈 growth-team — acquisition → activation → retention
+Enable when the product needs users to find it, reach value, and come back. Owns
+the `/advise` **growth** lane (ADR-0028).
+| Agent | When to use |
+| --- | --- |
+| `growth` | **Lead.** Activation, funnels, growth loops, referral/revenue, experimentation, instrumentation. Audit-first; refuses dark patterns |
+| `retention` | Cohort retention, churn (voluntary + involuntary), engagement loops, habit formation, lifecycle messaging, resurrection |
+| `seo-specialist` | Acquisition / discoverability (Google + LLM answer engines) — **shared** with design-team |
+`seo-specialist` is **shared** (it owns the page being indexable); `growth` and
+`retention` own what happens after the click — first value, then staying.
+### 📋 product-team & ⚙️ ops-team (starters included)
+| Agent | Squad | When to use |
+| --- | --- | --- |
+| `product-owner` | product-team | Roadmap shaping, prioritization, requirements (stories + acceptance criteria) |
+| `devops` | ops-team | CI/CD, build/deploy, environments, secrets, observability, release safety |
+## Sovereignty (who decides when they conflict)
+- **`code-reviewer` (devteam)** owns **style + the constitution** (ADR-0008-style).
+- **`qa-orchestrator` (qa-team)** owns **behaviour + test sign-off**.
+- **`security` (security-team)** owns **the security bar** — it can block a
+  release on a Critical/High finding, regardless of the other squads.
+- On conflict, **devteam decides** until the project reaches a maturity milestone
+  you define — then quality gates can harden (see `/context-level`, the L5/L6 gates).
+## Growing a squad
+- **Add an agent** → copy `.claude/agents/_TEMPLATE.md`, give it a sharp
+  `description` (that's how routing works), and list it here under its squad.
+- **Rich briefing (two-tier, optional)** → a lean agent in `.claude/agents/` +
+  a deep briefing in `contextkit/squads/<squad>/<agent>.md` (from `_BRIEFING.md.tpl`)
+  for full anti-patterns and recipes. Use `/squad brief <agent>`.
+- **New squad** → add a section here under the same convention.
+  Use `/squad new-squad <name>`.
+### More squads worth adding (templates / suggestions)
+Scaffold any of these with `/squad new-squad <name>` when the project needs it:
+- **docs-team** — `tech-writer` (READMEs, API docs, ADR prose, changelog clarity).
+- **data-team** — `data-engineer` / `analytics` (schemas, pipelines, event tracking).
+- **support-team** — `support-engineer` (triage, repro, runbooks from incidents).
+- jurisdiction add-ons for compliance-team — `gdpr`, `hipaa`, `soc2`.
+The orchestration commands (`/ship`, `/test-plan`, `/scaffold-tests`,
+`/qa-signoff`) fan work out to these squads.

package/templates/contextkit/squads/_BRIEFING.md.tpl ADDED Viewed

@@ -0,0 +1,27 @@
+# {{AGENT}} — rich briefing ({{SQUAD}} squad)
+> Optional **tier-2** briefing for the lean agent in `.claude/agents/{{AGENT}}.md`.
+> The lean file is the executable router (frontmatter + a tight prompt); THIS is
+> the deep reference — full anti-patterns, end-to-end recipes, edge cases — that
+> the agent reads when it needs more than the headline rules.
+> Created via `/squad brief {{AGENT}}`.
+## Mandate
+<!-- One paragraph: exactly what this agent owns and what it refuses. -->
+## Mental model
+<!-- The diagram / invariants the agent treats as hard rules. -->
+## Anti-patterns (full catalogue)
+| Symptom | Why it's wrong | Fix |
+| --- | --- | --- |
+|  |  |  |
+## End-to-end recipes
+<!-- The canonical step-by-step for the 1–3 tasks this agent does most. -->
+## Edge cases & traps
+<!-- The stack-/domain-specific gotchas worth writing down. -->
+## Hand-offs
+<!-- When to delegate to another squad member, and to whom. -->

package/templates/contextkit/squads/agent-forge/README.md ADDED Viewed

@@ -0,0 +1,69 @@
+# agent-forge — the agent factory squad
+> A **factory** squad: unlike the internal squads (devteam, qa-team, …) whose
+> client is *the developer inside Claude Code*, agent-forge produces an artifact
+> that ships **out** — a portable, multi-provider **Agent Package** consumed by a
+> client project's production runtime. Approved + scoped by
+> [ADR-0012](../../memory/decisions/0012-agent-forge-squad-for-portable-agent-packages.md).
+> Read [`best-practices.md`](best-practices.md) before forging.
+## What it produces
+The **Agent Package (APF)** — a versioned, self-contained folder under the client
+project's `agent-packages/<name>@<semver>/` with: a single source-of-truth
+`manifest.yaml`, per-provider prompts, canonical tool schemas + provider adapters,
+optional RAG config, an eval harness (golden + red-team), three governance policies
+(cost · compliance · quality), and optional runtime adapters (Node/Python/…).
+**The output has zero dependency on ContextDevKit, Node, or any runtime at consume
+time.** The forge runs here; the package runs anywhere. (ADR-0012, constraint 1–2.)
+## The boundary (why this squad is different)
+| Internal squad (devteam/qa-team/…) | agent-forge (factory) |
+| --- | --- |
+| Client = the developer, in Claude Code | Client = the client's product, in production |
+| Output = edits / reviews / tests in this repo | Output = a portable Agent Package |
+| Provider = Claude | Provider = Claude · OpenAI · Gemini · DeepSeek · self-hosted |
+## Roster (delivered across phases — see backlog 030–035)
+> Membership follows the kit convention — **no `squad.manifest.json`**. Each agent is
+> a lean file in `.claude/agents/<name>.md` tagged `(agent-forge squad)` (detected by
+> `squadOf`), with an optional tier-2 briefing here in `squads/agent-forge/`.
+| Agent | Role | Phase |
+| --- | --- | --- |
+| `forge-orchestrator` | Runs the pipeline (architect → router → prompt+tool+rag → eval → governance → packager) | Fase 1 |
+| `agent-architect` | Interviews the dev → produces the Agent Blueprint (YAML) | Fase 1 |
+| `model-router` | Picks provider/model from the capability matrix + decision rules; writes the rationale | Fase 1 |
+| `prompt-engineer` | System prompt per provider (XML for Claude, few-shot for Gemini, CoT for DeepSeek…) | Fase 1–2 |
+| `tool-designer` | Canonical JSON Schema → per-provider tool/function adapters | Fase 1–2 |
+| `eval-designer` | Golden dataset + red-team cases + rubric + thresholds | Fase 3 |
+| `governance-officer` | Attaches the three policies + fallback chain + kill switch + audit schema | Fase 3 |
+| `rag-designer` | *(opt)* chunking, embeddings, index, reranker — only if the blueprint needs retrieval | Fase 5 |
+| `packager` | Assembles the APF, versions it (semver + provenance), generates runtime adapters | Fase 1 |
+## The five principles (full text in `best-practices.md`)
+1. **Portability absolute** — the APF depends on nothing of ours at runtime.
+2. **Provider-agnostic manifest, provider-specific adapter** — switch provider = switch adapter.
+3. **Economic choice is structured, not intuitive** — the router decides by matrix + rules; the LLM is only a tie-breaker.
+4. **Best practices are the default, not a suggestion** — caching, fallback, retry, audit, kill switch, eval ship by default; removing one needs a reason.
+5. **Eval before embarkation** — no package leaves the forge without passing a minimum golden + red-team gate.
+## Where it sits in the levels
+- **L4** — an optional squad; enable per project (e.g. `project_type: ai-product`).
+- **L5** — edits under `agent-packages/` are a high-risk path → `/simulate-impact` applies (changing a primary model has wide blast radius).
+- **L6** — `/context-stats` gains a Forge Stats section (agents in prod, aggregate cost, fallback rate, eval drift).
+- **L7** — `/fleet` manages agent packages cross-repo (one agent serving many projects).
+## Status
+Fase 0 (foundations) — **complete**. This README, [`best-practices.md`](best-practices.md),
+the [`router/capability-matrix.json`](router/capability-matrix.json) seed, and the
+[`templates/agent-package/`](templates/agent-package/) APF v1 skeleton are in place. The
+agents and the `/forge-*` commands land across Fases 1–5 — track them on the DevPipeline
+(031–035). For the **section-by-section blueprint-coverage map** (the single artifact a
+future session opens to know where we stand), read [`ROADMAP.md`](ROADMAP.md).

package/templates/contextkit/squads/agent-forge/ROADMAP.md ADDED Viewed

@@ -0,0 +1,108 @@
+# agent-forge — blueprint → status
+> The single map between the original `agent-forge` blueprint and what is actually
+> shipped here. Read this **first** when working on the squad — it spares you a
+> spelunk through the ADRs, the backlog, and the source. Kept current as work moves.
+>
+> **Status key** (same as [`docs/ROADMAP.md`](../../../../docs/ROADMAP.md)):
+> ✅ done · ⏳ in progress · 🟡 partial · 📋 planned · ➖ dropped/superseded ·
+> 🆕 added by ADR (not in the original blueprint).
+## Anchors
+- **Approved by** [ADR-0012](../../memory/decisions/0012-agent-forge-squad-for-portable-agent-packages.md) — 7 binding constraints reshape the blueprint where it collided with the kit.
+- **YAML strategy** [ADR-0013](../../memory/decisions/0013-agent-forge-yaml-via-optional-dynamic-import.md) — optional `yaml` behind dynamic import (the `zod` precedent).
+- **Declarative pipeline DSL** [ADR-0015](../../memory/decisions/0015-pipeline-dsl-working-stage-and-multi-session-work-claims.md) — `pipeline.yaml` per squad; engine is opt-in, dry-runnable, simulate-impact-mappable. First consumer is this squad (Fase 6).
+- **Phased delivery** on the DevPipeline as tasks **030–035** (Fases 0–5 — all ✅) + **Fase 6** (declarative pipeline DSL — ✅, shipped on `feat/agent-forge-fase6-pipeline-dsl`).
+## Coverage map (blueprint section → here)
+| § | Blueprint | Status | Where / next |
+|---|---|---|---|
+| 0–1 | Exec summary + 5 principles | ✅ | [`README.md`](README.md), [`best-practices.md`](best-practices.md) |
+| 2 | `squad.manifest.json` | ➖ | Dropped by ADR-0012 §3 — squads detected by `squadOf` (the `(agent-forge squad)` tag) |
+| 2 | Squad folder + roster table | ✅ | [`README.md`](README.md) — agents listed by phase |
+| 2 | The 9 lean agent files (`.claude/agents/forge-*.md`) | ✅ | Fase 1: `forge-orchestrator` / `agent-architect` / `model-router` / `prompt-engineer` / `tool-designer` / `packager`. Fase 3: `eval-designer` + `governance-officer`. Fase 5: `rag-designer`. |
+| 2 | `templates/providers/<provider>/` reusable snippets | 🟡 | Per-provider stubs currently live **inside** the APF (`prompts/system.<provider>.md` + `tools/adapters/<provider>.tools.json`). Split out only if Fase 1–2 generators need shared snippets above APF scope. |
+| 2 | `policies/*.template.yaml` (squad scope) | 🟡 | The canonical policy templates ship **inside** the APF (`agent-package/governance/*.policy.yaml`). Equivalent for now; split if Fase 3's governance-officer needs squad-level partials. |
+| 3 | APF v1 — full tree (45 files) | ✅ | [`templates/agent-package/`](templates/agent-package/) (commit `d5efcd2`) |
+| 4.1 | Router inputs | ✅ | Documented (best-practices §4 / blueprint §4.1) — consumed in Fase 1 |
+| 4.2 | `capability-matrix.json` | ✅ | [`router/capability-matrix.json`](router/capability-matrix.json) (5 providers / 11 models, dated, ADR-gated, parse/id guard — commit `3ad928a`) |
+| 4.3 | `decision-rules.json` | ✅ | [`router/decision-rules.json`](router/decision-rules.json) — 13 rules (cap 15), shortlists only, no quality opinions (ADR-0012 §5). Engine in [`lib/router.mjs`](lib/router.mjs). |
+| 4.4 | Rationale section in package README | ✅ slot + generator | The model-router emits the canonical `## Model Selection Rationale` block (rule trace + cross-provider fallback warning + eval-as-authority disclaimer) — `lib/router.mjs` `buildRationale`. |
+| 5 | Per-provider behaviour notes | ✅ | `best-practices.md` §4 (condensed table) |
+| 5 | `prompt-engineer` per-provider generators | ✅ | All 5 providers: Anthropic (XML, `cache=ephemeral`), OpenAI (Markdown), Google (`systemInstruction` body + safetySettings note), DeepSeek (OpenAI-compat + explicit CoT cue), Ollama (Markdown, chat_template applied by runtime). [`lib/prompt-gen.mjs`](lib/prompt-gen.mjs) |
+| 5 | `tool-designer` per-provider generators | ✅ | All 5 providers: Anthropic (`name`/`description`/`input_schema`), OpenAI (`type:function`), Google (`functionDeclarations` SUBSET — `additionalProperties` + `$schema` stripped), DeepSeek + Ollama (OpenAI-compat shapes). [`lib/tool-gen.mjs`](lib/tool-gen.mjs) |
+| 5 | Runtime `AgentRuntime` contract | ✅ | Documented in APF adapter READMEs + Node/Python/Go stubs |
+| 6.1–6.3 | Cost / compliance / quality policy templates | ✅ | [`templates/agent-package/governance/`](templates/agent-package/governance/) |
+| 6.x | `fallback-chain.yaml` + `audit.schema.json` | ✅ | Same dir |
+| 6 | `governance-officer` ENFORCER ("refuse if any pillar under-configured") | ✅ | [`lib/governance-officer.mjs`](lib/governance-officer.mjs) — `attachGovernance` populates the 3 pillars from the blueprint + builds fallback chain from the router decision; `validateGovernance` refuses on missing sections or unresolved `{{TOKEN}}` placeholders. Briefing in [`.claude/agents/governance-officer.md`](../../../claude/agents/governance-officer.md). |
+| 6.4 | Three-pillar equal-weight rationale | ✅ | `best-practices.md` §5 |
+| 7.1–7.3 | Golden / red-team / rubric / thresholds | ✅ templates | [`templates/agent-package/evals/`](templates/agent-package/evals/) |
+| 7.4 | Eval lifecycle (3 moments) | ✅ | `best-practices.md` §6 (docs); [`lib/eval-runner.mjs`](lib/eval-runner.mjs) `runEvalSuite` (golden + red-team aggregated against thresholds; provider-agnostic — mock for CI, real adapter for prod). |
+| 7 | Eval gate in orchestrator (refuse to ship on fail) | ✅ | `forgeNew` supports `opts.runEval = { provider, semantic }`; `packageAgent` stamps `provenance.eval_passed_at` only when `evalResult.verdict === 'pass'`. The (≤3 retries → abort) refinement loop is the AGENT's job — driven by `.claude/agents/eval-designer.md`. |
+| 8 | `/forge-new` | ✅ | [`templates/claude/commands/forge-new.md`](../../../claude/commands/forge-new.md) + CLI [`cli/forge-new.mjs`](cli/forge-new.mjs) (`forgeNew()` exported for the integration test) |
+| 8 | 13 maintenance `/forge-*` commands | ✅ | `cli/forge-ops.mjs` (list/show/doctor/policy/budget/audit) + `cli/forge-eval-cli.mjs` (eval/redteam/route/fallback-test) + `cli/forge-admin.mjs` (refresh-matrix/killswitch/deprecate, dry-run by default). 13 thin briefings under `templates/claude/commands/forge-*.md`. |
+| 9 | Full lifecycle (forge → review → install → prod → maintain) | ✅ | Fase 1 engine + Fase 3 eval gate + Fase 4 maintenance commands all wired. The runtime adapter ships a `createShadowEval` scaffold (sample rate from `quality.policy.yaml.eval_gates.drift_monitoring.sample_pct`). |
+| 10 | L4 enablement | ✅ | `README.md` "Where it sits in the levels" |
+| 10 | L5 `simulate-impact` for `agent-packages/` edits | ✅ | `defaults.l5.highRiskPaths` includes `agent-packages/**` — the simulate-gate triggers on any forged-agent edit. Guarded by `checkL5ForgePath`. |
+| 10 | L6 `/context-stats` Forge Stats section | ✅ | `stats.mjs` `collectForge()` walks `agent-packages/`; surfaces package count, eval-stamp ratio, aggregate monthly target + hard cap, distribution by primary provider. |
+| 10 | L7 `/fleet` cross-repo agent-package registry | ✅ | `fleet.mjs cmdStats` aggregates per-repo Forge Stats — packages, eval-stamp ratio, monthly target + hard cap, surfaced both per-repo and as a fleet total. |
+| 11 | Implementation roadmap (5 fases) | ✅ | Mapped 1:1 to backlog 030–035 with sequenced SLAs |
+| 12 | Risks — matrix freshness | ✅ | ADR-0012 §6 + `checkCapabilityMatrix` |
+| 12 | Risks — decision-rules Frankenstein | ✅ | Router enforces the 15-rule cap at runtime; currently 13/15. Split by intent category when outgrown. |
+| 12 | Risks — golden eval staleness | 📋 | Shadow eval feeding golden (Fase 4) |
+| 12 | Risks — cross-project package divergence | ✅ | `/fleet stats` surfaces packages-per-repo + aggregate budget so divergence becomes visible across the fleet. |
+| 12 | Risks — compliance vertical templates (HIPAA/PCI) | ➖ v1 | Future jurisdiction add-ons via `compliance-team` squad |
+| 12 | Risks — forge self-cost | ✅ planned | Orchestrator defaults to Haiku (set in agent files, Fase 1) |
+| Ap A | forge vs classic squad table | ✅ | `README.md` "The boundary (why this squad is different)" |
+| Ap B | Why a separate factory squad | ✅ | Same section |
+| Ap C | Glossary | 📋 low priority | Inline in best-practices for now; consolidate if it grows |
+| — | **Fase 6 — declarative `pipeline.yaml` + dry-run engine** (ADR-0015 §A) | ✅ | [`pipeline.yaml`](pipeline.yaml) (9 steps) + engine [`templates/contextkit/tools/scripts/squad-pipeline.mjs`](../../tools/scripts/squad-pipeline.mjs) + whitelisted condition parser [`squad-pipeline-condition.mjs`](../../tools/scripts/squad-pipeline-condition.mjs). Parses via `lib/yaml.mjs` (ADR-0013); refuses on missing `yaml` with **exit 0 + informative** message (pipelines are opt-in, not hot-path). `--dry-run` walks the graph with empty ctx (markers `✓ / ⊘ / ↺`). `max_review_cycles` is a hard cap; vendor model names are refused (router stays the single resolver). 8 new selfchecks (`checkConditionParser` + `checkSquadPipeline`) + 4 new integration asserts (ships, validates, yaml-absent path). Spec: [`docs/SQUAD-PIPELINE-FORMAT.md`](../../../../docs/SQUAD-PIPELINE-FORMAT.md). `state.json` per run is deferred to task 040 (ADR-0015 §C). |
+## Net additions (ADR-driven, not in the original blueprint)
+🆕 **No `squad.manifest.json`** (ADR-0012 §3) — reuse the kit's `squadOf` detection.
+🆕 **No phantom `AI-AGENT-PRACTICES.md`** (ADR-0012 §4) — authored inline as `best-practices.md`.
+🆕 **Eval-as-authority** (ADR-0012 §5) — router rules are deterministic shortlists; the eval harness measured on the user's golden set decides.
+🆕 **Matrix-freshness guard** (ADR-0012 §6) — `checkCapabilityMatrix` rejects malformed / duplicate / disallowed model ids.
+🆕 **Hot-path zero-yaml** (ADR-0013) — `checkHotPathNoYaml` enforces rule 1.
+🆕 **`lib/yaml.mjs` loader** (ADR-0013) — the single touchpoint for the optional `yaml` dep.
+🆕 **`checkRouterEngine` selfcheck** (Fase 1) — behavioural guard: typical blueprint + no-cloud constraint both honored; rationale carries the eval-as-authority disclaimer.
+🆕 **Installer copies the squad at L>=4** (Fase 1 fix) — without this, agent-forge code lived only in source; selfcheck `checkSourceInvariants` guards the copy.
+🆕 **forge-new no-yaml fallback** (Fase 1) — integration test exercises the pure half of the pipeline (validate → route → assembleManifest → gens) so CI without the optional `yaml` dep still proves correctness end-to-end.
+🆕 **`runtime_adapters` is a first-class blueprint field** (Fase 2) — `enum-multi` over `[node, python, go]` with default `[node]`; `validateBlueprint` rejects unknown entries; `assembleManifest` reads it straight from the blueprint, so the manifest is no longer a packager-opts artifact.
+🆕 **Gemini subset enforced at generation time** (Fase 2) — `downConvertForGemini` strips `additionalProperties` / `$schema` / `$id` / `$ref` recursively so `functionDeclarations` is valid the moment it's written; the runtime adapter never has to remediate.
+🆕 **DeepSeek CoT cue baked into the prompt, not the runtime** (Fase 2) — `renderDeepSeek` prepends "Think step by step…" to Rules; the runtime contract stays uniform across providers.
+🆕 **governance-officer THROWS, not warns** (Fase 3) — `attachGovernance` runs at the top of `packageAgent`, before any I/O, so under-configured pillars never waste a `copyTree`. Refusal is a feature.
+🆕 **`semantic_similarity:>=N` skips without a callback** (Fase 3) — when `opts.semantic` is absent the field is *uncounted*, not failed; CI gates don't false-negative on missing embeddings.
+🆕 **One seed golden per package** (Fase 3) — `eval-designer` ships ONE category-shaped case; the eval-designer agent drives 10–50 domain-specific expansion with the dev. Templates carry no made-up domain content.
+🆕 **`eval_passed_at` defaults to `null`** (Fase 3) — the default is a *refused* gate, not an assumed pass. Only `evalResult.verdict === 'pass'` stamps the timestamp.
+🆕 **Mutator CLIs are dry-run by default** (Fase 4) — `/forge-refresh-matrix` / `/forge-killswitch` / `/forge-deprecate` only print the proposed diff; `--write` triggers an atomic tmp+rename. Refuse-over-rubber-stamp posture.
+🆕 **`discoverPackages` survives without `yaml`** (Fase 4) — directory walk + regex on `<name>@<semver>`; listing always works, columns from `manifest.yaml` degrade to `?` / `NEVER` gracefully.
+🆕 **Three CLI modules grouped by intent**, not 13 files (Fase 4) — `cli/forge-ops.mjs` (read) / `cli/forge-eval-cli.mjs` (re-run) / `cli/forge-admin.mjs` (mutate) share `lib/package-ops.mjs` and stay under 180 lines each.
+🆕 **Shadow-eval is a scaffold, not a runner** (Fase 4) — Node adapter ships `createShadowEval` with the sample-rate gate + a `runOne` delegation point; scoring stays single-sourced in the package's `evals/` + the kit's `eval-runner`.
+🆕 **`/forge-refresh-matrix` only stamps `updated`** (Fase 4) — model adds/removes/price changes stay ADR-gated (ADR-0012 §6). The command surfaces the ADR expectation in its output.
+🆕 **L5 simulate-impact gate is default-ON for `agent-packages/**`** (Fase 5) — `defaults.l5.highRiskPaths` ships with the glob; swapping a forged agent's primary model is too high-blast-radius to require opt-in.
+🆕 **Selfcheck split by responsibility, not phase** (Fase 5) — `selfcheck-agent-forge.mjs` (build pipeline: matrix + hot-path + router + Fase-3 gate) vs `selfcheck-agent-forge-ops.mjs` (operations: package-ops + rag-designer + L5 gate). Build engine vs running fleet.
+🆕 **Pinecone-under-no-cloud is refused, not silently downgraded** (Fase 5) — `rag-designer` makes the compliance contradiction explicit rather than quietly switching to pgvector and hiding the residency intent.
+🆕 **`/fleet stats` Forge fleet aggregation** (Fase 5) — `fleet.mjs cmdStats` surfaces packages, eval-stamp ratio, monthly target + hard cap both per-repo and as a fleet total — cross-project divergence becomes visible at the registry level.
+🆕 **.NET / Rust adapters deferred to demand** (Fase 5) — `stampRuntimeAdapters` seam is clean (a one-line `if (runtimes.includes('dotnet'))` branch when a real project asks). The blueprint said "per demand", and the polish is honest about it.
+🆕 **`pipeline.yaml` is opt-in per squad** (Fase 6, ADR-0015) — agent-forge is the first consumer; squads without a `pipeline.yaml` keep working as today. The engine refuses (with a clear message) when `yaml` is absent — pipelines are opt-in, not hot-path, so the zero-dep rule still holds.
+🆕 **Whitelisted `condition` grammar** (Fase 6, ADR-0015 §A.2) — only `<id>(.<id>)* <op> <literal>` and `…length <op> <int>` in v1. No arbitrary expression evaluation; bigger grammar needs a new ADR with a real use case.
+🆕 **Vendor model names stay out of YAML** (Fase 6, ADR-0015 §A.3) — `model_tier: fast|powerful|reasoning` only; the router (ADR-0012 §4) is the single resolver.
+🆕 **`max_review_cycles` is a hard cap** (Fase 6, ADR-0015 §A.4) — the engine refuses to loop past the cap and exits with "manual escalation required" instead of silently retrying forever.
+🆕 **Engine refuses with exit 0 when yaml is absent** (Fase 6, shipped) — pipelines are opt-in; missing yaml is a "feature not enabled" informative message, not an error. Selfcheck + integration test both exercise this path. Matches rule 2 ("hooks never break real work") — the squad keeps running fine if the optional dep is missing.
+🆕 **`condition` parser + engine split** (Fase 6, shipped) — `squad-pipeline-condition.mjs` (parser+eval, 192 lines) is a sibling of `squad-pipeline.mjs` (engine + dry-run, 250 lines). One file per responsibility — parser stays pure & testable without the yaml dep; engine handles I/O + dual-layout (source/installed) discovery.
+## How this stays current
+A session that touches agent-forge **updates the markers here** as work moves (✅⏳🟡📋➖) — same convention as `docs/ROADMAP.md`. New architectural decisions → a new ADR, then a row update here cross-referencing it. The DevPipeline tasks 030–035 are the *executable* counterpart; this is the **map** that ties them to the original spec + the ADRs + the source files.
+## Quick refs
+- Approval + constraints: [ADR-0012](../../memory/decisions/0012-agent-forge-squad-for-portable-agent-packages.md)
+- YAML strategy: [ADR-0013](../../memory/decisions/0013-agent-forge-yaml-via-optional-dynamic-import.md)
+- Backlog: `contextkit/pipeline/backlog/032..035-*.md` · concluded: `conclusion/030-*.md` + `conclusion/031-*.md`
+- Sessions: 17 (Fase 0 + ADRs) · 18 (Fase 1: router engine + libs + agents + /forge-new + integration round-trip)
+- Branches: `feat/agent-forge-fase0` (PR #18) · `feat/agent-forge-fase1` (current — Fase 1 complete)

package/templates/contextkit/squads/agent-forge/best-practices.md ADDED Viewed

@@ -0,0 +1,89 @@
+# Forge best-practices — the bar every forged agent clears
+> The normative reference for `agent-forge`. This is what the blueprint called the
+> "Constituição / AI-AGENT-PRACTICES" — authored **here**, inline, rather than as a
+> phantom prerequisite ([ADR-0012](../../memory/decisions/0012-agent-forge-squad-for-portable-agent-packages.md),
+> constraint 4). It governs the **forged agents** (the output), not this repo's own
+> hot path. Conflicts resolve top-down by the five principles.
+## 1. The five principles
+1. **Portability is absolute.** The Agent Package depends on no ContextDevKit, Node, or
+   language runtime *at consume time*. It is declarative files + optional adapters. A
+   client runs it via the provider SDK, LangChain, or their own runtime — with the kit
+   uninstalled.
+2. **Provider-agnostic manifest, provider-specific adapter.** The `manifest.yaml`
+   describes the agent abstractly (intent, capabilities, tools, RAG, policy). Adapters
+   translate to each provider's wire format. Switching provider swaps an adapter; it
+   never rewrites the agent.
+3. **Economic choice is structured, not intuitive.** The `model-router` picks
+   provider+model from a capability matrix + deterministic rules. An LLM enters only as
+   a tie-breaker on genuinely ambiguous cases. Predictable cost is a design goal.
+4. **Best practices are the default, not a suggestion.** Everything in §3 ships in the
+   base template. To *remove* one, the dev records a reason; to *keep* it, they do
+   nothing. Default-safe beats flexible.
+5. **Eval before embarkation.** No package leaves the forge without passing a minimum
+   golden + red-team gate. Failed → refinement loop. An unmeasured agent never reaches
+   the client.
+## 2. Router authority comes from eval, not frozen opinion
+The decision rules may deterministically *shortlist and rank* providers — but the final
+"best model for this agent" verdict is the **eval harness measured on the user's golden
+set**, not a preference frozen in JSON the day it was written (ADR-0012, constraint 5).
+Shipped rules carry no hardcoded quality claims (no "model X is 8pp better at PT-BR").
+The capability matrix is dated, versioned, and changed only via ADR + `/forge-refresh-matrix`.
+## 3. The default catalogue (every forged agent ships with these)
+| Default | Why it's not optional |
+| --- | --- |
+| **Prompt caching** (where the provider supports it) | Largest single cost lever on long, stable prompts (glossaries, rules). |
+| **Fallback chain** (≥1 provider *different* from primary) | Survives a provider outage; a single-provider agent is a single point of failure. |
+| **Retry with backoff** (exponential; on 5xx/timeout/rate-limit; **never** on 4xx/safety-block) | Transient errors recover; client errors and safety decisions must not be retried blindly. |
+| **Rate limiting** (per-user + global) | Caps blast radius of a bug or abuse before it becomes a bill. |
+| **Audit log** (inputs after redaction, output, model, cost, fallback, PII redactions) | Without it there is no compliance story and no drift forensics. |
+| **Kill switch** (cost breach, eval-below-threshold, red-team regression) | The agent must be able to refuse *itself* when a guardrail trips. |
+| **Eval golden + red-team** | The only objective evidence the agent works and is safe. |
+| **Structured-output validation** | A malformed payload is a failure, not a "best effort" — validate, retry once, then fail. |
+## 4. Provider best-practices (condensed)
+| Provider | System prompt | Tools | Notes that bite |
+| --- | --- | --- | --- |
+| **Anthropic** | Separate param; XML-structured sections | `tools[]` + `input_schema`; `tool_choice` to force | No native JSON mode — use a single-tool schema for structured output; mark static blocks `cache_control`. |
+| **OpenAI** | First `system` message | `tools[]` type `function` | Native `response_format: json_schema` strict mode; caching is automatic >1024-tok prefix. o-series: no system msg, `reasoning_effort`. |
+| **Google (Gemini)** | `systemInstruction` param | `functionDeclarations[]` (JSON-Schema **subset**) | **Set `safetySettings` explicitly** or hit surprise blocks; up to 2M ctx (RAG win); caching needs >32k tokens. |
+| **DeepSeek** | OpenAI-compatible | OpenAI-compatible | Prefers explicit CoT; reasoner models split `reasoning_content`; an order of magnitude cheaper; PT-BR/vision weaker than Claude/Gemini. |
+| **Self-hosted (Ollama/vLLM)** | Per-model `chat_template` | Native but less reliable → robust eval is critical | Data never leaves the client's infra — the only viable path for heavy PII + strict residency. Perf depends on their hardware. |
+The runtime adapters expose **one interface** regardless of provider — `invoke`,
+`invokeStream`, `preflight`, `estimate`, `onEvent`. The client switches provider by
+editing `manifest.yaml → spec.model_selection.primary`; their code does not change.
+## 5. Governance — three pillars, equal weight
+The forge refuses to package an agent with any of the three under-configured.
+- **Cost** — per-call + monthly budgets, alert tiers, caching required, rate limits,
+  kill switch on hard-cap breach. *Without it the agent dies of budget politics.*
+- **Compliance** — PII detection + redaction/tokenization, LGPD basis + data-subject
+  rights, data residency (allow/deny providers), retention, audit. *Without it the
+  agent is fined, sued, or banned in regulated work.*
+- **Quality** — eval gates (golden + red-team thresholds), fallback chain, retry policy,
+  drift monitoring, kill switch on quality regression. *Without it the agent is cheap
+  and legal but hallucinates and burns the product.*
+## 6. Eval lifecycle
+1. **Pre-release** *(mandatory)* — the package does not ship unless golden + red-team
+   clear thresholds (e.g. golden accuracy ≥ 0.85; PII-leak block rate = 1.00).
+2. **Pre-version-bump** *(mandatory on minor/major)* — a significant change reruns eval.
+3. **Shadow in production** *(recommended)* — the client evals a sample (~5%) of real
+   traffic to catch upstream model drift; feeds candidates back into the golden set.
+## 7. Red-team minimum
+Every package carries at least: **prompt-injection**, **jailbreak**, and **PII-leak**
+cases. PII-leak tolerance is zero (block rate 1.00). These run before each release and
+on every version bump; bias tests are recommended where the domain warrants.