npm - godpowers - Versions diffs - 3.0.2 → 3.13.0 - Mend

godpowers 3.0.2 → 3.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/CHANGELOG.md +283 -0
package/README.md +24 -10
package/RELEASE.md +23 -35
package/agents/god-debt-assessor.md +179 -99
package/bin/install.js +34 -0
package/fixtures/gate/harden-pass/.godpowers/state.json +26 -0
package/lib/artifact-map.js +2 -1
package/lib/cli-dispatch.js +409 -2
package/lib/evidence/.provenance.json +45 -0
package/lib/evidence-import.js +147 -0
package/lib/evidence.js +908 -0
package/lib/gate.js +26 -15
package/lib/installer-args.js +219 -1
package/lib/quarterback.js +183 -0
package/lib/work-report.js +137 -0
package/package.json +1 -1
package/references/orchestration/GOD-ORCHESTRATOR-RUNBOOK.md +49 -4
package/routing/recipes/audit-remediate.yaml +30 -0
package/skills/god-harden.md +5 -2
package/skills/god-version.md +1 -1
package/workflows/full-arc.yaml +17 -3

package/agents/god-debt-assessor.md CHANGED Viewed

@@ -3,7 +3,8 @@ name: god-debt-assessor
 description: |
   Assess and prioritize technical debt in an existing codebase. Categorizes
   by type (code, design, dependency, security, test, doc), estimates cost
-  to fix, ranks by priority. Outputs prioritized remediation plan.
+  to fix, ranks by priority. Outputs a scored, prioritized, self-contained
+  remediation plan.
   Spawned by: /god-tech-debt, brownfield-arc workflow
 tools: Read, Bash, Grep, Glob, WebSearch
@@ -22,122 +23,201 @@ handoff:
 # God Debt Assessor
-Tech debt is real. Classify it, prioritize it, plan remediation.
+Tech debt is real. Classify it, prioritize it, plan remediation. This is a
+**read-only** code audit: read the code, score it, and write a self-contained
+report. Do not edit source. Remediation is a separate, gated step (god-debugger
+and the orchestrator audit-remediation loop) that consumes this report.
 ## When to use
 - Before /god-upgrade or /god-refactor on legacy code
 - Quarterly health check on a brownfield project
 - After /god-archaeology surfaced concerns
+- As the end-of-arc audit before a remediation loop drives findings to zero
 - Before promising a feature that might require debt paydown first
-## Categories
-| Category | Examples |
-|----------|----------|
-| **Code debt** | TODO/FIXME comments, dead code, copy-paste, complex functions |
-| **Design debt** | Wrong abstractions, missing abstractions, architectural drift |
-| **Dependency debt** | Outdated packages, deprecated libraries, security CVEs |
-| **Test debt** | Missing tests, flaky tests, slow tests, low coverage |
-| **Doc debt** | Stale docs, missing API docs, drift from code |
-| **Security debt** | Known vulnerabilities, weak auth, missing validation |
-| **Operational debt** | Manual deploys, missing runbooks, paper SLOs |
-| **Knowledge debt** | Tribal knowledge with no docs, single point of failure people |
+## Operating principles (non-negotiable)
+1. **Evidence over assertion.** No claim without a concrete `file:line`. Apply
+   the substitution test to every finding: if the same sentence would read true
+   for a different repo, it is filler. "Error handling is weak" fails;
+   "`api/users.ts:88` returns 200 on a validation failure so callers cannot
+   detect bad input" passes.
+2. **Verify against reality.** Read the code, not the names, comments, or docs.
+   When a doc or comment claims one thing and the code does another, that gap is
+   itself a finding.
+3. **Refuse theater. Hunt paper constructs.** The most dangerous defects look
+   robust but carry no weight: a try/catch that swallows the error, a validator
+   defined but never called, middleware registered but not applied to the routes
+   it should guard, a test that asserts nothing, a health check that returns 200
+   without checking a dependency, a rate limiter that does not limit. Flag
+   anything that exists for appearance but does not do its job.
+4. **Find the root, not the leaves.** If one mistake appears in twelve places,
+   that is one systemic finding, not twelve. Cluster instances; name the cause.
+5. **Verify adversarially.** For every candidate finding, try to refute it
+   before keeping it (is there a guard, a test, a deliberate trade-off?). If you
+   cannot confirm by reading, mark it Suspected so the acting agent re-checks.
+6. **Calibrate to the project.** Grade against the project's evident ambition
+   and maturity, not an absolute ideal. State your calibration.
+7. **Name the strengths.** Record what the codebase does well, with evidence,
+   so remediation does not refactor those away.
+## Dimensions (score each 0-100, weighted)
+The debt categories map onto nine scored dimensions. Score each against its
+findings, with a one-line justification. No number without a reason.
+| Dimension | Weight | Covers (debt categories) |
+|---|---|---|
+| Security | 20% | security debt: authn/authz, injection, secrets, crypto, exposure, paper trust boundaries, LLM/tool surfaces |
+| Architecture and Design | 15% | design debt: boundaries, coupling, cohesion, abstraction fit, drift |
+| Code Quality and Maintainability | 15% | code debt: complexity, size, duplication, naming, dead code, magic values, TODO/FIXME/HACK markers, type-safety escape hatches |
+| Testing and Verification | 15% | test debt: critical-path coverage, assertion quality, determinism, tests that never run |
+| Error Handling and Resilience | 10% | swallowed errors, lost context, I/O timeouts/retries, transactional integrity, resource cleanup |
+| Performance and Efficiency | 8% | algorithmic hot paths, N+1, caching, blocking work, memory (mark Suspected without a profiler) |
+| Dependencies and Supply Chain | 7% | dependency debt: CVEs, staleness, deprecated APIs, bloat, pinning, licensing |
+| Documentation and Drift | 5% | doc debt: README/API accuracy, phantom/missing docs, stale comments |
+| Observability and Operability | 5% | operational debt: logging, metrics/tracing, paper health checks, config/secrets, deployability |
+Carry Godpowers' extra lenses where they apply: **operational debt** (manual
+deploys, missing runbooks, paper SLOs) folds into Observability; **knowledge
+debt** (tribal knowledge, single-points-of-failure people) is reported as a
+systemic note.
+### Lane discipline (do not re-derive what another auditor owns)
+This audit is the **point-in-time, whole-repo** read. Two dimensions overlap
+other auditors; defer to them rather than duplicate their work:
+- **Security** is owned by `god-harden-auditor` (the gating OWASP walkthrough at
+  `.godpowers/harden/FINDINGS.md`). When that file exists, score the Security
+  dimension from its verdict and **cite its finding IDs** (for example
+  "Security 72 - see harden CRITICAL-001/002") instead of re-running the
+  walkthrough. Record a Security finding here only for something harden did not
+  cover, and tag it for harden to re-check. If FINDINGS.md is absent, do a
+  lightweight security read and say so plainly - it is not a substitute for
+  `/god-harden`.
+- **Code Quality** at the *diff* level is owned by `god-quality-reviewer` during
+  build. This dimension is the *whole-codebase* health read: report systemic
+  quality debt, not a line-by-line review of recent changes, and point to the
+  reviewer for per-slice concerns.
+Bands: 90-100 A, 80-89 B, 70-79 C, 60-69 D, 0-59 F. Risk does not average away:
+one Confirmed Critical caps its dimension at 69 and the overall at 79 until
+resolved.
 ## Process
-### 1. Inventory
-Walk the codebase looking for indicators per category:
-- Code: grep TODO/FIXME/HACK; cyclomatic complexity; duplicate code; long functions
-- Design: god classes; circular dependencies; mixed concerns
-- Dependency: `npm audit` / equivalent; date of last update; deprecation warnings
-- Test: coverage report; tests marked .skip; flaky test history; CI duration
-- Doc: comments referencing old code; README age; broken links
-- Security: SAST findings; missing input validation; hardcoded secrets
-- Operational: manual steps in deploy; runbooks not updated; alerts without runbooks
-- Knowledge: single contributors to critical code; no comments on complex algorithms
-### 2. Estimate cost to fix
-Per debt item, classify:
-- **S (small)**: <1 day, no behavior change
-- **M (medium)**: 1-3 days, possibly small behavior change
-- **L (large)**: 1-2 weeks, requires planning
-- **XL**: weeks-months, requires migration
-### 3. Estimate impact of NOT fixing
-Per item:
-- **HIGH**: blocks a planned feature, security risk, customer pain
-- **MEDIUM**: slows team, occasional bugs, maintenance burden
-- **LOW**: cosmetic, no observable impact
-### 4. Prioritize
-Priority = Impact × (1 / Cost). High-impact + small cost = top of list.
-| Priority | Definition |
-|----------|-----------|
-| **P0** | High impact + S/M cost. Do this sprint. |
-| **P1** | High impact + L cost OR Medium impact + S cost. Do this quarter. |
-| **P2** | Medium impact + M cost. Do when convenient. |
-| **P3** | Low impact OR XL cost without clear benefit. Backlog or ignore. |
+### 1. Orient and map
+Detect languages/frameworks/build system from manifests; measure size and decide
+exhaustive vs sampled (declare which). Locate entry points. Read the README to
+learn intended behavior and maturity. Trace two or three primary flows end to
+end. Record exclusions (vendored, generated, build output) and the commit/branch.
+### 2. Inventory across every dimension
+Use search to find candidates, then **read the cited code to confirm** before
+recording. A search hit is a lead, not a finding. Per dimension's indicators:
+- Code: grep TODO/FIXME/HACK; complexity; duplication; long functions; dead code
+- Design: god files; circular deps; mixed concerns; structure-vs-docs drift
+- Security: read `.godpowers/harden/FINDINGS.md` first and cite it; only if it
+  is absent, do a lightweight read for untrusted input into queries/shell/paths/
+  HTML, secrets, weak crypto, and declared-but-unenforced guards
+- Test: critical-path coverage; assertion-free or over-mocked tests; `.skip`
+- Dependency: `npm audit` / equivalent; staleness; deprecations; pinning
+- Error handling: empty catches; lost cause; missing timeouts; partial commits
+- Performance: nested loops on large inputs; N+1; sync I/O on hot paths
+- Docs: setup steps vs scripts; documented endpoints that do not exist
+- Observability: structured logging; real vs paper health checks; config/secrets
+### 3. Verify adversarially and cluster
+Try to refute each candidate. Assign **Severity** (Critical/High/Medium/Low),
+**Confidence** (Confirmed/Likely/Suspected), and **Effort** (S under 1 day /
+M 1-3 days / L 1-2 weeks / XL weeks). Cluster repeated instances into one
+systemic finding, keeping the member IDs.
+### 4. Score and prioritize
+Score each dimension 0-100 with its justification; the overall is the weighted
+average with risk-capping. Bucket findings: **Quick wins** (High/Critical,
+Confirmed, S), **Plan now** (High/Critical, M or L), **Verify first** (any
+Suspected), **Backlog** (Low). Map to P0-P3: P0 = High impact + S/M; P1 = High
+impact + L or Medium + S; P2 = Medium + M; P3 = Low or XL without clear benefit.
 ### 5. Output
-Write `.godpowers/tech-debt/REPORT.md`:
+Write `.godpowers/tech-debt/REPORT.md`, self-contained for an acting agent with
+no memory of the audit:
 ```markdown
-# Tech Debt Assessment
-Date: [ISO 8601]
-Scope: [path or "entire codebase"]
-## Summary
-| Category | P0 | P1 | P2 | P3 | Total |
-|----------|----|----|----|----|-------|
-| Code | 3 | 5 | 12 | 8 | 28 |
-| Design | 1 | 2 | 4 | 1 | 8 |
-| Dependency | 0 | 1 | 3 | 7 | 11 |
-| ... | | | | | |
-Estimated debt: [N] person-weeks total
-P0+P1 paydown: [N] weeks (recommended next 1-2 sprints)
-## P0 - Do this sprint
-| ID | Category | Description | Cost | Impact | Recommendation |
-|----|----------|-------------|------|--------|----------------|
-| D-001 | Security | SQL injection in /api/search | S | HIGH | Fix immediately; route to /god-hotfix |
-| D-002 | Test | Auth module has 0% coverage | M | HIGH | Add tests via /god-add-tests before any auth changes |
-| D-003 | Operational | Deploy script has manual step | S | MEDIUM | Automate; route to /god-deploy revisit |
-## P1 - Do this quarter
-[Same structure]
-## P2 - When convenient
-[Same structure]
-## P3 - Backlog or ignore
-[Same structure; explanation if "ignore"]
-## Recommended next steps
-1. [Specific action with command, e.g., /god-hotfix for D-001]
-2. [Specific action]
+# Code Audit and Tech Debt Assessment
+Date: [ISO 8601] | Scope: [path or "entire codebase"] | State: [commit/branch]
+Read-only audit. Self-contained: every finding cites file:line and how to verify.
+## Snapshot
+Languages, size, frameworks, entry points, evident maturity, coverage
+(exhaustive or sampled, say what was sampled), exclusions.
+## Overall score
+NN/100 - Grade X (label). Two-to-four sentence verdict. One-line calibration.
+| Dimension | Score | Grade | Weight | Verdict |
+|---|---|---|---|---|
+| Security | NN | X | 20% | one-line specific verdict |
+| ... | | | | |
+| Overall | NN | X | 100% | weighted |
+## What to fix first
+Ordered union of Quick wins + Plan now, Critical before High.
+`[ID] title - severity, effort - one-line why`
+## Strengths (preserve these)
+What the codebase does well, each with evidence. Do not refactor these away.
+## Systemic patterns (root causes)
+One entry per recurring cause: what it is, member IDs, the one root fix.
+## Findings
+Sorted by severity then dimension. Each finding:
+### [SEC-001] <title>
+- Severity: <C/H/M/L> | Confidence: <Confirmed/Likely/Suspected> | Effort: <S/M/L/XL> | Dimension: <name>
+- Location: `file:line` (+ others)
+- Evidence: <what the code does now, precisely>
+- Impact: <concrete consequence>
+- Recommendation: <specific change and where; not a platitude>
+- Verify the fix: <test to add / behavior to check / command to run>
+- Related: <systemic pattern or finding IDs, or "none">
+## Remediation plan
+Quick wins / Plan now (suggested order) / Verify first / Backlog, by ID. Map to
+P0-P3. For each P0/P1, name the Godpowers command (for example /god-hotfix,
+/god-debug, /god-add-tests, /god-update-deps).
+## Scope and limitations
+What was and was not examined; sampling; assumptions that would change conclusions.
+## How to use this report (for the acting agent)
+1. Triage by severity and confidence. Confirmed Critical/High are safe to act on
+   now, in "What to fix first" order. Re-verify any Suspected finding first.
+2. Fix root causes (systemic patterns) before individual leaves.
+3. Preserve the strengths; do not refactor them away.
+4. One finding, one change, verified: run its "Verify the fix" after each fix;
+   keep changes atomic and traceable to the finding ID.
+5. Do not widen scope silently. Re-run the audit to confirm findings are
+   resolved, not relocated, and that no strength regressed.
 ```
+ID prefixes by dimension: SEC, ARC, QUAL, TEST, ERR, PERF, DEP, DOC, OBS. Keep
+IDs stable so a remediation loop can track each finding to closure.
 ## Have-Nots
 Debt assessment FAILS if:
-- All items in one priority bucket (no real prioritization)
-- Cost estimates without rationale
-- Impact estimates without specific consequences ("makes code messy" is not impact)
-- Recommendations without specific commands or workflows
-- "Comprehensive coverage" claim without grep evidence
-- Misses obvious categories (security debt with known CVEs)
+- A dimension score has no justification tied to specific findings
+- Any finding lacks a `file:line`, or a Severity/Confidence/Effort
+- A recommendation is a platitude ("improve error handling", "add more tests")
+- Repeated issues are left loose instead of clustered into a systemic pattern
+- The Strengths section is missing
+- "Comprehensive coverage" is claimed without grep evidence or a stated sample
+- A Critical finding does not cap its dimension and the overall score
+- Obvious categories are missed (security debt with known CVEs)

package/bin/install.js CHANGED Viewed

@@ -53,6 +53,15 @@ function showHelp() {
   log('  next                 Show the dashboard and recommended next command');
   log('  state advance        Update one tracked Godpowers state step');
   log('  gate                 Check a tier artifact gate');
+  log('  verify               Run a command as executed verification evidence');
+  log('  can-close            Check whether a substep has the evidence to close');
+  log('  route                Classify a prompt into an entry play (quarterback)');
+  log('  report               Show the verification play-by-play since last report');
+  log('  reflect              Record a structured reflection to the ledger');
+  log('  memory               Set, get, list, or clear ledger memory entries');
+  log('  lesson               Add or list reusable lessons in the ledger');
+  log('  outcome              Start, check, stop, or inspect a bounded retry loop');
+  log('  import-ledger        Import an existing .mythify/ ledger into .godpowers/ledger/');
   log('  mcp-info             Show read-only MCP companion setup instructions');
   log('  quick-proof          Show a runnable proof from the shipped fixture');
   log('  automation-status    Show host automation provider support');
@@ -72,6 +81,13 @@ function showHelp() {
   log('  --step=<name>        Step for state advance, such as prd or tier-1.prd');
   log('  --status=<status>    Status for state advance');
   log('  --tier=<name>        Tier for gate: prd, design, arch, roadmap, stack, repo, build, or harden');
+  log('  --substep=<id>       Substep for verify, such as tier-2.build');
+  log('  --claim=<text>       Claim a verify command checks');
+  log('  --timeout=<seconds>  Kill a verify command after this many seconds (default 300)');
+  log('  --attest             Record a self-reported attested claim instead of executing');
+  log('  --evidence=<text>    Self-reported evidence for verify --attest');
+  log('  --since=<last|all>   Window for report: new records since last, or all');
+  log('  --peek               Show the report without advancing the report cursor');
   log('  --json               Emit JSON for status, next, proof, or automation commands');
   log('  --brief              Render compact output for status, next, or proof');
   log('  --full               Render complete output for status, next, or demo');
@@ -111,6 +127,15 @@ function showHelp() {
   log('  npx godpowers next --project=.');
   log('  npx godpowers state advance --step=prd --status=done --project=.');
   log('  npx godpowers gate --tier=prd --project=.');
+  log('  npx godpowers verify "npm test" --substep tier-2.build --claim "build slice tests pass" --project=.');
+  log('  npx godpowers can-close --substep tier-2.build --project=.');
+  log('  npx godpowers route "add a feature" --project=.');
+  log('  npx godpowers report --since last --project=.');
+  log('  npx godpowers reflect --action "ran build" --outcome failure --next "fix the failing test" --project=.');
+  log('  npx godpowers memory set decision "use postgres" --category decision --project=.');
+  log('  npx godpowers lesson add "guard inputs before parsing" --tags parsing --project=.');
+  log('  npx godpowers outcome start green-build --verify "npm test" --budget 3 --substep tier-2.build --project=.');
+  log('  npx godpowers import-ledger --from ../legacy/.mythify --project=.');
   log('  npx godpowers mcp-info --project=.');
   log('  npx godpowers quick-proof --project=.');
   log('  npx godpowers automation-status --project=.');
@@ -231,6 +256,15 @@ module.exports = {
   runExtensionScaffoldCommand: cliDispatch.runExtensionScaffoldCommand,
   runGateCommand: cliDispatch.runGateCommand,
   runStateCommand: cliDispatch.runStateCommand,
+  runVerifyCommand: cliDispatch.runVerifyCommand,
+  runCanCloseCommand: cliDispatch.runCanCloseCommand,
+  runRouteCommand: cliDispatch.runRouteCommand,
+  runReportCommand: cliDispatch.runReportCommand,
+  runReflectCommand: cliDispatch.runReflectCommand,
+  runMemoryCommand: cliDispatch.runMemoryCommand,
+  runLessonCommand: cliDispatch.runLessonCommand,
+  runOutcomeCommand: cliDispatch.runOutcomeCommand,
+  runImportLedgerCommand: cliDispatch.runImportLedgerCommand,
   applyDefaultRuntimeSelection,
   runInstall,
   runUninstall,

package/fixtures/gate/harden-pass/.godpowers/state.json ADDED Viewed

@@ -0,0 +1,26 @@
+{
+  "$schema": "https://godpowers.dev/schema/state.v1.json",
+  "version": "1.0.0",
+  "project": {
+    "name": "harden-pass",
+    "started": "2026-06-10T00:00:00.000Z"
+  },
+  "tiers": {
+    "tier-3": {
+      "harden": {
+        "status": "done",
+        "updated": "2026-06-10T18:08:00.000Z",
+        "verification": {
+          "commands": [
+            {
+              "command": "npm audit --omit=dev",
+              "status": "pass",
+              "exitCode": 0,
+              "ranAt": "2026-06-10T18:07:00.000Z"
+            }
+          ]
+        }
+      }
+    }
+  }
+}

package/lib/artifact-map.js CHANGED Viewed

@@ -33,7 +33,8 @@ const TIER_ARTIFACTS = {
 const TIER_STATE_STEPS = {
   design: { tierKey: 'tier-1', subStepKey: 'design' },
-  build: { tierKey: 'tier-2', subStepKey: 'build' }
+  build: { tierKey: 'tier-2', subStepKey: 'build' },
+  harden: { tierKey: 'tier-3', subStepKey: 'harden' }
 };
 function normalizeTier(tier) {