npm - nubos-pilot - Versions diffs - 1.2.2 → 1.2.3 - Mend

nubos-pilot 1.2.2 → 1.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

package/CHANGELOG.md +10 -0
package/agents/np-architect.md +2 -0
package/agents/np-executor.md +1 -1
package/agents/np-learnings-extractor.md +54 -0
package/agents/np-planner.md +1 -1
package/agents/np-security-reviewer.md +9 -0
package/bin/np-tools/_commands.cjs +4 -0
package/bin/np-tools/derive-tier.cjs +86 -0
package/bin/np-tools/derive-tier.test.cjs +83 -0
package/bin/np-tools/learnings.cjs +105 -0
package/bin/np-tools/learnings.test.cjs +66 -0
package/bin/np-tools/loop-run-round.cjs +7 -1
package/bin/np-tools/skill-audit.cjs +79 -0
package/bin/np-tools/skill-audit.test.cjs +86 -0
package/bin/np-tools/verify-reliability.cjs +65 -0
package/bin/np-tools/verify-reliability.test.cjs +69 -0
package/lib/agents.test.cjs +1 -0
package/lib/config-defaults.cjs +13 -0
package/lib/config-schema.cjs +11 -0
package/lib/eval-reliability.cjs +63 -0
package/lib/eval-reliability.test.cjs +56 -0
package/lib/install/claude-hooks-learnings.test.cjs +82 -0
package/lib/install/claude-hooks.cjs +65 -4
package/lib/install/claude-hooks.test.cjs +5 -2
package/lib/learnings/capture-ledger.cjs +80 -0
package/lib/learnings/capture-ledger.test.cjs +54 -0
package/lib/learnings/extract.cjs +191 -0
package/lib/learnings/extract.test.cjs +115 -0
package/lib/nubosloop-audit.cjs +104 -0
package/lib/nubosloop-skill-audit.test.cjs +98 -0
package/lib/nubosloop.cjs +9 -0
package/lib/tier-classify.cjs +67 -0
package/lib/tier-classify.test.cjs +67 -0
package/np-tools.cjs +4 -0
package/package.json +1 -1
package/skills/np-access-control/SKILL.md +42 -0
package/skills/np-accessibility-audit/SKILL.md +41 -0
package/skills/np-adr/SKILL.md +37 -0
package/skills/np-api-design/SKILL.md +34 -0
package/skills/np-caching-strategy/SKILL.md +38 -0
package/skills/np-data-modeling/SKILL.md +37 -0
package/skills/np-data-privacy/SKILL.md +39 -0
package/skills/np-dependency-audit/SKILL.md +47 -0
package/skills/np-encryption/SKILL.md +47 -0
package/skills/np-error-handling/SKILL.md +37 -0
package/skills/np-incident-response/SKILL.md +38 -0
package/skills/np-llm-app-architecture/SKILL.md +50 -0
package/skills/np-observability/SKILL.md +39 -0
package/skills/np-performance/SKILL.md +38 -0
package/skills/np-queue-design/SKILL.md +32 -0
package/skills/np-rag-design/SKILL.md +43 -0
package/skills/np-refactoring/SKILL.md +35 -0
package/skills/np-resilience-patterns/SKILL.md +39 -0
package/skills/np-secure-code-review/SKILL.md +46 -0
package/skills/np-secure-design/SKILL.md +44 -0
package/skills/np-service-boundary/SKILL.md +35 -0
package/skills/np-system-design/SKILL.md +40 -0
package/skills/np-test-strategy/SKILL.md +46 -0
package/skills/np-threat-model/SKILL.md +42 -0
package/templates/claude/payload/hooks/np-learnings-hook.cjs +55 -0
package/workflows/architect-phase.md +21 -1
package/workflows/execute-phase.md +66 -4
package/workflows/verify-work.md +17 -4

package/templates/claude/payload/hooks/np-learnings-hook.cjs ADDED Viewed

@@ -0,0 +1,55 @@
+#!/usr/bin/env node
+'use strict';
+const fs = require('node:fs');
+const path = require('node:path');
+const cp = require('node:child_process');
+// ADR-0010 / ECC continuous-learning: thin Stop-hook shim. On session Stop it
+// asks np-tools to (rate-limited) auto-capture reusable learnings from the
+// turn's diff; on UserPromptSubmit it resets the consecutive-stop streak. All
+// heavy logic lives in lib/learnings/. A learning hook must NEVER break the
+// session — every failure path exits 0 silently.
+const ALLOWED_VERBS = new Set(['capture', 'reset']);
+function resolveNpTools() {
+  const candidates = [
+    path.join(process.cwd(), '.nubos-pilot', 'bin', 'np-tools.cjs'),
+    path.join(__dirname, '..', '..', '..', '.nubos-pilot', 'bin', 'np-tools.cjs'),
+  ];
+  for (const c of candidates) {
+    try { if (fs.statSync(c).isFile()) return c; } catch {}
+  }
+  return null;
+}
+function readStdin() {
+  return new Promise((resolve) => {
+    if (process.stdin.isTTY) return resolve('');
+    let buf = '';
+    process.stdin.setEncoding('utf-8');
+    const timer = setTimeout(() => { try { process.stdin.removeAllListeners(); } catch {} resolve(buf); }, 800);
+    process.stdin.on('data', (c) => { buf += c; });
+    process.stdin.on('end', () => { clearTimeout(timer); resolve(buf); });
+    process.stdin.on('error', () => { clearTimeout(timer); resolve(buf); });
+  });
+}
+(async () => {
+  const verb = process.argv[2];
+  if (!ALLOWED_VERBS.has(verb)) { process.exit(0); return; }
+  const npTools = resolveNpTools();
+  if (!npTools) { process.exit(0); return; }
+  const input = await readStdin();
+  try {
+    cp.spawnSync(process.execPath, [npTools, 'learnings', verb, '--stdin'], {
+      input,
+      encoding: 'utf-8',
+      timeout: 15000,
+      maxBuffer: 4 * 1024 * 1024,
+      cwd: process.cwd(),
+    });
+  } catch { /* never let a learning hook break the session */ }
+  process.exit(0);
+})().catch(() => { process.exit(0); });

package/workflows/architect-phase.md CHANGED Viewed

@@ -74,9 +74,24 @@ The architect then consumes the consensus-merged `RESEARCH.md` instead of a sing
 After the architect emits `M<NNN>-ARCHITECTURE.md`, the orchestrator spawns ONE `np-critic` instance with the architecture file + `M<NNN>-CONTEXT.md` as inputs. The critic verifies that every locked decision in CONTEXT has a corresponding architecture entry and that no `Deferred` items leaked into the architecture. Findings of category `unmet-criterion`, `locked-decision-violation`, or `information-missing` route per `lib/nubosloop.cjs::routeFindings`. A single Build-Fixer-style round on the architect closes the loop. Beyond one round the workflow exits with `stuck` and the user resolves manually — architecture decisions don't merit unbounded looping.
+## Skills (Nubos library)
+Nubos ships a design-time skill library under `.claude/skills/np-*/` (present only on Claude Code). These are the **quality bar for the architecture decisions you are about to commit** — each skill's "Verification bar" is the standard each ADR-style decision is held to. Before spawning `np-architect`, classify the milestone (read `M<NNN>-CONTEXT.md` + `M<NNN>-RESEARCH.md`) and inject the matching skill triggers into the architect's spawn prompt. Skills **stack** — include every row the milestone matches (cap at the most relevant ~4 if more match; always keep the security row when it applies).
+| Milestone signal | Skills to trigger |
+|---|---|
+| Designs a new system, module, or significant feature | `np-system-design` (with `np-adr` for an architecturally significant, hard-to-reverse choice) |
+| Introduces or moves a module/service boundary, splits a service, or chooses sync vs async | `np-service-boundary` |
+| Any structural decision that is costly to reverse — datastore, sync/async, new dependency, auth model, public contract | `np-adr` |
+| New external surface, new trust boundary, new privilege model, or handling of sensitive assets | `np-secure-design` (with `np-threat-model`) |
+| Authorization model — roles, permissions, policies, resource ownership | `np-access-control` |
+| Design depends on an unreliable dependency, async/queue processing, or a cache | `np-resilience-patterns`, `np-queue-design`, `np-caching-strategy` (each as it applies) |
+| Persisted data shape, or personal/sensitive data | `np-data-modeling`, `np-data-privacy` (as they apply) |
+| Purely additive milestone with no structural/security decision | None — skip the skill block (and reconsider whether the architect pass is needed at all) |
 ## Spawn np-architect
-Spawn `agents/np-architect.md` mit dem folgenden Files-to-Read-Block:
+Spawn `agents/np-architect.md` mit dem folgenden Files-to-Read-Block und (sofern Skills matchen) der angehängten Skill-Direktive:
 ```
 <files_to_read>
@@ -88,8 +103,13 @@ Spawn `agents/np-architect.md` mit dem folgenden Files-to-Read-Block:
 Milestone: M<NNN>
 Task: Emit M<NNN>-ARCHITECTURE.md per the agent's Output Contract.
+Use the following Nubos skills as the quality bar for your decisions: <skill-1>, <skill-2>, ...
+Each is installed at .claude/skills/<skill>/SKILL.md; every architecture decision must satisfy the matching skill's "Verification bar".
 ```
+If zero skills match, omit the skill-directive line — do not invent skills.
 Der Agent ist read-only auf Source — er schreibt EINE Datei:
 `.nubos-pilot/milestones/M<NNN>/M<NNN>-ARCHITECTURE.md`.

package/workflows/execute-phase.md CHANGED Viewed

@@ -33,6 +33,8 @@ if [[ "$INIT" == @file:* ]]; then INIT=$(cat "${INIT#@file:}"); fi
 AGENT_SKILLS_EXECUTOR=$(node .nubos-pilot/bin/np-tools.cjs agent-skills executor 2>/dev/null)
 RUNTIME=$(node .nubos-pilot/bin/np-tools.cjs detect-runtime)
 WORKTREE_ISOLATION=$(node .nubos-pilot/bin/np-tools.cjs config-get workflow.worktree_isolation 2>/dev/null || echo "false")
+TIER_ROUTING=$(node .nubos-pilot/bin/np-tools.cjs config-get workflow.tier_routing 2>/dev/null || echo "false")
+VERIFY_RUNS=$(node .nubos-pilot/bin/np-tools.cjs config-get loop.verify_runs 2>/dev/null || echo "1")
 ```
 When `--verify-work` is passed, the init payload's `auto_verify: true` flag tells this workflow to chain into `/np:verify-work $PHASE` after every slice committed and `finalize-milestone` ran. Without the flag the workflow stops after finalize as before — verify-work then remains a separate manual step.
@@ -57,18 +59,44 @@ Parse JSON for: `milestone`, `milestone_id`, `milestone_dir`, `waves[]` (each wi
 Nubos ships a skill library under `.claude/skills/np-*/` (auto-installed by `npx nubos-pilot`, present only on Claude Code). For each task in a wave, before spawning `np-executor`, classify the task by reading its `T<NNNN>-PLAN.md` and inject the matching skill triggers into the executor's spawn prompt as a "Use these skills" directive. The executor then loads each skill's `SKILL.md` via the runtime's skill mechanism and follows its rules during implementation.
-Mapping (match the dominant signal in `files_modified` + task description):
+Match the task against **both** tables below — a task can match rows in each (e.g. a new authenticated endpoint backed by a migration is UI-free but matches `np-api-design` + `np-secure-code-review` + `np-data-modeling`). Skills **stack**: trigger every row whose signal the task matches. The only exception is the UI style anchor (pick exactly one). If more than ~4 rows match, keep the most task-critical and always retain any security row (`np-secure-code-review` / `np-threat-model`) and `np-test-strategy` for behaviour changes.
+**UI / frontend** (match the dominant signal in `files_modified` + task description):
 | Task signal | Skills to trigger |
 |---|---|
-| Any UI/component edit (`.tsx`, `.jsx`, `.vue`, `.svelte`, `views/**`, `components/**`, `pages/**`, `app/**`) | `np-impeccable` (polish/audit), `np-frontend-design` (build), `np-design` (review), `np-web-design-guidelines` (a11y/UX) |
+| Any UI/component edit (`.tsx`, `.jsx`, `.vue`, `.svelte`, `views/**`, `components/**`, `pages/**`, `app/**`) | `np-impeccable` (polish/audit), `np-frontend-design` (build), `np-design` (review), `np-web-design-guidelines` (a11y/UX), `np-accessibility-audit` (WCAG AA bar) |
 | `components.json` present in repo OR shadcn/ui imports in modified files | `np-shadcn` (in addition to UI skills above) |
 | React/Next.js component or hook edit | `np-react-best-practices`, `np-composition-patterns` |
 | Page/route transitions, `<ViewTransition>`, `startViewTransition` | `np-react-view-transitions` |
 | React Native / Expo source (`*.tsx` under `app/`, `screens/`, `mobile/**`) | `np-react-native-skills` |
 | Restyling an existing surface (no greenfield) | `np-redesign-existing-projects` |
 | New surface needing visual direction | Pick exactly **one** style anchor: `np-high-end-visual-design` (default agency premium), `np-minimalist-ui`, `np-industrial-brutalist-ui`, or `np-stitch-design-taste` |
-| Non-UI task (backend, infra, tooling, docs) | None — skip the skill block entirely |
+**Engineering / non-UI** (these stack — include each row the task matches):
+| Task signal | Skills to trigger |
+|---|---|
+| Adds/changes a consumed contract — HTTP route, RPC/GraphQL handler, controller, resolver, public SDK/library function, CLI flag | `np-api-design` |
+| Touches auth, authz, session, secrets, crypto, SQL/query construction, file upload, deserialization, or any untrusted input reaching a sink | `np-secure-code-review` |
+| Introduces or alters a trust boundary — new ingress, webhook/callback, queue consumer, third-party integration, or a new store for credentials/PII | `np-threat-model` (with `np-secure-code-review`) |
+| DB schema, migration, ORM model/entity, or any backfill/transform of persisted data | `np-data-modeling` |
+| Backend/service/integration/IO path that can fail — external calls, retries, timeouts, batch work | `np-error-handling` |
+| Calls an external/unreliable dependency (other service, third-party API, DB under load) | `np-resilience-patterns` (with `np-error-handling`) |
+| New service/handler/job/integration path, or a new failure path that must be diagnosable | `np-observability` |
+| Data access, queries, loops over collections, hot paths — anything that scales with input size | `np-performance` |
+| Adds or changes a cache / memoization layer (in-memory, distributed, HTTP/CDN) | `np-caching-strategy` |
+| Message queue, background job, worker, async consumer, or event handler | `np-queue-design` |
+| Introduces or changes a module/service boundary, splits a service, or makes a cross-module change | `np-service-boundary` |
+| Roles, permissions, policies, resource ownership, or access-rule changes (RBAC/ABAC, authz checks) | `np-access-control` (with `np-secure-code-review`) |
+| Encryption, hashing, password storage, TLS, tokens, signing/HMAC, or key/secret management | `np-encryption` |
+| Adds or upgrades a third-party dependency, or edits a manifest/lockfile | `np-dependency-audit` |
+| Collects, stores, processes, exports, or logs personal/sensitive data (PII) | `np-data-privacy` |
+| Refactor / cleanup / restructure where behaviour must be preserved | `np-refactoring` |
+| Risky / hard-to-reverse / high-blast-radius change — feature flags, migration coupled to code, change to an external integration | `np-incident-response` |
+| LLM / agent / prompt / tool-use / structured-output / AI feature | `np-llm-app-architecture` (add `np-rag-design` if it retrieves from a corpus) |
+| Any change to logic or behaviour (almost all non-trivial tasks) | `np-test-strategy` |
+| Pure docs/config with no behaviour change | None — skip the skill block |
 **Spawn-prompt injection format.** Append to the executor prompt verbatim (one line per matched skill):
@@ -78,6 +106,14 @@ Each skill is installed at .claude/skills/<skill>/SKILL.md and encodes a
 quality bar you must satisfy before invoking commit-task.
 ```
+**Consultation audit (counterpart to Rule 9).** Whenever you inject a non-empty skill block, BEFORE spawning the executor record the expected set so the post-critics gate can verify the executor actually consulted them:
+```bash
+node .nubos-pilot/bin/np-tools.cjs skill-audit expect --task "$TASK_ID" --skills "<skill-1>,<skill-2>,..."
+```
+The executor stamps each skill it reads via `skill-audit ack`. At post-critics, any injected-but-unconsulted skill becomes a `skill-bar-unconsulted` finding that routes the task back to the executor (once per round, bounded by `loop.maxRounds`) — exactly like a Rule-9 search miss. Skip the `expect` call only when zero skills were injected.
 If zero skills match, omit the block — do **not** invent skills. Adding new skills under `skills/np-*/` in the source repo is sufficient: the next `npx nubos-pilot update` rolls them out and you extend this mapping in one PR.
 ## Pre-Flight — orphan-checkpoint guard
@@ -262,6 +298,7 @@ for WAVE_INDEX in 0 1 2 ...; do
     TASK_JSON=$(node .nubos-pilot/bin/np-tools.cjs init execute-milestone execute-task "$PHASE" "$TASK_ID")
     if [[ "$TASK_JSON" == @file:* ]]; then TASK_JSON=$(cat "${TASK_JSON#@file:}"); fi
     TASK_QUERY=$(echo "$TASK_JSON" | node -e "process.stdin.on('data', d => { const j=JSON.parse(d); console.log(j.query || j.name || ''); })")
+    TASK_TIER=$(echo "$TASK_JSON" | node -e "process.stdin.on('data', d => { const j=JSON.parse(d); console.log(j.tier || 'sonnet'); })")
     EXECUTOR_START=$(node .nubos-pilot/bin/np-tools.cjs metrics start-timestamp)
     CONSENSUS_PATTERN=""
@@ -371,7 +408,18 @@ for WAVE_INDEX in 0 1 2 ...; do
       else
         EXECUTOR_AGENT="np-build-fixer"
       fi
-      EXECUTOR_MODEL=$(node .nubos-pilot/bin/np-tools.cjs resolve-model "$EXECUTOR_AGENT" --profile frontier)
+      # Model resolution. Default (tier_routing off): the executor always runs at
+      # the `frontier` profile — every task gets the strongest model. Opt-in
+      # tier-routing (config `workflow.tier_routing: true`) instead honours the
+      # planner's per-task `tier` under the project's configured `model_profile`
+      # (default `balanced`), so trivial→haiku / standard→sonnet / large→opus —
+      # ECC-style cost-aware routing. Round-2+ build-fixer always stays frontier:
+      # fixing a failing task wants the strongest model regardless of routing.
+      if [[ "$TIER_ROUTING" == "true" && "$ROUND" -eq 1 ]]; then
+        EXECUTOR_MODEL=$(node .nubos-pilot/bin/np-tools.cjs resolve-model "$TASK_TIER")
+      else
+        EXECUTOR_MODEL=$(node .nubos-pilot/bin/np-tools.cjs resolve-model "$EXECUTOR_AGENT" --profile frontier)
+      fi
       # → execute group (1) per ACTION CONTRACT above, then:
       node .nubos-pilot/bin/np-tools.cjs checkpoint transition "$TASK_ID" verifying
@@ -380,6 +428,20 @@ for WAVE_INDEX in 0 1 2 ...; do
       VERIFY_LOG="${TMPDIR:-/tmp}/np-verify-${TASK_ID}-r${ROUND}.log"
       # Orchestrator (NOT the agent) runs the task's <verify> command + stack
       # linters; redirect stdout+stderr to $VERIFY_LOG.
+      #
+      # pass@k reliability (opt-in, $VERIFY_RUNS, default 1): run the SAME verify
+      # command $VERIFY_RUNS times, collecting one exit code per run into a
+      # comma-separated list ($VERIFY_CODES, e.g. "0,1,0"). With the default of 1
+      # this is a single run, identical to before. Then fold the runs:
+      #   VERIFY_EXIT=$?                                   # when $VERIFY_RUNS == 1
+      #   # when $VERIFY_RUNS > 1:
+      #   REL=$(node .nubos-pilot/bin/np-tools.cjs verify-reliability --codes "$VERIFY_CODES")
+      #   VERIFY_EXIT=$(echo "$REL" | node -e 'process.stdin.on("data",d=>console.log(JSON.parse(d).aggregate_exit_code))')
+      #   # append the human verdict so a FLAKY task tells the build-fixer why it is red:
+      #   echo "$REL" | node -e 'process.stdin.on("data",d=>console.log(JSON.parse(d).description))' >> "$VERIFY_LOG"
+      # aggregate_exit_code is 0 only when EVERY run passed (pass^k); a flaky task
+      # (some pass, some fail) aggregates to red and flows through the normal
+      # spawn-build-fixer path below — no new critic category, no spurious stuck.
       VERIFY_EXIT=$?
       # Stamp executor spawn-evidence into the audit log. EXECUTOR_TOOL_LOG is
       # the tool-name JSON array harvested from the spawn's tool_use stream

package/workflows/verify-work.md CHANGED Viewed

@@ -38,15 +38,28 @@ Parse: `milestone`, `milestone_id`, `milestone_dir`, `milestone_name`, `success_
 ## Skills (Nubos library)
-For UI/UX-flavoured success criteria, instruct the verifier (in its spawn prompt) to load the matching Nubos skill before classifying:
+Instruct the verifier (in its spawn prompt) to load the matching Nubos skill before classifying — the skill's "Verification bar" is the standard the SC is judged against, not just the SC's own wording:
 | SC type | Skill to use |
 |---|---|
 | Visual polish, layout, hierarchy, motion | `np-impeccable` (`.claude/skills/np-impeccable/SKILL.md`) |
-| Accessibility, semantic HTML, UX heuristics | `np-web-design-guidelines` |
+| Accessibility, semantic HTML, keyboard/contrast | `np-web-design-guidelines`, `np-accessibility-audit` |
 | Component architecture, design-system fit | `np-design` |
-For borderline Pass/Fail calls in Pass 2 (deterministic evidence inconclusive **and** the SC carries real consequences), pressure-test with **`np-council`** before flipping `needs_user_confirm` → `Pass`/`Fail`. Backend/infra/data SCs do not need any skill — verdict on evidence alone.
+| API / endpoint / contract behaviour | `np-api-design` |
+| Security, auth, input handling, secrets, crypto | `np-secure-code-review` (and `np-threat-model` if a new trust boundary) |
+| Authorization — roles, permissions, ownership, access rules | `np-access-control` |
+| Encryption, hashing, TLS, key/secret management | `np-encryption` |
+| Personal/sensitive data handling, retention, logging | `np-data-privacy` |
+| Schema / migration / data correctness | `np-data-modeling` |
+| Error handling, retries, failure modes | `np-error-handling` |
+| Resilience under dependency failure — timeout, circuit-breaker, fallback | `np-resilience-patterns` |
+| Caching correctness / invalidation | `np-caching-strategy` |
+| Async job / queue / worker behaviour — idempotency, ordering, DLQ | `np-queue-design` |
+| Module/service boundary, coupling, contract integrity | `np-service-boundary` |
+| Performance, latency, query/loop cost | `np-performance` |
+| LLM / agent / retrieval behaviour | `np-llm-app-architecture`, `np-rag-design` |
+For borderline Pass/Fail calls in Pass 2 (deterministic evidence inconclusive **and** the SC carries real consequences), pressure-test with **`np-council`** before flipping `needs_user_confirm` → `Pass`/`Fail`. An SC with no matching skill is judged on evidence alone.
 ## Output-Schema (pre-spawn injection)