npm - @openwop/openwop-conformance - Versions diffs - 1.12.0 → 1.14.0 - Mend

@openwop/openwop-conformance 1.12.0 → 1.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

package/CHANGELOG.md +23 -0
package/README.md +2 -2
package/api/openapi.yaml +60 -0
package/coverage.md +18 -6
package/fixtures/wasm-sandbox/isolation-global.wasm +0 -0
package/fixtures/wasm-sandbox/isolation-global.wat +6 -0
package/fixtures/wasm-sandbox/misbehaving-capability-gate.wasm +0 -0
package/fixtures/wasm-sandbox/misbehaving-capability-gate.wat +4 -0
package/fixtures/wasm-sandbox/misbehaving-env.wasm +0 -0
package/fixtures/wasm-sandbox/misbehaving-env.wat +4 -0
package/fixtures/wasm-sandbox/misbehaving-fs.wasm +0 -0
package/fixtures/wasm-sandbox/misbehaving-fs.wat +4 -0
package/fixtures/wasm-sandbox/misbehaving-memory.wasm +0 -0
package/fixtures/wasm-sandbox/misbehaving-memory.wat +5 -0
package/fixtures/wasm-sandbox/misbehaving-network.wasm +0 -0
package/fixtures/wasm-sandbox/misbehaving-network.wat +4 -0
package/fixtures/wasm-sandbox/misbehaving-process.wasm +0 -0
package/fixtures/wasm-sandbox/misbehaving-process.wat +4 -0
package/fixtures/wasm-sandbox/misbehaving-timeout.wasm +0 -0
package/fixtures/wasm-sandbox/misbehaving-timeout.wat +4 -0
package/fixtures/wasm-sandbox/well-behaved-echo.wasm +0 -0
package/fixtures/wasm-sandbox/well-behaved-echo.wat +2 -0
package/fixtures/wasm-sandbox/well-behaved-host-fetch.wasm +0 -0
package/fixtures/wasm-sandbox/well-behaved-host-fetch.wat +3 -0
package/package.json +1 -1
package/src/lib/agentDeployment.ts +117 -0
package/src/lib/agentEval.ts +83 -0
package/src/lib/discovery-capabilities.ts +18 -19
package/src/lib/egressPolicy.ts +76 -0
package/src/lib/profiles.ts +15 -0
package/src/lib/sandbox-timeout-worker.mjs +31 -0
package/src/lib/toolCatalog.ts +81 -0
package/src/lib/wasm-sandbox-probe.ts +168 -0
package/src/scenarios/agent-deployment-lifecycle.test.ts +147 -0
package/src/scenarios/agent-eval-run.test.ts +145 -0
package/src/scenarios/core-standard-profile.test.ts +75 -0
package/src/scenarios/egress-audience-binding.test.ts +81 -0
package/src/scenarios/egress-decision-content-free.test.ts +57 -0
package/src/scenarios/multi-agent-confidence-escalation.test.ts +12 -7
package/src/scenarios/prompt-resolution-chain-event.test.ts +113 -0
package/src/scenarios/sandbox-wasm-isolation.test.ts +98 -0
package/src/scenarios/sandbox-wasm-timeout.test.ts +40 -0
package/src/scenarios/tool-catalog-projection.test.ts +120 -0
package/src/scenarios/tool-session-lifecycle.test.ts +105 -0
package/src/scenarios/workspace-cross-tenant-isolation-blackbox.test.ts +89 -0

package/src/scenarios/agent-deployment-lifecycle.test.ts ADDED Viewed

@@ -0,0 +1,147 @@
+/**
+ * Agent deployment lifecycle — the §E promotion contract + §B channel pin
+ * (RFC 0082) — behavioral.
+ *
+ * Capability-gated on `agents.deployment.supported` (root-first per RFC 0073).
+ * Soft-skips when unadvertised (default) / hard-fails under
+ * `OPENWOP_REQUIRE_BEHAVIOR=true`. The always-on wire-shape coverage lives in
+ * `agent-deployment-shape.test.ts`; this asserts host BEHAVIOR via the
+ * `POST /v1/host/sample/agents/deployment-transition` seam + the test event-log
+ * seam + the NORMATIVE `GET /v1/agents/{agentId}/deployments` read:
+ *
+ *   1. PROMOTE (§E) — authorize → approvalGate → eval-verify → a content-free
+ *      `deployment.promoted` with `toState` in the seven-state vocabulary; the
+ *      returned record validates against `agent-deployment.schema.json`.
+ *   2. FAIL-CLOSED (§E-1, `deployment-promotion-fail-closed`) — a principal
+ *      lacking `deploy:promote` is denied (`allowed !== true`) and emits NO
+ *      `deployment.promoted`.
+ *   3. EVAL-GATE-UNMET (§E-3) — a promote whose `evalRunId` has `passed:false`
+ *      is denied with `eval_gate_unmet` and emits NO `deployment.promoted`.
+ *   4. CHANNEL PIN (§B) — a `@channel`-bound run records the resolved version as
+ *      `resolvedAgentVersion` on `agent.invocation.started` (the recorded fact a
+ *      replay re-reads rather than re-resolving).
+ *
+ * Each leg soft-skips independently (seam absent / event-log seam absent).
+ *
+ * Spec references:
+ *   - https://github.com/openwop/openwop/blob/main/spec/v1/agent-deployment.md (§B/§E)
+ *   - https://github.com/openwop/openwop/blob/main/RFCS/0082-agent-deployment-lifecycle.md
+ */
+import { describe, it, expect } from 'vitest';
+import { readFileSync } from 'node:fs';
+import { join } from 'node:path';
+import Ajv2020 from 'ajv/dist/2020.js';
+import addFormats from 'ajv-formats';
+import { driver } from '../lib/driver.js';
+import { behaviorGate } from '../lib/behavior-gate.js';
+import { SCHEMAS_DIR } from '../lib/paths.js';
+import {
+  readDeploymentCap,
+  driveDeploymentTransition,
+  DEPLOYMENT_STATES,
+  DEPLOYMENT_CONTENT_FORBIDDEN,
+} from '../lib/agentDeployment.js';
+import { queryTestEvents, isEventLogSeamAvailable, resetTestSeam } from '../lib/event-log-query.js';
+function loadSchema(name: string): Record<string, unknown> {
+  return JSON.parse(readFileSync(join(SCHEMAS_DIR, name), 'utf8')) as Record<string, unknown>;
+}
+function expectContentFree(payload: Record<string, unknown>, where: string): void {
+  for (const f of DEPLOYMENT_CONTENT_FORBIDDEN) {
+    expect(
+      !(f in payload),
+      driver.describe('RFC 0082 §D (deployment-event-no-content-leak)', `${where} MUST be content-free (no ${f})`),
+    ).toBe(true);
+  }
+}
+describe('agent-deployment-lifecycle (RFC 0082 §B/§E)', () => {
+  it('promotes via the eval+RBAC+approval gate, fails closed without scope/eval, and pins the channel version', async () => {
+    const cap = await readDeploymentCap();
+    if (!behaviorGate('openwop-deployment-lifecycle', cap?.supported === true)) return;
+    if (!(await isEventLogSeamAvailable())) return; // event-log seam absent — soft-skip
+    const ajv = new Ajv2020({ strict: false, allErrors: true });
+    addFormats(ajv);
+    const validateRecord = ajv.compile(loadSchema('agent-deployment.schema.json'));
+    // ---- Leg 1: eval+RBAC+approval-gated promotion (§E) ------------------
+    const promote = await driveDeploymentTransition({ scenario: 'promote' });
+    if (promote === null) return; // deployment seam unwired — soft-skip the whole behavioral suite
+    if (promote.record) {
+      expect(
+        validateRecord(promote.record),
+        driver.describe(
+          'agent-deployment.schema.json',
+          `a promoted deployment record MUST validate (${ajv.errorsText(validateRecord.errors)})`,
+        ),
+      ).toBe(true);
+    }
+    if (promote.runId) {
+      const pq = await queryTestEvents(promote.runId, { type: 'deployment.promoted' });
+      if (pq.ok) {
+        for (const e of pq.events) {
+          expectContentFree(e.payload, 'deployment.promoted');
+          expect(
+            typeof e.payload.toState === 'string' && DEPLOYMENT_STATES.includes(e.payload.toState as string),
+            driver.describe('run-event-payloads.schema.json#/$defs/deploymentPromoted', 'toState MUST be in the seven-state vocabulary'),
+          ).toBe(true);
+          expect(
+            typeof e.payload.toVersion === 'string' && (e.payload.toVersion as string).length > 0,
+            driver.describe('agent-deployment.md §D', 'deployment.promoted MUST carry the promoted toVersion'),
+          ).toBe(true);
+        }
+      }
+    }
+    // ---- Leg 2: fail-closed authz (§E-1; deployment-promotion-fail-closed) -
+    const unauth = await driveDeploymentTransition({ scenario: 'unauthorized' });
+    if (unauth && unauth.runId) {
+      expect(
+        unauth.allowed !== true,
+        driver.describe('agent-deployment.md §E-1', 'a principal without deploy:promote MUST be denied (fail-closed)'),
+      ).toBe(true);
+      const uq = await queryTestEvents(unauth.runId, { type: 'deployment.promoted' });
+      if (uq.ok) {
+        expect(
+          uq.events.length === 0,
+          driver.describe('SECURITY invariant deployment-promotion-fail-closed', 'a denied transition MUST emit NO deployment.promoted'),
+        ).toBe(true);
+      }
+    }
+    // ---- Leg 3: eval-gate-unmet denial (§E-3) ----------------------------
+    const evalUnmet = await driveDeploymentTransition({ scenario: 'eval-gate-unmet' });
+    if (evalUnmet && evalUnmet.runId) {
+      expect(
+        evalUnmet.error === 'eval_gate_unmet' || evalUnmet.allowed !== true,
+        driver.describe('agent-deployment.md §E-3', 'a promote whose eval evidence has passed:false MUST be denied (eval_gate_unmet)'),
+      ).toBe(true);
+      const eq = await queryTestEvents(evalUnmet.runId, { type: 'deployment.promoted' });
+      if (eq.ok) {
+        expect(
+          eq.events.length === 0,
+          driver.describe('agent-deployment.md §E-3', 'an unmet eval gate MUST emit NO deployment.promoted'),
+        ).toBe(true);
+      }
+    }
+    // ---- Leg 4: channel-resolution pin (§B) ------------------------------
+    const pin = await driveDeploymentTransition({ scenario: 'channel-pin', channel: 'stable' });
+    if (pin && pin.runId) {
+      const iq = await queryTestEvents(pin.runId, { type: 'agent.invocation.started' });
+      if (iq.ok && iq.events.length > 0) {
+        const started = iq.events.sort((a, b) => a.sequence - b.sequence)[0]!;
+        expect(
+          typeof started.payload.resolvedAgentVersion === 'string' && (started.payload.resolvedAgentVersion as string).length > 0,
+          driver.describe('agent-deployment.md §B', 'a @channel-bound run MUST record resolvedAgentVersion on agent.invocation.started (the recorded fact a replay re-reads)'),
+        ).toBe(true);
+      }
+    }
+    await resetTestSeam();
+  });
+});

package/src/scenarios/agent-eval-run.test.ts ADDED Viewed

@@ -0,0 +1,145 @@
+/**
+ * Agent eval-run — the `mode:"eval"` projection (RFC 0081 §B/§C) — behavioral.
+ *
+ * Capability-gated on `agents.evalSuite.supported` (root-first per RFC 0073).
+ * Soft-skips when unadvertised (default) / hard-fails under
+ * `OPENWOP_REQUIRE_BEHAVIOR=true`. The always-on wire-shape coverage lives in
+ * `agent-eval-suite-shape.test.ts`; this asserts host BEHAVIOR via the
+ * `POST /v1/host/sample/agents/eval-run` seam + the test event-log seam + the
+ * NORMATIVE `GET /v1/runs/{runId}/eval-summary` read:
+ *
+ *   1. ORDERING (§C) — an eval run emits `eval.started` FIRST, one `eval.scored`
+ *      per task, then `eval.completed` once (count == eval.completed.taskCount).
+ *   2. CONTENT-FREE (SR-1 / `eval-summary-no-content-leak`) — every `eval.scored`
+ *      carries scores / ids / scalars ONLY (never task output / rubric / prose);
+ *      `score` ∈ 0..1; `passed` is a boolean.
+ *   3. NORMATIVE SUMMARY (§C) — `GET /v1/runs/{runId}/eval-summary` returns a
+ *      schema-valid `EvalSummary` whose `passedCount <= taskCount` and whose
+ *      task entries carry no output body.
+ *
+ * Each leg soft-skips independently (seam absent / event-log seam absent).
+ *
+ * Spec references:
+ *   - https://github.com/openwop/openwop/blob/main/spec/v1/agent-evaluation.md (§B/§C)
+ *   - https://github.com/openwop/openwop/blob/main/RFCS/0081-agent-evaluation-and-scorecards.md
+ */
+import { describe, it, expect } from 'vitest';
+import { readFileSync } from 'node:fs';
+import { join } from 'node:path';
+import Ajv2020 from 'ajv/dist/2020.js';
+import addFormats from 'ajv-formats';
+import { driver } from '../lib/driver.js';
+import { behaviorGate } from '../lib/behavior-gate.js';
+import { SCHEMAS_DIR } from '../lib/paths.js';
+import {
+  readEvalSuiteCap,
+  driveEvalRun,
+  getEvalSummary,
+  EVAL_CONTENT_FORBIDDEN,
+} from '../lib/agentEval.js';
+import { queryTestEvents, isEventLogSeamAvailable, resetTestSeam } from '../lib/event-log-query.js';
+function loadSchema(name: string): Record<string, unknown> {
+  return JSON.parse(readFileSync(join(SCHEMAS_DIR, name), 'utf8')) as Record<string, unknown>;
+}
+function expectContentFree(payload: Record<string, unknown>, where: string): void {
+  for (const f of EVAL_CONTENT_FORBIDDEN) {
+    expect(
+      !(f in payload),
+      driver.describe('RFC 0081 §C (eval-summary-no-content-leak)', `${where} MUST be content-free (no ${f})`),
+    ).toBe(true);
+  }
+}
+describe('agent-eval-run (RFC 0081 §B/§C)', () => {
+  it('emits eval.started → per-task eval.scored → eval.completed and serves a content-free EvalSummary', async () => {
+    const cap = await readEvalSuiteCap();
+    if (!behaviorGate('openwop-eval-run', cap?.supported === true)) return;
+    if (!(await isEventLogSeamAvailable())) return; // event-log seam absent — soft-skip
+    const run = await driveEvalRun({ modes: ['golden'] });
+    if (run === null) return; // eval-run seam unwired — soft-skip the whole behavioral suite
+    if (!run.runId) return;
+    // ---- Legs 1+2: eval.* ordering + content-free (§C) -------------------
+    const startedQ = await queryTestEvents(run.runId, { type: 'eval.started' });
+    const scoredQ = await queryTestEvents(run.runId, { type: 'eval.scored' });
+    const completedQ = await queryTestEvents(run.runId, { type: 'eval.completed' });
+    if (startedQ.ok && scoredQ.ok && startedQ.events.length > 0) {
+      const started = startedQ.events.sort((a, b) => a.sequence - b.sequence)[0]!;
+      // eval.started precedes every eval.scored (§C ordering).
+      for (const s of scoredQ.events) {
+        expect(
+          started.sequence < s.sequence,
+          driver.describe('agent-evaluation.md §C', 'eval.started MUST precede every eval.scored'),
+        ).toBe(true);
+      }
+      if (completedQ.ok && completedQ.events.length > 0) {
+        const completed = completedQ.events.sort((a, b) => a.sequence - b.sequence)[completedQ.events.length - 1]!;
+        for (const s of scoredQ.events) {
+          expect(
+            s.sequence < completed.sequence,
+            driver.describe('agent-evaluation.md §C', 'every eval.scored MUST precede eval.completed'),
+          ).toBe(true);
+        }
+        // eval.scored is emitted once per task (count == eval.completed.taskCount).
+        if (typeof completed.payload.taskCount === 'number') {
+          expect(
+            scoredQ.events.length === completed.payload.taskCount,
+            driver.describe('agent-evaluation.md §C', 'one eval.scored per task (count == eval.completed.taskCount)'),
+          ).toBe(true);
+        }
+        expectContentFree(completed.payload, 'eval.completed');
+      }
+      // each eval.scored content-free + score ∈ 0..1, passed boolean.
+      for (const s of scoredQ.events) {
+        expectContentFree(s.payload, 'eval.scored');
+        expect(
+          typeof s.payload.score === 'number' && (s.payload.score as number) >= 0 && (s.payload.score as number) <= 1,
+          driver.describe('run-event-payloads.schema.json#/$defs/evalScored', 'eval.scored.score MUST be in 0..1'),
+        ).toBe(true);
+        expect(
+          typeof s.payload.passed === 'boolean',
+          driver.describe('run-event-payloads.schema.json#/$defs/evalScored', 'eval.scored.passed MUST be a boolean'),
+        ).toBe(true);
+      }
+      expectContentFree(started.payload, 'eval.started');
+    }
+    // ---- Leg 3: NORMATIVE EvalSummary read (§C) --------------------------
+    const { status, summary } = await getEvalSummary(run.runId);
+    if (status === 200 && summary) {
+      const ajv = new Ajv2020({ strict: false, allErrors: true });
+      addFormats(ajv);
+      const validate = ajv.compile(loadSchema('eval-summary.schema.json'));
+      expect(
+        validate(summary),
+        driver.describe(
+          'eval-summary.schema.json',
+          `GET /v1/runs/{runId}/eval-summary MUST return a schema-valid EvalSummary (${ajv.errorsText(validate.errors)})`,
+        ),
+      ).toBe(true);
+      const tasks = (summary.tasks as Array<Record<string, unknown>> | undefined) ?? [];
+      const passedCount = summary.passedCount as number | undefined;
+      const taskCount = summary.taskCount as number | undefined;
+      if (typeof passedCount === 'number' && typeof taskCount === 'number') {
+        expect(
+          passedCount <= taskCount,
+          driver.describe('agent-evaluation.md §C', 'EvalSummary.passedCount MUST NOT exceed taskCount'),
+        ).toBe(true);
+      }
+      for (const t of tasks) {
+        expectContentFree(t, 'EvalSummary.tasks[]');
+      }
+    }
+    await resetTestSeam();
+  });
+});

package/src/scenarios/core-standard-profile.test.ts ADDED Viewed

@@ -0,0 +1,75 @@
+/**
+ * openwop-core-standard — operational-annex predicate derivation (RFC 0088).
+ *
+ * Always-on, server-free derivation probe. Verifies that `isCoreStandard`
+ * derives the Core Standard Profile floor correctly from representative
+ * discovery payloads (RFC 0088 §B / core-standard-profile.md §B):
+ *   - a host meeting openwop-core + openwop-interrupts + a transport is core-standard;
+ *   - a bare openwop-core host (no interrupts) is NOT core-standard — the floor is
+ *     deliberately stricter than the v1 minimum;
+ *   - a host with no event transport (supportedTransports: []) fails the floor;
+ *   - the floor is the AND of three existing closed-catalog predicates (it composes,
+ *     it does not redefine — so it is absent from deriveProfiles()).
+ *
+ * The LIVE aggregate-evidence assertion (does every §C floor scenario actually
+ * pass against a host claiming the profile?) is the `Active → Accepted` step per
+ * RFC 0088 §C — already satisfied by MyndHyve + all reference hosts, asserted via
+ * each constituent scenario, and deferred here. This scenario asserts the
+ * discovery-predicate derivation only.
+ *
+ * Spec references:
+ *   - https://github.com/openwop/openwop/blob/main/spec/v1/core-standard-profile.md
+ *   - https://github.com/openwop/openwop/blob/main/RFCS/0088-core-standard-profile.md
+ */
+import { describe, it, expect } from 'vitest';
+import { isCoreStandard, isCore, deriveProfiles } from '../lib/profiles.js';
+const why = (specRef: string, requirement: string): string => `${specRef} — ${requirement}`;
+const CORE = {
+  protocolVersion: '1.0',
+  supportedEnvelopes: ['clarification.request'],
+  schemaVersions: {},
+  limits: { clarificationRounds: 1, schemaRounds: 1, envelopesPerTurn: 1 },
+};
+describe('core-standard-profile: floor predicate (RFC 0088 §B, server-free)', () => {
+  it('a host meeting core + interrupts + a default transport is core-standard', () => {
+    // No supportedTransports ⇒ both stream predicates default-true (profiles.md).
+    const c = { ...CORE };
+    expect(isCoreStandard(c), why('core-standard-profile.md §B', 'core + interrupts + transport ⇒ core-standard')).toBe(true);
+  });
+  it('a bare openwop-core host without interrupts is NOT core-standard', () => {
+    // openwop-core minimum, but no clarification.request ⇒ fails openwop-interrupts.
+    const c = { ...CORE, supportedEnvelopes: ['schema.request'] };
+    expect(isCore(c), why('profiles.md §openwop-core', 'still a valid openwop-core host')).toBe(true);
+    expect(isCoreStandard(c), why('core-standard-profile.md §B', 'the floor is stricter than the v1 minimum')).toBe(false);
+  });
+  it('a host advertising no event transport fails the floor', () => {
+    const c = { ...CORE, supportedTransports: [] as string[] };
+    expect(isCoreStandard(c), why('core-standard-profile.md §B', 'at least one event transport is required')).toBe(false);
+  });
+  it('a host advertising the rest transport satisfies the transport term', () => {
+    const c = { ...CORE, supportedTransports: ['rest'] };
+    expect(isCoreStandard(c), why('core-standard-profile.md §B', 'rest transport ⇒ stream term satisfied')).toBe(true);
+  });
+  it('a non-1.x host is not core-standard', () => {
+    const c = { ...CORE, protocolVersion: '2.0' };
+    expect(isCoreStandard(c), why('profiles.md §openwop-core', 'core-standard implies openwop-core (1.x)')).toBe(false);
+  });
+});
+describe('core-standard-profile: composes, does not redefine (RFC 0088 §A, server-free)', () => {
+  it('openwop-core-standard is an annex, NOT a closed-catalog profile (absent from deriveProfiles)', () => {
+    const c = { ...CORE };
+    expect(
+      (deriveProfiles(c) as readonly string[]).includes('openwop-core-standard'),
+      why('core-standard-profile.md §A', 'the annex is not a closed-catalog predicate'),
+    ).toBe(false);
+  });
+});

package/src/scenarios/egress-audience-binding.test.ts ADDED Viewed

@@ -0,0 +1,81 @@
+/**
+ * Credential-audience-bound egress (RFC 0079 §C) — behavioral KEYSTONE.
+ *
+ * Gated on `httpClient.egressPolicy.supported` (root-first per RFC 0073).
+ * Soft-skips when unadvertised (default) / hard-fails under
+ * `OPENWOP_REQUIRE_BEHAVIOR=true`. The always-on wire-shape coverage lives in
+ * `egress-provenance-shape.test.ts`; this asserts host BEHAVIOR — the §C
+ * confused-deputy MUST that backs the `egress-credential-audience-bound`
+ * SECURITY invariant:
+ *
+ *   1. OUT-OF-AUDIENCE — a host-issued credential bound to audience A, used for
+ *      an egress to destination B (B ∉ A), MUST be `denied` or `downgraded`
+ *      with `reason: "out-of-audience"`, and the credential MUST NOT be attached
+ *      to the egress (`credentialAttached !== true`).
+ *   2. PROVENANCE-UNEVALUABLE — an egress whose credential provenance cannot be
+ *      evaluated MUST be `denied` with `reason: "provenance-unevaluable"`
+ *      (fail-closed, not fail-open).
+ *
+ * The decision is driven through the OPTIONAL host-sample egress seam
+ * (`POST /v1/host/sample/egress/decide`) — soft-skip on 404/405. The decision
+ * reason is a CLOSED enum so a host cannot spill a blocked URL/host into a
+ * free-form string (SR-1, asserted in `egress-decision-content-free.test.ts`).
+ *
+ * Spec references:
+ *   - https://github.com/openwop/openwop/blob/main/spec/v1/host-capabilities.md (§"Credential provenance + egress policy")
+ *   - https://github.com/openwop/openwop/blob/main/RFCS/0079-credential-provenance-and-egress-policy.md
+ *   - https://github.com/openwop/openwop/blob/main/SECURITY/invariants.yaml (egress-credential-audience-bound)
+ */
+import { describe, it, expect } from 'vitest';
+import { driver } from '../lib/driver.js';
+import { behaviorGate } from '../lib/behavior-gate.js';
+import { readEgressPolicyCap, driveEgress, EGRESS_DECISIONS, EGRESS_REASONS } from '../lib/egressPolicy.js';
+describe('egress-audience-binding (RFC 0079 §C)', () => {
+  it('denies/downgrades an out-of-audience egress without attaching the credential, and fails closed on unevaluable provenance', async () => {
+    const cap = await readEgressPolicyCap();
+    if (!behaviorGate('openwop-egress-audience-binding', cap?.supported === true)) return;
+    // ---- Leg 1: out-of-audience — deny|downgrade + credential NOT attached --
+    const oob = await driveEgress({ scenario: 'out-of-audience' });
+    if (oob === null) return; // egress seam absent — soft-skip the whole behavior
+    expect(
+      oob.decision === 'denied' || oob.decision === 'downgraded',
+      driver.describe('host-capabilities.md §"Credential provenance + egress policy"', 'an out-of-audience egress MUST be denied or downgraded'),
+    ).toBe(true);
+    expect(
+      typeof oob.decision === 'string' && EGRESS_DECISIONS.includes(oob.decision),
+      driver.describe('run-event-payloads.schema.json#egressDecided', 'decision MUST be in the closed enum'),
+    ).toBe(true);
+    expect(
+      oob.reason === 'out-of-audience',
+      driver.describe('RFC 0079 §C', 'an out-of-audience denial MUST carry reason "out-of-audience"'),
+    ).toBe(true);
+    expect(
+      oob.credentialAttached !== true,
+      driver.describe('SECURITY/invariants.yaml egress-credential-audience-bound', 'the host MUST NOT attach a credential whose audience excludes the destination (confused-deputy)'),
+    ).toBe(true);
+    // ---- Leg 2: provenance-unevaluable — fail closed (deny) ----------------
+    const uneval = await driveEgress({ scenario: 'provenance-unevaluable' });
+    if (uneval !== null) {
+      expect(
+        uneval.decision === 'denied',
+        driver.describe('RFC 0079 §C', 'an egress with unevaluable provenance MUST fail closed (denied)'),
+      ).toBe(true);
+      expect(
+        uneval.reason === 'provenance-unevaluable',
+        driver.describe('RFC 0079 §C', 'a provenance-unevaluable denial MUST carry reason "provenance-unevaluable"'),
+      ).toBe(true);
+      expect(
+        typeof uneval.reason === 'string' && EGRESS_REASONS.includes(uneval.reason),
+        driver.describe('run-event-payloads.schema.json#egressDecided', 'reason MUST be in the closed enum'),
+      ).toBe(true);
+      expect(
+        uneval.credentialAttached !== true,
+        driver.describe('SECURITY/invariants.yaml egress-credential-audience-bound', 'a fail-closed egress MUST NOT attach the credential'),
+      ).toBe(true);
+    }
+  });
+});

package/src/scenarios/egress-decision-content-free.test.ts ADDED Viewed

@@ -0,0 +1,57 @@
+/**
+ * Egress-decision secret non-leak (RFC 0079 §F / SR-1) — behavioral.
+ *
+ * Gated on `httpClient.egressPolicy.supported` (root-first per RFC 0073).
+ * Soft-skips when unadvertised (default) / hard-fails under
+ * `OPENWOP_REQUIRE_BEHAVIOR=true`. Backs the `egress-decision-no-secret-leak`
+ * guarantee: an `egress.decided` payload is metadata-only — it MUST NOT carry
+ * the credential value, nor spill the blocked URL/host/header/body into a
+ * free-form field, and its `reason` MUST be drawn from the CLOSED vocabulary
+ * (so a host cannot smuggle a blocked destination into the reason string).
+ *
+ * Drives the host-sample seam with a `canary` credential whose value is a known
+ * sentinel and asserts the sentinel never surfaces in the decision
+ * (`canaryLeaked !== true`) and that the payload carries none of the forbidden
+ * content keys. Soft-skips on 404/405.
+ *
+ * Spec references:
+ *   - https://github.com/openwop/openwop/blob/main/spec/v1/host-capabilities.md (§"Credential provenance + egress policy")
+ *   - https://github.com/openwop/openwop/blob/main/RFCS/0079-credential-provenance-and-egress-policy.md
+ */
+import { describe, it, expect } from 'vitest';
+import { driver } from '../lib/driver.js';
+import { behaviorGate } from '../lib/behavior-gate.js';
+import { readEgressPolicyCap, driveEgress, EGRESS_REASONS, EGRESS_CONTENT_FORBIDDEN } from '../lib/egressPolicy.js';
+describe('egress-decision-content-free (RFC 0079 §F / SR-1)', () => {
+  it('never leaks the credential value or the blocked destination into the egress.decided payload', async () => {
+    const cap = await readEgressPolicyCap();
+    if (!behaviorGate('openwop-egress-decision-content-free', cap?.supported === true)) return;
+    const res = await driveEgress({ scenario: 'canary' });
+    if (res === null) return; // seam absent — soft-skip
+    // The canary sentinel MUST NOT appear anywhere observable.
+    expect(
+      res.canaryLeaked !== true,
+      driver.describe('RFC 0079 §F (SR-1)', 'the credential value (canary) MUST NOT leak into any observable surface'),
+    ).toBe(true);
+    // No forbidden content keys on the decision payload.
+    for (const forbidden of EGRESS_CONTENT_FORBIDDEN) {
+      expect(
+        !(forbidden in res),
+        driver.describe('RFC 0079 §F (SR-1)', `egress.decided MUST be content-free (no ${forbidden})`),
+      ).toBe(true);
+    }
+    // The reason stays in the closed vocabulary — no free-form destination spill.
+    if (res.reason !== undefined) {
+      expect(
+        typeof res.reason === 'string' && EGRESS_REASONS.includes(res.reason),
+        driver.describe('run-event-payloads.schema.json#egressDecided', 'reason MUST be in the closed enum (no free-form spill)'),
+      ).toBe(true);
+    }
+  });
+});

package/src/scenarios/multi-agent-confidence-escalation.test.ts CHANGED Viewed

@@ -49,7 +49,7 @@
 import { describe, it, expect } from 'vitest';
 import { driver } from '../lib/driver.js';
 import { isFixtureAdvertised } from '../lib/fixtures.js';
-import { pollUntilTerminal } from '../lib/polling.js';
+import { pollUntil } from '../lib/polling.js';
 import { capabilityFamily } from '../lib/discovery-capabilities.js';
 const HTTP_SKIP = !process.env.OPENWOP_BASE_URL;
@@ -111,12 +111,17 @@ describe.skipIf(BEHAVIORAL_SKIP)('multi-agent-confidence-escalation: behavioral
     expect(create.status).toBe(201);
     const runId = (create.json as { runId: string }).runId;
-    const terminal = await pollUntilTerminal(runId);
-    // RFC 0039 escalation suspends the parent — NOT a terminal `completed`.
-    // The conformance pollUntilTerminal returns when the run reaches any
-    // settled status. RFC 0039 §A gives hosts a choice: clarify-kind
-    // escalation (→ waiting-clarification) OR escalate-kind approval
-    // (→ waiting-approval).
+    // RFC 0039 confidence escalation SUSPENDS the parent (a `waiting-*` status)
+    // — it is NOT a terminal `completed`/`failed`/`cancelled`. So poll until the
+    // run either suspends or settles; polling only for terminal statuses
+    // (`pollUntilTerminal`, whose set is {completed,failed,cancelled}) would time
+    // out before the suspension is ever observed — the cause of the prior flake.
+    const terminal = await pollUntil(runId, (s) => {
+      const st = s.status as string;
+      return st.startsWith('waiting-') || st === 'completed' || st === 'failed' || st === 'cancelled';
+    });
+    // RFC 0039 §A gives hosts a choice: clarify-kind escalation
+    // (→ waiting-clarification) OR escalate-kind approval (→ waiting-approval).
     //
     // RFC 0044 routing: when the host advertises
     // `capabilities.multiAgent.executionModel.confidenceEscalationInterruptKind`

package/src/scenarios/prompt-resolution-chain-event.test.ts ADDED Viewed

@@ -0,0 +1,113 @@
+/**
+ * prompt-resolution-chain-event — RFC 0029 layer precedence on the PRODUCTION wire.
+ *
+ * The black-box, production-path counterpart to the three seam-driven
+ * `prompt-resolution-chain-{node-wins,agent-intrinsic,fallback-cascade}.test.ts`
+ * scenarios. Instead of the synchronous `POST /v1/host/sample/prompt/resolve`
+ * seam, this creates a real run from a prompt-exercising fixture, reads the
+ * run's DURABLE event log via the NORMATIVE `GET /v1/runs/{runId}/events/poll`
+ * endpoint, and asserts the `agent.promptResolved` event carries the full
+ * layer-by-layer precedence record (`spec/v1/prompts.md` §"Resolution chain") —
+ * no `/v1/host/sample/*` seam.
+ *
+ * The `agentPromptResolved` payload (`schemas/run-event-payloads.schema.json`)
+ * already REQUIRES `chain[]` with one `applied: true` entry + the full-traversal
+ * MUST, so the wire is already capable of conveying precedence without the seam.
+ * This is the "replace seam-gated proofs with black-box production-path
+ * conformance" step (independent-audit acceptance-bar item 3) for RFC 0029: once
+ * a host emits `agent.promptResolved`, prompt-chain precedence is proven on the
+ * production wire and the surface graduates INTO the `openwop-core-standard`
+ * floor (RFC 0088 §D Lever-2 → floor).
+ *
+ * Gating: soft-skips unless `capabilities.prompts.supported` AND the host
+ * actually emits `agent.promptResolved` for the run (emission is staged per
+ * RFC 0029 / RFC 0021 — a host advertising prompts MAY not yet emit the event).
+ *
+ * @see RFCS/0029-prompt-override-hierarchy.md
+ * @see spec/v1/prompts.md §"Resolution chain (normative)"
+ */
+import { describe, it, expect } from 'vitest';
+import { driver } from '../lib/driver.js';
+import { capabilityFamily } from '../lib/discovery-capabilities.js';
+import { pollUntilTerminal } from '../lib/polling.js';
+const PROMPT_FIXTURE_ID = 'conformance-prompt-end-to-end';
+const VALID_LAYERS = new Set([
+  'run-configurable', 'node', 'agent-intrinsic', 'agent-overrides',
+  'agent-library-default', 'workflow-defaults', 'host-defaults',
+]);
+interface ChainEntry { layer?: unknown; source?: unknown; applied?: unknown }
+interface PromptResolvedPayload { chain?: ChainEntry[]; resolved?: unknown }
+interface RawEvent { type?: string; payload?: PromptResolvedPayload }
+async function promptsSupported(): Promise<boolean> {
+  const res = await driver.get('/.well-known/openwop');
+  return capabilityFamily(res.json as Record<string, unknown> | undefined, 'prompts')?.supported === true;
+}
+describe('prompt-resolution-chain-event (black-box): agent.promptResolved carries the precedence record (RFC 0029)', () => {
+  it('the production agent.promptResolved event records the full four-layer resolution chain', async () => {
+    if (!(await promptsSupported())) return; // capability not advertised — skip
+    const create = await driver.post('/v1/runs', { workflowId: PROMPT_FIXTURE_ID });
+    if (create.status !== 201) {
+      // Fixture not seeded / run not accepted — not a prompt-chain failure.
+      // eslint-disable-next-line no-console
+      console.warn(`[prompt-resolution-chain-event] POST /v1/runs for ${PROMPT_FIXTURE_ID} returned ${create.status}; skipping the production-path assertion`);
+      return;
+    }
+    const runId = (create.json as { runId?: string }).runId;
+    if (!runId) return;
+    await pollUntilTerminal(runId);
+    const poll = await driver.get(`/v1/runs/${encodeURIComponent(runId)}/events/poll`);
+    const events = (poll.json as { events?: RawEvent[] } | undefined)?.events ?? [];
+    const resolved = events.filter((e) => e.type === 'agent.promptResolved');
+    if (resolved.length === 0) {
+      // Host advertises prompts but does not yet emit agent.promptResolved
+      // (RFC 0029 emission is staged) — soft-skip the behavioral assertion.
+      // eslint-disable-next-line no-console
+      console.warn('[prompt-resolution-chain-event] host emitted no agent.promptResolved event; skipping (RFC 0029 emission staged)');
+      return;
+    }
+    for (const ev of resolved) {
+      const chain = ev.payload?.chain;
+      expect(
+        Array.isArray(chain) && chain.length > 0,
+        driver.describe('prompts.md §Resolution chain', 'agent.promptResolved MUST carry a non-empty chain[] of attempted layers'),
+      ).toBe(true);
+      const entries = chain as ChainEntry[];
+      // Every entry is a well-formed layer record (the full-traversal shape).
+      for (const e of entries) {
+        expect(
+          typeof e.layer === 'string' && VALID_LAYERS.has(e.layer),
+          driver.describe('prompts.md §Resolution chain', `each chain entry MUST name a valid layer, got ${String(e.layer)}`),
+        ).toBe(true);
+        expect(typeof e.applied, driver.describe('prompts.md §Resolution chain', 'each chain entry MUST carry a boolean `applied`')).toBe('boolean');
+      }
+      // Exactly one layer wins (or none, when resolved is null).
+      const applied = entries.filter((e) => e.applied === true);
+      expect(
+        applied.length <= 1,
+        driver.describe('prompts.md §Resolution chain', 'AT MOST one chain entry MAY be applied: true (the winning layer)'),
+      ).toBe(true);
+      // resolved mirrors the applied entry's source (RFC 0029 §B).
+      if (applied.length === 1) {
+        expect(
+          ev.payload?.resolved,
+          driver.describe('run-event-payloads.schema.json agentPromptResolved', '`resolved` MUST mirror the applied chain entry\'s `source`'),
+        ).toBe(applied[0]?.source);
+      } else {
+        expect(
+          ev.payload?.resolved === null || ev.payload?.resolved === undefined,
+          driver.describe('run-event-payloads.schema.json agentPromptResolved', 'with no applied layer, `resolved` MUST be null'),
+        ).toBe(true);
+      }
+    }
+  });
+});