npm - @openwop/openwop-conformance - Versions diffs - 1.5.0 → 1.6.1 - Mend

@openwop/openwop-conformance 1.5.0 → 1.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

package/CHANGELOG.md +27 -0
package/README.md +2 -2
package/api/asyncapi.yaml +25 -4
package/api/openapi.yaml +371 -0
package/coverage.md +31 -4
package/fixtures/conformance-phase4-nondet-tool.json +53 -0
package/fixtures/conformance-phase4-replay-divergence.json +40 -0
package/fixtures.md +5 -3
package/package.json +1 -1
package/schemas/README.md +4 -0
package/schemas/annotation-create.schema.json +37 -0
package/schemas/annotation.schema.json +56 -0
package/schemas/capabilities.schema.json +191 -3
package/schemas/credential-reference.schema.json +21 -0
package/schemas/node-pack-manifest.schema.json +112 -1
package/schemas/run-diff-response.schema.json +64 -0
package/schemas/run-event-payloads.schema.json +104 -2
package/schemas/run-event.schema.json +8 -1
package/schemas/run-snapshot.schema.json +11 -0
package/src/lib/behavior-gate.ts +51 -0
package/src/lib/driver.ts +13 -1
package/src/lib/feedback.ts +31 -0
package/src/lib/saml-idp.ts +179 -0
package/src/scenarios/approval-gate-events.test.ts +61 -0
package/src/scenarios/approval-gate-flow.test.ts +68 -0
package/src/scenarios/auth-saml-profile.test.ts +119 -0
package/src/scenarios/auth-scim-profile.test.ts +65 -0
package/src/scenarios/authorization-fail-closed.test.ts +80 -0
package/src/scenarios/authorization-roles-shape.test.ts +83 -0
package/src/scenarios/connector-manifest-validity.test.ts +142 -0
package/src/scenarios/credential-payload-redaction.test.ts +93 -0
package/src/scenarios/credentials-capability-shape.test.ts +90 -0
package/src/scenarios/cross-engine-append-behavior.test.ts +204 -0
package/src/scenarios/cross-host-traceparent-propagation.test.ts +13 -6
package/src/scenarios/cross-workspace-isolation.test.ts +72 -0
package/src/scenarios/deadletter-capability-shape.test.ts +59 -0
package/src/scenarios/deadletter-retry-exhaustion.test.ts +62 -0
package/src/scenarios/experimental-tier-shape.test.ts +192 -0
package/src/scenarios/feedback-capability-shape.test.ts +35 -0
package/src/scenarios/feedback-correction-redaction.test.ts +35 -0
package/src/scenarios/feedback-cross-tenant-isolation.test.ts +37 -0
package/src/scenarios/feedback-fork-not-copied.test.ts +40 -0
package/src/scenarios/feedback-on-terminal-run.test.ts +32 -0
package/src/scenarios/feedback-record-and-list.test.ts +32 -0
package/src/scenarios/feedback-unsupported-501.test.ts +32 -0
package/src/scenarios/identity-owner-shape.test.ts +64 -0
package/src/scenarios/multi-agent-confidence-escalation.test.ts +13 -12
package/src/scenarios/multi-agent-memory-lifecycle.test.ts +87 -12
package/src/scenarios/multi-region-idempotency-behavior.test.ts +203 -0
package/src/scenarios/oauth-capability-shape.test.ts +97 -0
package/src/scenarios/oauth-connector-redaction.test.ts +91 -0
package/src/scenarios/pack-registry-isolation.test.ts +108 -0
package/src/scenarios/pack-registry-publish.test.ts +1 -1
package/src/scenarios/prompt-mutation-workspace-membership-enforced.test.ts +126 -0
package/src/scenarios/prompt-read-workspace-membership-enforced.test.ts +183 -0
package/src/scenarios/redaction.test.ts +4 -1
package/src/scenarios/replay-divergence-at-refusal.test.ts +187 -7
package/src/scenarios/replay-observable-sequence-determinism.test.ts +20 -6
package/src/scenarios/run-diff.test.ts +143 -0
package/src/scenarios/sandbox-capability-gate-respected.test.ts +7 -1
package/src/scenarios/sandbox-memory-cap.test.ts +7 -5
package/src/scenarios/sandbox-mvp-behavior.test.ts +280 -0
package/src/scenarios/sandbox-no-cross-pack-mutation.test.ts +7 -1
package/src/scenarios/sandbox-no-host-env-leak.test.ts +5 -1
package/src/scenarios/sandbox-no-host-fs-escape.test.ts +9 -1
package/src/scenarios/sandbox-no-host-process-escape.test.ts +5 -1
package/src/scenarios/sandbox-no-network-escape.test.ts +5 -1
package/src/scenarios/sandbox-timeout-cap.test.ts +7 -5
package/src/scenarios/scheduling-capability-shape.test.ts +81 -0
package/src/scenarios/scheduling-cron-fires-once.test.ts +66 -0
package/src/scenarios/secret-leakage-otel-attribute.test.ts +241 -0
package/src/scenarios/spec-corpus-validity.test.ts +6 -3

package/src/scenarios/replay-divergence-at-refusal.test.ts CHANGED Viewed

@@ -27,9 +27,15 @@
  * mock provider returning a valid envelope on the original run and a
  * refusal on the replay (or vice-versa). Reference workflow-engine ships
  * a mock-AI provider (`OPENWOP_MULTI_AGENT_EXECUTION_MODEL=true`); the
- * Phase 4 wiring extends it to honor a "refusal on replay" mode. Until
- * that wiring lands, the assertion is surfaced as `it.todo` so test
- * reporters track the gap rather than reporting a vacuous PASS.
+ * Phase 4 wiring (landed 2026-05-23 via commits `1fce55a` + `bba3b4a`)
+ * extends it with `checkReplayDivergence()` in the executor catch-path
+ * + symmetric success-path detection of envelope-kind divergence; emits
+ * `replay.divergedAtRefusal` event and fails the run with
+ * `error.code: 'replay_diverged_at_refusal'` when source vs replay
+ * differ at the same nodeId. Behavioral coverage is now real: 3
+ * assertions PASS against workflow-engine when Phase 4 advertisement
+ * is enabled (cover both divergence directions: original=valid +
+ * replay=refusal AND original=refusal + replay=valid).
  *
  * @see RFCS/0041-multi-agent-replay-under-nondeterminism.md §B
  * @see spec/v1/replay.md §"Envelope-refusal recovery in replay (MAE-8 closure)"
@@ -113,6 +119,40 @@ describe.skipIf(HTTP_SKIP)('replay-divergence-at-refusal: advertisement shape (R
   });
 });
+interface RunSnapshot {
+  status?: string;
+  error?: { code?: string; message?: string };
+}
+interface RunEventDoc {
+  type: string;
+  nodeId?: string;
+  sequence?: number;
+  payload?: Record<string, unknown>;
+}
+async function pollUntilTerminal(runId: string): Promise<RunSnapshot> {
+  for (let i = 0; i < 50; i++) {
+    const r = await driver.get(`/v1/runs/${encodeURIComponent(runId)}`);
+    const snap = r.json as RunSnapshot;
+    if (snap.status === 'completed' || snap.status === 'failed' || snap.status === 'cancelled') {
+      return snap;
+    }
+    await new Promise((resolve) => setTimeout(resolve, 100));
+  }
+  throw new Error(`run ${runId} did not reach terminal within 5s`);
+}
+async function readEvents(runId: string): Promise<RunEventDoc[]> {
+  const r = await driver.get(`/v1/runs/${encodeURIComponent(runId)}/events`);
+  const body = r.json as { events?: RunEventDoc[] };
+  return body.events ?? [];
+}
+async function programMock(nodeId: string, program: Array<Record<string, unknown>>): Promise<number> {
+  const r = await driver.post('/v1/host/sample/test/mock-ai/program', { nodeId, program });
+  return r.status;
+}
 describe.skipIf(HTTP_SKIP)('replay-divergence-at-refusal: behavioral (RFC 0041 §B MAE-8)', () => {
   // Behavioral assertion drives a workflow whose mock-AI provider returns a
   // valid envelope on the original run + a refusal on the replay (or
@@ -127,8 +167,148 @@ describe.skipIf(HTTP_SKIP)('replay-divergence-at-refusal: behavioral (RFC 0041
   //        originalEnvelopeKind === 'valid' AND replayEnvelopeKind === 'refusal'.
   //   7. Assert NO silent substitution: the replay's continuation past the
   //      diverging node MUST NOT execute (run terminates at the divergence).
-  // Until the reference host wires the staged-refusal seam, surfaced as
-  // `todo` so test reporters track the gap.
-  it.todo('Phase 4 host MUST emit replay.divergedAtRefusal + fail with replay_diverged_at_refusal when original=valid + replay=refusal');
-  it.todo('Phase 4 host MUST emit replay.divergedAtRefusal + fail with replay_diverged_at_refusal when original=refusal + replay=valid (symmetric case)');
+  async function gateOnPhase4(ctx: { skip: () => void }): Promise<boolean> {
+    const d = await readDiscovery();
+    const rd = d?.capabilities?.multiAgent?.executionModel?.replayDeterminism;
+    if (rd?.supported !== true || rd?.refusalDivergenceEmission !== true) {
+      ctx.skip();
+      return false;
+    }
+    return true;
+  }
+  it('Phase 4 host MUST emit replay.divergedAtRefusal + fail with replay_diverged_at_refusal when original=valid + replay=refusal', async (ctx) => {
+    if (!(await gateOnPhase4(ctx))) return;
+    const NODE_ID = 'structured-call';
+    // Original program: valid envelope. Replay program (set after the
+    // original completes): refusal. Programming twice is the spec-canonical
+    // pattern — see spec/v1/host-sample-test-seams.md §5.
+    const validEnv = '{"valid":true}';
+    const programStatus = await programMock(NODE_ID, [
+      { content: validEnv, stopReason: 'end_turn' as const },
+    ]);
+    if (programStatus === 404) {
+      ctx.skip(); // mock-AI program seam not exposed — soft-skip
+      return;
+    }
+    expect(programStatus).toBe(200);
+    const createRes = await driver.post('/v1/runs', {
+      workflowId: 'conformance-phase4-replay-divergence',
+    });
+    if (createRes.status === 404 || createRes.status === 422) {
+      ctx.skip(); // fixture not advertised
+      return;
+    }
+    expect(createRes.status).toBe(201);
+    const sourceRunId = (createRes.json as { runId: string }).runId;
+    const sourceTerminal = await pollUntilTerminal(sourceRunId);
+    expect(sourceTerminal.status).toBe('completed');
+    // Stage refusal for the replay's mock-AI dispatch.
+    await programMock(NODE_ID, [
+      { content: 'safety-refused-for-conformance', stopReason: 'safety' as const, refusalText: 'safety-refused-for-conformance' },
+    ]);
+    const forkRes = await driver.post(`/v1/runs/${encodeURIComponent(sourceRunId)}:fork`, {
+      fromSeq: 0,
+      mode: 'replay',
+    });
+    expect(forkRes.status).toBe(201);
+    const replayRunId = (forkRes.json as { runId: string }).runId;
+    const replayTerminal = await pollUntilTerminal(replayRunId);
+    expect(
+      replayTerminal.status,
+      driver.describe(
+        'RFCS/0041-multi-agent-replay-under-nondeterminism.md §B + spec/v1/rest-endpoints.md §"Common error codes"',
+        'replay MUST terminate `failed` when refusal-divergence is detected (silent substitution is non-conformant)',
+      ),
+    ).toBe('failed');
+    expect(
+      replayTerminal.error?.code,
+      driver.describe(
+        'spec/v1/rest-endpoints.md §"Common error codes" — replay_diverged_at_refusal',
+        'error.code MUST be `replay_diverged_at_refusal` per the canonical catalog',
+      ),
+    ).toBe('replay_diverged_at_refusal');
+    const replayEvents = await readEvents(replayRunId);
+    const divergenceEvent = replayEvents.find((e) => e.type === 'replay.divergedAtRefusal');
+    expect(
+      divergenceEvent,
+      driver.describe(
+        'schemas/run-event-payloads.schema.json §replayDivergedAtRefusal',
+        'replay event log MUST contain exactly one `replay.divergedAtRefusal` event identifying the divergence',
+      ),
+    ).toBeDefined();
+    expect(divergenceEvent?.payload?.sourceRunId).toBe(sourceRunId);
+    expect(divergenceEvent?.payload?.nodeId).toBe(NODE_ID);
+    expect(
+      divergenceEvent?.payload?.originalEnvelopeKind,
+      driver.describe(
+        'schemas/run-event-payloads.schema.json §replayDivergedAtRefusal.originalEnvelopeKind',
+        'originalEnvelopeKind MUST be `valid` (source run completed normally)',
+      ),
+    ).toBe('valid');
+    expect(
+      divergenceEvent?.payload?.replayEnvelopeKind,
+      driver.describe(
+        'schemas/run-event-payloads.schema.json §replayDivergedAtRefusal.replayEnvelopeKind',
+        'replayEnvelopeKind MUST be `refusal` (replay hit the refusal entry of the mock program)',
+      ),
+    ).toBe('refusal');
+  });
+  it('Phase 4 host MUST emit replay.divergedAtRefusal + fail with replay_diverged_at_refusal when original=refusal + replay=valid (symmetric case)', async (ctx) => {
+    if (!(await gateOnPhase4(ctx))) return;
+    const NODE_ID = 'structured-call';
+    // Symmetric: original=refusal, replay=valid.
+    const programStatus = await programMock(NODE_ID, [
+      { content: 'safety-refused-for-conformance', stopReason: 'safety' as const, refusalText: 'safety-refused-for-conformance' },
+    ]);
+    if (programStatus === 404) {
+      ctx.skip();
+      return;
+    }
+    expect(programStatus).toBe(200);
+    const createRes = await driver.post('/v1/runs', {
+      workflowId: 'conformance-phase4-replay-divergence',
+    });
+    if (createRes.status === 404 || createRes.status === 422) {
+      ctx.skip();
+      return;
+    }
+    expect(createRes.status).toBe(201);
+    const sourceRunId = (createRes.json as { runId: string }).runId;
+    const sourceTerminal = await pollUntilTerminal(sourceRunId);
+    // Source run fails because the LLM refused.
+    expect(sourceTerminal.status).toBe('failed');
+    // Stage valid envelope for the replay's mock-AI dispatch.
+    await programMock(NODE_ID, [
+      { content: '{"valid":true}', stopReason: 'end_turn' as const },
+    ]);
+    const forkRes = await driver.post(`/v1/runs/${encodeURIComponent(sourceRunId)}:fork`, {
+      fromSeq: 0,
+      mode: 'replay',
+    });
+    expect(forkRes.status).toBe(201);
+    const replayRunId = (forkRes.json as { runId: string }).runId;
+    const replayTerminal = await pollUntilTerminal(replayRunId);
+    expect(replayTerminal.status).toBe('failed');
+    expect(replayTerminal.error?.code).toBe('replay_diverged_at_refusal');
+    const replayEvents = await readEvents(replayRunId);
+    const divergenceEvent = replayEvents.find((e) => e.type === 'replay.divergedAtRefusal');
+    expect(divergenceEvent).toBeDefined();
+    expect(divergenceEvent?.payload?.originalEnvelopeKind).toBe('refusal');
+    expect(divergenceEvent?.payload?.replayEnvelopeKind).toBe('valid');
+  });
 });

package/src/scenarios/replay-observable-sequence-determinism.test.ts CHANGED Viewed

@@ -5,7 +5,7 @@
  * `capabilities.multiAgent.executionModel.version >= 4` AND
  * `capabilities.multiAgent.executionModel.replayDeterminism.supported: true`.
  *
- * Asserts (behavioral, when a Phase 4 host advertises the contract):
+ * Asserts (behavioral, when a host advertises `version: 4` + the contract):
  *
  *   1. A `mode: replay` fork from event-log index `fromSeq` produces an
  *      event-log prefix `[0, fromSeq]` that is byte-equivalent to the
@@ -26,14 +26,14 @@
  * Driving the assertion requires a workflow fixture whose tool call is
  * pure-nondeterministic (different bytes on each call) but whose
  * observable result is what gets cached. Reference workflow-engine ships
- * `core.noop` + deterministic fixtures; Phase 4 wiring needs a
+ * `core.noop` + deterministic fixtures; the `version: 4` wiring needs a
  * nondeterministic-tool fixture (e.g., `conformance-phase4-nondet-tool`).
  * Until that lands, the cross-boundary assertion is surfaced as `it.todo`
  * so test reporters track the gap.
  *
  * @see RFCS/0041-multi-agent-replay-under-nondeterminism.md §C
  * @see spec/v1/replay.md §"Observable-output-sequence determinism vs bit-equivalent execution (MAE-9 closure)"
- * @see spec/v1/multi-agent-execution.md §"Phase 4 replay determinism"
+ * @see spec/v1/multi-agent-execution.md §"Replay determinism under nondeterminism (RFC 0041)"
  */
 import { describe, it } from 'vitest';
@@ -62,10 +62,19 @@ describe('replay-observable-sequence-determinism: prefix byte-equivalence (RFC 0
   //   6. Read original + replay RunSnapshot at index N; assert
   //      variables + channels + status byte-equivalent.
   // Surfaced as `todo` until the `conformance-phase4-nondet-tool`
-  // fixture ships in the suite — consistent with the sibling Phase 4
+  // fixture ships in the suite — consistent with the sibling RFC 0041
   // scenarios (`replay-divergence-at-refusal.test.ts`,
   // `replay-llm-cache-key-portable.test.ts`).
-  it.todo('original and replay event-log prefixes [0, fromSeq] MUST be byte-equivalent (modulo per-region clock + ULID-T entropy)');
+  // Marked out of stable profile via RFC 0042 §B (experimental tier):
+  // RFC 0041 §C remains Active, so its wire shape MAY shift compatibly
+  // within v1.x. Hosts that wire this assertion before RFC 0041 graduates
+  // to Accepted SHOULD advertise `multiAgent.executionModel.tier:
+  // 'experimental'` + `experimentalUntil` per RFC 0042 §A. Path-to-runnable
+  // requires: (a) host pure-replay observable-cache emission via the
+  // `:fork mode: replay` re-dispatch path and (b) the test seam endpoint
+  // contract for cache-hit-vs-fresh-call distinction (see
+  // `spec/v1/host-sample-test-seams.md` for the established seam pattern).
+  it.skip('original and replay event-log prefixes [0, fromSeq] MUST be byte-equivalent (modulo per-region clock + ULID-T entropy) — out of stable profile via RFC 0042');
 });
 describe('replay-observable-sequence-determinism: observable-result caching (RFC 0041 §C)', () => {
@@ -76,5 +85,10 @@ describe('replay-observable-sequence-determinism: observable-result caching (RFC
   // this a valid determinism contract — bit-equivalent execution would
   // require unbounded caching (rejected per RFC 0041 §"Alternatives
   // considered" #2).
-  it.todo('replay of a workflow containing a nondeterministic tool call reproduces the original observable result, NOT a fresh call');
+  // Marked out of stable profile via RFC 0042 §B (experimental tier):
+  // see the prefix-byte-equivalence comment above for the same routing.
+  // This is RFC 0041 §C's load-bearing assertion; it lands as a runnable
+  // `it()` when RFC 0041 graduates to Accepted on first non-steward host
+  // adoption.
+  it.skip('replay of a workflow containing a nondeterministic tool call reproduces the original observable result, NOT a fresh call — out of stable profile via RFC 0042');
 });

package/src/scenarios/run-diff.test.ts ADDED Viewed

@@ -0,0 +1,143 @@
+/**
+ * RFC 0054 — run diff & execution comparison.
+ *
+ * Exercises `GET /v1/runs/{runId}:diff?against={otherRunId}` per
+ * `spec/v1/rest-endpoints.md` §"GET /v1/runs/{runId}:diff" and
+ * `schemas/run-diff-response.schema.json`. The endpoint is OPTIONAL —
+ * hosts that don't implement it return 404 and these scenarios soft-skip.
+ *
+ * Coverage:
+ *   - identical:  diffing a run against itself ⇒ divergedAtSeq null,
+ *                 empty eventDiffs (the determinism floor).
+ *   - divergence: diffing two structurally-different runs ⇒ a non-null
+ *                 integer divergedAtSeq; eventDiffs begin at that seq.
+ *   - state-shape: response conforms to run-diff-response.schema.json and
+ *                 stateDiff is redaction-safe (no credential-shaped keys).
+ *   - error-surface: missing `against` ⇒ 400; nonexistent `against` ⇒ 404
+ *                 (the access boundary; full cross-principal authz needs a
+ *                 multi-principal harness — host-specific).
+ *
+ * @see RFCS/0054-run-diff-and-execution-comparison.md
+ * @see api/openapi.yaml §diffRun
+ */
+import { describe, it, expect } from 'vitest';
+import { driver } from '../lib/driver.js';
+const HTTP_SKIP = !process.env.OPENWOP_BASE_URL;
+// Two standard conformance fixtures with structurally-different event
+// logs — diffing one against the other is a deterministic divergence.
+const FIXTURE_A = 'conformance-agent-reasoning';
+const FIXTURE_B = 'conformance-dispatch-loop';
+interface DiffResponse {
+  a: string;
+  b: string;
+  divergedAtSeq: number | null;
+  eventDiffs: Array<{ seq: number; op: string; aEvent?: unknown; bEvent?: unknown }>;
+  stateDiff: Record<string, unknown>;
+  truncated?: boolean;
+}
+async function createRun(workflowId: string): Promise<string | null> {
+  const res = await driver.post('/v1/runs', { workflowId });
+  if (res.status !== 201) return null;
+  return (res.json as { runId: string }).runId;
+}
+/** Poll until the run is terminal (best-effort; bounded). */
+async function settle(runId: string): Promise<void> {
+  for (let i = 0; i < 20; i++) {
+    const r = await driver.get(`/v1/runs/${encodeURIComponent(runId)}`);
+    const status = (r.json as { status?: string })?.status;
+    if (status === 'completed' || status === 'failed' || status === 'cancelled') return;
+    await new Promise((res) => setTimeout(res, 100));
+  }
+}
+describe.skipIf(HTTP_SKIP)('run-diff: GET /v1/runs/{runId}:diff (RFC 0054)', () => {
+  it('diffing a run against itself ⇒ divergedAtSeq null + empty eventDiffs', async (ctx) => {
+    const runId = await createRun(FIXTURE_A);
+    if (!runId) { ctx.skip(); return; }
+    await settle(runId);
+    const res = await driver.get(`/v1/runs/${encodeURIComponent(runId)}:diff?against=${encodeURIComponent(runId)}`);
+    if (res.status === 404) { ctx.skip(); return; } // endpoint not implemented
+    expect(res.status, driver.describe('spec/v1/rest-endpoints.md §:diff', 'self-diff MUST return 200')).toBe(200);
+    const body = res.json as DiffResponse;
+    expect(body.a).toBe(runId);
+    expect(body.b).toBe(runId);
+    expect(
+      body.divergedAtSeq,
+      driver.describe('RFCS/0054 §C', 'identical logs MUST yield divergedAtSeq: null'),
+    ).toBeNull();
+    expect(body.eventDiffs, 'identical logs MUST yield an empty eventDiffs array').toEqual([]);
+  });
+  it('diffing two structurally-different runs ⇒ non-null divergedAtSeq aligned to eventDiffs[0]', async (ctx) => {
+    const [ra, rb] = await Promise.all([createRun(FIXTURE_A), createRun(FIXTURE_B)]);
+    if (!ra || !rb) { ctx.skip(); return; }
+    await Promise.all([settle(ra), settle(rb)]);
+    const res = await driver.get(`/v1/runs/${encodeURIComponent(ra)}:diff?against=${encodeURIComponent(rb)}`);
+    if (res.status === 404) { ctx.skip(); return; }
+    expect(res.status).toBe(200);
+    const body = res.json as DiffResponse;
+    expect(
+      typeof body.divergedAtSeq === 'number' && body.divergedAtSeq >= 0,
+      driver.describe('RFCS/0054 §C', 'structurally-different runs MUST report a non-null integer divergedAtSeq'),
+    ).toBe(true);
+    expect(body.eventDiffs.length, 'divergent runs MUST report at least one eventDiff').toBeGreaterThan(0);
+    expect(
+      body.eventDiffs[0]?.seq,
+      driver.describe('RFCS/0054 §C', 'eventDiffs MUST begin at divergedAtSeq'),
+    ).toBe(body.divergedAtSeq);
+    for (const d of body.eventDiffs) {
+      expect(['added', 'removed', 'changed']).toContain(d.op);
+    }
+  });
+  it('response conforms to run-diff-response.schema.json and stateDiff is redaction-safe', async (ctx) => {
+    const [ra, rb] = await Promise.all([createRun(FIXTURE_A), createRun(FIXTURE_B)]);
+    if (!ra || !rb) { ctx.skip(); return; }
+    await Promise.all([settle(ra), settle(rb)]);
+    const res = await driver.get(`/v1/runs/${encodeURIComponent(ra)}:diff?against=${encodeURIComponent(rb)}`);
+    if (res.status === 404) { ctx.skip(); return; }
+    expect(res.status).toBe(200);
+    const body = res.json as DiffResponse;
+    expect(typeof body.a === 'string' && typeof body.b === 'string', 'a + b MUST be strings').toBe(true);
+    expect(Array.isArray(body.eventDiffs), 'eventDiffs MUST be an array').toBe(true);
+    expect(body.stateDiff !== null && typeof body.stateDiff === 'object', 'stateDiff MUST be an object').toBe(true);
+    // Redaction-safe: no credential-shaped material leaks into the diff.
+    const serialized = JSON.stringify(body.stateDiff);
+    expect(
+      /sk-|api[_-]?key|secret|bearer\s/i.test(serialized),
+      driver.describe('RFCS/0054 §B', 'stateDiff MUST be redaction-safe — no credential material'),
+    ).toBe(false);
+  });
+  it('missing `against` ⇒ 400; nonexistent `against` ⇒ 404 (access boundary)', async (ctx) => {
+    const runId = await createRun(FIXTURE_A);
+    if (!runId) { ctx.skip(); return; }
+    const probe = await driver.get(`/v1/runs/${encodeURIComponent(runId)}:diff?against=${encodeURIComponent(runId)}`);
+    if (probe.status === 404) { ctx.skip(); return; } // endpoint not implemented at all
+    const missing = await driver.get(`/v1/runs/${encodeURIComponent(runId)}:diff`);
+    expect(
+      missing.status,
+      driver.describe('api/openapi.yaml §diffRun', 'missing required `against` query param MUST return 400'),
+    ).toBe(400);
+    const nonexistent = await driver.get(`/v1/runs/${encodeURIComponent(runId)}:diff?against=does-not-exist-${Date.now()}`);
+    expect(
+      nonexistent.status,
+      driver.describe('RFCS/0054 §A', 'diffing against a run the caller cannot read/that does not exist MUST NOT return 200'),
+    ).toBe(404);
+  });
+});

package/src/scenarios/sandbox-capability-gate-respected.test.ts CHANGED Viewed

@@ -23,5 +23,11 @@ import { describe, it } from 'vitest';
 // reporting a vacuous PASS.
 describe('sandbox-capability-gate-respected: behavioral (RFC 0035 §B)', () => {
-  it.todo('a misbehaving pack calling an undeclared host capability fails closed with sandbox_capability_denied');
+  // Behavioral coverage in `sandbox-mvp-behavior.test.ts` §"capability-gate-respected"
+  // (drives `POST /v1/host/sample/test/sandbox-invoke` against the
+  // workflow-engine's node:vm MVP and asserts `error.code:
+  // 'sandbox_capability_denied'` + `details.requestedCapability` per
+  // `host-capabilities.md` §"Error codes"). `it.skip` preserves the
+  // per-invariant file structure without inflating the `it.todo` count.
+  it.skip('behavioral coverage in sandbox-mvp-behavior.test.ts §"capability-gate-respected"');
 });

package/src/scenarios/sandbox-memory-cap.test.ts CHANGED Viewed

@@ -50,9 +50,11 @@ describe.skipIf(HTTP_SKIP)('sandbox-memory-cap: capability shape + behavioral (R
     ).toBe(true);
   });
-  // Behavioral assertion lands when the misbehaving-memory-cap typeId is
-  // available. Expected: error.code === 'sandbox_memory_exceeded';
-  // details.requestedBytes > memoryLimitBytes. Surfaced as `todo` so
-  // test reporters track the gap rather than reporting a vacuous PASS.
-  it.todo('a misbehaving pack allocating beyond memoryLimitBytes fails with sandbox_memory_exceeded');
+  // Behavioral coverage in `sandbox-mvp-behavior.test.ts` §"memory-exceeded"
+  // (drives `POST /v1/host/sample/test/sandbox-invoke` against the
+  // workflow-engine's node:vm MVP and asserts `error.code:
+  // 'sandbox_memory_exceeded'` per `host-capabilities.md` §"Error codes").
+  // `it.skip` preserves the per-invariant file structure without inflating
+  // the `it.todo` count external auditors track.
+  it.skip('behavioral coverage in sandbox-mvp-behavior.test.ts §"memory-exceeded"');
 });