npm - @openwop/openwop-conformance - Versions diffs - 1.5.0 → 1.6.0 - Mend

@openwop/openwop-conformance 1.5.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

package/CHANGELOG.md +19 -0
package/README.md +2 -2
package/api/asyncapi.yaml +8 -3
package/api/openapi.yaml +305 -0
package/coverage.md +29 -4
package/fixtures/conformance-phase4-nondet-tool.json +53 -0
package/fixtures/conformance-phase4-replay-divergence.json +40 -0
package/fixtures.md +5 -3
package/package.json +1 -1
package/schemas/README.md +2 -0
package/schemas/capabilities.schema.json +167 -3
package/schemas/credential-reference.schema.json +21 -0
package/schemas/node-pack-manifest.schema.json +112 -1
package/schemas/run-diff-response.schema.json +64 -0
package/schemas/run-event-payloads.schema.json +104 -2
package/schemas/run-event.schema.json +8 -1
package/schemas/run-snapshot.schema.json +11 -0
package/src/lib/behavior-gate.ts +51 -0
package/src/lib/driver.ts +13 -1
package/src/lib/saml-idp.ts +179 -0
package/src/scenarios/approval-gate-events.test.ts +61 -0
package/src/scenarios/approval-gate-flow.test.ts +68 -0
package/src/scenarios/auth-saml-profile.test.ts +119 -0
package/src/scenarios/auth-scim-profile.test.ts +65 -0
package/src/scenarios/authorization-fail-closed.test.ts +80 -0
package/src/scenarios/authorization-roles-shape.test.ts +83 -0
package/src/scenarios/connector-manifest-validity.test.ts +142 -0
package/src/scenarios/credential-payload-redaction.test.ts +93 -0
package/src/scenarios/credentials-capability-shape.test.ts +90 -0
package/src/scenarios/cross-engine-append-behavior.test.ts +204 -0
package/src/scenarios/cross-host-traceparent-propagation.test.ts +13 -6
package/src/scenarios/cross-workspace-isolation.test.ts +72 -0
package/src/scenarios/deadletter-capability-shape.test.ts +59 -0
package/src/scenarios/deadletter-retry-exhaustion.test.ts +62 -0
package/src/scenarios/experimental-tier-shape.test.ts +192 -0
package/src/scenarios/identity-owner-shape.test.ts +64 -0
package/src/scenarios/multi-agent-confidence-escalation.test.ts +13 -12
package/src/scenarios/multi-agent-memory-lifecycle.test.ts +87 -12
package/src/scenarios/multi-region-idempotency-behavior.test.ts +203 -0
package/src/scenarios/oauth-capability-shape.test.ts +97 -0
package/src/scenarios/oauth-connector-redaction.test.ts +91 -0
package/src/scenarios/pack-registry-isolation.test.ts +108 -0
package/src/scenarios/pack-registry-publish.test.ts +1 -1
package/src/scenarios/prompt-mutation-workspace-membership-enforced.test.ts +126 -0
package/src/scenarios/prompt-read-workspace-membership-enforced.test.ts +183 -0
package/src/scenarios/replay-divergence-at-refusal.test.ts +187 -7
package/src/scenarios/replay-observable-sequence-determinism.test.ts +20 -6
package/src/scenarios/run-diff.test.ts +143 -0
package/src/scenarios/sandbox-capability-gate-respected.test.ts +7 -1
package/src/scenarios/sandbox-memory-cap.test.ts +7 -5
package/src/scenarios/sandbox-mvp-behavior.test.ts +280 -0
package/src/scenarios/sandbox-no-cross-pack-mutation.test.ts +7 -1
package/src/scenarios/sandbox-no-host-env-leak.test.ts +5 -1
package/src/scenarios/sandbox-no-host-fs-escape.test.ts +9 -1
package/src/scenarios/sandbox-no-host-process-escape.test.ts +5 -1
package/src/scenarios/sandbox-no-network-escape.test.ts +5 -1
package/src/scenarios/sandbox-timeout-cap.test.ts +7 -5
package/src/scenarios/scheduling-capability-shape.test.ts +81 -0
package/src/scenarios/scheduling-cron-fires-once.test.ts +66 -0
package/src/scenarios/secret-leakage-otel-attribute.test.ts +241 -0
package/src/scenarios/spec-corpus-validity.test.ts +2 -2

package/src/scenarios/pack-registry-publish.test.ts CHANGED Viewed

@@ -1,7 +1,7 @@
 /**
  * Pack-registry publish scenarios — `node-packs.md` §"PUT /v1/packs/{name}/-/{version}.tgz".
  *
- * Status: BEHAVIORAL (soft-skip). Per RFC 0025 (`Draft` 2026-05-19),
+ * Status: BEHAVIORAL (soft-skip). Per RFC 0025 (`Active` 2026-05-19),
  * the conformance suite drives the documented 19-code error catalog
  * via the test-mode mirror namespace `/v1/packs-test/*`, gated on
  * `capabilities.packs.testMode.supported: true`. Each scenario soft-

package/src/scenarios/prompt-mutation-workspace-membership-enforced.test.ts ADDED Viewed

@@ -0,0 +1,126 @@
+/**
+ * prompt-mutation-workspace-membership-enforced — RFC 0028 Tier-2 §"Workspace
+ * membership on workspace-scoped writes" verification.
+ *
+ * Status: ACTIVE (capability-gated; behavioral when the host advertises
+ * `capabilities.prompts.mutableLibrary: true`). Hosts that don't advertise
+ * mutableLibrary soft-skip cleanly.
+ *
+ * The contract (spec/v1/prompts.md §"Discovery & distribution" §"REST
+ * endpoints" §"Workspace membership on workspace-scoped writes"):
+ *
+ *   Hosts MUST verify that the authenticated principal is a member of the
+ *   target workspace BEFORE honoring any POST / PUT / DELETE to a
+ *   workspace-scoped /v1/prompts* resource. A workspaceId supplied by the
+ *   caller (request body, URL, or query string) MUST NOT be trusted as
+ *   authorization on its own. Non-members MUST be rejected fail-closed
+ *   (typically 403) before any persistence occurs.
+ *
+ * The probe drives `POST /v1/prompts` with a `workspaceId` the conformance
+ * principal cannot be a member of (a cryptographically-unique random value
+ * by default; operator-overridable via `OPENWOP_TEST_NONMEMBER_WORKSPACE_ID`
+ * for hosts that need a specific synthetic workspace shape). The behavioral
+ * MUST is that the host refuses — NOT a 2xx. Any 4xx/5xx is acceptable
+ * (401 = auth not configured for this surface; 403 = membership check;
+ * 404 = endpoint absent; 422 = body validation; 501 = capability not
+ * provided). The failure mode this invariant guards against is a SILENT
+ * 2xx with a write to a workspace the caller doesn't belong to — that's the
+ * RFC 0028 Tier-2 vulnerability self-disclosed by an adopter on 2026-05-25.
+ *
+ * Why a random workspaceId is sufficient: a non-member workspace check is
+ * negative-space — the host MUST refuse for ANY workspace the principal
+ * isn't a member of, and a random UUID has astronomically-low collision
+ * probability with any real workspace membership grant.
+ *
+ * @see RFCS/0028-prompt-library-endpoints.md §"Post-promotion notes"
+ * @see spec/v1/prompts.md §"Security invariants" §prompt-mutation-workspace-membership-enforced
+ * @see spec/v1/auth.md §"Identity claims — tenant · workspace · principal"
+ * @see RFCS/0048-tenant-workspace-principal-identity-model.md §D
+ */
+import { describe, it, expect } from 'vitest';
+import { randomUUID } from 'node:crypto';
+import { driver } from '../lib/driver.js';
+const HTTP_SKIP = !process.env.OPENWOP_BASE_URL;
+interface DiscoveryDoc {
+  capabilities?: {
+    prompts?: {
+      mutableLibrary?: unknown;
+    };
+  };
+}
+async function readDiscovery(): Promise<DiscoveryDoc | null> {
+  try {
+    const res = await driver.get('/.well-known/openwop');
+    if (res.status !== 200) return null;
+    return res.json as DiscoveryDoc;
+  } catch {
+    return null;
+  }
+}
+describe.skipIf(HTTP_SKIP)(
+  'prompt-mutation-workspace-membership-enforced: writes to non-member workspaces MUST be refused (RFC 0028 Tier-2)',
+  () => {
+    it('POST /v1/prompts with a workspaceId the principal is not a member of MUST NOT succeed with 2xx', async (ctx) => {
+      const d = await readDiscovery();
+      if (d === null) {
+        ctx.skip();
+        return;
+      }
+      const mutableLibrary = d.capabilities?.prompts?.mutableLibrary;
+      if (mutableLibrary !== true) {
+        ctx.skip();
+        return;
+      }
+      const nonMemberWorkspaceId =
+        process.env.OPENWOP_TEST_NONMEMBER_WORKSPACE_ID ??
+        `openwop-conformance-nonmember-${randomUUID()}`;
+      const res = await driver.post('/v1/prompts', {
+        workspaceId: nonMemberWorkspaceId,
+        templateId: `conformance-membership-probe-${randomUUID()}`,
+        version: '1.0.0',
+        kind: 'system',
+        text: 'conformance probe — SHOULD NOT persist',
+      });
+      // The conformance MUST: the host MUST NOT honor a write to a workspace
+      // the caller cannot prove membership of. Any refusal (4xx/5xx) is
+      // acceptable; a 2xx silent success is the failure mode that the RFC
+      // 0028 Tier-2 self-disclosed vulnerability demonstrated.
+      expect(
+        res.status,
+        driver.describe(
+          'spec/v1/prompts.md §Workspace membership on workspace-scoped reads and writes',
+          `mutating /v1/prompts MUST refuse a write to a non-member workspace; ` +
+            `got ${res.status} ${res.text.slice(0, 200)}`,
+        ),
+      ).toBeGreaterThanOrEqual(400);
+      // T1 canonicalization (2026-05-25): when the host CHOOSES 403 to
+      // signal the authz boundary, the response envelope MUST carry
+      // `error: "workspace_membership_required"` per rest-endpoints.md
+      // §"Common error codes". Hosts that refuse with other codes
+      // (401 if they treat the failure as authentication-level, 404 to
+      // avoid existence disclosure, 5xx on infra failure) have the
+      // refusal accepted above but the envelope shape is NOT constrained
+      // by this scenario — the canonical envelope is conditional on the
+      // 403 status code, not a forced upgrade.
+      if (res.status === 403) {
+        const body = res.json as { error?: unknown } | null;
+        expect(
+          body?.error,
+          driver.describe(
+            'spec/v1/rest-endpoints.md §Common error codes — workspace_membership_required',
+            `403 refusal of a workspace-scoped mutation MUST carry error: "workspace_membership_required"; got error: ${JSON.stringify(body?.error)}`,
+          ),
+        ).toBe('workspace_membership_required');
+      }
+    });
+  },
+);

package/src/scenarios/prompt-read-workspace-membership-enforced.test.ts ADDED Viewed

@@ -0,0 +1,183 @@
+/**
+ * prompt-read-workspace-membership-enforced — RFC 0028 Tier-2 §"Workspace
+ * membership on workspace-scoped reads and writes" verification (READ path).
+ *
+ * Status: ACTIVE (capability-gated; behavioral when the host advertises
+ * `capabilities.prompts.supported: true` AND accepts `?workspaceId=` on
+ * `GET /v1/prompts`). Hosts that don't expose workspace-scoped reads
+ * (host-only template libraries with no workspace dimension) self-skip
+ * via response-shape detection.
+ *
+ * The contract (spec/v1/prompts.md §"Discovery & distribution" §"REST
+ * endpoints" §"Workspace membership on workspace-scoped reads and writes"):
+ *
+ *   Read paths are NOT exempt from the workspace-membership invariant
+ *   just because they don't write. A GET /v1/prompts?workspaceId=<not-mine>
+ *   that returns another workspace's templates is a cross-tenant data leak
+ *   with the same blast radius as a cross-tenant write. Hosts MUST verify
+ *   the authenticated principal's workspace membership BEFORE returning
+ *   workspace-scoped content.
+ *
+ * Gate per MyndHyve relay 2026-05-25 ("Option B"): probe ALL hosts that
+ * advertise `capabilities.prompts.supported: true` regardless of
+ * `mutableLibrary`; read-only hosts that expose `?workspaceId=` reads are
+ * NOT exempt from the symmetric authz invariant. Hosts that don't expose
+ * workspace-scoped reads at all self-skip via the response interpretation
+ * below (the suite avoids inventing a new capability field just for this
+ * gating concern).
+ *
+ * The probe drives `GET /v1/prompts?workspaceId=<random-uuid>` and
+ * interprets the response:
+ *
+ *   - 4xx (any code) — PASS (refused). If 403 specifically, additionally
+ *     pin `error === "workspace_membership_required"` per the canonical
+ *     envelope in rest-endpoints.md §"Common error codes".
+ *   - 200 with `templates: []` — PASS. The host correctly returned no
+ *     content for a workspace the principal isn't a member of. A random
+ *     UUID workspace also definitionally has no real content, so an empty
+ *     result is the correct null answer.
+ *   - 200 with `templates: [non-empty]` — FAIL. The host returned content
+ *     for an unauthorized workspace. This is the cross-tenant data leak
+ *     failure mode. (Note: this scenario uses a random workspaceId so any
+ *     non-empty result is a leak — there can't legitimately be templates
+ *     in a freshly-generated nonexistent workspace.)
+ *   - 200 without a `templates[]` field, or a response shape that doesn't
+ *     resemble the documented `/v1/prompts` list shape — SKIP with a
+ *     diagnostic log. Indicates the host doesn't recognize `?workspaceId=`
+ *     on this endpoint (e.g., host-only template library with no
+ *     workspace dimension).
+ *   - 5xx — PASS (refused; envelope shape unconstrained).
+ *
+ * Why a random workspaceId is sufficient: the assertion is negative-space.
+ * A host that correctly enforces membership MUST refuse for ANY workspace
+ * the principal isn't a member of, and a random UUID has astronomically-low
+ * collision probability with any real workspace membership grant. A host
+ * that returns templates from a random UUID workspace is leaking content
+ * from somewhere (host-built-in misclassified as workspace, or a silent
+ * fall-through to another workspace's content, or a query bug returning
+ * everything).
+ *
+ * @see RFCS/0028-prompt-library-endpoints.md §"Post-promotion notes"
+ * @see spec/v1/prompts.md §"Security invariants" §prompt-read-workspace-membership-enforced
+ * @see spec/v1/rest-endpoints.md §"Common error codes" §workspace_membership_required
+ * @see spec/v1/auth.md §"Identity claims — tenant · workspace · principal"
+ * @see RFCS/0048-tenant-workspace-principal-identity-model.md §D
+ */
+import { describe, it, expect } from 'vitest';
+import { randomUUID } from 'node:crypto';
+import { driver } from '../lib/driver.js';
+const HTTP_SKIP = !process.env.OPENWOP_BASE_URL;
+interface DiscoveryDoc {
+  capabilities?: {
+    prompts?: {
+      supported?: unknown;
+    };
+  };
+}
+interface PromptListResponse {
+  templates?: unknown;
+}
+async function readDiscovery(): Promise<DiscoveryDoc | null> {
+  try {
+    const res = await driver.get('/.well-known/openwop');
+    if (res.status !== 200) return null;
+    return res.json as DiscoveryDoc;
+  } catch {
+    return null;
+  }
+}
+describe.skipIf(HTTP_SKIP)(
+  'prompt-read-workspace-membership-enforced: workspace-scoped reads MUST NOT leak templates from another workspace (RFC 0028 Tier-2)',
+  () => {
+    it('GET /v1/prompts?workspaceId=<non-member> MUST refuse OR return empty templates[] — never another workspace\'s content', async (ctx) => {
+      const d = await readDiscovery();
+      if (d === null) {
+        ctx.skip();
+        return;
+      }
+      const promptsSupported = d.capabilities?.prompts?.supported;
+      if (promptsSupported !== true) {
+        ctx.skip();
+        return;
+      }
+      const nonMemberWorkspaceId =
+        process.env.OPENWOP_TEST_NONMEMBER_WORKSPACE_ID ??
+        `openwop-conformance-nonmember-${randomUUID()}`;
+      const res = await driver.get(
+        `/v1/prompts?workspaceId=${encodeURIComponent(nonMemberWorkspaceId)}`,
+      );
+      // 4xx — refused. Acceptable shape for the membership-required failure
+      // (and any other refusal mode the host chooses: 401, 404 for
+      // existence-disclosure avoidance, etc).
+      if (res.status >= 400 && res.status < 500) {
+        // Canonical envelope on 403 per rest-endpoints.md §"Common error codes".
+        if (res.status === 403) {
+          const body = res.json as { error?: unknown } | null;
+          expect(
+            body?.error,
+            driver.describe(
+              'spec/v1/rest-endpoints.md §Common error codes — workspace_membership_required',
+              `403 refusal of a workspace-scoped read MUST carry error: "workspace_membership_required"; got error: ${JSON.stringify(body?.error)}`,
+            ),
+          ).toBe('workspace_membership_required');
+        }
+        return;
+      }
+      // 5xx — refused (infrastructure failure is acceptable; envelope shape
+      // unconstrained).
+      if (res.status >= 500) return;
+      // 2xx — must inspect the response body. The failure mode this
+      // invariant guards against is a 200 response that LEAKS templates
+      // from a workspace the principal isn't a member of.
+      if (res.status >= 200 && res.status < 300) {
+        const body = res.json as PromptListResponse | null;
+        if (
+          body === null ||
+          typeof body !== 'object' ||
+          !('templates' in body)
+        ) {
+          // Host doesn't recognize `?workspaceId=` on this endpoint
+          // (response shape doesn't include the documented `templates[]`
+          // field). Soft-skip: this scenario probes hosts that expose
+          // workspace-scoped reads, and a host without that surface is
+          // simply out of scope.
+          ctx.skip();
+          return;
+        }
+        const templates = body.templates;
+        if (!Array.isArray(templates)) {
+          // Same: unrecognized shape, skip.
+          ctx.skip();
+          return;
+        }
+        // A random non-member workspaceId can never legitimately contain
+        // templates the caller is authorized to see. Any non-empty result
+        // is a cross-tenant data leak.
+        expect(
+          templates.length,
+          driver.describe(
+            'spec/v1/prompts.md §Workspace membership on workspace-scoped reads and writes',
+            `GET /v1/prompts?workspaceId=<random-non-member> MUST NOT return any templates; got ${templates.length} templates which is a cross-tenant data leak (the random workspaceId is freshly generated per probe and cannot legitimately contain authorized content)`,
+          ),
+        ).toBe(0);
+        return;
+      }
+      // Other status codes (1xx, 3xx) — soft-skip with note. Not a clear
+      // signal either way.
+      ctx.skip();
+    });
+  },
+);

package/src/scenarios/replay-divergence-at-refusal.test.ts CHANGED Viewed

@@ -27,9 +27,15 @@
  * mock provider returning a valid envelope on the original run and a
  * refusal on the replay (or vice-versa). Reference workflow-engine ships
  * a mock-AI provider (`OPENWOP_MULTI_AGENT_EXECUTION_MODEL=true`); the
- * Phase 4 wiring extends it to honor a "refusal on replay" mode. Until
- * that wiring lands, the assertion is surfaced as `it.todo` so test
- * reporters track the gap rather than reporting a vacuous PASS.
+ * Phase 4 wiring (landed 2026-05-23 via commits `1fce55a` + `bba3b4a`)
+ * extends it with `checkReplayDivergence()` in the executor catch-path
+ * + symmetric success-path detection of envelope-kind divergence; emits
+ * `replay.divergedAtRefusal` event and fails the run with
+ * `error.code: 'replay_diverged_at_refusal'` when source vs replay
+ * differ at the same nodeId. Behavioral coverage is now real: 3
+ * assertions PASS against workflow-engine when Phase 4 advertisement
+ * is enabled (cover both divergence directions: original=valid +
+ * replay=refusal AND original=refusal + replay=valid).
  *
  * @see RFCS/0041-multi-agent-replay-under-nondeterminism.md §B
  * @see spec/v1/replay.md §"Envelope-refusal recovery in replay (MAE-8 closure)"
@@ -113,6 +119,40 @@ describe.skipIf(HTTP_SKIP)('replay-divergence-at-refusal: advertisement shape (R
   });
 });
+interface RunSnapshot {
+  status?: string;
+  error?: { code?: string; message?: string };
+}
+interface RunEventDoc {
+  type: string;
+  nodeId?: string;
+  sequence?: number;
+  payload?: Record<string, unknown>;
+}
+async function pollUntilTerminal(runId: string): Promise<RunSnapshot> {
+  for (let i = 0; i < 50; i++) {
+    const r = await driver.get(`/v1/runs/${encodeURIComponent(runId)}`);
+    const snap = r.json as RunSnapshot;
+    if (snap.status === 'completed' || snap.status === 'failed' || snap.status === 'cancelled') {
+      return snap;
+    }
+    await new Promise((resolve) => setTimeout(resolve, 100));
+  }
+  throw new Error(`run ${runId} did not reach terminal within 5s`);
+}
+async function readEvents(runId: string): Promise<RunEventDoc[]> {
+  const r = await driver.get(`/v1/runs/${encodeURIComponent(runId)}/events`);
+  const body = r.json as { events?: RunEventDoc[] };
+  return body.events ?? [];
+}
+async function programMock(nodeId: string, program: Array<Record<string, unknown>>): Promise<number> {
+  const r = await driver.post('/v1/host/sample/test/mock-ai/program', { nodeId, program });
+  return r.status;
+}
 describe.skipIf(HTTP_SKIP)('replay-divergence-at-refusal: behavioral (RFC 0041 §B MAE-8)', () => {
   // Behavioral assertion drives a workflow whose mock-AI provider returns a
   // valid envelope on the original run + a refusal on the replay (or
@@ -127,8 +167,148 @@ describe.skipIf(HTTP_SKIP)('replay-divergence-at-refusal: behavioral (RFC 0041
   //        originalEnvelopeKind === 'valid' AND replayEnvelopeKind === 'refusal'.
   //   7. Assert NO silent substitution: the replay's continuation past the
   //      diverging node MUST NOT execute (run terminates at the divergence).
-  // Until the reference host wires the staged-refusal seam, surfaced as
-  // `todo` so test reporters track the gap.
-  it.todo('Phase 4 host MUST emit replay.divergedAtRefusal + fail with replay_diverged_at_refusal when original=valid + replay=refusal');
-  it.todo('Phase 4 host MUST emit replay.divergedAtRefusal + fail with replay_diverged_at_refusal when original=refusal + replay=valid (symmetric case)');
+  async function gateOnPhase4(ctx: { skip: () => void }): Promise<boolean> {
+    const d = await readDiscovery();
+    const rd = d?.capabilities?.multiAgent?.executionModel?.replayDeterminism;
+    if (rd?.supported !== true || rd?.refusalDivergenceEmission !== true) {
+      ctx.skip();
+      return false;
+    }
+    return true;
+  }
+  it('Phase 4 host MUST emit replay.divergedAtRefusal + fail with replay_diverged_at_refusal when original=valid + replay=refusal', async (ctx) => {
+    if (!(await gateOnPhase4(ctx))) return;
+    const NODE_ID = 'structured-call';
+    // Original program: valid envelope. Replay program (set after the
+    // original completes): refusal. Programming twice is the spec-canonical
+    // pattern — see spec/v1/host-sample-test-seams.md §5.
+    const validEnv = '{"valid":true}';
+    const programStatus = await programMock(NODE_ID, [
+      { content: validEnv, stopReason: 'end_turn' as const },
+    ]);
+    if (programStatus === 404) {
+      ctx.skip(); // mock-AI program seam not exposed — soft-skip
+      return;
+    }
+    expect(programStatus).toBe(200);
+    const createRes = await driver.post('/v1/runs', {
+      workflowId: 'conformance-phase4-replay-divergence',
+    });
+    if (createRes.status === 404 || createRes.status === 422) {
+      ctx.skip(); // fixture not advertised
+      return;
+    }
+    expect(createRes.status).toBe(201);
+    const sourceRunId = (createRes.json as { runId: string }).runId;
+    const sourceTerminal = await pollUntilTerminal(sourceRunId);
+    expect(sourceTerminal.status).toBe('completed');
+    // Stage refusal for the replay's mock-AI dispatch.
+    await programMock(NODE_ID, [
+      { content: 'safety-refused-for-conformance', stopReason: 'safety' as const, refusalText: 'safety-refused-for-conformance' },
+    ]);
+    const forkRes = await driver.post(`/v1/runs/${encodeURIComponent(sourceRunId)}:fork`, {
+      fromSeq: 0,
+      mode: 'replay',
+    });
+    expect(forkRes.status).toBe(201);
+    const replayRunId = (forkRes.json as { runId: string }).runId;
+    const replayTerminal = await pollUntilTerminal(replayRunId);
+    expect(
+      replayTerminal.status,
+      driver.describe(
+        'RFCS/0041-multi-agent-replay-under-nondeterminism.md §B + spec/v1/rest-endpoints.md §"Common error codes"',
+        'replay MUST terminate `failed` when refusal-divergence is detected (silent substitution is non-conformant)',
+      ),
+    ).toBe('failed');
+    expect(
+      replayTerminal.error?.code,
+      driver.describe(
+        'spec/v1/rest-endpoints.md §"Common error codes" — replay_diverged_at_refusal',
+        'error.code MUST be `replay_diverged_at_refusal` per the canonical catalog',
+      ),
+    ).toBe('replay_diverged_at_refusal');
+    const replayEvents = await readEvents(replayRunId);
+    const divergenceEvent = replayEvents.find((e) => e.type === 'replay.divergedAtRefusal');
+    expect(
+      divergenceEvent,
+      driver.describe(
+        'schemas/run-event-payloads.schema.json §replayDivergedAtRefusal',
+        'replay event log MUST contain exactly one `replay.divergedAtRefusal` event identifying the divergence',
+      ),
+    ).toBeDefined();
+    expect(divergenceEvent?.payload?.sourceRunId).toBe(sourceRunId);
+    expect(divergenceEvent?.payload?.nodeId).toBe(NODE_ID);
+    expect(
+      divergenceEvent?.payload?.originalEnvelopeKind,
+      driver.describe(
+        'schemas/run-event-payloads.schema.json §replayDivergedAtRefusal.originalEnvelopeKind',
+        'originalEnvelopeKind MUST be `valid` (source run completed normally)',
+      ),
+    ).toBe('valid');
+    expect(
+      divergenceEvent?.payload?.replayEnvelopeKind,
+      driver.describe(
+        'schemas/run-event-payloads.schema.json §replayDivergedAtRefusal.replayEnvelopeKind',
+        'replayEnvelopeKind MUST be `refusal` (replay hit the refusal entry of the mock program)',
+      ),
+    ).toBe('refusal');
+  });
+  it('Phase 4 host MUST emit replay.divergedAtRefusal + fail with replay_diverged_at_refusal when original=refusal + replay=valid (symmetric case)', async (ctx) => {
+    if (!(await gateOnPhase4(ctx))) return;
+    const NODE_ID = 'structured-call';
+    // Symmetric: original=refusal, replay=valid.
+    const programStatus = await programMock(NODE_ID, [
+      { content: 'safety-refused-for-conformance', stopReason: 'safety' as const, refusalText: 'safety-refused-for-conformance' },
+    ]);
+    if (programStatus === 404) {
+      ctx.skip();
+      return;
+    }
+    expect(programStatus).toBe(200);
+    const createRes = await driver.post('/v1/runs', {
+      workflowId: 'conformance-phase4-replay-divergence',
+    });
+    if (createRes.status === 404 || createRes.status === 422) {
+      ctx.skip();
+      return;
+    }
+    expect(createRes.status).toBe(201);
+    const sourceRunId = (createRes.json as { runId: string }).runId;
+    const sourceTerminal = await pollUntilTerminal(sourceRunId);
+    // Source run fails because the LLM refused.
+    expect(sourceTerminal.status).toBe('failed');
+    // Stage valid envelope for the replay's mock-AI dispatch.
+    await programMock(NODE_ID, [
+      { content: '{"valid":true}', stopReason: 'end_turn' as const },
+    ]);
+    const forkRes = await driver.post(`/v1/runs/${encodeURIComponent(sourceRunId)}:fork`, {
+      fromSeq: 0,
+      mode: 'replay',
+    });
+    expect(forkRes.status).toBe(201);
+    const replayRunId = (forkRes.json as { runId: string }).runId;
+    const replayTerminal = await pollUntilTerminal(replayRunId);
+    expect(replayTerminal.status).toBe('failed');
+    expect(replayTerminal.error?.code).toBe('replay_diverged_at_refusal');
+    const replayEvents = await readEvents(replayRunId);
+    const divergenceEvent = replayEvents.find((e) => e.type === 'replay.divergedAtRefusal');
+    expect(divergenceEvent).toBeDefined();
+    expect(divergenceEvent?.payload?.originalEnvelopeKind).toBe('refusal');
+    expect(divergenceEvent?.payload?.replayEnvelopeKind).toBe('valid');
+  });
 });

package/src/scenarios/replay-observable-sequence-determinism.test.ts CHANGED Viewed

@@ -5,7 +5,7 @@
  * `capabilities.multiAgent.executionModel.version >= 4` AND
  * `capabilities.multiAgent.executionModel.replayDeterminism.supported: true`.
  *
- * Asserts (behavioral, when a Phase 4 host advertises the contract):
+ * Asserts (behavioral, when a host advertises `version: 4` + the contract):
  *
  *   1. A `mode: replay` fork from event-log index `fromSeq` produces an
  *      event-log prefix `[0, fromSeq]` that is byte-equivalent to the
@@ -26,14 +26,14 @@
  * Driving the assertion requires a workflow fixture whose tool call is
  * pure-nondeterministic (different bytes on each call) but whose
  * observable result is what gets cached. Reference workflow-engine ships
- * `core.noop` + deterministic fixtures; Phase 4 wiring needs a
+ * `core.noop` + deterministic fixtures; the `version: 4` wiring needs a
  * nondeterministic-tool fixture (e.g., `conformance-phase4-nondet-tool`).
  * Until that lands, the cross-boundary assertion is surfaced as `it.todo`
  * so test reporters track the gap.
  *
  * @see RFCS/0041-multi-agent-replay-under-nondeterminism.md §C
  * @see spec/v1/replay.md §"Observable-output-sequence determinism vs bit-equivalent execution (MAE-9 closure)"
- * @see spec/v1/multi-agent-execution.md §"Phase 4 replay determinism"
+ * @see spec/v1/multi-agent-execution.md §"Replay determinism under nondeterminism (RFC 0041)"
  */
 import { describe, it } from 'vitest';
@@ -62,10 +62,19 @@ describe('replay-observable-sequence-determinism: prefix byte-equivalence (RFC 0
   //   6. Read original + replay RunSnapshot at index N; assert
   //      variables + channels + status byte-equivalent.
   // Surfaced as `todo` until the `conformance-phase4-nondet-tool`
-  // fixture ships in the suite — consistent with the sibling Phase 4
+  // fixture ships in the suite — consistent with the sibling RFC 0041
   // scenarios (`replay-divergence-at-refusal.test.ts`,
   // `replay-llm-cache-key-portable.test.ts`).
-  it.todo('original and replay event-log prefixes [0, fromSeq] MUST be byte-equivalent (modulo per-region clock + ULID-T entropy)');
+  // Marked out of stable profile via RFC 0042 §B (experimental tier):
+  // RFC 0041 §C remains Active, so its wire shape MAY shift compatibly
+  // within v1.x. Hosts that wire this assertion before RFC 0041 graduates
+  // to Accepted SHOULD advertise `multiAgent.executionModel.tier:
+  // 'experimental'` + `experimentalUntil` per RFC 0042 §A. Path-to-runnable
+  // requires: (a) host pure-replay observable-cache emission via the
+  // `:fork mode: replay` re-dispatch path and (b) the test seam endpoint
+  // contract for cache-hit-vs-fresh-call distinction (see
+  // `spec/v1/host-sample-test-seams.md` for the established seam pattern).
+  it.skip('original and replay event-log prefixes [0, fromSeq] MUST be byte-equivalent (modulo per-region clock + ULID-T entropy) — out of stable profile via RFC 0042');
 });
 describe('replay-observable-sequence-determinism: observable-result caching (RFC 0041 §C)', () => {
@@ -76,5 +85,10 @@ describe('replay-observable-sequence-determinism: observable-result caching (RFC
   // this a valid determinism contract — bit-equivalent execution would
   // require unbounded caching (rejected per RFC 0041 §"Alternatives
   // considered" #2).
-  it.todo('replay of a workflow containing a nondeterministic tool call reproduces the original observable result, NOT a fresh call');
+  // Marked out of stable profile via RFC 0042 §B (experimental tier):
+  // see the prefix-byte-equivalence comment above for the same routing.
+  // This is RFC 0041 §C's load-bearing assertion; it lands as a runnable
+  // `it()` when RFC 0041 graduates to Accepted on first non-steward host
+  // adoption.
+  it.skip('replay of a workflow containing a nondeterministic tool call reproduces the original observable result, NOT a fresh call — out of stable profile via RFC 0042');
 });