@openwop/openwop-conformance 1.4.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/CHANGELOG.md +60 -0
  2. package/README.md +2 -2
  3. package/api/asyncapi.yaml +8 -3
  4. package/api/openapi.yaml +305 -0
  5. package/coverage.md +35 -10
  6. package/fixtures/conformance-phase4-nondet-tool.json +53 -0
  7. package/fixtures/conformance-phase4-replay-divergence.json +40 -0
  8. package/fixtures.md +5 -3
  9. package/package.json +1 -1
  10. package/schemas/README.md +2 -0
  11. package/schemas/capabilities.schema.json +176 -3
  12. package/schemas/credential-reference.schema.json +21 -0
  13. package/schemas/node-pack-manifest.schema.json +112 -1
  14. package/schemas/run-diff-response.schema.json +64 -0
  15. package/schemas/run-event-payloads.schema.json +104 -2
  16. package/schemas/run-event.schema.json +8 -1
  17. package/schemas/run-snapshot.schema.json +11 -0
  18. package/src/lib/behavior-gate.ts +51 -0
  19. package/src/lib/driver.ts +13 -1
  20. package/src/lib/saml-idp.ts +179 -0
  21. package/src/scenarios/approval-gate-events.test.ts +61 -0
  22. package/src/scenarios/approval-gate-flow.test.ts +68 -0
  23. package/src/scenarios/auth-saml-profile.test.ts +119 -0
  24. package/src/scenarios/auth-scim-profile.test.ts +65 -0
  25. package/src/scenarios/authorization-fail-closed.test.ts +80 -0
  26. package/src/scenarios/authorization-roles-shape.test.ts +83 -0
  27. package/src/scenarios/connector-manifest-validity.test.ts +142 -0
  28. package/src/scenarios/credential-payload-redaction.test.ts +93 -0
  29. package/src/scenarios/credentials-capability-shape.test.ts +90 -0
  30. package/src/scenarios/cross-engine-append-behavior.test.ts +204 -0
  31. package/src/scenarios/cross-host-traceparent-propagation.test.ts +13 -6
  32. package/src/scenarios/cross-workspace-isolation.test.ts +72 -0
  33. package/src/scenarios/deadletter-capability-shape.test.ts +59 -0
  34. package/src/scenarios/deadletter-retry-exhaustion.test.ts +62 -0
  35. package/src/scenarios/experimental-tier-shape.test.ts +192 -0
  36. package/src/scenarios/identity-owner-shape.test.ts +64 -0
  37. package/src/scenarios/multi-agent-confidence-escalation.test.ts +59 -21
  38. package/src/scenarios/multi-agent-memory-lifecycle.test.ts +87 -12
  39. package/src/scenarios/multi-region-idempotency-behavior.test.ts +203 -0
  40. package/src/scenarios/oauth-capability-shape.test.ts +97 -0
  41. package/src/scenarios/oauth-connector-redaction.test.ts +91 -0
  42. package/src/scenarios/pack-registry-isolation.test.ts +108 -0
  43. package/src/scenarios/pack-registry-publish.test.ts +1 -1
  44. package/src/scenarios/prompt-mutation-workspace-membership-enforced.test.ts +126 -0
  45. package/src/scenarios/prompt-read-workspace-membership-enforced.test.ts +183 -0
  46. package/src/scenarios/replay-divergence-at-refusal.test.ts +187 -7
  47. package/src/scenarios/replay-observable-sequence-determinism.test.ts +20 -6
  48. package/src/scenarios/run-diff.test.ts +143 -0
  49. package/src/scenarios/sandbox-capability-gate-respected.test.ts +15 -13
  50. package/src/scenarios/sandbox-memory-cap.test.ts +7 -8
  51. package/src/scenarios/sandbox-mvp-behavior.test.ts +280 -0
  52. package/src/scenarios/sandbox-no-cross-pack-mutation.test.ts +14 -13
  53. package/src/scenarios/sandbox-no-host-env-leak.test.ts +14 -21
  54. package/src/scenarios/sandbox-no-host-fs-escape.test.ts +20 -15
  55. package/src/scenarios/sandbox-no-host-process-escape.test.ts +18 -13
  56. package/src/scenarios/sandbox-no-network-escape.test.ts +14 -31
  57. package/src/scenarios/sandbox-timeout-cap.test.ts +7 -8
  58. package/src/scenarios/scheduling-capability-shape.test.ts +81 -0
  59. package/src/scenarios/scheduling-cron-fires-once.test.ts +66 -0
  60. package/src/scenarios/secret-leakage-otel-attribute.test.ts +241 -0
  61. package/src/scenarios/spec-corpus-validity.test.ts +2 -2
@@ -0,0 +1,183 @@
1
+ /**
2
+ * prompt-read-workspace-membership-enforced — RFC 0028 Tier-2 §"Workspace
3
+ * membership on workspace-scoped reads and writes" verification (READ path).
4
+ *
5
+ * Status: ACTIVE (capability-gated; behavioral when the host advertises
6
+ * `capabilities.prompts.supported: true` AND accepts `?workspaceId=` on
7
+ * `GET /v1/prompts`). Hosts that don't expose workspace-scoped reads
8
+ * (host-only template libraries with no workspace dimension) self-skip
9
+ * via response-shape detection.
10
+ *
11
+ * The contract (spec/v1/prompts.md §"Discovery & distribution" §"REST
12
+ * endpoints" §"Workspace membership on workspace-scoped reads and writes"):
13
+ *
14
+ * Read paths are NOT exempt from the workspace-membership invariant
15
+ * just because they don't write. A GET /v1/prompts?workspaceId=<not-mine>
16
+ * that returns another workspace's templates is a cross-tenant data leak
17
+ * with the same blast radius as a cross-tenant write. Hosts MUST verify
18
+ * the authenticated principal's workspace membership BEFORE returning
19
+ * workspace-scoped content.
20
+ *
21
+ * Gate per MyndHyve relay 2026-05-25 ("Option B"): probe ALL hosts that
22
+ * advertise `capabilities.prompts.supported: true` regardless of
23
+ * `mutableLibrary`; read-only hosts that expose `?workspaceId=` reads are
24
+ * NOT exempt from the symmetric authz invariant. Hosts that don't expose
25
+ * workspace-scoped reads at all self-skip via the response interpretation
26
+ * below (the suite avoids inventing a new capability field just for this
27
+ * gating concern).
28
+ *
29
+ * The probe drives `GET /v1/prompts?workspaceId=<random-uuid>` and
30
+ * interprets the response:
31
+ *
32
+ * - 4xx (any code) — PASS (refused). If 403 specifically, additionally
33
+ * pin `error === "workspace_membership_required"` per the canonical
34
+ * envelope in rest-endpoints.md §"Common error codes".
35
+ * - 200 with `templates: []` — PASS. The host correctly returned no
36
+ * content for a workspace the principal isn't a member of. A random
37
+ * UUID workspace also definitionally has no real content, so an empty
38
+ * result is the correct null answer.
39
+ * - 200 with `templates: [non-empty]` — FAIL. The host returned content
40
+ * for an unauthorized workspace. This is the cross-tenant data leak
41
+ * failure mode. (Note: this scenario uses a random workspaceId so any
42
+ * non-empty result is a leak — there can't legitimately be templates
43
+ * in a freshly-generated nonexistent workspace.)
44
+ * - 200 without a `templates[]` field, or a response shape that doesn't
45
+ * resemble the documented `/v1/prompts` list shape — SKIP with a
46
+ * diagnostic log. Indicates the host doesn't recognize `?workspaceId=`
47
+ * on this endpoint (e.g., host-only template library with no
48
+ * workspace dimension).
49
+ * - 5xx — PASS (refused; envelope shape unconstrained).
50
+ *
51
+ * Why a random workspaceId is sufficient: the assertion is negative-space.
52
+ * A host that correctly enforces membership MUST refuse for ANY workspace
53
+ * the principal isn't a member of, and a random UUID has astronomically-low
54
+ * collision probability with any real workspace membership grant. A host
55
+ * that returns templates from a random UUID workspace is leaking content
56
+ * from somewhere (host-built-in misclassified as workspace, or a silent
57
+ * fall-through to another workspace's content, or a query bug returning
58
+ * everything).
59
+ *
60
+ * @see RFCS/0028-prompt-library-endpoints.md §"Post-promotion notes"
61
+ * @see spec/v1/prompts.md §"Security invariants" §prompt-read-workspace-membership-enforced
62
+ * @see spec/v1/rest-endpoints.md §"Common error codes" §workspace_membership_required
63
+ * @see spec/v1/auth.md §"Identity claims — tenant · workspace · principal"
64
+ * @see RFCS/0048-tenant-workspace-principal-identity-model.md §D
65
+ */
66
+
67
+ import { describe, it, expect } from 'vitest';
68
+ import { randomUUID } from 'node:crypto';
69
+ import { driver } from '../lib/driver.js';
70
+
71
+ const HTTP_SKIP = !process.env.OPENWOP_BASE_URL;
72
+
73
+ interface DiscoveryDoc {
74
+ capabilities?: {
75
+ prompts?: {
76
+ supported?: unknown;
77
+ };
78
+ };
79
+ }
80
+
81
+ interface PromptListResponse {
82
+ templates?: unknown;
83
+ }
84
+
85
+ async function readDiscovery(): Promise<DiscoveryDoc | null> {
86
+ try {
87
+ const res = await driver.get('/.well-known/openwop');
88
+ if (res.status !== 200) return null;
89
+ return res.json as DiscoveryDoc;
90
+ } catch {
91
+ return null;
92
+ }
93
+ }
94
+
95
+ describe.skipIf(HTTP_SKIP)(
96
+ 'prompt-read-workspace-membership-enforced: workspace-scoped reads MUST NOT leak templates from another workspace (RFC 0028 Tier-2)',
97
+ () => {
98
+ it('GET /v1/prompts?workspaceId=<non-member> MUST refuse OR return empty templates[] — never another workspace\'s content', async (ctx) => {
99
+ const d = await readDiscovery();
100
+ if (d === null) {
101
+ ctx.skip();
102
+ return;
103
+ }
104
+ const promptsSupported = d.capabilities?.prompts?.supported;
105
+ if (promptsSupported !== true) {
106
+ ctx.skip();
107
+ return;
108
+ }
109
+
110
+ const nonMemberWorkspaceId =
111
+ process.env.OPENWOP_TEST_NONMEMBER_WORKSPACE_ID ??
112
+ `openwop-conformance-nonmember-${randomUUID()}`;
113
+
114
+ const res = await driver.get(
115
+ `/v1/prompts?workspaceId=${encodeURIComponent(nonMemberWorkspaceId)}`,
116
+ );
117
+
118
+ // 4xx — refused. Acceptable shape for the membership-required failure
119
+ // (and any other refusal mode the host chooses: 401, 404 for
120
+ // existence-disclosure avoidance, etc).
121
+ if (res.status >= 400 && res.status < 500) {
122
+ // Canonical envelope on 403 per rest-endpoints.md §"Common error codes".
123
+ if (res.status === 403) {
124
+ const body = res.json as { error?: unknown } | null;
125
+ expect(
126
+ body?.error,
127
+ driver.describe(
128
+ 'spec/v1/rest-endpoints.md §Common error codes — workspace_membership_required',
129
+ `403 refusal of a workspace-scoped read MUST carry error: "workspace_membership_required"; got error: ${JSON.stringify(body?.error)}`,
130
+ ),
131
+ ).toBe('workspace_membership_required');
132
+ }
133
+ return;
134
+ }
135
+
136
+ // 5xx — refused (infrastructure failure is acceptable; envelope shape
137
+ // unconstrained).
138
+ if (res.status >= 500) return;
139
+
140
+ // 2xx — must inspect the response body. The failure mode this
141
+ // invariant guards against is a 200 response that LEAKS templates
142
+ // from a workspace the principal isn't a member of.
143
+ if (res.status >= 200 && res.status < 300) {
144
+ const body = res.json as PromptListResponse | null;
145
+ if (
146
+ body === null ||
147
+ typeof body !== 'object' ||
148
+ !('templates' in body)
149
+ ) {
150
+ // Host doesn't recognize `?workspaceId=` on this endpoint
151
+ // (response shape doesn't include the documented `templates[]`
152
+ // field). Soft-skip: this scenario probes hosts that expose
153
+ // workspace-scoped reads, and a host without that surface is
154
+ // simply out of scope.
155
+ ctx.skip();
156
+ return;
157
+ }
158
+ const templates = body.templates;
159
+ if (!Array.isArray(templates)) {
160
+ // Same: unrecognized shape, skip.
161
+ ctx.skip();
162
+ return;
163
+ }
164
+
165
+ // A random non-member workspaceId can never legitimately contain
166
+ // templates the caller is authorized to see. Any non-empty result
167
+ // is a cross-tenant data leak.
168
+ expect(
169
+ templates.length,
170
+ driver.describe(
171
+ 'spec/v1/prompts.md §Workspace membership on workspace-scoped reads and writes',
172
+ `GET /v1/prompts?workspaceId=<random-non-member> MUST NOT return any templates; got ${templates.length} templates which is a cross-tenant data leak (the random workspaceId is freshly generated per probe and cannot legitimately contain authorized content)`,
173
+ ),
174
+ ).toBe(0);
175
+ return;
176
+ }
177
+
178
+ // Other status codes (1xx, 3xx) — soft-skip with note. Not a clear
179
+ // signal either way.
180
+ ctx.skip();
181
+ });
182
+ },
183
+ );
@@ -27,9 +27,15 @@
27
27
  * mock provider returning a valid envelope on the original run and a
28
28
  * refusal on the replay (or vice-versa). Reference workflow-engine ships
29
29
  * a mock-AI provider (`OPENWOP_MULTI_AGENT_EXECUTION_MODEL=true`); the
30
- * Phase 4 wiring extends it to honor a "refusal on replay" mode. Until
31
- * that wiring lands, the assertion is surfaced as `it.todo` so test
32
- * reporters track the gap rather than reporting a vacuous PASS.
30
+ * Phase 4 wiring (landed 2026-05-23 via commits `1fce55a` + `bba3b4a`)
31
+ * extends it with `checkReplayDivergence()` in the executor catch-path
32
+ * + symmetric success-path detection of envelope-kind divergence; emits
33
+ * `replay.divergedAtRefusal` event and fails the run with
34
+ * `error.code: 'replay_diverged_at_refusal'` when source vs replay
35
+ * differ at the same nodeId. Behavioral coverage is now real: 3
36
+ * assertions PASS against workflow-engine when Phase 4 advertisement
37
+ * is enabled (cover both divergence directions: original=valid +
38
+ * replay=refusal AND original=refusal + replay=valid).
33
39
  *
34
40
  * @see RFCS/0041-multi-agent-replay-under-nondeterminism.md §B
35
41
  * @see spec/v1/replay.md §"Envelope-refusal recovery in replay (MAE-8 closure)"
@@ -113,6 +119,40 @@ describe.skipIf(HTTP_SKIP)('replay-divergence-at-refusal: advertisement shape (R
113
119
  });
114
120
  });
115
121
 
122
+ interface RunSnapshot {
123
+ status?: string;
124
+ error?: { code?: string; message?: string };
125
+ }
126
+ interface RunEventDoc {
127
+ type: string;
128
+ nodeId?: string;
129
+ sequence?: number;
130
+ payload?: Record<string, unknown>;
131
+ }
132
+
133
+ async function pollUntilTerminal(runId: string): Promise<RunSnapshot> {
134
+ for (let i = 0; i < 50; i++) {
135
+ const r = await driver.get(`/v1/runs/${encodeURIComponent(runId)}`);
136
+ const snap = r.json as RunSnapshot;
137
+ if (snap.status === 'completed' || snap.status === 'failed' || snap.status === 'cancelled') {
138
+ return snap;
139
+ }
140
+ await new Promise((resolve) => setTimeout(resolve, 100));
141
+ }
142
+ throw new Error(`run ${runId} did not reach terminal within 5s`);
143
+ }
144
+
145
+ async function readEvents(runId: string): Promise<RunEventDoc[]> {
146
+ const r = await driver.get(`/v1/runs/${encodeURIComponent(runId)}/events`);
147
+ const body = r.json as { events?: RunEventDoc[] };
148
+ return body.events ?? [];
149
+ }
150
+
151
+ async function programMock(nodeId: string, program: Array<Record<string, unknown>>): Promise<number> {
152
+ const r = await driver.post('/v1/host/sample/test/mock-ai/program', { nodeId, program });
153
+ return r.status;
154
+ }
155
+
116
156
  describe.skipIf(HTTP_SKIP)('replay-divergence-at-refusal: behavioral (RFC 0041 §B MAE-8)', () => {
117
157
  // Behavioral assertion drives a workflow whose mock-AI provider returns a
118
158
  // valid envelope on the original run + a refusal on the replay (or
@@ -127,8 +167,148 @@ describe.skipIf(HTTP_SKIP)('replay-divergence-at-refusal: behavioral (RFC 0041
127
167
  // originalEnvelopeKind === 'valid' AND replayEnvelopeKind === 'refusal'.
128
168
  // 7. Assert NO silent substitution: the replay's continuation past the
129
169
  // diverging node MUST NOT execute (run terminates at the divergence).
130
- // Until the reference host wires the staged-refusal seam, surfaced as
131
- // `todo` so test reporters track the gap.
132
- it.todo('Phase 4 host MUST emit replay.divergedAtRefusal + fail with replay_diverged_at_refusal when original=valid + replay=refusal');
133
- it.todo('Phase 4 host MUST emit replay.divergedAtRefusal + fail with replay_diverged_at_refusal when original=refusal + replay=valid (symmetric case)');
170
+
171
+ async function gateOnPhase4(ctx: { skip: () => void }): Promise<boolean> {
172
+ const d = await readDiscovery();
173
+ const rd = d?.capabilities?.multiAgent?.executionModel?.replayDeterminism;
174
+ if (rd?.supported !== true || rd?.refusalDivergenceEmission !== true) {
175
+ ctx.skip();
176
+ return false;
177
+ }
178
+ return true;
179
+ }
180
+
181
+ it('Phase 4 host MUST emit replay.divergedAtRefusal + fail with replay_diverged_at_refusal when original=valid + replay=refusal', async (ctx) => {
182
+ if (!(await gateOnPhase4(ctx))) return;
183
+
184
+ const NODE_ID = 'structured-call';
185
+ // Original program: valid envelope. Replay program (set after the
186
+ // original completes): refusal. Programming twice is the spec-canonical
187
+ // pattern — see spec/v1/host-sample-test-seams.md §5.
188
+ const validEnv = '{"valid":true}';
189
+ const programStatus = await programMock(NODE_ID, [
190
+ { content: validEnv, stopReason: 'end_turn' as const },
191
+ ]);
192
+ if (programStatus === 404) {
193
+ ctx.skip(); // mock-AI program seam not exposed — soft-skip
194
+ return;
195
+ }
196
+ expect(programStatus).toBe(200);
197
+
198
+ const createRes = await driver.post('/v1/runs', {
199
+ workflowId: 'conformance-phase4-replay-divergence',
200
+ });
201
+ if (createRes.status === 404 || createRes.status === 422) {
202
+ ctx.skip(); // fixture not advertised
203
+ return;
204
+ }
205
+ expect(createRes.status).toBe(201);
206
+ const sourceRunId = (createRes.json as { runId: string }).runId;
207
+ const sourceTerminal = await pollUntilTerminal(sourceRunId);
208
+ expect(sourceTerminal.status).toBe('completed');
209
+
210
+ // Stage refusal for the replay's mock-AI dispatch.
211
+ await programMock(NODE_ID, [
212
+ { content: 'safety-refused-for-conformance', stopReason: 'safety' as const, refusalText: 'safety-refused-for-conformance' },
213
+ ]);
214
+
215
+ const forkRes = await driver.post(`/v1/runs/${encodeURIComponent(sourceRunId)}:fork`, {
216
+ fromSeq: 0,
217
+ mode: 'replay',
218
+ });
219
+ expect(forkRes.status).toBe(201);
220
+ const replayRunId = (forkRes.json as { runId: string }).runId;
221
+ const replayTerminal = await pollUntilTerminal(replayRunId);
222
+
223
+ expect(
224
+ replayTerminal.status,
225
+ driver.describe(
226
+ 'RFCS/0041-multi-agent-replay-under-nondeterminism.md §B + spec/v1/rest-endpoints.md §"Common error codes"',
227
+ 'replay MUST terminate `failed` when refusal-divergence is detected (silent substitution is non-conformant)',
228
+ ),
229
+ ).toBe('failed');
230
+ expect(
231
+ replayTerminal.error?.code,
232
+ driver.describe(
233
+ 'spec/v1/rest-endpoints.md §"Common error codes" — replay_diverged_at_refusal',
234
+ 'error.code MUST be `replay_diverged_at_refusal` per the canonical catalog',
235
+ ),
236
+ ).toBe('replay_diverged_at_refusal');
237
+
238
+ const replayEvents = await readEvents(replayRunId);
239
+ const divergenceEvent = replayEvents.find((e) => e.type === 'replay.divergedAtRefusal');
240
+ expect(
241
+ divergenceEvent,
242
+ driver.describe(
243
+ 'schemas/run-event-payloads.schema.json §replayDivergedAtRefusal',
244
+ 'replay event log MUST contain exactly one `replay.divergedAtRefusal` event identifying the divergence',
245
+ ),
246
+ ).toBeDefined();
247
+ expect(divergenceEvent?.payload?.sourceRunId).toBe(sourceRunId);
248
+ expect(divergenceEvent?.payload?.nodeId).toBe(NODE_ID);
249
+ expect(
250
+ divergenceEvent?.payload?.originalEnvelopeKind,
251
+ driver.describe(
252
+ 'schemas/run-event-payloads.schema.json §replayDivergedAtRefusal.originalEnvelopeKind',
253
+ 'originalEnvelopeKind MUST be `valid` (source run completed normally)',
254
+ ),
255
+ ).toBe('valid');
256
+ expect(
257
+ divergenceEvent?.payload?.replayEnvelopeKind,
258
+ driver.describe(
259
+ 'schemas/run-event-payloads.schema.json §replayDivergedAtRefusal.replayEnvelopeKind',
260
+ 'replayEnvelopeKind MUST be `refusal` (replay hit the refusal entry of the mock program)',
261
+ ),
262
+ ).toBe('refusal');
263
+ });
264
+
265
+ it('Phase 4 host MUST emit replay.divergedAtRefusal + fail with replay_diverged_at_refusal when original=refusal + replay=valid (symmetric case)', async (ctx) => {
266
+ if (!(await gateOnPhase4(ctx))) return;
267
+
268
+ const NODE_ID = 'structured-call';
269
+ // Symmetric: original=refusal, replay=valid.
270
+ const programStatus = await programMock(NODE_ID, [
271
+ { content: 'safety-refused-for-conformance', stopReason: 'safety' as const, refusalText: 'safety-refused-for-conformance' },
272
+ ]);
273
+ if (programStatus === 404) {
274
+ ctx.skip();
275
+ return;
276
+ }
277
+ expect(programStatus).toBe(200);
278
+
279
+ const createRes = await driver.post('/v1/runs', {
280
+ workflowId: 'conformance-phase4-replay-divergence',
281
+ });
282
+ if (createRes.status === 404 || createRes.status === 422) {
283
+ ctx.skip();
284
+ return;
285
+ }
286
+ expect(createRes.status).toBe(201);
287
+ const sourceRunId = (createRes.json as { runId: string }).runId;
288
+ const sourceTerminal = await pollUntilTerminal(sourceRunId);
289
+ // Source run fails because the LLM refused.
290
+ expect(sourceTerminal.status).toBe('failed');
291
+
292
+ // Stage valid envelope for the replay's mock-AI dispatch.
293
+ await programMock(NODE_ID, [
294
+ { content: '{"valid":true}', stopReason: 'end_turn' as const },
295
+ ]);
296
+
297
+ const forkRes = await driver.post(`/v1/runs/${encodeURIComponent(sourceRunId)}:fork`, {
298
+ fromSeq: 0,
299
+ mode: 'replay',
300
+ });
301
+ expect(forkRes.status).toBe(201);
302
+ const replayRunId = (forkRes.json as { runId: string }).runId;
303
+ const replayTerminal = await pollUntilTerminal(replayRunId);
304
+
305
+ expect(replayTerminal.status).toBe('failed');
306
+ expect(replayTerminal.error?.code).toBe('replay_diverged_at_refusal');
307
+
308
+ const replayEvents = await readEvents(replayRunId);
309
+ const divergenceEvent = replayEvents.find((e) => e.type === 'replay.divergedAtRefusal');
310
+ expect(divergenceEvent).toBeDefined();
311
+ expect(divergenceEvent?.payload?.originalEnvelopeKind).toBe('refusal');
312
+ expect(divergenceEvent?.payload?.replayEnvelopeKind).toBe('valid');
313
+ });
134
314
  });
@@ -5,7 +5,7 @@
5
5
  * `capabilities.multiAgent.executionModel.version >= 4` AND
6
6
  * `capabilities.multiAgent.executionModel.replayDeterminism.supported: true`.
7
7
  *
8
- * Asserts (behavioral, when a Phase 4 host advertises the contract):
8
+ * Asserts (behavioral, when a host advertises `version: 4` + the contract):
9
9
  *
10
10
  * 1. A `mode: replay` fork from event-log index `fromSeq` produces an
11
11
  * event-log prefix `[0, fromSeq]` that is byte-equivalent to the
@@ -26,14 +26,14 @@
26
26
  * Driving the assertion requires a workflow fixture whose tool call is
27
27
  * pure-nondeterministic (different bytes on each call) but whose
28
28
  * observable result is what gets cached. Reference workflow-engine ships
29
- * `core.noop` + deterministic fixtures; Phase 4 wiring needs a
29
+ * `core.noop` + deterministic fixtures; the `version: 4` wiring needs a
30
30
  * nondeterministic-tool fixture (e.g., `conformance-phase4-nondet-tool`).
31
31
  * Until that lands, the cross-boundary assertion is surfaced as `it.todo`
32
32
  * so test reporters track the gap.
33
33
  *
34
34
  * @see RFCS/0041-multi-agent-replay-under-nondeterminism.md §C
35
35
  * @see spec/v1/replay.md §"Observable-output-sequence determinism vs bit-equivalent execution (MAE-9 closure)"
36
- * @see spec/v1/multi-agent-execution.md §"Phase 4 replay determinism"
36
+ * @see spec/v1/multi-agent-execution.md §"Replay determinism under nondeterminism (RFC 0041)"
37
37
  */
38
38
 
39
39
  import { describe, it } from 'vitest';
@@ -62,10 +62,19 @@ describe('replay-observable-sequence-determinism: prefix byte-equivalence (RFC 0
62
62
  // 6. Read original + replay RunSnapshot at index N; assert
63
63
  // variables + channels + status byte-equivalent.
64
64
  // Surfaced as `todo` until the `conformance-phase4-nondet-tool`
65
- // fixture ships in the suite — consistent with the sibling Phase 4
65
+ // fixture ships in the suite — consistent with the sibling RFC 0041
66
66
  // scenarios (`replay-divergence-at-refusal.test.ts`,
67
67
  // `replay-llm-cache-key-portable.test.ts`).
68
- it.todo('original and replay event-log prefixes [0, fromSeq] MUST be byte-equivalent (modulo per-region clock + ULID-T entropy)');
68
+ // Marked out of stable profile via RFC 0042 §B (experimental tier):
69
+ // RFC 0041 §C remains Active, so its wire shape MAY shift compatibly
70
+ // within v1.x. Hosts that wire this assertion before RFC 0041 graduates
71
+ // to Accepted SHOULD advertise `multiAgent.executionModel.tier:
72
+ // 'experimental'` + `experimentalUntil` per RFC 0042 §A. Path-to-runnable
73
+ // requires: (a) host pure-replay observable-cache emission via the
74
+ // `:fork mode: replay` re-dispatch path and (b) the test seam endpoint
75
+ // contract for cache-hit-vs-fresh-call distinction (see
76
+ // `spec/v1/host-sample-test-seams.md` for the established seam pattern).
77
+ it.skip('original and replay event-log prefixes [0, fromSeq] MUST be byte-equivalent (modulo per-region clock + ULID-T entropy) — out of stable profile via RFC 0042');
69
78
  });
70
79
 
71
80
  describe('replay-observable-sequence-determinism: observable-result caching (RFC 0041 §C)', () => {
@@ -76,5 +85,10 @@ describe('replay-observable-sequence-determinism: observable-result caching (RFC
76
85
  // this a valid determinism contract — bit-equivalent execution would
77
86
  // require unbounded caching (rejected per RFC 0041 §"Alternatives
78
87
  // considered" #2).
79
- it.todo('replay of a workflow containing a nondeterministic tool call reproduces the original observable result, NOT a fresh call');
88
+ // Marked out of stable profile via RFC 0042 §B (experimental tier):
89
+ // see the prefix-byte-equivalence comment above for the same routing.
90
+ // This is RFC 0041 §C's load-bearing assertion; it lands as a runnable
91
+ // `it()` when RFC 0041 graduates to Accepted on first non-steward host
92
+ // adoption.
93
+ it.skip('replay of a workflow containing a nondeterministic tool call reproduces the original observable result, NOT a fresh call — out of stable profile via RFC 0042');
80
94
  });
@@ -0,0 +1,143 @@
1
+ /**
2
+ * RFC 0054 — run diff & execution comparison.
3
+ *
4
+ * Exercises `GET /v1/runs/{runId}:diff?against={otherRunId}` per
5
+ * `spec/v1/rest-endpoints.md` §"GET /v1/runs/{runId}:diff" and
6
+ * `schemas/run-diff-response.schema.json`. The endpoint is OPTIONAL —
7
+ * hosts that don't implement it return 404 and these scenarios soft-skip.
8
+ *
9
+ * Coverage:
10
+ * - identical: diffing a run against itself ⇒ divergedAtSeq null,
11
+ * empty eventDiffs (the determinism floor).
12
+ * - divergence: diffing two structurally-different runs ⇒ a non-null
13
+ * integer divergedAtSeq; eventDiffs begin at that seq.
14
+ * - state-shape: response conforms to run-diff-response.schema.json and
15
+ * stateDiff is redaction-safe (no credential-shaped keys).
16
+ * - error-surface: missing `against` ⇒ 400; nonexistent `against` ⇒ 404
17
+ * (the access boundary; full cross-principal authz needs a
18
+ * multi-principal harness — host-specific).
19
+ *
20
+ * @see RFCS/0054-run-diff-and-execution-comparison.md
21
+ * @see api/openapi.yaml §diffRun
22
+ */
23
+
24
+ import { describe, it, expect } from 'vitest';
25
+ import { driver } from '../lib/driver.js';
26
+
27
+ const HTTP_SKIP = !process.env.OPENWOP_BASE_URL;
28
+
29
+ // Two standard conformance fixtures with structurally-different event
30
+ // logs — diffing one against the other is a deterministic divergence.
31
+ const FIXTURE_A = 'conformance-agent-reasoning';
32
+ const FIXTURE_B = 'conformance-dispatch-loop';
33
+
34
+ interface DiffResponse {
35
+ a: string;
36
+ b: string;
37
+ divergedAtSeq: number | null;
38
+ eventDiffs: Array<{ seq: number; op: string; aEvent?: unknown; bEvent?: unknown }>;
39
+ stateDiff: Record<string, unknown>;
40
+ truncated?: boolean;
41
+ }
42
+
43
+ async function createRun(workflowId: string): Promise<string | null> {
44
+ const res = await driver.post('/v1/runs', { workflowId });
45
+ if (res.status !== 201) return null;
46
+ return (res.json as { runId: string }).runId;
47
+ }
48
+
49
+ /** Poll until the run is terminal (best-effort; bounded). */
50
+ async function settle(runId: string): Promise<void> {
51
+ for (let i = 0; i < 20; i++) {
52
+ const r = await driver.get(`/v1/runs/${encodeURIComponent(runId)}`);
53
+ const status = (r.json as { status?: string })?.status;
54
+ if (status === 'completed' || status === 'failed' || status === 'cancelled') return;
55
+ await new Promise((res) => setTimeout(res, 100));
56
+ }
57
+ }
58
+
59
+ describe.skipIf(HTTP_SKIP)('run-diff: GET /v1/runs/{runId}:diff (RFC 0054)', () => {
60
+ it('diffing a run against itself ⇒ divergedAtSeq null + empty eventDiffs', async (ctx) => {
61
+ const runId = await createRun(FIXTURE_A);
62
+ if (!runId) { ctx.skip(); return; }
63
+ await settle(runId);
64
+
65
+ const res = await driver.get(`/v1/runs/${encodeURIComponent(runId)}:diff?against=${encodeURIComponent(runId)}`);
66
+ if (res.status === 404) { ctx.skip(); return; } // endpoint not implemented
67
+ expect(res.status, driver.describe('spec/v1/rest-endpoints.md §:diff', 'self-diff MUST return 200')).toBe(200);
68
+
69
+ const body = res.json as DiffResponse;
70
+ expect(body.a).toBe(runId);
71
+ expect(body.b).toBe(runId);
72
+ expect(
73
+ body.divergedAtSeq,
74
+ driver.describe('RFCS/0054 §C', 'identical logs MUST yield divergedAtSeq: null'),
75
+ ).toBeNull();
76
+ expect(body.eventDiffs, 'identical logs MUST yield an empty eventDiffs array').toEqual([]);
77
+ });
78
+
79
+ it('diffing two structurally-different runs ⇒ non-null divergedAtSeq aligned to eventDiffs[0]', async (ctx) => {
80
+ const [ra, rb] = await Promise.all([createRun(FIXTURE_A), createRun(FIXTURE_B)]);
81
+ if (!ra || !rb) { ctx.skip(); return; }
82
+ await Promise.all([settle(ra), settle(rb)]);
83
+
84
+ const res = await driver.get(`/v1/runs/${encodeURIComponent(ra)}:diff?against=${encodeURIComponent(rb)}`);
85
+ if (res.status === 404) { ctx.skip(); return; }
86
+ expect(res.status).toBe(200);
87
+
88
+ const body = res.json as DiffResponse;
89
+ expect(
90
+ typeof body.divergedAtSeq === 'number' && body.divergedAtSeq >= 0,
91
+ driver.describe('RFCS/0054 §C', 'structurally-different runs MUST report a non-null integer divergedAtSeq'),
92
+ ).toBe(true);
93
+ expect(body.eventDiffs.length, 'divergent runs MUST report at least one eventDiff').toBeGreaterThan(0);
94
+ expect(
95
+ body.eventDiffs[0]?.seq,
96
+ driver.describe('RFCS/0054 §C', 'eventDiffs MUST begin at divergedAtSeq'),
97
+ ).toBe(body.divergedAtSeq);
98
+ for (const d of body.eventDiffs) {
99
+ expect(['added', 'removed', 'changed']).toContain(d.op);
100
+ }
101
+ });
102
+
103
+ it('response conforms to run-diff-response.schema.json and stateDiff is redaction-safe', async (ctx) => {
104
+ const [ra, rb] = await Promise.all([createRun(FIXTURE_A), createRun(FIXTURE_B)]);
105
+ if (!ra || !rb) { ctx.skip(); return; }
106
+ await Promise.all([settle(ra), settle(rb)]);
107
+
108
+ const res = await driver.get(`/v1/runs/${encodeURIComponent(ra)}:diff?against=${encodeURIComponent(rb)}`);
109
+ if (res.status === 404) { ctx.skip(); return; }
110
+ expect(res.status).toBe(200);
111
+
112
+ const body = res.json as DiffResponse;
113
+ expect(typeof body.a === 'string' && typeof body.b === 'string', 'a + b MUST be strings').toBe(true);
114
+ expect(Array.isArray(body.eventDiffs), 'eventDiffs MUST be an array').toBe(true);
115
+ expect(body.stateDiff !== null && typeof body.stateDiff === 'object', 'stateDiff MUST be an object').toBe(true);
116
+ // Redaction-safe: no credential-shaped material leaks into the diff.
117
+ const serialized = JSON.stringify(body.stateDiff);
118
+ expect(
119
+ /sk-|api[_-]?key|secret|bearer\s/i.test(serialized),
120
+ driver.describe('RFCS/0054 §B', 'stateDiff MUST be redaction-safe — no credential material'),
121
+ ).toBe(false);
122
+ });
123
+
124
+ it('missing `against` ⇒ 400; nonexistent `against` ⇒ 404 (access boundary)', async (ctx) => {
125
+ const runId = await createRun(FIXTURE_A);
126
+ if (!runId) { ctx.skip(); return; }
127
+
128
+ const probe = await driver.get(`/v1/runs/${encodeURIComponent(runId)}:diff?against=${encodeURIComponent(runId)}`);
129
+ if (probe.status === 404) { ctx.skip(); return; } // endpoint not implemented at all
130
+
131
+ const missing = await driver.get(`/v1/runs/${encodeURIComponent(runId)}:diff`);
132
+ expect(
133
+ missing.status,
134
+ driver.describe('api/openapi.yaml §diffRun', 'missing required `against` query param MUST return 400'),
135
+ ).toBe(400);
136
+
137
+ const nonexistent = await driver.get(`/v1/runs/${encodeURIComponent(runId)}:diff?against=does-not-exist-${Date.now()}`);
138
+ expect(
139
+ nonexistent.status,
140
+ driver.describe('RFCS/0054 §A', 'diffing against a run the caller cannot read/that does not exist MUST NOT return 200'),
141
+ ).toBe(404);
142
+ });
143
+ });
@@ -13,19 +13,21 @@
13
13
  * @see SECURITY/invariants.yaml node-pack-sandbox-capability-gate-respected
14
14
  */
15
15
 
16
- import { describe, it, expect } from 'vitest';
17
- import { driver } from '../lib/driver.js';
16
+ import { describe, it } from 'vitest';
18
17
 
19
- const HTTP_SKIP = !process.env.OPENWOP_BASE_URL;
20
- interface D { capabilities?: { sandbox?: { supported?: unknown } } }
21
- async function ok(): Promise<boolean> { try { const r = await driver.get('/.well-known/openwop'); return r.status === 200 && (r.json as D).capabilities?.sandbox?.supported === true; } catch { return false; } }
18
+ // Behavioral assertion lands when the misbehaving-capability-gate typeId
19
+ // ships + a host advertises `capabilities.sandbox.supported: true`.
20
+ // Expected: error.code === 'sandbox_capability_denied';
21
+ // details.requestedCapability is set to the disallowed identifier.
22
+ // Surfaced as `todo` so test reporters track the gap rather than
23
+ // reporting a vacuous PASS.
22
24
 
23
- describe.skipIf(HTTP_SKIP)('sandbox-capability-gate-respected: behavioral (RFC 0035 §B)', () => {
24
- it('a misbehaving pack calling an undeclared host capability fails closed with sandbox_capability_denied', async () => {
25
- if (!(await ok())) return;
26
- // Behavioral assertion lands when the misbehaving-capability-gate typeId
27
- // is available. Expected: error.code === 'sandbox_capability_denied';
28
- // details.requestedCapability is set to the disallowed identifier.
29
- expect(true).toBe(true);
30
- });
25
+ describe('sandbox-capability-gate-respected: behavioral (RFC 0035 §B)', () => {
26
+ // Behavioral coverage in `sandbox-mvp-behavior.test.ts` §"capability-gate-respected"
27
+ // (drives `POST /v1/host/sample/test/sandbox-invoke` against the
28
+ // workflow-engine's node:vm MVP and asserts `error.code:
29
+ // 'sandbox_capability_denied'` + `details.requestedCapability` per
30
+ // `host-capabilities.md` §"Error codes"). `it.skip` preserves the
31
+ // per-invariant file structure without inflating the `it.todo` count.
32
+ it.skip('behavioral coverage in sandbox-mvp-behavior.test.ts §"capability-gate-respected"');
31
33
  });
@@ -50,12 +50,11 @@ describe.skipIf(HTTP_SKIP)('sandbox-memory-cap: capability shape + behavioral (R
50
50
  ).toBe(true);
51
51
  });
52
52
 
53
- it('a misbehaving pack allocating beyond memoryLimitBytes fails with sandbox_memory_exceeded', async () => {
54
- const sb = await readSandbox();
55
- if (!sb || sb.memoryLimitBytes === undefined) return; // soft-skip
56
- // Behavioral assertion lands when the misbehaving-memory-cap typeId is
57
- // available. Expected: error.code === 'sandbox_memory_exceeded';
58
- // details.requestedBytes > memoryLimitBytes.
59
- expect(true).toBe(true);
60
- });
53
+ // Behavioral coverage in `sandbox-mvp-behavior.test.ts` §"memory-exceeded"
54
+ // (drives `POST /v1/host/sample/test/sandbox-invoke` against the
55
+ // workflow-engine's node:vm MVP and asserts `error.code:
56
+ // 'sandbox_memory_exceeded'` per `host-capabilities.md` §"Error codes").
57
+ // `it.skip` preserves the per-invariant file structure without inflating
58
+ // the `it.todo` count external auditors track.
59
+ it.skip('behavioral coverage in sandbox-mvp-behavior.test.ts §"memory-exceeded"');
61
60
  });