@openwop/openwop-conformance 1.3.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. package/CHANGELOG.md +132 -1
  2. package/README.md +3 -2
  3. package/api/asyncapi.yaml +8 -0
  4. package/api/openapi.yaml +371 -1
  5. package/coverage.md +26 -6
  6. package/fixtures/conformance-envelope-nl-to-format-engaged.json +41 -0
  7. package/fixtures/conformance-envelope-recovery-applied.json +39 -0
  8. package/fixtures/conformance-envelope-refusal.json +38 -0
  9. package/fixtures/conformance-envelope-retry-attempted.json +39 -0
  10. package/fixtures/conformance-envelope-retry-exhausted.json +38 -0
  11. package/fixtures/conformance-envelope-truncated.json +39 -0
  12. package/fixtures/conformance-envelope-truncation-cap-exhaustion.json +39 -0
  13. package/fixtures/conformance-model-capability-insufficient.json +25 -0
  14. package/fixtures/conformance-multi-agent-confidence-escalation.json +49 -0
  15. package/fixtures/conformance-multi-agent-handoff-child.json +27 -0
  16. package/fixtures/conformance-multi-agent-handoff.json +49 -0
  17. package/fixtures/conformance-prompt-all-four-kinds.json +39 -0
  18. package/fixtures/conformance-prompt-end-to-end.json +33 -0
  19. package/fixtures/conformance-subworkflow-mid-run-mutation-child.json +31 -0
  20. package/fixtures/conformance-subworkflow-mid-run-mutation.json +33 -0
  21. package/fixtures/openwop-smoke-cost-emit.json +37 -0
  22. package/fixtures/prompt-templates/conformance-prompt-few-shot-2.json +14 -0
  23. package/fixtures/prompt-templates/conformance-prompt-few-shot.json +14 -0
  24. package/fixtures/prompt-templates/conformance-prompt-schema-hint.json +14 -0
  25. package/fixtures/prompt-templates/conformance-prompt-secret-redaction.json +23 -0
  26. package/fixtures/prompt-templates/conformance-prompt-trust-marker.json +23 -0
  27. package/fixtures/prompt-templates/conformance-prompt-writer-system.json +15 -0
  28. package/fixtures/prompt-templates/conformance-prompt-writer-user.json +15 -0
  29. package/fixtures.md +39 -0
  30. package/package.json +1 -1
  31. package/schemas/README.md +5 -0
  32. package/schemas/agent-manifest.schema.json +16 -0
  33. package/schemas/capabilities.schema.json +384 -1
  34. package/schemas/envelopes/clarification.request.schema.json +9 -0
  35. package/schemas/envelopes/error.schema.json +4 -0
  36. package/schemas/envelopes/schema.request.schema.json +4 -0
  37. package/schemas/envelopes/schema.response.schema.json +1 -1
  38. package/schemas/node-pack-manifest.schema.json +28 -0
  39. package/schemas/orchestrator-decision.schema.json +12 -0
  40. package/schemas/prompt-kind.schema.json +8 -0
  41. package/schemas/prompt-pack-manifest.schema.json +80 -0
  42. package/schemas/prompt-ref.schema.json +40 -0
  43. package/schemas/prompt-template.schema.json +149 -0
  44. package/schemas/registry-version-manifest.schema.json +5 -0
  45. package/schemas/run-ancestry-response.schema.json +54 -0
  46. package/schemas/run-event-payloads.schema.json +479 -11
  47. package/schemas/run-event.schema.json +15 -1
  48. package/schemas/run-snapshot.schema.json +3 -2
  49. package/schemas/workflow-definition.schema.json +19 -1
  50. package/src/lib/llm-cache-key-recipe.ts +68 -0
  51. package/src/scenarios/aiEnvelope.contractRefusal.test.ts +104 -13
  52. package/src/scenarios/aiEnvelope.correlationReplay.test.ts +32 -15
  53. package/src/scenarios/aiEnvelope.redaction.test.ts +6 -5
  54. package/src/scenarios/aiEnvelope.schemaDrift.test.ts +5 -5
  55. package/src/scenarios/aiEnvelope.trustBoundaryPropagation.test.ts +211 -12
  56. package/src/scenarios/aiEnvelope.universalKinds.test.ts +7 -7
  57. package/src/scenarios/blob-presign-expiry.test.ts +7 -7
  58. package/src/scenarios/cache-ttl-expiry.test.ts +6 -6
  59. package/src/scenarios/cost-attribution.test.ts +124 -11
  60. package/src/scenarios/cross-engine-append-ordering.test.ts +99 -0
  61. package/src/scenarios/cross-host-ancestry-endpoint.test.ts +136 -0
  62. package/src/scenarios/cross-host-causation-shape.test.ts +117 -0
  63. package/src/scenarios/cross-host-traceparent-propagation.test.ts +60 -0
  64. package/src/scenarios/envelope-completion-distinguishes-truncation.test.ts +223 -0
  65. package/src/scenarios/envelope-nl-to-format-engaged.test.ts +152 -0
  66. package/src/scenarios/envelope-reasoning-secret-redaction.test.ts +343 -0
  67. package/src/scenarios/envelope-reasoning-shape.test.ts +190 -0
  68. package/src/scenarios/envelope-recovery-applied.test.ts +229 -0
  69. package/src/scenarios/envelope-refusal-shape.test.ts +289 -0
  70. package/src/scenarios/envelope-retry-attempted.test.ts +258 -0
  71. package/src/scenarios/envelope-retry-exhausted.test.ts +168 -0
  72. package/src/scenarios/envelope-tier-one-subset-static.test.ts +229 -0
  73. package/src/scenarios/envelope-truncated.test.ts +136 -0
  74. package/src/scenarios/envelope-truncation-cap-exhaustion.test.ts +144 -0
  75. package/src/scenarios/envelope-variant-discriminator-static.test.ts +152 -0
  76. package/src/scenarios/fixtures-valid.test.ts +123 -15
  77. package/src/scenarios/kv-ttl-expiry.test.ts +7 -7
  78. package/src/scenarios/model-capability-insufficient.test.ts +221 -0
  79. package/src/scenarios/model-capability-substituted.test.ts +203 -0
  80. package/src/scenarios/multi-agent-confidence-escalation.test.ts +201 -0
  81. package/src/scenarios/multi-agent-handoff-state-machine.test.ts +167 -0
  82. package/src/scenarios/multi-agent-memory-lifecycle.test.ts +124 -0
  83. package/src/scenarios/multi-region-idempotency.test.ts +58 -0
  84. package/src/scenarios/node-module-required-capabilities-shape.test.ts +185 -0
  85. package/src/scenarios/prompt-all-four-kinds-events.test.ts +198 -0
  86. package/src/scenarios/prompt-composed-secret-redaction.test.ts +178 -0
  87. package/src/scenarios/prompt-composed-trust-marker.test.ts +165 -0
  88. package/src/scenarios/prompt-end-to-end-events.test.ts +202 -0
  89. package/src/scenarios/prompt-list-and-fetch.test.ts +207 -0
  90. package/src/scenarios/prompt-mutable-lifecycle.test.ts +216 -0
  91. package/src/scenarios/prompt-pack-install.test.ts +187 -0
  92. package/src/scenarios/prompt-render-deterministic.test.ts +240 -0
  93. package/src/scenarios/prompt-resolution-chain-agent-intrinsic.test.ts +140 -0
  94. package/src/scenarios/prompt-resolution-chain-fallback-cascade.test.ts +172 -0
  95. package/src/scenarios/prompt-resolution-chain-node-wins.test.ts +144 -0
  96. package/src/scenarios/prompt-template-shape.test.ts +359 -0
  97. package/src/scenarios/queue-ack-nack-dlq.test.ts +7 -7
  98. package/src/scenarios/queue-publish-consume-roundtrip.test.ts +7 -7
  99. package/src/scenarios/replay-divergence-at-refusal.test.ts +134 -0
  100. package/src/scenarios/replay-llm-cache-key-portable.test.ts +197 -0
  101. package/src/scenarios/replay-llm-cache-key.test.ts +1 -40
  102. package/src/scenarios/replay-observable-sequence-determinism.test.ts +80 -0
  103. package/src/scenarios/sandbox-capability-gate-respected.test.ts +27 -0
  104. package/src/scenarios/sandbox-memory-cap.test.ts +58 -0
  105. package/src/scenarios/sandbox-no-cross-pack-mutation.test.ts +30 -0
  106. package/src/scenarios/sandbox-no-host-env-leak.test.ts +27 -0
  107. package/src/scenarios/sandbox-no-host-fs-escape.test.ts +88 -0
  108. package/src/scenarios/sandbox-no-host-process-escape.test.ts +31 -0
  109. package/src/scenarios/sandbox-no-network-escape.test.ts +28 -0
  110. package/src/scenarios/sandbox-timeout-cap.test.ts +58 -0
  111. package/src/scenarios/search-bm25-roundtrip.test.ts +7 -7
  112. package/src/scenarios/spec-corpus-validity.test.ts +34 -6
  113. package/src/scenarios/sql-transaction-atomicity.test.ts +6 -6
  114. package/src/scenarios/stream-subscribe-from-beginning.test.ts +7 -7
  115. package/src/scenarios/subworkflow-input-mapping.test.ts +70 -4
  116. package/src/scenarios/table-cursor-pagination.test.ts +7 -7
  117. package/src/scenarios/table-schema-enforcement.test.ts +7 -7
  118. package/src/scenarios/vector-knn-roundtrip.test.ts +7 -7
@@ -1,12 +1,12 @@
1
1
  /**
2
- * queue-publish-consume-roundtrip — RFC 0017 advertisement-shape verification + behavioral placeholders.
2
+ * queue-publish-consume-roundtrip — RFC 0017 advertisement-shape verification + behavioral roundtrip.
3
3
  *
4
- * Status: ACTIVE (advertisement-shape). RFC 0017 promoted to `Active`
5
- * 2026-05-17. The matching `capabilities.queueBus` block has landed in
6
- * `schemas/capabilities.schema.json`. This scenario asserts the advertisement
7
- * shape against any host that boots the conformance suite, and keeps the
8
- * deeper behavioral assertions as `it.todo()` until a reference host wires
9
- * a test seam.
4
+ * Status: ACTIVE (advertisement-shape + behavioral). RFC 0017 promoted to
5
+ * `Active` 2026-05-17. The matching `capabilities.queueBus` block has
6
+ * landed in `schemas/capabilities.schema.json`. This scenario asserts the
7
+ * advertisement shape against any host that boots the conformance suite, and
8
+ * exercises the behavioral surface through the `/v1/host/sample/test/surface`
9
+ * seam (soft-skip with HTTP 404 on hosts that don't expose it).
10
10
  *
11
11
  * Summary: publish + consume + ack roundtrip.
12
12
  *
@@ -0,0 +1,134 @@
1
+ /**
2
+ * replay-divergence-at-refusal — RFC 0041 §B behavioral assertion.
3
+ *
4
+ * Status: ACTIVE (capability-gated behavioral; soft-skips when no Phase 4
5
+ * host advertises the contract). Gated on
6
+ * `capabilities.multiAgent.executionModel.version >= 4` AND
7
+ * `capabilities.multiAgent.executionModel.replayDeterminism.refusalDivergenceEmission: true`.
8
+ *
9
+ * Asserts (behavioral, when a Phase 4 host advertises both gates):
10
+ *
11
+ * 1. When the original run obtained a valid LLM envelope but the replay
12
+ * gets a refusal, the host MUST emit a `replay.divergedAtRefusal`
13
+ * event AND fail the replay with `error.code:
14
+ * "replay_diverged_at_refusal"`. Silent substitution is non-conformant.
15
+ *
16
+ * 2. The emitted `replay.divergedAtRefusal` payload MUST carry
17
+ * `originalEnvelopeKind: "valid"` + `replayEnvelopeKind: "refusal"`
18
+ * (or the inverse for the original-refused case). The two MUST
19
+ * differ — otherwise there is no divergence to report.
20
+ *
21
+ * 3. The error envelope MAY carry `details.atSequence`, `details.nodeId`,
22
+ * `details.originalEnvelopeKind`, `details.replayEnvelopeKind` per
23
+ * `spec/v1/rest-endpoints.md` §"Common error codes" — when present,
24
+ * the values MUST be consistent with the emitted event.
25
+ *
26
+ * Driving the assertion requires a host-side test seam that can stage a
27
+ * mock provider returning a valid envelope on the original run and a
28
+ * refusal on the replay (or vice-versa). Reference workflow-engine ships
29
+ * a mock-AI provider (`OPENWOP_MULTI_AGENT_EXECUTION_MODEL=true`); the
30
+ * Phase 4 wiring extends it to honor a "refusal on replay" mode. Until
31
+ * that wiring lands, the assertion is surfaced as `it.todo` so test
32
+ * reporters track the gap rather than reporting a vacuous PASS.
33
+ *
34
+ * @see RFCS/0041-multi-agent-replay-under-nondeterminism.md §B
35
+ * @see spec/v1/replay.md §"Envelope-refusal recovery in replay (MAE-8 closure)"
36
+ * @see spec/v1/multi-agent-execution.md §"Phase 4 replay determinism"
37
+ * @see spec/v1/rest-endpoints.md §"Common error codes" — replay_diverged_at_refusal
38
+ * @see schemas/run-event-payloads.schema.json §replayDivergedAtRefusal
39
+ */
40
+
41
+ import { describe, it, expect } from 'vitest';
42
+ import { driver } from '../lib/driver.js';
43
+
44
+ const HTTP_SKIP = !process.env.OPENWOP_BASE_URL;
45
+
46
+ interface DiscoveryDoc {
47
+ capabilities?: {
48
+ multiAgent?: {
49
+ executionModel?: {
50
+ supported?: unknown;
51
+ version?: unknown;
52
+ replayDeterminism?: {
53
+ supported?: unknown;
54
+ refusalDivergenceEmission?: unknown;
55
+ };
56
+ };
57
+ };
58
+ };
59
+ }
60
+
61
+ async function readDiscovery(): Promise<DiscoveryDoc | null> {
62
+ try {
63
+ const res = await driver.get('/.well-known/openwop');
64
+ if (res.status !== 200) return null;
65
+ return res.json as DiscoveryDoc;
66
+ } catch { return null; }
67
+ }
68
+
69
+ describe.skipIf(HTTP_SKIP)('replay-divergence-at-refusal: advertisement shape (RFC 0041 §D)', () => {
70
+ it('replayDeterminism (when present) conforms to RFC 0041 §D', async (ctx) => {
71
+ const d = await readDiscovery();
72
+ if (d === null) {
73
+ ctx.skip();
74
+ return;
75
+ }
76
+ const rd = d.capabilities?.multiAgent?.executionModel?.replayDeterminism;
77
+ if (rd === undefined) {
78
+ ctx.skip(); // optional advertisement — host hasn't opted in
79
+ return;
80
+ }
81
+
82
+ expect(
83
+ typeof rd.supported,
84
+ driver.describe(
85
+ 'RFCS/0041-multi-agent-replay-under-nondeterminism.md §D',
86
+ 'replayDeterminism.supported MUST be boolean when present',
87
+ ),
88
+ ).toBe('boolean');
89
+
90
+ if (rd.supported === true) {
91
+ const version = d.capabilities?.multiAgent?.executionModel?.version as number | undefined;
92
+ expect(
93
+ typeof version === 'number' && version >= 4,
94
+ driver.describe(
95
+ 'RFCS/0041-multi-agent-replay-under-nondeterminism.md §D',
96
+ 'when replayDeterminism.supported: true, multiAgent.executionModel.version MUST be >= 4',
97
+ ),
98
+ ).toBe(true);
99
+
100
+ // Phase 4 hosts MUST commit to refusal-divergence emission per the
101
+ // schema description on capabilities.schema.json §replayDeterminism
102
+ // .refusalDivergenceEmission. The MUST is normative prose on the
103
+ // schema; JSON Schema can't express the conditional, so this
104
+ // assertion closes the conformance-enforcement gap.
105
+ expect(
106
+ rd.refusalDivergenceEmission,
107
+ driver.describe(
108
+ 'schemas/capabilities.schema.json §replayDeterminism.refusalDivergenceEmission',
109
+ 'hosts advertising version: 4 MUST set replayDeterminism.refusalDivergenceEmission to true',
110
+ ),
111
+ ).toBe(true);
112
+ }
113
+ });
114
+ });
115
+
116
+ describe.skipIf(HTTP_SKIP)('replay-divergence-at-refusal: behavioral (RFC 0041 §B MAE-8)', () => {
117
+ // Behavioral assertion drives a workflow whose mock-AI provider returns a
118
+ // valid envelope on the original run + a refusal on the replay (or
119
+ // vice-versa via a second variant). The assertion sequence:
120
+ // 1. Stage mock provider: original returns valid envelope.
121
+ // 2. Run workflow `conformance-phase4-replay-divergence` end-to-end.
122
+ // 3. Re-stage mock provider: replay-of-this-runId returns refusal.
123
+ // 4. POST /v1/runs/{runId}:fork { mode: 'replay' }.
124
+ // 5. Assert the resulting run terminates with
125
+ // error.code === 'replay_diverged_at_refusal'.
126
+ // 6. Assert event log contains a `replay.divergedAtRefusal` event with
127
+ // originalEnvelopeKind === 'valid' AND replayEnvelopeKind === 'refusal'.
128
+ // 7. Assert NO silent substitution: the replay's continuation past the
129
+ // diverging node MUST NOT execute (run terminates at the divergence).
130
+ // Until the reference host wires the staged-refusal seam, surfaced as
131
+ // `todo` so test reporters track the gap.
132
+ it.todo('Phase 4 host MUST emit replay.divergedAtRefusal + fail with replay_diverged_at_refusal when original=valid + replay=refusal');
133
+ it.todo('Phase 4 host MUST emit replay.divergedAtRefusal + fail with replay_diverged_at_refusal when original=refusal + replay=valid (symmetric case)');
134
+ });
@@ -0,0 +1,197 @@
1
+ /**
2
+ * replay-llm-cache-key-portable — RFC 0041 §E SECURITY-invariant probe.
3
+ *
4
+ * Status: ACTIVE (capability-gated behavioral). Gated on
5
+ * `capabilities.multiAgent.executionModel.version >= 4` AND
6
+ * `capabilities.multiAgent.executionModel.replayDeterminism.llmCacheKeyRecipe: "spec-rfc-0041"`.
7
+ *
8
+ * The CROSS-host parity assertion in `replay-llm-cache-key.test.ts §D`
9
+ * (gated on `OPENWOP_BASE_URL_B`) is the cross-instance probe. This file
10
+ * is the SECURITY-tier complement: it asserts that the SINGLE-host
11
+ * recipe is portable in the strict sense — given the recipe input, the
12
+ * host's emitted key is reproducible offline from the recipe alone
13
+ * (no host-internal secrets, sequence numbers, or trace context
14
+ * influence the key).
15
+ *
16
+ * Asserts:
17
+ *
18
+ * 1. Two probes with byte-identical recipe input MUST yield the same
19
+ * cache key (intra-host determinism; subsumes the SECURITY
20
+ * portability requirement at the single-host boundary).
21
+ *
22
+ * 2. The emitted key is reproducible offline: locally recomputed
23
+ * SHA-256-over-RFC-8785-JCS over the canonical recipe MUST equal
24
+ * the host's emission. This is the load-bearing claim — without
25
+ * it, the recipe is private host state masquerading as a content-
26
+ * addressable hash.
27
+ *
28
+ * 3. (Negative) Permuting any non-recipe field (`max_tokens`, `stop`,
29
+ * `stream`, `seed`, `metadata`, `user`, request IDs, trace context)
30
+ * MUST NOT shift the key. This is the security boundary: hosts
31
+ * that mix non-recipe state into the key leak that state across
32
+ * the cache boundary, defeating the portability claim and (via
33
+ * the SR-1 sibling invariant) potentially leaking BYOK plaintexts
34
+ * through the cache.
35
+ *
36
+ * 4. (Gated on Phase 4 advertisement.) The host's discovery doc MUST
37
+ * advertise `replayDeterminism.llmCacheKeyRecipe` matching the
38
+ * recipe it honors — `spec-rfc-0041` for the canonical recipe,
39
+ * `x-host-<host>-<recipe-name>` for vendor variants per
40
+ * `host-extensions.md` §"Canonical prefixes".
41
+ *
42
+ * The behavioral assertions reuse the existing test seam at
43
+ * `POST /v1/host/sample/test/llm-cache-key` (the same seam the sibling
44
+ * `replay-llm-cache-key.test.ts` drives). Hosts that don't expose the
45
+ * seam return 404 and the scenario soft-skips.
46
+ *
47
+ * @see RFCS/0041-multi-agent-replay-under-nondeterminism.md §E
48
+ * @see SECURITY/invariants.yaml §replay-llm-cache-key-portable
49
+ * @see spec/v1/replay.md §"LLM cache-key recipe" §A + §B + §D
50
+ * @see conformance/src/scenarios/replay-llm-cache-key.test.ts (the sibling behavioral suite)
51
+ */
52
+
53
+ import { describe, it, expect } from 'vitest';
54
+ import { driver } from '../lib/driver.js';
55
+ import { expectedCacheKey, callCacheKeySeam as callSeam } from '../lib/llm-cache-key-recipe.js';
56
+
57
+ const HTTP_SKIP = !process.env.OPENWOP_BASE_URL;
58
+
59
+ interface DiscoveryDoc {
60
+ capabilities?: {
61
+ multiAgent?: {
62
+ executionModel?: {
63
+ version?: unknown;
64
+ replayDeterminism?: {
65
+ supported?: unknown;
66
+ llmCacheKeyRecipe?: unknown;
67
+ };
68
+ };
69
+ };
70
+ };
71
+ }
72
+
73
+ async function readDiscovery(): Promise<DiscoveryDoc | null> {
74
+ try {
75
+ const res = await driver.get('/.well-known/openwop');
76
+ if (res.status !== 200) return null;
77
+ return res.json as DiscoveryDoc;
78
+ } catch { return null; }
79
+ }
80
+
81
+ describe.skipIf(HTTP_SKIP)('replay-llm-cache-key-portable: intra-host reproducibility (RFC 0041 §E)', () => {
82
+ it('host cache key MUST equal locally-recomputed SHA-256 over canonical JSON (reproducible offline)', async (ctx) => {
83
+ const input = {
84
+ provider: 'anthropic',
85
+ model: 'claude-3-5-sonnet-20240620',
86
+ messages: [
87
+ { role: 'system' as const, content: 'portability probe' },
88
+ { role: 'user' as const, content: 'reproduce offline' },
89
+ ],
90
+ temperature: 0.3,
91
+ };
92
+ const result = await callSeam(input);
93
+ if (result.status === 404) {
94
+ ctx.skip(); // host doesn't expose the test seam
95
+ return;
96
+ }
97
+ expect(result.status).toBe(200);
98
+ expect(
99
+ result.cacheKey,
100
+ driver.describe(
101
+ 'SECURITY/invariants.yaml §replay-llm-cache-key-portable + replay.md §B',
102
+ 'host cache key MUST be reproducible offline from the recipe alone — no host-internal state',
103
+ ),
104
+ ).toBe(expectedCacheKey(input));
105
+ });
106
+
107
+ it('two identical probes MUST yield byte-identical keys (intra-host determinism)', async (ctx) => {
108
+ const input = {
109
+ provider: 'openai',
110
+ model: 'gpt-4',
111
+ messages: [{ role: 'user' as const, content: 'idempotence probe' }],
112
+ temperature: 0.0,
113
+ };
114
+ const a = await callSeam(input);
115
+ if (a.status === 404) {
116
+ ctx.skip(); // host doesn't expose the test seam
117
+ return;
118
+ }
119
+ const b = await callSeam(input);
120
+ expect(
121
+ a.cacheKey,
122
+ driver.describe(
123
+ 'SECURITY/invariants.yaml §replay-llm-cache-key-portable',
124
+ 'two byte-identical recipe inputs MUST yield byte-identical keys (no per-request entropy)',
125
+ ),
126
+ ).toBe(b.cacheKey);
127
+ });
128
+ });
129
+
130
+ describe.skipIf(HTTP_SKIP)('replay-llm-cache-key-portable: non-recipe-field invariance (RFC 0041 §E security boundary)', () => {
131
+ it('non-recipe fields (request ID, trace context, tenant ID) MUST NOT influence the cache key', async (ctx) => {
132
+ const base = {
133
+ provider: 'openai',
134
+ model: 'gpt-4',
135
+ messages: [{ role: 'user' as const, content: 'security-boundary probe' }],
136
+ temperature: 0.5,
137
+ };
138
+ const baseResult = await callSeam(base);
139
+ if (baseResult.status === 404) {
140
+ ctx.skip(); // host doesn't expose the test seam
141
+ return;
142
+ }
143
+
144
+ // The security boundary: ANY of these fields leaking into the key
145
+ // would expose tenant/request state through cache-collision behavior.
146
+ const polluted = {
147
+ ...base,
148
+ max_tokens: 1000,
149
+ stop: ['STOP'],
150
+ stream: true,
151
+ seed: 42,
152
+ metadata: { tenantId: 'tenant-A', traceparent: '00-deadbeef-cafe-01' },
153
+ user: 'user-42',
154
+ 'x-request-id': 'req-abc-123',
155
+ };
156
+ const pollutedResult = await callSeam(polluted);
157
+ expect(
158
+ pollutedResult.cacheKey,
159
+ driver.describe(
160
+ 'SECURITY/invariants.yaml §replay-llm-cache-key-portable + replay.md §A',
161
+ 'non-recipe fields (request id, trace context, tenant id) MUST NOT influence the cache key — leaking them defeats the portability invariant',
162
+ ),
163
+ ).toBe(baseResult.cacheKey);
164
+ });
165
+ });
166
+
167
+ describe.skipIf(HTTP_SKIP)('replay-llm-cache-key-portable: Phase 4 advertisement alignment (RFC 0041 §D)', () => {
168
+ it('hosts advertising version: 4 MUST advertise replayDeterminism.llmCacheKeyRecipe', async (ctx) => {
169
+ const d = await readDiscovery();
170
+ const em = d?.capabilities?.multiAgent?.executionModel;
171
+ const version = em?.version;
172
+ if (typeof version !== 'number' || version < 4) {
173
+ ctx.skip(); // pre-Phase-4 or no multiAgent advertisement
174
+ return;
175
+ }
176
+
177
+ const recipe = em?.replayDeterminism?.llmCacheKeyRecipe;
178
+ expect(
179
+ typeof recipe === 'string',
180
+ driver.describe(
181
+ 'RFCS/0041-multi-agent-replay-under-nondeterminism.md §D',
182
+ 'Phase 4 host MUST advertise replayDeterminism.llmCacheKeyRecipe (`spec-rfc-0041` or `x-host-<host>-<recipe>`)',
183
+ ),
184
+ ).toBe(true);
185
+
186
+ const r = recipe as string;
187
+ const canonical = r === 'spec-rfc-0041';
188
+ const vendor = /^x-host-[a-z][a-z0-9-]*-[a-z][a-z0-9-]*$/.test(r);
189
+ expect(
190
+ canonical || vendor,
191
+ driver.describe(
192
+ 'schemas/capabilities.schema.json §replayDeterminism.llmCacheKeyRecipe',
193
+ 'llmCacheKeyRecipe MUST be `spec-rfc-0041` OR match `^x-host-<host>-<recipe>$` per host-extensions.md',
194
+ ),
195
+ ).toBe(true);
196
+ });
197
+ });
@@ -20,47 +20,8 @@
20
20
  */
21
21
 
22
22
  import { describe, it, expect } from 'vitest';
23
- import { createHash } from 'node:crypto';
24
23
  import { driver } from '../lib/driver.js';
25
-
26
- /** Mirror of the reference impl's `canonicalize` so the conformance
27
- * scenario can recompute the expected cache key locally and assert
28
- * equality with what the host returns. RFC 8785 JCS-style:
29
- * sorted-keys, no whitespace, preserve array order. */
30
- function canonicalize(value: unknown): string {
31
- if (value === null) return 'null';
32
- if (typeof value === 'boolean' || typeof value === 'number') return JSON.stringify(value);
33
- if (typeof value === 'string') return JSON.stringify(value);
34
- if (Array.isArray(value)) return '[' + value.map((v) => canonicalize(v)).join(',') + ']';
35
- if (typeof value === 'object') {
36
- const obj = value as Record<string, unknown>;
37
- const keys = Object.keys(obj).sort();
38
- return '{' + keys.map((k) => `${JSON.stringify(k)}:${canonicalize(obj[k])}`).join(',') + '}';
39
- }
40
- return JSON.stringify(value);
41
- }
42
-
43
- function projectRecipe(raw: Record<string, unknown>): Record<string, unknown> {
44
- const out: Record<string, unknown> = { provider: raw.provider, model: raw.model, messages: raw.messages };
45
- if (Array.isArray(raw.tools) && raw.tools.length > 0) {
46
- out.tools = [...(raw.tools as Array<{ name: string }>)].sort((a, b) => a.name.localeCompare(b.name));
47
- }
48
- if (typeof raw.temperature === 'number') out.temperature = raw.temperature;
49
- if (typeof raw.topP === 'number') out.topP = raw.topP;
50
- if (typeof raw.topK === 'number') out.topK = raw.topK;
51
- if (raw.responseFormat && typeof raw.responseFormat === 'object') out.responseFormat = raw.responseFormat;
52
- return out;
53
- }
54
-
55
- function expectedCacheKey(input: Record<string, unknown>): string {
56
- return createHash('sha256').update(canonicalize(projectRecipe(input)), 'utf8').digest('hex');
57
- }
58
-
59
- async function callSeam(input: Record<string, unknown>): Promise<{ status: number; cacheKey?: string }> {
60
- const res = await driver.post('/v1/host/sample/test/llm-cache-key', input);
61
- const cacheKey = (res.json as { cacheKey?: string }).cacheKey;
62
- return cacheKey !== undefined ? { status: res.status, cacheKey } : { status: res.status };
63
- }
24
+ import { expectedCacheKey, callCacheKeySeam as callSeam } from '../lib/llm-cache-key-recipe.js';
64
25
 
65
26
  describe('replay-llm-cache-key: SHA-256-over-JCS recipe (replay.md §B)', () => {
66
27
  it('host cache key MUST equal locally-recomputed SHA-256 over canonical JSON', async () => {
@@ -0,0 +1,80 @@
1
+ /**
2
+ * replay-observable-sequence-determinism — RFC 0041 §C behavioral.
3
+ *
4
+ * Status: ACTIVE (capability-gated behavioral). Gated on
5
+ * `capabilities.multiAgent.executionModel.version >= 4` AND
6
+ * `capabilities.multiAgent.executionModel.replayDeterminism.supported: true`.
7
+ *
8
+ * Asserts (behavioral, when a Phase 4 host advertises the contract):
9
+ *
10
+ * 1. A `mode: replay` fork from event-log index `fromSeq` produces an
11
+ * event-log prefix `[0, fromSeq]` that is byte-equivalent to the
12
+ * original run's prefix (modulo per-region clock fields per RFC 0036
13
+ * §E and ULID component-T entropy when ULIDs are minted fresh).
14
+ *
15
+ * 2. The replay's `RunSnapshot.variables`, `RunSnapshot.channels`, and
16
+ * `RunSnapshot.status` at the boundary index are byte-equivalent to
17
+ * the original.
18
+ *
19
+ * 3. (Crucially per §C.) The replay reproduces observable output EVEN
20
+ * WHEN the underlying tool call would have produced different bytes.
21
+ * The reference test uses a mock tool that returns a fresh random
22
+ * string on each call; the host MUST cache the original observable
23
+ * result so replay returns the SAME string the original got — not
24
+ * the bytes a fresh call would return now.
25
+ *
26
+ * Driving the assertion requires a workflow fixture whose tool call is
27
+ * pure-nondeterministic (different bytes on each call) but whose
28
+ * observable result is what gets cached. Reference workflow-engine ships
29
+ * `core.noop` + deterministic fixtures; Phase 4 wiring needs a
30
+ * nondeterministic-tool fixture (e.g., `conformance-phase4-nondet-tool`).
31
+ * Until that lands, the cross-boundary assertion is surfaced as `it.todo`
32
+ * so test reporters track the gap.
33
+ *
34
+ * @see RFCS/0041-multi-agent-replay-under-nondeterminism.md §C
35
+ * @see spec/v1/replay.md §"Observable-output-sequence determinism vs bit-equivalent execution (MAE-9 closure)"
36
+ * @see spec/v1/multi-agent-execution.md §"Phase 4 replay determinism"
37
+ */
38
+
39
+ import { describe, it } from 'vitest';
40
+
41
+ // Behavioral assertions in this file are currently `it.todo` placeholders;
42
+ // the `conformance-phase4-nondet-tool` fixture hasn't shipped yet. When
43
+ // it does, the `it.todo` calls flip back to runnable `it(...)` bodies
44
+ // that read discovery (via `driver.get('/.well-known/openwop')`), gate
45
+ // on `multiAgent.executionModel.version >= 4` AND
46
+ // `replayDeterminism.supported: true`, and drive the workflow through
47
+ // the fixture.
48
+
49
+ describe('replay-observable-sequence-determinism: prefix byte-equivalence (RFC 0041 §C)', () => {
50
+ // Behavioral assertion drives a workflow with at least one node whose
51
+ // underlying tool call is nondeterministic (different bytes on each
52
+ // call). The assertion sequence:
53
+ // 1. POST /v1/runs { workflowId: 'conformance-phase4-nondet-tool' }
54
+ // → runs to completion, capturing the original event log.
55
+ // 2. Capture original event-log prefix [0, N] where N is the index
56
+ // after the nondeterministic-tool node fires.
57
+ // 3. POST /v1/runs/{runId}:fork { mode: 'replay', fromSeq: N }
58
+ // 4. Read replay event-log prefix [0, N].
59
+ // 5. Assert byte-equivalence modulo the carve-outs:
60
+ // - per-region observedAt timestamps (RFC 0036 §E)
61
+ // - ULID component-T entropy on newly-minted eventIds
62
+ // 6. Read original + replay RunSnapshot at index N; assert
63
+ // variables + channels + status byte-equivalent.
64
+ // Surfaced as `todo` until the `conformance-phase4-nondet-tool`
65
+ // fixture ships in the suite — consistent with the sibling Phase 4
66
+ // scenarios (`replay-divergence-at-refusal.test.ts`,
67
+ // `replay-llm-cache-key-portable.test.ts`).
68
+ it.todo('original and replay event-log prefixes [0, fromSeq] MUST be byte-equivalent (modulo per-region clock + ULID-T entropy)');
69
+ });
70
+
71
+ describe('replay-observable-sequence-determinism: observable-result caching (RFC 0041 §C)', () => {
72
+ // The load-bearing assertion: a nondeterministic tool call's OBSERVABLE
73
+ // RESULT (return value + side-effects on workflow state + emitted events)
74
+ // is what gets cached, not the bytes-on-the-wire of the underlying call.
75
+ // The replay's reproduction of the observable sequence is what makes
76
+ // this a valid determinism contract — bit-equivalent execution would
77
+ // require unbounded caching (rejected per RFC 0041 §"Alternatives
78
+ // considered" #2).
79
+ it.todo('replay of a workflow containing a nondeterministic tool call reproduces the original observable result, NOT a fresh call');
80
+ });
@@ -0,0 +1,27 @@
1
+ /**
2
+ * sandbox-capability-gate-respected — RFC 0035 §B invariant
3
+ * `node-pack-sandbox-capability-gate-respected`.
4
+ *
5
+ * Capability-gated on `capabilities.sandbox.supported: true`.
6
+ *
7
+ * Asserts (behavioral when host advertises): a pack invocation that calls
8
+ * a host capability NOT in `capabilities.sandbox.allowedHostCalls` fails
9
+ * closed with `error.code: "sandbox_capability_denied"` AND
10
+ * `details.requestedCapability` identifying the disallowed capability.
11
+ *
12
+ * @see RFCS/0035-sandbox-execution-contract.md §B + §C
13
+ * @see SECURITY/invariants.yaml node-pack-sandbox-capability-gate-respected
14
+ */
15
+
16
+ import { describe, it } from 'vitest';
17
+
18
+ // Behavioral assertion lands when the misbehaving-capability-gate typeId
19
+ // ships + a host advertises `capabilities.sandbox.supported: true`.
20
+ // Expected: error.code === 'sandbox_capability_denied';
21
+ // details.requestedCapability is set to the disallowed identifier.
22
+ // Surfaced as `todo` so test reporters track the gap rather than
23
+ // reporting a vacuous PASS.
24
+
25
+ describe('sandbox-capability-gate-respected: behavioral (RFC 0035 §B)', () => {
26
+ it.todo('a misbehaving pack calling an undeclared host capability fails closed with sandbox_capability_denied');
27
+ });
@@ -0,0 +1,58 @@
1
+ /**
2
+ * sandbox-memory-cap — RFC 0035 §B invariant `node-pack-sandbox-memory-cap`.
3
+ *
4
+ * Capability-gated on `capabilities.sandbox.supported: true` AND
5
+ * `capabilities.sandbox.memoryLimitBytes` advertised.
6
+ *
7
+ * Asserts (behavioral when host advertises): a pack invocation that
8
+ * allocates beyond `capabilities.sandbox.memoryLimitBytes` fails closed
9
+ * with `error.code: "sandbox_memory_exceeded"` per RFC 0035 §C. The host
10
+ * MUST advertise an integer ≥ 1 MiB per the schema.
11
+ *
12
+ * @see RFCS/0035-sandbox-execution-contract.md §B + §C
13
+ * @see SECURITY/invariants.yaml node-pack-sandbox-memory-cap
14
+ */
15
+
16
+ import { describe, it, expect } from 'vitest';
17
+ import { driver } from '../lib/driver.js';
18
+
19
+ const HTTP_SKIP = !process.env.OPENWOP_BASE_URL;
20
+
21
+ interface D {
22
+ capabilities?: { sandbox?: { supported?: unknown; memoryLimitBytes?: unknown } };
23
+ }
24
+
25
+ async function readSandbox(): Promise<{ supported: boolean; memoryLimitBytes?: number } | null> {
26
+ try {
27
+ const r = await driver.get('/.well-known/openwop');
28
+ if (r.status !== 200) return null;
29
+ const sb = (r.json as D).capabilities?.sandbox;
30
+ if (!sb || sb.supported !== true) return null;
31
+ return {
32
+ supported: true,
33
+ ...(typeof sb.memoryLimitBytes === 'number' ? { memoryLimitBytes: sb.memoryLimitBytes } : {}),
34
+ };
35
+ } catch { return null; }
36
+ }
37
+
38
+ describe.skipIf(HTTP_SKIP)('sandbox-memory-cap: capability shape + behavioral (RFC 0035 §B)', () => {
39
+ it('memoryLimitBytes MUST be integer ≥ 1 MiB when present (per schema)', async () => {
40
+ const sb = await readSandbox();
41
+ if (!sb) return; // soft-skip
42
+ if (sb.memoryLimitBytes === undefined) return; // optional field
43
+
44
+ expect(
45
+ Number.isInteger(sb.memoryLimitBytes) && sb.memoryLimitBytes >= 1048576,
46
+ driver.describe(
47
+ 'RFCS/0035-sandbox-execution-contract.md §A',
48
+ 'memoryLimitBytes MUST be integer ≥ 1 MiB (1048576)',
49
+ ),
50
+ ).toBe(true);
51
+ });
52
+
53
+ // Behavioral assertion lands when the misbehaving-memory-cap typeId is
54
+ // available. Expected: error.code === 'sandbox_memory_exceeded';
55
+ // details.requestedBytes > memoryLimitBytes. Surfaced as `todo` so
56
+ // test reporters track the gap rather than reporting a vacuous PASS.
57
+ it.todo('a misbehaving pack allocating beyond memoryLimitBytes fails with sandbox_memory_exceeded');
58
+ });
@@ -0,0 +1,30 @@
1
+ /**
2
+ * sandbox-no-cross-pack-mutation — RFC 0035 §B invariant
3
+ * `node-pack-sandbox-no-cross-pack-mutation`.
4
+ *
5
+ * Capability-gated on `capabilities.sandbox.supported: true`.
6
+ *
7
+ * Asserts (behavioral when host advertises): pack A's sandbox invocation
8
+ * cannot mutate state visible to pack B running in the same host process.
9
+ * Exercised via two synthetic packs from `vendor.openwop.misbehaving-sandbox`:
10
+ * - pack-a writes a sentinel to a shared address (e.g., a global object,
11
+ * a known process-singleton, an ambient module);
12
+ * - pack-b reads the same address;
13
+ * the test asserts pack-b does NOT see pack-a's write (sandbox isolation
14
+ * holds at the pack boundary, not just at the syscall boundary).
15
+ *
16
+ * @see RFCS/0035-sandbox-execution-contract.md §B
17
+ * @see SECURITY/invariants.yaml node-pack-sandbox-no-cross-pack-mutation
18
+ */
19
+
20
+ import { describe, it } from 'vitest';
21
+
22
+ // Behavioral assertion lands when the misbehaving-cross-pack-mutation
23
+ // typeIds ship + a host advertises `capabilities.sandbox.supported: true`.
24
+ // Expected: pack-b read returns the absent sentinel value; pack-a's
25
+ // mutation did not cross the isolation boundary. Surfaced as `todo` so
26
+ // test reporters track the gap rather than reporting a vacuous PASS.
27
+
28
+ describe('sandbox-no-cross-pack-mutation: behavioral (RFC 0035 §B)', () => {
29
+ it.todo('pack A writing a sentinel is NOT visible to pack B in the same host process');
30
+ });
@@ -0,0 +1,27 @@
1
+ /**
2
+ * sandbox-no-host-env-leak — RFC 0035 §B invariant `node-pack-sandbox-no-host-env-leak`.
3
+ *
4
+ * Capability-gated on `capabilities.sandbox.supported: true`.
5
+ *
6
+ * Asserts (behavioral when host advertises): a pack invocation that reads
7
+ * `process.env` (or the platform equivalent) does NOT see host-level env
8
+ * vars unless the host has forwarded them via an `allowedHostCalls` entry
9
+ * exposing env resolution.
10
+ *
11
+ * @see RFCS/0035-sandbox-execution-contract.md §B
12
+ * @see SECURITY/invariants.yaml node-pack-sandbox-no-host-env-leak
13
+ */
14
+
15
+ import { describe, it } from 'vitest';
16
+
17
+ // Behavioral assertion lands when a sandbox-executing host advertises
18
+ // `capabilities.sandbox.supported: true` AND ships a misbehaving-env-leak
19
+ // typeId. The assertion sets a canary env var on the host process, runs
20
+ // the misbehaving pack that reads `process.env`, and asserts the pack's
21
+ // view of env does NOT contain the canary (unless the host has forwarded
22
+ // it via an `allowedHostCalls` entry). Surfaced as `todo` so test
23
+ // reporters track the gap rather than reporting a vacuous PASS.
24
+
25
+ describe('sandbox-no-host-env-leak: behavioral (RFC 0035 §B)', () => {
26
+ it.todo('a misbehaving pack reading process.env does NOT see host env vars unless explicitly allowed');
27
+ });