@openwop/openwop-conformance 1.13.0 → 1.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/CHANGELOG.md +21 -0
  2. package/README.md +2 -2
  3. package/api/openapi.yaml +60 -0
  4. package/coverage.md +15 -4
  5. package/fixtures/wasm-sandbox/isolation-global.wasm +0 -0
  6. package/fixtures/wasm-sandbox/isolation-global.wat +6 -0
  7. package/fixtures/wasm-sandbox/misbehaving-capability-gate.wasm +0 -0
  8. package/fixtures/wasm-sandbox/misbehaving-capability-gate.wat +4 -0
  9. package/fixtures/wasm-sandbox/misbehaving-env.wasm +0 -0
  10. package/fixtures/wasm-sandbox/misbehaving-env.wat +4 -0
  11. package/fixtures/wasm-sandbox/misbehaving-fs.wasm +0 -0
  12. package/fixtures/wasm-sandbox/misbehaving-fs.wat +4 -0
  13. package/fixtures/wasm-sandbox/misbehaving-memory.wasm +0 -0
  14. package/fixtures/wasm-sandbox/misbehaving-memory.wat +5 -0
  15. package/fixtures/wasm-sandbox/misbehaving-network.wasm +0 -0
  16. package/fixtures/wasm-sandbox/misbehaving-network.wat +4 -0
  17. package/fixtures/wasm-sandbox/misbehaving-process.wasm +0 -0
  18. package/fixtures/wasm-sandbox/misbehaving-process.wat +4 -0
  19. package/fixtures/wasm-sandbox/misbehaving-timeout.wasm +0 -0
  20. package/fixtures/wasm-sandbox/misbehaving-timeout.wat +4 -0
  21. package/fixtures/wasm-sandbox/well-behaved-echo.wasm +0 -0
  22. package/fixtures/wasm-sandbox/well-behaved-echo.wat +2 -0
  23. package/fixtures/wasm-sandbox/well-behaved-host-fetch.wasm +0 -0
  24. package/fixtures/wasm-sandbox/well-behaved-host-fetch.wat +3 -0
  25. package/package.json +1 -1
  26. package/src/lib/discovery-capabilities.ts +18 -19
  27. package/src/lib/egressPolicy.ts +76 -0
  28. package/src/lib/otel-collector.ts +72 -0
  29. package/src/lib/profiles.ts +15 -0
  30. package/src/lib/sandbox-timeout-worker.mjs +31 -0
  31. package/src/lib/toolCatalog.ts +81 -0
  32. package/src/lib/wasm-sandbox-probe.ts +168 -0
  33. package/src/scenarios/core-standard-profile.test.ts +75 -0
  34. package/src/scenarios/egress-audience-binding.test.ts +81 -0
  35. package/src/scenarios/egress-decision-content-free.test.ts +57 -0
  36. package/src/scenarios/memory-degraded-projection.test.ts +121 -0
  37. package/src/scenarios/multi-agent-confidence-escalation.test.ts +12 -7
  38. package/src/scenarios/otel-collector-canary-inspection.test.ts +211 -0
  39. package/src/scenarios/prompt-resolution-chain-event.test.ts +113 -0
  40. package/src/scenarios/replay-observable-sequence-determinism.test.ts +192 -75
  41. package/src/scenarios/sandbox-wasm-isolation.test.ts +98 -0
  42. package/src/scenarios/sandbox-wasm-timeout.test.ts +40 -0
  43. package/src/scenarios/secret-leakage-otel-attribute.test.ts +52 -0
  44. package/src/scenarios/tool-catalog-projection.test.ts +120 -0
  45. package/src/scenarios/tool-session-lifecycle.test.ts +105 -0
  46. package/src/scenarios/workspace-cross-tenant-isolation-blackbox.test.ts +89 -0
@@ -8,87 +8,204 @@
8
8
  * Asserts (behavioral, when a host advertises `version: 4` + the contract):
9
9
  *
10
10
  * 1. A `mode: replay` fork from event-log index `fromSeq` produces an
11
- * event-log prefix `[0, fromSeq]` that is byte-equivalent to the
12
- * original run's prefix (modulo per-region clock fields per RFC 0036
13
- * §E and ULID component-T entropy when ULIDs are minted fresh).
11
+ * observable event-log prefix `[0, fromSeq]` that is byte-equivalent
12
+ * to the original run's prefix (modulo volatile per-event fields:
13
+ * eventId/ULID entropy, per-region `observedAt` clocks per RFC 0036
14
+ * §E, and the run id itself).
14
15
  *
15
- * 2. The replay's `RunSnapshot.variables`, `RunSnapshot.channels`, and
16
- * `RunSnapshot.status` at the boundary index are byte-equivalent to
17
- * the original.
16
+ * 2. (Crucially per §C.) The replay reproduces the OBSERVABLE RESULT of
17
+ * a nondeterministic tool node EVEN WHEN a fresh call would produce
18
+ * different bytes. The `conformance-phase4-nondet-tool` fixture's
19
+ * first node declares `config.nondeterministic: true`; a `version: 4`
20
+ * host MUST replay the original event-log entries for that node
21
+ * (cache the observable result) rather than re-executing it, so the
22
+ * node's terminal payload is identical across original + replay.
18
23
  *
19
- * 3. (Crucially per §C.) The replay reproduces observable output EVEN
20
- * WHEN the underlying tool call would have produced different bytes.
21
- * The reference test uses a mock tool that returns a fresh random
22
- * string on each call; the host MUST cache the original observable
23
- * result so replay returns the SAME string the original got not
24
- * the bytes a fresh call would return now.
24
+ * The `conformance-phase4-nondet-tool` fixture ships in the suite (added
25
+ * via the RFC 0041 Phase 4 fixtures commit). These assertions are now
26
+ * runnable capability-gated `it()` bodies consistent with the sibling
27
+ * `replay-divergence-at-refusal.test.ts`, which is likewise active and
28
+ * soft-skips on the same gate. They light up the moment a host advertises
29
+ * the `version: 4` replay-determinism contract; against hosts that don't
30
+ * (incl. the reference workflow-engine, which has not yet wired the
31
+ * pure-replay observable-cache path), they soft-skip honestly.
25
32
  *
26
- * Driving the assertion requires a workflow fixture whose tool call is
27
- * pure-nondeterministic (different bytes on each call) but whose
28
- * observable result is what gets cached. Reference workflow-engine ships
29
- * `core.noop` + deterministic fixtures; the `version: 4` wiring needs a
30
- * nondeterministic-tool fixture (e.g., `conformance-phase4-nondet-tool`).
31
- * Until that lands, the cross-boundary assertion is surfaced as `it.todo`
32
- * so test reporters track the gap.
33
+ * RFC 0042 §B note: RFC 0041 §C is `Active` (not yet `Accepted`), so its
34
+ * wire shape MAY shift compatibly within v1.x — a host wiring this before
35
+ * RFC 0041 graduates SHOULD advertise `multiAgent.executionModel.tier:
36
+ * 'experimental'` + `experimentalUntil` per RFC 0042 §A.
33
37
  *
34
38
  * @see RFCS/0041-multi-agent-replay-under-nondeterminism.md §C
35
39
  * @see spec/v1/replay.md §"Observable-output-sequence determinism vs bit-equivalent execution (MAE-9 closure)"
36
40
  * @see spec/v1/multi-agent-execution.md §"Replay determinism under nondeterminism (RFC 0041)"
37
41
  */
38
42
 
39
- import { describe, it } from 'vitest';
40
-
41
- // Behavioral assertions in this file are currently `it.todo` placeholders;
42
- // the `conformance-phase4-nondet-tool` fixture hasn't shipped yet. When
43
- // it does, the `it.todo` calls flip back to runnable `it(...)` bodies
44
- // that read discovery (via `driver.get('/.well-known/openwop')`), gate
45
- // on `multiAgent.executionModel.version >= 4` AND
46
- // `replayDeterminism.supported: true`, and drive the workflow through
47
- // the fixture.
48
-
49
- describe('replay-observable-sequence-determinism: prefix byte-equivalence (RFC 0041 §C)', () => {
50
- // Behavioral assertion drives a workflow with at least one node whose
51
- // underlying tool call is nondeterministic (different bytes on each
52
- // call). The assertion sequence:
53
- // 1. POST /v1/runs { workflowId: 'conformance-phase4-nondet-tool' }
54
- // → runs to completion, capturing the original event log.
55
- // 2. Capture original event-log prefix [0, N] where N is the index
56
- // after the nondeterministic-tool node fires.
57
- // 3. POST /v1/runs/{runId}:fork { mode: 'replay', fromSeq: N }
58
- // 4. Read replay event-log prefix [0, N].
59
- // 5. Assert byte-equivalence modulo the carve-outs:
60
- // - per-region observedAt timestamps (RFC 0036 §E)
61
- // - ULID component-T entropy on newly-minted eventIds
62
- // 6. Read original + replay RunSnapshot at index N; assert
63
- // variables + channels + status byte-equivalent.
64
- // Surfaced as `todo` until the `conformance-phase4-nondet-tool`
65
- // fixture ships in the suite — consistent with the sibling RFC 0041
66
- // scenarios (`replay-divergence-at-refusal.test.ts`,
67
- // `replay-llm-cache-key-portable.test.ts`).
68
- // Marked out of stable profile via RFC 0042 §B (experimental tier):
69
- // RFC 0041 §C remains Active, so its wire shape MAY shift compatibly
70
- // within v1.x. Hosts that wire this assertion before RFC 0041 graduates
71
- // to Accepted SHOULD advertise `multiAgent.executionModel.tier:
72
- // 'experimental'` + `experimentalUntil` per RFC 0042 §A. Path-to-runnable
73
- // requires: (a) host pure-replay observable-cache emission via the
74
- // `:fork mode: replay` re-dispatch path and (b) the test seam endpoint
75
- // contract for cache-hit-vs-fresh-call distinction (see
76
- // `spec/v1/host-sample-test-seams.md` for the established seam pattern).
77
- it.skip('original and replay event-log prefixes [0, fromSeq] MUST be byte-equivalent (modulo per-region clock + ULID-T entropy) — out of stable profile via RFC 0042');
78
- });
79
-
80
- describe('replay-observable-sequence-determinism: observable-result caching (RFC 0041 §C)', () => {
81
- // The load-bearing assertion: a nondeterministic tool call's OBSERVABLE
82
- // RESULT (return value + side-effects on workflow state + emitted events)
83
- // is what gets cached, not the bytes-on-the-wire of the underlying call.
84
- // The replay's reproduction of the observable sequence is what makes
85
- // this a valid determinism contract — bit-equivalent execution would
86
- // require unbounded caching (rejected per RFC 0041 §"Alternatives
87
- // considered" #2).
88
- // Marked out of stable profile via RFC 0042 §B (experimental tier):
89
- // see the prefix-byte-equivalence comment above for the same routing.
90
- // This is RFC 0041 §C's load-bearing assertion; it lands as a runnable
91
- // `it()` when RFC 0041 graduates to Accepted on first non-steward host
92
- // adoption.
93
- it.skip('replay of a workflow containing a nondeterministic tool call reproduces the original observable result, NOT a fresh call — out of stable profile via RFC 0042');
94
- });
43
+ import { describe, it, expect } from 'vitest';
44
+ import { driver } from '../lib/driver.js';
45
+ import { capabilityFamily } from '../lib/discovery-capabilities.js';
46
+
47
+ const HTTP_SKIP = !process.env.OPENWOP_BASE_URL;
48
+ const FIXTURE = 'conformance-phase4-nondet-tool';
49
+ const NONDET_NODE_ID = 'nondet-tool';
50
+
51
+ interface ExecutionModelCaps {
52
+ version?: unknown;
53
+ replayDeterminism?: { supported?: unknown };
54
+ }
55
+ interface DiscoveryDoc {
56
+ capabilities?: {
57
+ multiAgent?: { executionModel?: ExecutionModelCaps };
58
+ };
59
+ }
60
+
61
+ interface RunSnapshot {
62
+ status?: string;
63
+ }
64
+ interface RunEventDoc {
65
+ type: string;
66
+ nodeId?: string;
67
+ sequence?: number;
68
+ payload?: Record<string, unknown>;
69
+ }
70
+
71
+ async function readDiscovery(): Promise<DiscoveryDoc | null> {
72
+ try {
73
+ const res = await driver.get('/.well-known/openwop');
74
+ if (res.status !== 200) return null;
75
+ return res.json as DiscoveryDoc;
76
+ } catch {
77
+ return null;
78
+ }
79
+ }
80
+
81
+ /** Soft-skip unless the host advertises the RFC 0041 §C version-4 contract. */
82
+ async function gateOnPhase4(ctx: { skip: () => void }): Promise<boolean> {
83
+ const d = await readDiscovery();
84
+ const em = capabilityFamily<{ executionModel?: ExecutionModelCaps }>(d, 'multiAgent')?.executionModel;
85
+ const version = typeof em?.version === 'number' ? em.version : 0;
86
+ if (em?.replayDeterminism?.supported !== true || version < 4) {
87
+ ctx.skip();
88
+ return false;
89
+ }
90
+ return true;
91
+ }
92
+
93
+ async function pollUntilTerminal(runId: string): Promise<RunSnapshot> {
94
+ for (let i = 0; i < 50; i++) {
95
+ const r = await driver.get(`/v1/runs/${encodeURIComponent(runId)}`);
96
+ const snap = r.json as RunSnapshot;
97
+ if (snap.status === 'completed' || snap.status === 'failed' || snap.status === 'cancelled') {
98
+ return snap;
99
+ }
100
+ await new Promise((resolve) => setTimeout(resolve, 100));
101
+ }
102
+ throw new Error(`run ${runId} did not reach terminal within 5s`);
103
+ }
104
+
105
+ async function readEvents(runId: string): Promise<RunEventDoc[]> {
106
+ const r = await driver.get(`/v1/runs/${encodeURIComponent(runId)}/events`);
107
+ const body = r.json as { events?: RunEventDoc[] };
108
+ return body.events ?? [];
109
+ }
110
+
111
+ /**
112
+ * Strip volatile per-event fields so two runs of the same workflow are
113
+ * comparable. Removes the run id, freshly-minted event ids/ULIDs, and the
114
+ * per-region observed-at clock (RFC 0036 §E carve-out) wherever they
115
+ * appear at the event top level.
116
+ */
117
+ function stripVolatile(ev: RunEventDoc): Record<string, unknown> {
118
+ const clone = JSON.parse(JSON.stringify(ev)) as Record<string, unknown>;
119
+ for (const k of ['eventId', 'runId', 'observedAt', 'timestamp', 'occurredAt', 'emittedAt', 'id']) {
120
+ delete clone[k];
121
+ }
122
+ return clone;
123
+ }
124
+
125
+ /** Create the fixture run; returns null (with a skip) if it isn't advertised. */
126
+ async function startFixtureRun(ctx: { skip: () => void }): Promise<string | null> {
127
+ const create = await driver.post('/v1/runs', { workflowId: FIXTURE });
128
+ if (create.status === 404 || create.status === 422) {
129
+ ctx.skip(); // fixture not advertised by this host
130
+ return null;
131
+ }
132
+ expect(create.status).toBe(201);
133
+ return (create.json as { runId: string }).runId;
134
+ }
135
+
136
+ describe.skipIf(HTTP_SKIP)(
137
+ 'replay-observable-sequence-determinism: prefix byte-equivalence (RFC 0041 §C)',
138
+ () => {
139
+ it('original and replay event-log prefixes MUST be byte-equivalent (modulo per-event clock + ULID entropy)', async (ctx) => {
140
+ if (!(await gateOnPhase4(ctx))) return;
141
+
142
+ const sourceRunId = await startFixtureRun(ctx);
143
+ if (sourceRunId === null) return;
144
+ const sourceTerminal = await pollUntilTerminal(sourceRunId);
145
+ expect(sourceTerminal.status).toBe('completed');
146
+ const sourceEvents = await readEvents(sourceRunId);
147
+
148
+ const forkRes = await driver.post(`/v1/runs/${encodeURIComponent(sourceRunId)}:fork`, {
149
+ fromSeq: 0,
150
+ mode: 'replay',
151
+ });
152
+ expect(forkRes.status).toBe(201);
153
+ const replayRunId = (forkRes.json as { runId: string }).runId;
154
+ await pollUntilTerminal(replayRunId);
155
+ const replayEvents = await readEvents(replayRunId);
156
+
157
+ const sourceNorm = sourceEvents.map(stripVolatile);
158
+ const replayNorm = replayEvents.map(stripVolatile);
159
+ expect(
160
+ replayNorm,
161
+ driver.describe(
162
+ 'RFCS/0041-multi-agent-replay-under-nondeterminism.md §C',
163
+ 'a mode:replay fork MUST reproduce the original observable event-log sequence byte-for-byte modulo volatile per-event fields (eventId/ULID entropy, per-region observedAt clock)',
164
+ ),
165
+ ).toEqual(sourceNorm);
166
+ });
167
+ },
168
+ );
169
+
170
+ describe.skipIf(HTTP_SKIP)(
171
+ 'replay-observable-sequence-determinism: observable-result caching (RFC 0041 §C)',
172
+ () => {
173
+ it('replay of a nondeterministic tool node reproduces the ORIGINAL observable result, NOT a fresh call', async (ctx) => {
174
+ if (!(await gateOnPhase4(ctx))) return;
175
+
176
+ const sourceRunId = await startFixtureRun(ctx);
177
+ if (sourceRunId === null) return;
178
+ expect((await pollUntilTerminal(sourceRunId)).status).toBe('completed');
179
+ const sourceEvents = await readEvents(sourceRunId);
180
+
181
+ // The terminal event(s) for the nondeterministic node carry its
182
+ // observable result. Capture every event scoped to that node.
183
+ const sourceNodeEvents = sourceEvents.filter((e) => e.nodeId === NONDET_NODE_ID).map(stripVolatile);
184
+ expect(
185
+ sourceNodeEvents.length,
186
+ driver.describe(
187
+ 'RFCS/0041-multi-agent-replay-under-nondeterminism.md §C',
188
+ `the fixture's nondeterministic node \`${NONDET_NODE_ID}\` MUST emit at least one observable event`,
189
+ ),
190
+ ).toBeGreaterThan(0);
191
+
192
+ const forkRes = await driver.post(`/v1/runs/${encodeURIComponent(sourceRunId)}:fork`, {
193
+ fromSeq: 0,
194
+ mode: 'replay',
195
+ });
196
+ expect(forkRes.status).toBe(201);
197
+ const replayRunId = (forkRes.json as { runId: string }).runId;
198
+ await pollUntilTerminal(replayRunId);
199
+ const replayEvents = await readEvents(replayRunId);
200
+ const replayNodeEvents = replayEvents.filter((e) => e.nodeId === NONDET_NODE_ID).map(stripVolatile);
201
+
202
+ expect(
203
+ replayNodeEvents,
204
+ driver.describe(
205
+ 'RFCS/0041-multi-agent-replay-under-nondeterminism.md §C',
206
+ 'the nondeterministic tool node MUST replay its ORIGINAL observable result (cached event-log entry) rather than re-executing — bit-equivalent re-execution would require unbounded caching, rejected per RFC 0041 §"Alternatives considered" #2',
207
+ ),
208
+ ).toEqual(sourceNodeEvents);
209
+ });
210
+ },
211
+ );
@@ -0,0 +1,98 @@
1
+ /**
2
+ * RFC 0035 §B sandbox isolation — portable, server-free behavioral conformance.
3
+ *
4
+ * Drives the committed `fixtures/wasm-sandbox/*.wasm` modules through the
5
+ * suite-local `probeSandboxed` reference (see `../lib/wasm-sandbox-probe.ts`).
6
+ * Every assertion exercises real WebAssembly isolation — there are NO `it.todo`
7
+ * placeholders and NO mocks. These are the behavioral probes that graduate the
8
+ * cross-runtime `node-pack-sandbox-*` invariants from reference-impl to protocol
9
+ * tier (`SECURITY/invariants.yaml`).
10
+ *
11
+ * Coverage (six invariants, proven by construction, server-free):
12
+ * - node-pack-sandbox-fs-gated / -no-env / -network-gated / -no-process:
13
+ * a forbidden operation can only be a DECLARED IMPORT; the probe statically
14
+ * refuses any un-granted import → `sandbox_escape_attempt` + `escapeKind`.
15
+ * - capability gate: an un-granted `openwop.*` import → `sandbox_capability_denied`.
16
+ * - node-pack-sandbox-memory-cap: an access past the host memory bound traps →
17
+ * `sandbox_memory_exceeded`.
18
+ * - node-pack-sandbox-isolated-context: a fresh instance per invocation carries
19
+ * no state across calls.
20
+ *
21
+ * `node-pack-sandbox-timeout` requires thread preemption (a worker kill-timer) and
22
+ * stays reference-impl, proven by `examples/hosts/wasm-sandbox/test/sandbox.test.ts`
23
+ * (real worker kill). `node-pack-sandbox-no-eval` is JS-runtime-specific (WASM has
24
+ * no `eval`) and is exempt per RFC 0035.
25
+ *
26
+ * Spec reference:
27
+ * - https://github.com/openwop/openwop/blob/main/RFCS/0035-sandbox-execution-contract.md
28
+ */
29
+ import { describe, it, expect } from 'vitest';
30
+ import { readFileSync } from 'node:fs';
31
+ import { join } from 'node:path';
32
+ import { FIXTURES_DIR } from '../lib/paths.js';
33
+ import { probeSandboxed } from '../lib/wasm-sandbox-probe.js';
34
+
35
+ const why = (specRef: string, requirement: string): string => `${specRef} — ${requirement}`;
36
+ const dir = join(FIXTURES_DIR, 'wasm-sandbox');
37
+ const fix = (name: string): Uint8Array => new Uint8Array(readFileSync(join(dir, `${name}.wasm`)));
38
+ const BASE = { allowedHostCalls: [] as string[], memoryLimitBytes: 2 * 1024 * 1024 };
39
+
40
+ describe('sandbox-wasm-isolation: positive controls (RFC 0035 §B, server-free)', () => {
41
+ it('a well-behaved pure module runs and returns its input', () => {
42
+ const r = probeSandboxed(fix('well-behaved-echo'), BASE, 'invoke', 42);
43
+ expect(r.ok, why('RFC 0035 §B', 'a pure-compute module runs')).toBe(true);
44
+ expect(r.result).toBe(42);
45
+ });
46
+
47
+ it('a granted host capability is callable when in allowedHostCalls', () => {
48
+ const r = probeSandboxed(fix('well-behaved-host-fetch'), { ...BASE, allowedHostCalls: ['fetch'] }, 'invoke', 7);
49
+ expect(r.ok, why('RFC 0035 §B invariant 7', 'a granted openwop.* capability is callable')).toBe(true);
50
+ expect(r.result).toBe(7);
51
+ });
52
+ });
53
+
54
+ describe('sandbox-wasm-isolation: escape attempts fail closed (RFC 0035 §B 1–4, server-free)', () => {
55
+ const cases: ReadonlyArray<readonly [string, string, string]> = [
56
+ ['misbehaving-fs', 'host-fs-escape', 'node-pack-sandbox-fs-gated'],
57
+ ['misbehaving-env', 'host-env-leak', 'node-pack-sandbox-no-env'],
58
+ ['misbehaving-network', 'network-escape', 'node-pack-sandbox-network-gated'],
59
+ ['misbehaving-process', 'host-process-escape', 'node-pack-sandbox-no-process'],
60
+ ];
61
+ for (const [fixture, escapeKind, invariant] of cases) {
62
+ it(`${invariant}: ${fixture} → sandbox_escape_attempt (${escapeKind})`, () => {
63
+ const r = probeSandboxed(fix(fixture), BASE);
64
+ expect(r.code, why('RFC 0035 §B', `${invariant} fails closed before instantiation`)).toBe('sandbox_escape_attempt');
65
+ expect(r.escapeKind).toBe(escapeKind);
66
+ });
67
+ }
68
+ });
69
+
70
+ describe('sandbox-wasm-isolation: capability gate (RFC 0035 §B 7, server-free)', () => {
71
+ it('an un-granted openwop capability is denied with its name', () => {
72
+ const r = probeSandboxed(fix('misbehaving-capability-gate'), BASE);
73
+ expect(r.code, why('RFC 0035 §B invariant 7', 'undeclared host capability fails closed')).toBe('sandbox_capability_denied');
74
+ expect(r.requestedCapability).toBe('privileged');
75
+ });
76
+
77
+ it('host-fetch WITHOUT the grant is denied (the gate works both directions)', () => {
78
+ const r = probeSandboxed(fix('well-behaved-host-fetch'), BASE);
79
+ expect(r.code).toBe('sandbox_capability_denied');
80
+ expect(r.requestedCapability).toBe('fetch');
81
+ });
82
+ });
83
+
84
+ describe('sandbox-wasm-isolation: memory cap (RFC 0035 §B 5, server-free)', () => {
85
+ it('node-pack-sandbox-memory-cap: access beyond the host memory bound is sandbox_memory_exceeded', () => {
86
+ const r = probeSandboxed(fix('misbehaving-memory'), BASE);
87
+ expect(r.ok, why('RFC 0035 §B invariant 5', 'memory bound is engine-enforced')).toBe(false);
88
+ expect(r.code).toBe('sandbox_memory_exceeded');
89
+ });
90
+ });
91
+
92
+ describe('sandbox-wasm-isolation: isolated context (RFC 0035 §B 8, server-free)', () => {
93
+ it('node-pack-sandbox-isolated-context: each invocation gets a fresh instance (no cross-pack state)', () => {
94
+ const iso = fix('isolation-global');
95
+ expect(probeSandboxed(iso, BASE, 'bump').result, why('RFC 0035 §B invariant 8', 'a fresh instance starts at 0')).toBe(1);
96
+ expect(probeSandboxed(iso, BASE, 'read').result, why('RFC 0035 §B invariant 8', 'no state leaks across invocations')).toBe(0);
97
+ });
98
+ });
@@ -0,0 +1,40 @@
1
+ /**
2
+ * RFC 0035 §B invariant 6 — sandbox wall-clock timeout, worker-driven + server-free.
3
+ *
4
+ * The worker-thread counterpart to `sandbox-wasm-isolation.test.ts` (which proves
5
+ * the other six cross-runtime invariants in-process but deliberately cannot run a
6
+ * non-terminating module). A wall-clock cap can only be enforced by THREAD
7
+ * PREEMPTION — a same-thread timer cannot interrupt a synchronous WASM loop — so
8
+ * `probeTimeout` (see `../lib/wasm-sandbox-probe.ts`) spawns a worker running the
9
+ * committed `misbehaving-timeout.wasm` fixture and races a main-thread kill-timer.
10
+ *
11
+ * This is the worker-driven conformance probe that graduates
12
+ * `node-pack-sandbox-timeout` from reference-impl to protocol tier (the prior gap:
13
+ * the cap was proven only host-internally by the WASM host's `test/sandbox.test.ts`).
14
+ *
15
+ * @see RFCS/0035-sandbox-execution-contract.md §B invariant 6
16
+ * @see SECURITY/invariants.yaml node-pack-sandbox-timeout
17
+ */
18
+ import { describe, it, expect } from 'vitest';
19
+ import { readFileSync } from 'node:fs';
20
+ import { join } from 'node:path';
21
+ import { FIXTURES_DIR } from '../lib/paths.js';
22
+ import { probeTimeout } from '../lib/wasm-sandbox-probe.js';
23
+
24
+ const why = (specRef: string, requirement: string): string => `${specRef} — ${requirement}`;
25
+ const dir = join(FIXTURES_DIR, 'wasm-sandbox');
26
+ const fix = (name: string): Uint8Array => new Uint8Array(readFileSync(join(dir, `${name}.wasm`)));
27
+
28
+ describe('sandbox-wasm-timeout: wall-clock cap is engine/worker-enforced (RFC 0035 §B 6, server-free)', () => {
29
+ it('node-pack-sandbox-timeout: a non-terminating module is killed with sandbox_timeout', async () => {
30
+ const r = await probeTimeout(fix('misbehaving-timeout'), { memoryLimitBytes: 2 * 1024 * 1024, wallClockLimitMs: 300 });
31
+ expect(r.ok, why('RFC 0035 §B invariant 6', 'an over-budget invocation MUST fail')).toBe(false);
32
+ expect(r.code, why('RFC 0035 §C', 'the failure code MUST be sandbox_timeout')).toBe('sandbox_timeout');
33
+ });
34
+
35
+ it('positive control: a well-behaved module completes within the budget (the kill-timer does not false-positive)', async () => {
36
+ const r = await probeTimeout(fix('well-behaved-echo'), { memoryLimitBytes: 2 * 1024 * 1024, wallClockLimitMs: 1000 }, 'invoke', 7);
37
+ expect(r.ok, why('RFC 0035 §B', 'a within-budget invocation completes before the kill-timer')).toBe(true);
38
+ expect(r.result).toBe(7);
39
+ });
40
+ });
@@ -56,6 +56,7 @@ import { driver } from '../lib/driver.js';
56
56
  import { pollUntilTerminal } from '../lib/polling.js';
57
57
  import { isFixtureAdvertised } from '../lib/fixtures.js';
58
58
  import { capabilityFamily } from '../lib/discovery-capabilities.js';
59
+ import { getCollector, waitForRunSpans } from '../lib/otel-collector.js';
59
60
 
60
61
  const HTTP_SKIP = !process.env.OPENWOP_BASE_URL;
61
62
  const BYOK_WORKFLOW_ID = 'openwop-smoke-byok-roundtrip';
@@ -205,6 +206,57 @@ describe.skipIf(HTTP_SKIP || FIXTURE_SKIP)(
205
206
  },
206
207
  );
207
208
 
209
+ describe.skipIf(HTTP_SKIP || FIXTURE_SKIP)(
210
+ 'secret-leakage-otel-attribute: real OTLP export scrape (collector-side)',
211
+ () => {
212
+ // Distinct from the scrape-seam probe above: this asserts against what
213
+ // the host's OTLP exporter ACTUALLY shipped over the wire to the
214
+ // conformance collector, not what the host self-reports via its
215
+ // `/v1/host/sample/test/otel/spans` seam. A host could redact in its
216
+ // seam yet leak on the real export — only this catches that. Closes
217
+ // the `docs/KNOWN-LIMITS.md` "collector seam doesn't inspect span
218
+ // attributes" gap. Gated on the in-process collector being active
219
+ // (`OPENWOP_OTEL_COLLECTOR=true` + the host configured to export to it).
220
+ it('NO real-exported OTel span/metric attribute MUST contain the BYOK canary plaintext', async (ctx) => {
221
+ const collector = getCollector();
222
+ if (!collector || !CANARY_VALUE) {
223
+ ctx.skip();
224
+ return;
225
+ }
226
+ const d = await readDiscovery();
227
+ const secretsOk = capabilityFamily<{ supported?: unknown }>(d, 'secrets')?.supported === true;
228
+ const obsOk = capabilityFamily<unknown>(d, 'observability') !== undefined;
229
+ if (!secretsOk || !obsOk) {
230
+ ctx.skip();
231
+ return;
232
+ }
233
+
234
+ collector.reset();
235
+ const runId = await startByokRun();
236
+ if (runId === null) {
237
+ ctx.skip();
238
+ return;
239
+ }
240
+ const terminal = await pollUntilTerminal(runId);
241
+ expect(terminal.status).toBe('completed');
242
+
243
+ // Hosts export spans asynchronously after terminal; poll until the
244
+ // run's spans land (or the timeout elapses — an absent export is a
245
+ // separate coverage concern, not a leak).
246
+ await waitForRunSpans(runId, { timeoutMs: 8_000 });
247
+
248
+ const leaks = collector.findCanaryLeakage(CANARY_VALUE);
249
+ expect(
250
+ leaks,
251
+ driver.describe(
252
+ 'SECURITY/invariants.yaml secret-leakage-otel-attribute',
253
+ `no real-exported OTel span/metric attribute may contain the BYOK canary plaintext. Leaking surfaces: ${JSON.stringify(leaks)}`,
254
+ ),
255
+ ).toEqual([]);
256
+ });
257
+ },
258
+ );
259
+
208
260
  describe.skipIf(HTTP_SKIP || FIXTURE_SKIP)(
209
261
  'secret-leakage-otel-attribute: advertisement-shape probe (RFC 0034 §A)',
210
262
  () => {
@@ -0,0 +1,120 @@
1
+ /**
2
+ * Portable tool catalog — the `GET /v1/tools` projection (RFC 0078 §B/§F) —
3
+ * behavioral.
4
+ *
5
+ * Capability-gated on `toolCatalog.supported` (root-first per RFC 0073).
6
+ * Soft-skips when unadvertised (default) / hard-fails under
7
+ * `OPENWOP_REQUIRE_BEHAVIOR=true`. The always-on wire-shape coverage lives in
8
+ * `tool-descriptor-shape.test.ts`; this asserts host BEHAVIOR black-box on the
9
+ * NORMATIVE reads:
10
+ *
11
+ * 1. LIST (§B) — `GET /v1/tools` returns a `ToolDescriptor[]`, each
12
+ * schema-valid, `source` ∈ the closed vocab, `safetyTier` ∈ the closed
13
+ * vocab, and content-free (no credential material, SR-1).
14
+ * 2. BY-ID (§B) — `GET /v1/tools/{toolId}` returns that descriptor; an unknown
15
+ * id 404s.
16
+ * 3. AUTH-GATED — an unauthenticated `GET /v1/tools` is `401` (not public).
17
+ * 4. §F-2 NON-DISCLOSURE — a tool id known to belong to a DIFFERENT principal
18
+ * (`OPENWOP_CROSS_PRINCIPAL_TOOL_ID`) 404s for this caller, identically to
19
+ * "not found" — the authorization-scoped projection never discloses another
20
+ * principal's tools. Soft-skips when the env var is unset.
21
+ *
22
+ * Spec references:
23
+ * - https://github.com/openwop/openwop/blob/main/spec/v1/tool-catalog.md (§B/§F)
24
+ * - https://github.com/openwop/openwop/blob/main/RFCS/0078-portable-tool-catalog-and-tool-session-contract.md
25
+ */
26
+
27
+ import { describe, it, expect } from 'vitest';
28
+ import { readFileSync } from 'node:fs';
29
+ import { join } from 'node:path';
30
+ import Ajv2020 from 'ajv/dist/2020.js';
31
+ import addFormats from 'ajv-formats';
32
+ import { driver } from '../lib/driver.js';
33
+ import { behaviorGate } from '../lib/behavior-gate.js';
34
+ import { SCHEMAS_DIR } from '../lib/paths.js';
35
+ import {
36
+ readToolCatalogCap,
37
+ listTools,
38
+ getTool,
39
+ TOOL_SOURCES,
40
+ SAFETY_TIERS,
41
+ TOOL_CONTENT_FORBIDDEN,
42
+ } from '../lib/toolCatalog.js';
43
+
44
+ function loadSchema(name: string): Record<string, unknown> {
45
+ return JSON.parse(readFileSync(join(SCHEMAS_DIR, name), 'utf8')) as Record<string, unknown>;
46
+ }
47
+
48
+ function expectContentFree(d: Record<string, unknown>, where: string): void {
49
+ for (const f of TOOL_CONTENT_FORBIDDEN) {
50
+ expect(
51
+ !(f in d),
52
+ driver.describe('RFC 0078 §F (SR-1)', `${where} MUST be content-free (no ${f})`),
53
+ ).toBe(true);
54
+ }
55
+ }
56
+
57
+ describe('tool-catalog-projection (RFC 0078 §B/§F)', () => {
58
+ it('lists schema-valid ToolDescriptors, serves by-id + 404s, is auth-gated, and never discloses another principal', async () => {
59
+ const cap = await readToolCatalogCap();
60
+ if (!behaviorGate('openwop-tool-catalog', cap?.supported === true)) return;
61
+
62
+ const ajv = new Ajv2020({ strict: false, allErrors: true });
63
+ addFormats(ajv);
64
+ const validate = ajv.compile(loadSchema('tool-descriptor.schema.json'));
65
+
66
+ // ---- Leg 3: auth-gated (unauthenticated list MUST be 401) -------------
67
+ const unauth = await driver.get('/v1/tools', { authenticated: false });
68
+ expect(
69
+ unauth.status === 401,
70
+ driver.describe('tool-catalog.md §B', 'GET /v1/tools MUST require authentication (401 unauthenticated)'),
71
+ ).toBe(true);
72
+
73
+ // ---- Leg 1: the list (§B) -------------------------------------------
74
+ const tools = await listTools();
75
+ if (tools === null) return; // host advertises the cap but doesn't serve the read — soft-skip the rest
76
+
77
+ for (const t of tools) {
78
+ expect(
79
+ validate(t),
80
+ driver.describe('tool-descriptor.schema.json', `each ToolDescriptor MUST validate (${ajv.errorsText(validate.errors)})`),
81
+ ).toBe(true);
82
+ expect(
83
+ typeof t.source === 'string' && TOOL_SOURCES.includes(t.source as string),
84
+ driver.describe('tool-catalog.md §C', 'ToolDescriptor.source MUST be in the closed vocabulary'),
85
+ ).toBe(true);
86
+ expect(
87
+ typeof t.safetyTier === 'string' && SAFETY_TIERS.includes(t.safetyTier as string),
88
+ driver.describe('tool-catalog.md §C', 'ToolDescriptor.safetyTier MUST be pure|read|write|exec'),
89
+ ).toBe(true);
90
+ expectContentFree(t, 'ToolDescriptor');
91
+ }
92
+
93
+ // ---- Leg 2: by-id round-trip + unknown 404 (§B) ---------------------
94
+ if (tools.length > 0 && typeof tools[0]!.toolId === 'string') {
95
+ const id = tools[0]!.toolId as string;
96
+ const one = await getTool(id);
97
+ if (one.status === 200) {
98
+ expect(
99
+ one.descriptor?.toolId === id,
100
+ driver.describe('tool-catalog.md §B', 'GET /v1/tools/{toolId} MUST return the requested descriptor'),
101
+ ).toBe(true);
102
+ }
103
+ }
104
+ const unknown = await getTool('__conformance_nonexistent_tool__');
105
+ expect(
106
+ unknown.status === 404,
107
+ driver.describe('tool-catalog.md §B', 'GET /v1/tools/{unknown} MUST 404'),
108
+ ).toBe(true);
109
+
110
+ // ---- Leg 4: §F-2 cross-principal non-disclosure (env-gated) ---------
111
+ const crossId = process.env.OPENWOP_CROSS_PRINCIPAL_TOOL_ID;
112
+ if (crossId) {
113
+ const cross = await getTool(crossId);
114
+ expect(
115
+ cross.status === 404,
116
+ driver.describe('tool-catalog.md §F-2', 'a tool owned by a different principal MUST 404 (non-disclosure)'),
117
+ ).toBe(true);
118
+ }
119
+ });
120
+ });