npm - @openwop/openwop-conformance - Versions diffs - 1.13.0 → 1.15.0 - Mend

@openwop/openwop-conformance 1.13.0 → 1.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

package/CHANGELOG.md +21 -0
package/README.md +2 -2
package/api/openapi.yaml +60 -0
package/coverage.md +15 -4
package/fixtures/wasm-sandbox/isolation-global.wasm +0 -0
package/fixtures/wasm-sandbox/isolation-global.wat +6 -0
package/fixtures/wasm-sandbox/misbehaving-capability-gate.wasm +0 -0
package/fixtures/wasm-sandbox/misbehaving-capability-gate.wat +4 -0
package/fixtures/wasm-sandbox/misbehaving-env.wasm +0 -0
package/fixtures/wasm-sandbox/misbehaving-env.wat +4 -0
package/fixtures/wasm-sandbox/misbehaving-fs.wasm +0 -0
package/fixtures/wasm-sandbox/misbehaving-fs.wat +4 -0
package/fixtures/wasm-sandbox/misbehaving-memory.wasm +0 -0
package/fixtures/wasm-sandbox/misbehaving-memory.wat +5 -0
package/fixtures/wasm-sandbox/misbehaving-network.wasm +0 -0
package/fixtures/wasm-sandbox/misbehaving-network.wat +4 -0
package/fixtures/wasm-sandbox/misbehaving-process.wasm +0 -0
package/fixtures/wasm-sandbox/misbehaving-process.wat +4 -0
package/fixtures/wasm-sandbox/misbehaving-timeout.wasm +0 -0
package/fixtures/wasm-sandbox/misbehaving-timeout.wat +4 -0
package/fixtures/wasm-sandbox/well-behaved-echo.wasm +0 -0
package/fixtures/wasm-sandbox/well-behaved-echo.wat +2 -0
package/fixtures/wasm-sandbox/well-behaved-host-fetch.wasm +0 -0
package/fixtures/wasm-sandbox/well-behaved-host-fetch.wat +3 -0
package/package.json +1 -1
package/src/lib/discovery-capabilities.ts +18 -19
package/src/lib/egressPolicy.ts +76 -0
package/src/lib/otel-collector.ts +72 -0
package/src/lib/profiles.ts +15 -0
package/src/lib/sandbox-timeout-worker.mjs +31 -0
package/src/lib/toolCatalog.ts +81 -0
package/src/lib/wasm-sandbox-probe.ts +168 -0
package/src/scenarios/core-standard-profile.test.ts +75 -0
package/src/scenarios/egress-audience-binding.test.ts +81 -0
package/src/scenarios/egress-decision-content-free.test.ts +57 -0
package/src/scenarios/memory-degraded-projection.test.ts +121 -0
package/src/scenarios/multi-agent-confidence-escalation.test.ts +12 -7
package/src/scenarios/otel-collector-canary-inspection.test.ts +211 -0
package/src/scenarios/prompt-resolution-chain-event.test.ts +113 -0
package/src/scenarios/replay-observable-sequence-determinism.test.ts +192 -75
package/src/scenarios/sandbox-wasm-isolation.test.ts +98 -0
package/src/scenarios/sandbox-wasm-timeout.test.ts +40 -0
package/src/scenarios/secret-leakage-otel-attribute.test.ts +52 -0
package/src/scenarios/tool-catalog-projection.test.ts +120 -0
package/src/scenarios/tool-session-lifecycle.test.ts +105 -0
package/src/scenarios/workspace-cross-tenant-isolation-blackbox.test.ts +89 -0

package/src/scenarios/replay-observable-sequence-determinism.test.ts CHANGED Viewed

@@ -8,87 +8,204 @@
  * Asserts (behavioral, when a host advertises `version: 4` + the contract):
  *
  *   1. A `mode: replay` fork from event-log index `fromSeq` produces an
- *      event-log prefix `[0, fromSeq]` that is byte-equivalent to the
- *      original run's prefix (modulo per-region clock fields per RFC 0036
- *      §E and ULID component-T entropy when ULIDs are minted fresh).
+ *      observable event-log prefix `[0, fromSeq]` that is byte-equivalent
+ *      to the original run's prefix (modulo volatile per-event fields:
+ *      eventId/ULID entropy, per-region `observedAt` clocks per RFC 0036
+ *      §E, and the run id itself).
  *
- *   2. The replay's `RunSnapshot.variables`, `RunSnapshot.channels`, and
- *      `RunSnapshot.status` at the boundary index are byte-equivalent to
- *      the original.
+ *   2. (Crucially per §C.) The replay reproduces the OBSERVABLE RESULT of
+ *      a nondeterministic tool node EVEN WHEN a fresh call would produce
+ *      different bytes. The `conformance-phase4-nondet-tool` fixture's
+ *      first node declares `config.nondeterministic: true`; a `version: 4`
+ *      host MUST replay the original event-log entries for that node
+ *      (cache the observable result) rather than re-executing it, so the
+ *      node's terminal payload is identical across original + replay.
  *
- *   3. (Crucially per §C.) The replay reproduces observable output EVEN
- *      WHEN the underlying tool call would have produced different bytes.
- *      The reference test uses a mock tool that returns a fresh random
- *      string on each call; the host MUST cache the original observable
- *      result so replay returns the SAME string the original got — not
- *      the bytes a fresh call would return now.
+ * The `conformance-phase4-nondet-tool` fixture ships in the suite (added
+ * via the RFC 0041 Phase 4 fixtures commit). These assertions are now
+ * runnable capability-gated `it()` bodies — consistent with the sibling
+ * `replay-divergence-at-refusal.test.ts`, which is likewise active and
+ * soft-skips on the same gate. They light up the moment a host advertises
+ * the `version: 4` replay-determinism contract; against hosts that don't
+ * (incl. the reference workflow-engine, which has not yet wired the
+ * pure-replay observable-cache path), they soft-skip honestly.
  *
- * Driving the assertion requires a workflow fixture whose tool call is
- * pure-nondeterministic (different bytes on each call) but whose
- * observable result is what gets cached. Reference workflow-engine ships
- * `core.noop` + deterministic fixtures; the `version: 4` wiring needs a
- * nondeterministic-tool fixture (e.g., `conformance-phase4-nondet-tool`).
- * Until that lands, the cross-boundary assertion is surfaced as `it.todo`
- * so test reporters track the gap.
+ * RFC 0042 §B note: RFC 0041 §C is `Active` (not yet `Accepted`), so its
+ * wire shape MAY shift compatibly within v1.x — a host wiring this before
+ * RFC 0041 graduates SHOULD advertise `multiAgent.executionModel.tier:
+ * 'experimental'` + `experimentalUntil` per RFC 0042 §A.
  *
  * @see RFCS/0041-multi-agent-replay-under-nondeterminism.md §C
  * @see spec/v1/replay.md §"Observable-output-sequence determinism vs bit-equivalent execution (MAE-9 closure)"
  * @see spec/v1/multi-agent-execution.md §"Replay determinism under nondeterminism (RFC 0041)"
  */
-import { describe, it } from 'vitest';
-// Behavioral assertions in this file are currently `it.todo` placeholders;
-// the `conformance-phase4-nondet-tool` fixture hasn't shipped yet. When
-// it does, the `it.todo` calls flip back to runnable `it(...)` bodies
-// that read discovery (via `driver.get('/.well-known/openwop')`), gate
-// on `multiAgent.executionModel.version >= 4` AND
-// `replayDeterminism.supported: true`, and drive the workflow through
-// the fixture.
-describe('replay-observable-sequence-determinism: prefix byte-equivalence (RFC 0041 §C)', () => {
-  // Behavioral assertion drives a workflow with at least one node whose
-  // underlying tool call is nondeterministic (different bytes on each
-  // call). The assertion sequence:
-  //   1. POST /v1/runs { workflowId: 'conformance-phase4-nondet-tool' }
-  //      → runs to completion, capturing the original event log.
-  //   2. Capture original event-log prefix [0, N] where N is the index
-  //      after the nondeterministic-tool node fires.
-  //   3. POST /v1/runs/{runId}:fork { mode: 'replay', fromSeq: N }
-  //   4. Read replay event-log prefix [0, N].
-  //   5. Assert byte-equivalence modulo the carve-outs:
-  //      - per-region observedAt timestamps (RFC 0036 §E)
-  //      - ULID component-T entropy on newly-minted eventIds
-  //   6. Read original + replay RunSnapshot at index N; assert
-  //      variables + channels + status byte-equivalent.
-  // Surfaced as `todo` until the `conformance-phase4-nondet-tool`
-  // fixture ships in the suite — consistent with the sibling RFC 0041
-  // scenarios (`replay-divergence-at-refusal.test.ts`,
-  // `replay-llm-cache-key-portable.test.ts`).
-  // Marked out of stable profile via RFC 0042 §B (experimental tier):
-  // RFC 0041 §C remains Active, so its wire shape MAY shift compatibly
-  // within v1.x. Hosts that wire this assertion before RFC 0041 graduates
-  // to Accepted SHOULD advertise `multiAgent.executionModel.tier:
-  // 'experimental'` + `experimentalUntil` per RFC 0042 §A. Path-to-runnable
-  // requires: (a) host pure-replay observable-cache emission via the
-  // `:fork mode: replay` re-dispatch path and (b) the test seam endpoint
-  // contract for cache-hit-vs-fresh-call distinction (see
-  // `spec/v1/host-sample-test-seams.md` for the established seam pattern).
-  it.skip('original and replay event-log prefixes [0, fromSeq] MUST be byte-equivalent (modulo per-region clock + ULID-T entropy) — out of stable profile via RFC 0042');
-});
-describe('replay-observable-sequence-determinism: observable-result caching (RFC 0041 §C)', () => {
-  // The load-bearing assertion: a nondeterministic tool call's OBSERVABLE
-  // RESULT (return value + side-effects on workflow state + emitted events)
-  // is what gets cached, not the bytes-on-the-wire of the underlying call.
-  // The replay's reproduction of the observable sequence is what makes
-  // this a valid determinism contract — bit-equivalent execution would
-  // require unbounded caching (rejected per RFC 0041 §"Alternatives
-  // considered" #2).
-  // Marked out of stable profile via RFC 0042 §B (experimental tier):
-  // see the prefix-byte-equivalence comment above for the same routing.
-  // This is RFC 0041 §C's load-bearing assertion; it lands as a runnable
-  // `it()` when RFC 0041 graduates to Accepted on first non-steward host
-  // adoption.
-  it.skip('replay of a workflow containing a nondeterministic tool call reproduces the original observable result, NOT a fresh call — out of stable profile via RFC 0042');
-});
+import { describe, it, expect } from 'vitest';
+import { driver } from '../lib/driver.js';
+import { capabilityFamily } from '../lib/discovery-capabilities.js';
+const HTTP_SKIP = !process.env.OPENWOP_BASE_URL;
+const FIXTURE = 'conformance-phase4-nondet-tool';
+const NONDET_NODE_ID = 'nondet-tool';
+interface ExecutionModelCaps {
+  version?: unknown;
+  replayDeterminism?: { supported?: unknown };
+}
+interface DiscoveryDoc {
+  capabilities?: {
+    multiAgent?: { executionModel?: ExecutionModelCaps };
+  };
+}
+interface RunSnapshot {
+  status?: string;
+}
+interface RunEventDoc {
+  type: string;
+  nodeId?: string;
+  sequence?: number;
+  payload?: Record<string, unknown>;
+}
+async function readDiscovery(): Promise<DiscoveryDoc | null> {
+  try {
+    const res = await driver.get('/.well-known/openwop');
+    if (res.status !== 200) return null;
+    return res.json as DiscoveryDoc;
+  } catch {
+    return null;
+  }
+}
+/** Soft-skip unless the host advertises the RFC 0041 §C version-4 contract. */
+async function gateOnPhase4(ctx: { skip: () => void }): Promise<boolean> {
+  const d = await readDiscovery();
+  const em = capabilityFamily<{ executionModel?: ExecutionModelCaps }>(d, 'multiAgent')?.executionModel;
+  const version = typeof em?.version === 'number' ? em.version : 0;
+  if (em?.replayDeterminism?.supported !== true || version < 4) {
+    ctx.skip();
+    return false;
+  }
+  return true;
+}
+async function pollUntilTerminal(runId: string): Promise<RunSnapshot> {
+  for (let i = 0; i < 50; i++) {
+    const r = await driver.get(`/v1/runs/${encodeURIComponent(runId)}`);
+    const snap = r.json as RunSnapshot;
+    if (snap.status === 'completed' || snap.status === 'failed' || snap.status === 'cancelled') {
+      return snap;
+    }
+    await new Promise((resolve) => setTimeout(resolve, 100));
+  }
+  throw new Error(`run ${runId} did not reach terminal within 5s`);
+}
+async function readEvents(runId: string): Promise<RunEventDoc[]> {
+  const r = await driver.get(`/v1/runs/${encodeURIComponent(runId)}/events`);
+  const body = r.json as { events?: RunEventDoc[] };
+  return body.events ?? [];
+}
+/**
+ * Strip volatile per-event fields so two runs of the same workflow are
+ * comparable. Removes the run id, freshly-minted event ids/ULIDs, and the
+ * per-region observed-at clock (RFC 0036 §E carve-out) wherever they
+ * appear at the event top level.
+ */
+function stripVolatile(ev: RunEventDoc): Record<string, unknown> {
+  const clone = JSON.parse(JSON.stringify(ev)) as Record<string, unknown>;
+  for (const k of ['eventId', 'runId', 'observedAt', 'timestamp', 'occurredAt', 'emittedAt', 'id']) {
+    delete clone[k];
+  }
+  return clone;
+}
+/** Create the fixture run; returns null (with a skip) if it isn't advertised. */
+async function startFixtureRun(ctx: { skip: () => void }): Promise<string | null> {
+  const create = await driver.post('/v1/runs', { workflowId: FIXTURE });
+  if (create.status === 404 || create.status === 422) {
+    ctx.skip(); // fixture not advertised by this host
+    return null;
+  }
+  expect(create.status).toBe(201);
+  return (create.json as { runId: string }).runId;
+}
+describe.skipIf(HTTP_SKIP)(
+  'replay-observable-sequence-determinism: prefix byte-equivalence (RFC 0041 §C)',
+  () => {
+    it('original and replay event-log prefixes MUST be byte-equivalent (modulo per-event clock + ULID entropy)', async (ctx) => {
+      if (!(await gateOnPhase4(ctx))) return;
+      const sourceRunId = await startFixtureRun(ctx);
+      if (sourceRunId === null) return;
+      const sourceTerminal = await pollUntilTerminal(sourceRunId);
+      expect(sourceTerminal.status).toBe('completed');
+      const sourceEvents = await readEvents(sourceRunId);
+      const forkRes = await driver.post(`/v1/runs/${encodeURIComponent(sourceRunId)}:fork`, {
+        fromSeq: 0,
+        mode: 'replay',
+      });
+      expect(forkRes.status).toBe(201);
+      const replayRunId = (forkRes.json as { runId: string }).runId;
+      await pollUntilTerminal(replayRunId);
+      const replayEvents = await readEvents(replayRunId);
+      const sourceNorm = sourceEvents.map(stripVolatile);
+      const replayNorm = replayEvents.map(stripVolatile);
+      expect(
+        replayNorm,
+        driver.describe(
+          'RFCS/0041-multi-agent-replay-under-nondeterminism.md §C',
+          'a mode:replay fork MUST reproduce the original observable event-log sequence byte-for-byte modulo volatile per-event fields (eventId/ULID entropy, per-region observedAt clock)',
+        ),
+      ).toEqual(sourceNorm);
+    });
+  },
+);
+describe.skipIf(HTTP_SKIP)(
+  'replay-observable-sequence-determinism: observable-result caching (RFC 0041 §C)',
+  () => {
+    it('replay of a nondeterministic tool node reproduces the ORIGINAL observable result, NOT a fresh call', async (ctx) => {
+      if (!(await gateOnPhase4(ctx))) return;
+      const sourceRunId = await startFixtureRun(ctx);
+      if (sourceRunId === null) return;
+      expect((await pollUntilTerminal(sourceRunId)).status).toBe('completed');
+      const sourceEvents = await readEvents(sourceRunId);
+      // The terminal event(s) for the nondeterministic node carry its
+      // observable result. Capture every event scoped to that node.
+      const sourceNodeEvents = sourceEvents.filter((e) => e.nodeId === NONDET_NODE_ID).map(stripVolatile);
+      expect(
+        sourceNodeEvents.length,
+        driver.describe(
+          'RFCS/0041-multi-agent-replay-under-nondeterminism.md §C',
+          `the fixture's nondeterministic node \`${NONDET_NODE_ID}\` MUST emit at least one observable event`,
+        ),
+      ).toBeGreaterThan(0);
+      const forkRes = await driver.post(`/v1/runs/${encodeURIComponent(sourceRunId)}:fork`, {
+        fromSeq: 0,
+        mode: 'replay',
+      });
+      expect(forkRes.status).toBe(201);
+      const replayRunId = (forkRes.json as { runId: string }).runId;
+      await pollUntilTerminal(replayRunId);
+      const replayEvents = await readEvents(replayRunId);
+      const replayNodeEvents = replayEvents.filter((e) => e.nodeId === NONDET_NODE_ID).map(stripVolatile);
+      expect(
+        replayNodeEvents,
+        driver.describe(
+          'RFCS/0041-multi-agent-replay-under-nondeterminism.md §C',
+          'the nondeterministic tool node MUST replay its ORIGINAL observable result (cached event-log entry) rather than re-executing — bit-equivalent re-execution would require unbounded caching, rejected per RFC 0041 §"Alternatives considered" #2',
+        ),
+      ).toEqual(sourceNodeEvents);
+    });
+  },
+);

package/src/scenarios/sandbox-wasm-isolation.test.ts ADDED Viewed

@@ -0,0 +1,98 @@
+/**
+ * RFC 0035 §B sandbox isolation — portable, server-free behavioral conformance.
+ *
+ * Drives the committed `fixtures/wasm-sandbox/*.wasm` modules through the
+ * suite-local `probeSandboxed` reference (see `../lib/wasm-sandbox-probe.ts`).
+ * Every assertion exercises real WebAssembly isolation — there are NO `it.todo`
+ * placeholders and NO mocks. These are the behavioral probes that graduate the
+ * cross-runtime `node-pack-sandbox-*` invariants from reference-impl to protocol
+ * tier (`SECURITY/invariants.yaml`).
+ *
+ * Coverage (six invariants, proven by construction, server-free):
+ *   - node-pack-sandbox-fs-gated / -no-env / -network-gated / -no-process:
+ *     a forbidden operation can only be a DECLARED IMPORT; the probe statically
+ *     refuses any un-granted import → `sandbox_escape_attempt` + `escapeKind`.
+ *   - capability gate: an un-granted `openwop.*` import → `sandbox_capability_denied`.
+ *   - node-pack-sandbox-memory-cap: an access past the host memory bound traps →
+ *     `sandbox_memory_exceeded`.
+ *   - node-pack-sandbox-isolated-context: a fresh instance per invocation carries
+ *     no state across calls.
+ *
+ * `node-pack-sandbox-timeout` requires thread preemption (a worker kill-timer) and
+ * stays reference-impl, proven by `examples/hosts/wasm-sandbox/test/sandbox.test.ts`
+ * (real worker kill). `node-pack-sandbox-no-eval` is JS-runtime-specific (WASM has
+ * no `eval`) and is exempt per RFC 0035.
+ *
+ * Spec reference:
+ *   - https://github.com/openwop/openwop/blob/main/RFCS/0035-sandbox-execution-contract.md
+ */
+import { describe, it, expect } from 'vitest';
+import { readFileSync } from 'node:fs';
+import { join } from 'node:path';
+import { FIXTURES_DIR } from '../lib/paths.js';
+import { probeSandboxed } from '../lib/wasm-sandbox-probe.js';
+const why = (specRef: string, requirement: string): string => `${specRef} — ${requirement}`;
+const dir = join(FIXTURES_DIR, 'wasm-sandbox');
+const fix = (name: string): Uint8Array => new Uint8Array(readFileSync(join(dir, `${name}.wasm`)));
+const BASE = { allowedHostCalls: [] as string[], memoryLimitBytes: 2 * 1024 * 1024 };
+describe('sandbox-wasm-isolation: positive controls (RFC 0035 §B, server-free)', () => {
+  it('a well-behaved pure module runs and returns its input', () => {
+    const r = probeSandboxed(fix('well-behaved-echo'), BASE, 'invoke', 42);
+    expect(r.ok, why('RFC 0035 §B', 'a pure-compute module runs')).toBe(true);
+    expect(r.result).toBe(42);
+  });
+  it('a granted host capability is callable when in allowedHostCalls', () => {
+    const r = probeSandboxed(fix('well-behaved-host-fetch'), { ...BASE, allowedHostCalls: ['fetch'] }, 'invoke', 7);
+    expect(r.ok, why('RFC 0035 §B invariant 7', 'a granted openwop.* capability is callable')).toBe(true);
+    expect(r.result).toBe(7);
+  });
+});
+describe('sandbox-wasm-isolation: escape attempts fail closed (RFC 0035 §B 1–4, server-free)', () => {
+  const cases: ReadonlyArray<readonly [string, string, string]> = [
+    ['misbehaving-fs', 'host-fs-escape', 'node-pack-sandbox-fs-gated'],
+    ['misbehaving-env', 'host-env-leak', 'node-pack-sandbox-no-env'],
+    ['misbehaving-network', 'network-escape', 'node-pack-sandbox-network-gated'],
+    ['misbehaving-process', 'host-process-escape', 'node-pack-sandbox-no-process'],
+  ];
+  for (const [fixture, escapeKind, invariant] of cases) {
+    it(`${invariant}: ${fixture} → sandbox_escape_attempt (${escapeKind})`, () => {
+      const r = probeSandboxed(fix(fixture), BASE);
+      expect(r.code, why('RFC 0035 §B', `${invariant} fails closed before instantiation`)).toBe('sandbox_escape_attempt');
+      expect(r.escapeKind).toBe(escapeKind);
+    });
+  }
+});
+describe('sandbox-wasm-isolation: capability gate (RFC 0035 §B 7, server-free)', () => {
+  it('an un-granted openwop capability is denied with its name', () => {
+    const r = probeSandboxed(fix('misbehaving-capability-gate'), BASE);
+    expect(r.code, why('RFC 0035 §B invariant 7', 'undeclared host capability fails closed')).toBe('sandbox_capability_denied');
+    expect(r.requestedCapability).toBe('privileged');
+  });
+  it('host-fetch WITHOUT the grant is denied (the gate works both directions)', () => {
+    const r = probeSandboxed(fix('well-behaved-host-fetch'), BASE);
+    expect(r.code).toBe('sandbox_capability_denied');
+    expect(r.requestedCapability).toBe('fetch');
+  });
+});
+describe('sandbox-wasm-isolation: memory cap (RFC 0035 §B 5, server-free)', () => {
+  it('node-pack-sandbox-memory-cap: access beyond the host memory bound is sandbox_memory_exceeded', () => {
+    const r = probeSandboxed(fix('misbehaving-memory'), BASE);
+    expect(r.ok, why('RFC 0035 §B invariant 5', 'memory bound is engine-enforced')).toBe(false);
+    expect(r.code).toBe('sandbox_memory_exceeded');
+  });
+});
+describe('sandbox-wasm-isolation: isolated context (RFC 0035 §B 8, server-free)', () => {
+  it('node-pack-sandbox-isolated-context: each invocation gets a fresh instance (no cross-pack state)', () => {
+    const iso = fix('isolation-global');
+    expect(probeSandboxed(iso, BASE, 'bump').result, why('RFC 0035 §B invariant 8', 'a fresh instance starts at 0')).toBe(1);
+    expect(probeSandboxed(iso, BASE, 'read').result, why('RFC 0035 §B invariant 8', 'no state leaks across invocations')).toBe(0);
+  });
+});

package/src/scenarios/sandbox-wasm-timeout.test.ts ADDED Viewed

@@ -0,0 +1,40 @@
+/**
+ * RFC 0035 §B invariant 6 — sandbox wall-clock timeout, worker-driven + server-free.
+ *
+ * The worker-thread counterpart to `sandbox-wasm-isolation.test.ts` (which proves
+ * the other six cross-runtime invariants in-process but deliberately cannot run a
+ * non-terminating module). A wall-clock cap can only be enforced by THREAD
+ * PREEMPTION — a same-thread timer cannot interrupt a synchronous WASM loop — so
+ * `probeTimeout` (see `../lib/wasm-sandbox-probe.ts`) spawns a worker running the
+ * committed `misbehaving-timeout.wasm` fixture and races a main-thread kill-timer.
+ *
+ * This is the worker-driven conformance probe that graduates
+ * `node-pack-sandbox-timeout` from reference-impl to protocol tier (the prior gap:
+ * the cap was proven only host-internally by the WASM host's `test/sandbox.test.ts`).
+ *
+ * @see RFCS/0035-sandbox-execution-contract.md §B invariant 6
+ * @see SECURITY/invariants.yaml node-pack-sandbox-timeout
+ */
+import { describe, it, expect } from 'vitest';
+import { readFileSync } from 'node:fs';
+import { join } from 'node:path';
+import { FIXTURES_DIR } from '../lib/paths.js';
+import { probeTimeout } from '../lib/wasm-sandbox-probe.js';
+const why = (specRef: string, requirement: string): string => `${specRef} — ${requirement}`;
+const dir = join(FIXTURES_DIR, 'wasm-sandbox');
+const fix = (name: string): Uint8Array => new Uint8Array(readFileSync(join(dir, `${name}.wasm`)));
+describe('sandbox-wasm-timeout: wall-clock cap is engine/worker-enforced (RFC 0035 §B 6, server-free)', () => {
+  it('node-pack-sandbox-timeout: a non-terminating module is killed with sandbox_timeout', async () => {
+    const r = await probeTimeout(fix('misbehaving-timeout'), { memoryLimitBytes: 2 * 1024 * 1024, wallClockLimitMs: 300 });
+    expect(r.ok, why('RFC 0035 §B invariant 6', 'an over-budget invocation MUST fail')).toBe(false);
+    expect(r.code, why('RFC 0035 §C', 'the failure code MUST be sandbox_timeout')).toBe('sandbox_timeout');
+  });
+  it('positive control: a well-behaved module completes within the budget (the kill-timer does not false-positive)', async () => {
+    const r = await probeTimeout(fix('well-behaved-echo'), { memoryLimitBytes: 2 * 1024 * 1024, wallClockLimitMs: 1000 }, 'invoke', 7);
+    expect(r.ok, why('RFC 0035 §B', 'a within-budget invocation completes before the kill-timer')).toBe(true);
+    expect(r.result).toBe(7);
+  });
+});

package/src/scenarios/secret-leakage-otel-attribute.test.ts CHANGED Viewed

@@ -56,6 +56,7 @@ import { driver } from '../lib/driver.js';
 import { pollUntilTerminal } from '../lib/polling.js';
 import { isFixtureAdvertised } from '../lib/fixtures.js';
 import { capabilityFamily } from '../lib/discovery-capabilities.js';
+import { getCollector, waitForRunSpans } from '../lib/otel-collector.js';
 const HTTP_SKIP = !process.env.OPENWOP_BASE_URL;
 const BYOK_WORKFLOW_ID = 'openwop-smoke-byok-roundtrip';
@@ -205,6 +206,57 @@ describe.skipIf(HTTP_SKIP || FIXTURE_SKIP)(
   },
 );
+describe.skipIf(HTTP_SKIP || FIXTURE_SKIP)(
+  'secret-leakage-otel-attribute: real OTLP export scrape (collector-side)',
+  () => {
+    // Distinct from the scrape-seam probe above: this asserts against what
+    // the host's OTLP exporter ACTUALLY shipped over the wire to the
+    // conformance collector, not what the host self-reports via its
+    // `/v1/host/sample/test/otel/spans` seam. A host could redact in its
+    // seam yet leak on the real export — only this catches that. Closes
+    // the `docs/KNOWN-LIMITS.md` "collector seam doesn't inspect span
+    // attributes" gap. Gated on the in-process collector being active
+    // (`OPENWOP_OTEL_COLLECTOR=true` + the host configured to export to it).
+    it('NO real-exported OTel span/metric attribute MUST contain the BYOK canary plaintext', async (ctx) => {
+      const collector = getCollector();
+      if (!collector || !CANARY_VALUE) {
+        ctx.skip();
+        return;
+      }
+      const d = await readDiscovery();
+      const secretsOk = capabilityFamily<{ supported?: unknown }>(d, 'secrets')?.supported === true;
+      const obsOk = capabilityFamily<unknown>(d, 'observability') !== undefined;
+      if (!secretsOk || !obsOk) {
+        ctx.skip();
+        return;
+      }
+      collector.reset();
+      const runId = await startByokRun();
+      if (runId === null) {
+        ctx.skip();
+        return;
+      }
+      const terminal = await pollUntilTerminal(runId);
+      expect(terminal.status).toBe('completed');
+      // Hosts export spans asynchronously after terminal; poll until the
+      // run's spans land (or the timeout elapses — an absent export is a
+      // separate coverage concern, not a leak).
+      await waitForRunSpans(runId, { timeoutMs: 8_000 });
+      const leaks = collector.findCanaryLeakage(CANARY_VALUE);
+      expect(
+        leaks,
+        driver.describe(
+          'SECURITY/invariants.yaml secret-leakage-otel-attribute',
+          `no real-exported OTel span/metric attribute may contain the BYOK canary plaintext. Leaking surfaces: ${JSON.stringify(leaks)}`,
+        ),
+      ).toEqual([]);
+    });
+  },
+);
 describe.skipIf(HTTP_SKIP || FIXTURE_SKIP)(
   'secret-leakage-otel-attribute: advertisement-shape probe (RFC 0034 §A)',
   () => {

package/src/scenarios/tool-catalog-projection.test.ts ADDED Viewed

@@ -0,0 +1,120 @@
+/**
+ * Portable tool catalog — the `GET /v1/tools` projection (RFC 0078 §B/§F) —
+ * behavioral.
+ *
+ * Capability-gated on `toolCatalog.supported` (root-first per RFC 0073).
+ * Soft-skips when unadvertised (default) / hard-fails under
+ * `OPENWOP_REQUIRE_BEHAVIOR=true`. The always-on wire-shape coverage lives in
+ * `tool-descriptor-shape.test.ts`; this asserts host BEHAVIOR black-box on the
+ * NORMATIVE reads:
+ *
+ *   1. LIST (§B) — `GET /v1/tools` returns a `ToolDescriptor[]`, each
+ *      schema-valid, `source` ∈ the closed vocab, `safetyTier` ∈ the closed
+ *      vocab, and content-free (no credential material, SR-1).
+ *   2. BY-ID (§B) — `GET /v1/tools/{toolId}` returns that descriptor; an unknown
+ *      id 404s.
+ *   3. AUTH-GATED — an unauthenticated `GET /v1/tools` is `401` (not public).
+ *   4. §F-2 NON-DISCLOSURE — a tool id known to belong to a DIFFERENT principal
+ *      (`OPENWOP_CROSS_PRINCIPAL_TOOL_ID`) 404s for this caller, identically to
+ *      "not found" — the authorization-scoped projection never discloses another
+ *      principal's tools. Soft-skips when the env var is unset.
+ *
+ * Spec references:
+ *   - https://github.com/openwop/openwop/blob/main/spec/v1/tool-catalog.md (§B/§F)
+ *   - https://github.com/openwop/openwop/blob/main/RFCS/0078-portable-tool-catalog-and-tool-session-contract.md
+ */
+import { describe, it, expect } from 'vitest';
+import { readFileSync } from 'node:fs';
+import { join } from 'node:path';
+import Ajv2020 from 'ajv/dist/2020.js';
+import addFormats from 'ajv-formats';
+import { driver } from '../lib/driver.js';
+import { behaviorGate } from '../lib/behavior-gate.js';
+import { SCHEMAS_DIR } from '../lib/paths.js';
+import {
+  readToolCatalogCap,
+  listTools,
+  getTool,
+  TOOL_SOURCES,
+  SAFETY_TIERS,
+  TOOL_CONTENT_FORBIDDEN,
+} from '../lib/toolCatalog.js';
+function loadSchema(name: string): Record<string, unknown> {
+  return JSON.parse(readFileSync(join(SCHEMAS_DIR, name), 'utf8')) as Record<string, unknown>;
+}
+function expectContentFree(d: Record<string, unknown>, where: string): void {
+  for (const f of TOOL_CONTENT_FORBIDDEN) {
+    expect(
+      !(f in d),
+      driver.describe('RFC 0078 §F (SR-1)', `${where} MUST be content-free (no ${f})`),
+    ).toBe(true);
+  }
+}
+describe('tool-catalog-projection (RFC 0078 §B/§F)', () => {
+  it('lists schema-valid ToolDescriptors, serves by-id + 404s, is auth-gated, and never discloses another principal', async () => {
+    const cap = await readToolCatalogCap();
+    if (!behaviorGate('openwop-tool-catalog', cap?.supported === true)) return;
+    const ajv = new Ajv2020({ strict: false, allErrors: true });
+    addFormats(ajv);
+    const validate = ajv.compile(loadSchema('tool-descriptor.schema.json'));
+    // ---- Leg 3: auth-gated (unauthenticated list MUST be 401) -------------
+    const unauth = await driver.get('/v1/tools', { authenticated: false });
+    expect(
+      unauth.status === 401,
+      driver.describe('tool-catalog.md §B', 'GET /v1/tools MUST require authentication (401 unauthenticated)'),
+    ).toBe(true);
+    // ---- Leg 1: the list (§B) -------------------------------------------
+    const tools = await listTools();
+    if (tools === null) return; // host advertises the cap but doesn't serve the read — soft-skip the rest
+    for (const t of tools) {
+      expect(
+        validate(t),
+        driver.describe('tool-descriptor.schema.json', `each ToolDescriptor MUST validate (${ajv.errorsText(validate.errors)})`),
+      ).toBe(true);
+      expect(
+        typeof t.source === 'string' && TOOL_SOURCES.includes(t.source as string),
+        driver.describe('tool-catalog.md §C', 'ToolDescriptor.source MUST be in the closed vocabulary'),
+      ).toBe(true);
+      expect(
+        typeof t.safetyTier === 'string' && SAFETY_TIERS.includes(t.safetyTier as string),
+        driver.describe('tool-catalog.md §C', 'ToolDescriptor.safetyTier MUST be pure|read|write|exec'),
+      ).toBe(true);
+      expectContentFree(t, 'ToolDescriptor');
+    }
+    // ---- Leg 2: by-id round-trip + unknown 404 (§B) ---------------------
+    if (tools.length > 0 && typeof tools[0]!.toolId === 'string') {
+      const id = tools[0]!.toolId as string;
+      const one = await getTool(id);
+      if (one.status === 200) {
+        expect(
+          one.descriptor?.toolId === id,
+          driver.describe('tool-catalog.md §B', 'GET /v1/tools/{toolId} MUST return the requested descriptor'),
+        ).toBe(true);
+      }
+    }
+    const unknown = await getTool('__conformance_nonexistent_tool__');
+    expect(
+      unknown.status === 404,
+      driver.describe('tool-catalog.md §B', 'GET /v1/tools/{unknown} MUST 404'),
+    ).toBe(true);
+    // ---- Leg 4: §F-2 cross-principal non-disclosure (env-gated) ---------
+    const crossId = process.env.OPENWOP_CROSS_PRINCIPAL_TOOL_ID;
+    if (crossId) {
+      const cross = await getTool(crossId);
+      expect(
+        cross.status === 404,
+        driver.describe('tool-catalog.md §F-2', 'a tool owned by a different principal MUST 404 (non-disclosure)'),
+      ).toBe(true);
+    }
+  });
+});