@openwop/openwop-conformance 1.14.0 → 1.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,87 +8,229 @@
8
8
  * Asserts (behavioral, when a host advertises `version: 4` + the contract):
9
9
  *
10
10
  * 1. A `mode: replay` fork from event-log index `fromSeq` produces an
11
- * event-log prefix `[0, fromSeq]` that is byte-equivalent to the
12
- * original run's prefix (modulo per-region clock fields per RFC 0036
13
- * §E and ULID component-T entropy when ULIDs are minted fresh).
11
+ * observable event-log prefix `[0, fromSeq]` that is byte-equivalent
12
+ * to the original run's prefix (modulo volatile per-event fields:
13
+ * eventId/ULID entropy, per-region `observedAt` clocks per RFC 0036
14
+ * §E, and the run id itself).
14
15
  *
15
- * 2. The replay's `RunSnapshot.variables`, `RunSnapshot.channels`, and
16
- * `RunSnapshot.status` at the boundary index are byte-equivalent to
17
- * the original.
16
+ * 2. (Crucially per §C.) The replay reproduces the OBSERVABLE RESULT of
17
+ * a nondeterministic tool node EVEN WHEN a fresh call would produce
18
+ * different bytes. The `conformance-phase4-nondet-tool` fixture's
19
+ * first node declares `config.nondeterministic: true`; a `version: 4`
20
+ * host MUST replay the original event-log entries for that node
21
+ * (cache the observable result) rather than re-executing it, so the
22
+ * node's terminal payload is identical across original + replay.
18
23
  *
19
- * 3. (Crucially per §C.) The replay reproduces observable output EVEN
20
- * WHEN the underlying tool call would have produced different bytes.
21
- * The reference test uses a mock tool that returns a fresh random
22
- * string on each call; the host MUST cache the original observable
23
- * result so replay returns the SAME string the original got not
24
- * the bytes a fresh call would return now.
24
+ * The `conformance-phase4-nondet-tool` fixture ships in the suite (added
25
+ * via the RFC 0041 Phase 4 fixtures commit). These assertions are now
26
+ * runnable capability-gated `it()` bodies consistent with the sibling
27
+ * `replay-divergence-at-refusal.test.ts`, which is likewise active and
28
+ * soft-skips on the same gate. They light up the moment a host advertises
29
+ * the `version: 4` replay-determinism contract; against hosts that don't
30
+ * (incl. the reference workflow-engine, which has not yet wired the
31
+ * pure-replay observable-cache path), they soft-skip honestly.
25
32
  *
26
- * Driving the assertion requires a workflow fixture whose tool call is
27
- * pure-nondeterministic (different bytes on each call) but whose
28
- * observable result is what gets cached. Reference workflow-engine ships
29
- * `core.noop` + deterministic fixtures; the `version: 4` wiring needs a
30
- * nondeterministic-tool fixture (e.g., `conformance-phase4-nondet-tool`).
31
- * Until that lands, the cross-boundary assertion is surfaced as `it.todo`
32
- * so test reporters track the gap.
33
+ * RFC 0042 §B note: RFC 0041 §C is `Active` (not yet `Accepted`), so its
34
+ * wire shape MAY shift compatibly within v1.x — a host wiring this before
35
+ * RFC 0041 graduates SHOULD advertise `multiAgent.executionModel.tier:
36
+ * 'experimental'` + `experimentalUntil` per RFC 0042 §A.
33
37
  *
34
38
  * @see RFCS/0041-multi-agent-replay-under-nondeterminism.md §C
35
39
  * @see spec/v1/replay.md §"Observable-output-sequence determinism vs bit-equivalent execution (MAE-9 closure)"
36
40
  * @see spec/v1/multi-agent-execution.md §"Replay determinism under nondeterminism (RFC 0041)"
37
41
  */
38
42
 
39
- import { describe, it } from 'vitest';
40
-
41
- // Behavioral assertions in this file are currently `it.todo` placeholders;
42
- // the `conformance-phase4-nondet-tool` fixture hasn't shipped yet. When
43
- // it does, the `it.todo` calls flip back to runnable `it(...)` bodies
44
- // that read discovery (via `driver.get('/.well-known/openwop')`), gate
45
- // on `multiAgent.executionModel.version >= 4` AND
46
- // `replayDeterminism.supported: true`, and drive the workflow through
47
- // the fixture.
48
-
49
- describe('replay-observable-sequence-determinism: prefix byte-equivalence (RFC 0041 §C)', () => {
50
- // Behavioral assertion drives a workflow with at least one node whose
51
- // underlying tool call is nondeterministic (different bytes on each
52
- // call). The assertion sequence:
53
- // 1. POST /v1/runs { workflowId: 'conformance-phase4-nondet-tool' }
54
- // → runs to completion, capturing the original event log.
55
- // 2. Capture original event-log prefix [0, N] where N is the index
56
- // after the nondeterministic-tool node fires.
57
- // 3. POST /v1/runs/{runId}:fork { mode: 'replay', fromSeq: N }
58
- // 4. Read replay event-log prefix [0, N].
59
- // 5. Assert byte-equivalence modulo the carve-outs:
60
- // - per-region observedAt timestamps (RFC 0036 §E)
61
- // - ULID component-T entropy on newly-minted eventIds
62
- // 6. Read original + replay RunSnapshot at index N; assert
63
- // variables + channels + status byte-equivalent.
64
- // Surfaced as `todo` until the `conformance-phase4-nondet-tool`
65
- // fixture ships in the suite — consistent with the sibling RFC 0041
66
- // scenarios (`replay-divergence-at-refusal.test.ts`,
67
- // `replay-llm-cache-key-portable.test.ts`).
68
- // Marked out of stable profile via RFC 0042 §B (experimental tier):
69
- // RFC 0041 §C remains Active, so its wire shape MAY shift compatibly
70
- // within v1.x. Hosts that wire this assertion before RFC 0041 graduates
71
- // to Accepted SHOULD advertise `multiAgent.executionModel.tier:
72
- // 'experimental'` + `experimentalUntil` per RFC 0042 §A. Path-to-runnable
73
- // requires: (a) host pure-replay observable-cache emission via the
74
- // `:fork mode: replay` re-dispatch path and (b) the test seam endpoint
75
- // contract for cache-hit-vs-fresh-call distinction (see
76
- // `spec/v1/host-sample-test-seams.md` for the established seam pattern).
77
- it.skip('original and replay event-log prefixes [0, fromSeq] MUST be byte-equivalent (modulo per-region clock + ULID-T entropy) — out of stable profile via RFC 0042');
78
- });
79
-
80
- describe('replay-observable-sequence-determinism: observable-result caching (RFC 0041 §C)', () => {
81
- // The load-bearing assertion: a nondeterministic tool call's OBSERVABLE
82
- // RESULT (return value + side-effects on workflow state + emitted events)
83
- // is what gets cached, not the bytes-on-the-wire of the underlying call.
84
- // The replay's reproduction of the observable sequence is what makes
85
- // this a valid determinism contract — bit-equivalent execution would
86
- // require unbounded caching (rejected per RFC 0041 §"Alternatives
87
- // considered" #2).
88
- // Marked out of stable profile via RFC 0042 §B (experimental tier):
89
- // see the prefix-byte-equivalence comment above for the same routing.
90
- // This is RFC 0041 §C's load-bearing assertion; it lands as a runnable
91
- // `it()` when RFC 0041 graduates to Accepted on first non-steward host
92
- // adoption.
93
- it.skip('replay of a workflow containing a nondeterministic tool call reproduces the original observable result, NOT a fresh call — out of stable profile via RFC 0042');
94
- });
43
+ import { describe, it, expect } from 'vitest';
44
+ import { driver } from '../lib/driver.js';
45
+ import { capabilityFamily } from '../lib/discovery-capabilities.js';
46
+
47
+ const HTTP_SKIP = !process.env.OPENWOP_BASE_URL;
48
+ const FIXTURE = 'conformance-phase4-nondet-tool';
49
+ const NONDET_NODE_ID = 'nondet-tool';
50
+
51
+ interface ExecutionModelCaps {
52
+ version?: unknown;
53
+ replayDeterminism?: { supported?: unknown };
54
+ }
55
+ interface DiscoveryDoc {
56
+ capabilities?: {
57
+ multiAgent?: { executionModel?: ExecutionModelCaps };
58
+ };
59
+ }
60
+
61
+ interface RunSnapshot {
62
+ status?: string;
63
+ }
64
+ interface RunEventDoc {
65
+ type: string;
66
+ nodeId?: string;
67
+ sequence?: number;
68
+ payload?: Record<string, unknown>;
69
+ }
70
+
71
+ async function readDiscovery(): Promise<DiscoveryDoc | null> {
72
+ try {
73
+ const res = await driver.get('/.well-known/openwop');
74
+ if (res.status !== 200) return null;
75
+ return res.json as DiscoveryDoc;
76
+ } catch {
77
+ return null;
78
+ }
79
+ }
80
+
81
+ /** Soft-skip unless the host advertises the RFC 0041 §C version-4 contract. */
82
+ async function gateOnPhase4(ctx: { skip: () => void }): Promise<boolean> {
83
+ const d = await readDiscovery();
84
+ const em = capabilityFamily<{ executionModel?: ExecutionModelCaps }>(d, 'multiAgent')?.executionModel;
85
+ const version = typeof em?.version === 'number' ? em.version : 0;
86
+ if (em?.replayDeterminism?.supported !== true || version < 4) {
87
+ ctx.skip();
88
+ return false;
89
+ }
90
+ return true;
91
+ }
92
+
93
+ async function pollUntilTerminal(runId: string): Promise<RunSnapshot> {
94
+ for (let i = 0; i < 50; i++) {
95
+ const r = await driver.get(`/v1/runs/${encodeURIComponent(runId)}`);
96
+ const snap = r.json as RunSnapshot;
97
+ if (snap.status === 'completed' || snap.status === 'failed' || snap.status === 'cancelled') {
98
+ return snap;
99
+ }
100
+ await new Promise((resolve) => setTimeout(resolve, 100));
101
+ }
102
+ throw new Error(`run ${runId} did not reach terminal within 5s`);
103
+ }
104
+
105
+ async function readEvents(runId: string): Promise<RunEventDoc[]> {
106
+ const r = await driver.get(`/v1/runs/${encodeURIComponent(runId)}/events`);
107
+ const body = r.json as { events?: RunEventDoc[] };
108
+ return body.events ?? [];
109
+ }
110
+
111
+ /**
112
+ * Volatile field names that differ legitimately between an original run and
113
+ * its replay: freshly-minted event ids/ULIDs, the run id, and per-region
114
+ * clock fields (RFC 0036 §E carve-out). Stripped wherever they appear —
115
+ * including NESTED inside payloads — so the byte-equivalence comparison
116
+ * tolerates only these carve-outs and flags any other divergence.
117
+ */
118
+ const VOLATILE_KEYS = new Set([
119
+ 'eventId',
120
+ 'runId',
121
+ 'observedAt',
122
+ 'timestamp',
123
+ 'occurredAt',
124
+ 'emittedAt',
125
+ 'id',
126
+ ]);
127
+
128
+ /**
129
+ * Recursively strip {@link VOLATILE_KEYS} from an event so two runs of the
130
+ * same workflow are comparable. Recurses into nested objects + arrays (a
131
+ * host that buries a clock or ULID inside a payload is normalized too),
132
+ * leaving every non-volatile field intact for the equivalence assertion.
133
+ */
134
+ function stripVolatile(ev: RunEventDoc): unknown {
135
+ const walk = (node: unknown): unknown => {
136
+ if (Array.isArray(node)) return node.map(walk);
137
+ if (node !== null && typeof node === 'object') {
138
+ const out: Record<string, unknown> = {};
139
+ for (const [k, v] of Object.entries(node as Record<string, unknown>)) {
140
+ if (VOLATILE_KEYS.has(k)) continue;
141
+ out[k] = walk(v);
142
+ }
143
+ return out;
144
+ }
145
+ return node;
146
+ };
147
+ return walk(JSON.parse(JSON.stringify(ev)));
148
+ }
149
+
150
+ /** Create the fixture run; returns null (with a skip) if it isn't advertised. */
151
+ async function startFixtureRun(ctx: { skip: () => void }): Promise<string | null> {
152
+ const create = await driver.post('/v1/runs', { workflowId: FIXTURE });
153
+ if (create.status === 404 || create.status === 422) {
154
+ ctx.skip(); // fixture not advertised by this host
155
+ return null;
156
+ }
157
+ expect(create.status).toBe(201);
158
+ return (create.json as { runId: string }).runId;
159
+ }
160
+
161
+ describe.skipIf(HTTP_SKIP)(
162
+ 'replay-observable-sequence-determinism: prefix byte-equivalence (RFC 0041 §C)',
163
+ () => {
164
+ it('original and replay event-log prefixes MUST be byte-equivalent (modulo per-event clock + ULID entropy)', async (ctx) => {
165
+ if (!(await gateOnPhase4(ctx))) return;
166
+
167
+ const sourceRunId = await startFixtureRun(ctx);
168
+ if (sourceRunId === null) return;
169
+ const sourceTerminal = await pollUntilTerminal(sourceRunId);
170
+ expect(sourceTerminal.status).toBe('completed');
171
+ const sourceEvents = await readEvents(sourceRunId);
172
+
173
+ const forkRes = await driver.post(`/v1/runs/${encodeURIComponent(sourceRunId)}:fork`, {
174
+ fromSeq: 0,
175
+ mode: 'replay',
176
+ });
177
+ expect(forkRes.status).toBe(201);
178
+ const replayRunId = (forkRes.json as { runId: string }).runId;
179
+ await pollUntilTerminal(replayRunId);
180
+ const replayEvents = await readEvents(replayRunId);
181
+
182
+ const sourceNorm = sourceEvents.map(stripVolatile);
183
+ const replayNorm = replayEvents.map(stripVolatile);
184
+ expect(
185
+ replayNorm,
186
+ driver.describe(
187
+ 'RFCS/0041-multi-agent-replay-under-nondeterminism.md §C',
188
+ 'a mode:replay fork MUST reproduce the original observable event-log sequence byte-for-byte modulo volatile per-event fields (eventId/ULID entropy, per-region observedAt clock)',
189
+ ),
190
+ ).toEqual(sourceNorm);
191
+ });
192
+ },
193
+ );
194
+
195
+ describe.skipIf(HTTP_SKIP)(
196
+ 'replay-observable-sequence-determinism: observable-result caching (RFC 0041 §C)',
197
+ () => {
198
+ it('replay of a nondeterministic tool node reproduces the ORIGINAL observable result, NOT a fresh call', async (ctx) => {
199
+ if (!(await gateOnPhase4(ctx))) return;
200
+
201
+ const sourceRunId = await startFixtureRun(ctx);
202
+ if (sourceRunId === null) return;
203
+ expect((await pollUntilTerminal(sourceRunId)).status).toBe('completed');
204
+ const sourceEvents = await readEvents(sourceRunId);
205
+
206
+ // The terminal event(s) for the nondeterministic node carry its
207
+ // observable result. Capture every event scoped to that node.
208
+ const sourceNodeEvents = sourceEvents.filter((e) => e.nodeId === NONDET_NODE_ID).map(stripVolatile);
209
+ expect(
210
+ sourceNodeEvents.length,
211
+ driver.describe(
212
+ 'RFCS/0041-multi-agent-replay-under-nondeterminism.md §C',
213
+ `the fixture's nondeterministic node \`${NONDET_NODE_ID}\` MUST emit at least one observable event`,
214
+ ),
215
+ ).toBeGreaterThan(0);
216
+
217
+ const forkRes = await driver.post(`/v1/runs/${encodeURIComponent(sourceRunId)}:fork`, {
218
+ fromSeq: 0,
219
+ mode: 'replay',
220
+ });
221
+ expect(forkRes.status).toBe(201);
222
+ const replayRunId = (forkRes.json as { runId: string }).runId;
223
+ await pollUntilTerminal(replayRunId);
224
+ const replayEvents = await readEvents(replayRunId);
225
+ const replayNodeEvents = replayEvents.filter((e) => e.nodeId === NONDET_NODE_ID).map(stripVolatile);
226
+
227
+ expect(
228
+ replayNodeEvents,
229
+ driver.describe(
230
+ 'RFCS/0041-multi-agent-replay-under-nondeterminism.md §C',
231
+ 'the nondeterministic tool node MUST replay its ORIGINAL observable result (cached event-log entry) rather than re-executing — bit-equivalent re-execution would require unbounded caching, rejected per RFC 0041 §"Alternatives considered" #2',
232
+ ),
233
+ ).toEqual(sourceNodeEvents);
234
+ });
235
+ },
236
+ );
@@ -56,6 +56,7 @@ import { driver } from '../lib/driver.js';
56
56
  import { pollUntilTerminal } from '../lib/polling.js';
57
57
  import { isFixtureAdvertised } from '../lib/fixtures.js';
58
58
  import { capabilityFamily } from '../lib/discovery-capabilities.js';
59
+ import { getCollector, waitForRunSpans } from '../lib/otel-collector.js';
59
60
 
60
61
  const HTTP_SKIP = !process.env.OPENWOP_BASE_URL;
61
62
  const BYOK_WORKFLOW_ID = 'openwop-smoke-byok-roundtrip';
@@ -205,6 +206,57 @@ describe.skipIf(HTTP_SKIP || FIXTURE_SKIP)(
205
206
  },
206
207
  );
207
208
 
209
+ describe.skipIf(HTTP_SKIP || FIXTURE_SKIP)(
210
+ 'secret-leakage-otel-attribute: real OTLP export scrape (collector-side)',
211
+ () => {
212
+ // Distinct from the scrape-seam probe above: this asserts against what
213
+ // the host's OTLP exporter ACTUALLY shipped over the wire to the
214
+ // conformance collector, not what the host self-reports via its
215
+ // `/v1/host/sample/test/otel/spans` seam. A host could redact in its
216
+ // seam yet leak on the real export — only this catches that. Closes
217
+ // the `docs/KNOWN-LIMITS.md` "collector seam doesn't inspect span
218
+ // attributes" gap. Gated on the in-process collector being active
219
+ // (`OPENWOP_OTEL_COLLECTOR=true` + the host configured to export to it).
220
+ it('NO real-exported OTel span/metric attribute MUST contain the BYOK canary plaintext', async (ctx) => {
221
+ const collector = getCollector();
222
+ if (!collector || !CANARY_VALUE) {
223
+ ctx.skip();
224
+ return;
225
+ }
226
+ const d = await readDiscovery();
227
+ const secretsOk = capabilityFamily<{ supported?: unknown }>(d, 'secrets')?.supported === true;
228
+ const obsOk = capabilityFamily<unknown>(d, 'observability') !== undefined;
229
+ if (!secretsOk || !obsOk) {
230
+ ctx.skip();
231
+ return;
232
+ }
233
+
234
+ collector.reset();
235
+ const runId = await startByokRun();
236
+ if (runId === null) {
237
+ ctx.skip();
238
+ return;
239
+ }
240
+ const terminal = await pollUntilTerminal(runId);
241
+ expect(terminal.status).toBe('completed');
242
+
243
+ // Hosts export spans asynchronously after terminal; poll until the
244
+ // run's spans land (or the timeout elapses — an absent export is a
245
+ // separate coverage concern, not a leak).
246
+ await waitForRunSpans(runId, { timeoutMs: 8_000 });
247
+
248
+ const leaks = collector.findCanaryLeakage(CANARY_VALUE);
249
+ expect(
250
+ leaks,
251
+ driver.describe(
252
+ 'SECURITY/invariants.yaml secret-leakage-otel-attribute',
253
+ `no real-exported OTel span/metric attribute may contain the BYOK canary plaintext. Leaking surfaces: ${JSON.stringify(leaks)}`,
254
+ ),
255
+ ).toEqual([]);
256
+ });
257
+ },
258
+ );
259
+
208
260
  describe.skipIf(HTTP_SKIP || FIXTURE_SKIP)(
209
261
  'secret-leakage-otel-attribute: advertisement-shape probe (RFC 0034 §A)',
210
262
  () => {