@openwop/openwop-conformance 1.14.0 → 1.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +20 -0
- package/README.md +2 -2
- package/coverage.md +6 -4
- package/package.json +1 -1
- package/src/lib/budgetPolicy.ts +63 -0
- package/src/lib/otel-collector.ts +102 -0
- package/src/scenarios/budget-enforcement.test.ts +152 -0
- package/src/scenarios/memory-degraded-projection.test.ts +121 -0
- package/src/scenarios/otel-collector-canary-inspection.test.ts +261 -0
- package/src/scenarios/replay-observable-sequence-determinism.test.ts +217 -75
- package/src/scenarios/secret-leakage-otel-attribute.test.ts +52 -0
|
@@ -8,87 +8,229 @@
|
|
|
8
8
|
* Asserts (behavioral, when a host advertises `version: 4` + the contract):
|
|
9
9
|
*
|
|
10
10
|
* 1. A `mode: replay` fork from event-log index `fromSeq` produces an
|
|
11
|
-
* event-log prefix `[0, fromSeq]` that is byte-equivalent
|
|
12
|
-
* original run's prefix (modulo per-
|
|
13
|
-
*
|
|
11
|
+
* observable event-log prefix `[0, fromSeq]` that is byte-equivalent
|
|
12
|
+
* to the original run's prefix (modulo volatile per-event fields:
|
|
13
|
+
* eventId/ULID entropy, per-region `observedAt` clocks per RFC 0036
|
|
14
|
+
* §E, and the run id itself).
|
|
14
15
|
*
|
|
15
|
-
* 2. The replay
|
|
16
|
-
*
|
|
17
|
-
*
|
|
16
|
+
* 2. (Crucially per §C.) The replay reproduces the OBSERVABLE RESULT of
|
|
17
|
+
* a nondeterministic tool node EVEN WHEN a fresh call would produce
|
|
18
|
+
* different bytes. The `conformance-phase4-nondet-tool` fixture's
|
|
19
|
+
* first node declares `config.nondeterministic: true`; a `version: 4`
|
|
20
|
+
* host MUST replay the original event-log entries for that node
|
|
21
|
+
* (cache the observable result) rather than re-executing it, so the
|
|
22
|
+
* node's terminal payload is identical across original + replay.
|
|
18
23
|
*
|
|
19
|
-
*
|
|
20
|
-
*
|
|
21
|
-
*
|
|
22
|
-
*
|
|
23
|
-
*
|
|
24
|
-
*
|
|
24
|
+
* The `conformance-phase4-nondet-tool` fixture ships in the suite (added
|
|
25
|
+
* via the RFC 0041 Phase 4 fixtures commit). These assertions are now
|
|
26
|
+
* runnable capability-gated `it()` bodies — consistent with the sibling
|
|
27
|
+
* `replay-divergence-at-refusal.test.ts`, which is likewise active and
|
|
28
|
+
* soft-skips on the same gate. They light up the moment a host advertises
|
|
29
|
+
* the `version: 4` replay-determinism contract; against hosts that don't
|
|
30
|
+
* (incl. the reference workflow-engine, which has not yet wired the
|
|
31
|
+
* pure-replay observable-cache path), they soft-skip honestly.
|
|
25
32
|
*
|
|
26
|
-
*
|
|
27
|
-
*
|
|
28
|
-
*
|
|
29
|
-
* `
|
|
30
|
-
* nondeterministic-tool fixture (e.g., `conformance-phase4-nondet-tool`).
|
|
31
|
-
* Until that lands, the cross-boundary assertion is surfaced as `it.todo`
|
|
32
|
-
* so test reporters track the gap.
|
|
33
|
+
* RFC 0042 §B note: RFC 0041 §C is `Active` (not yet `Accepted`), so its
|
|
34
|
+
* wire shape MAY shift compatibly within v1.x — a host wiring this before
|
|
35
|
+
* RFC 0041 graduates SHOULD advertise `multiAgent.executionModel.tier:
|
|
36
|
+
* 'experimental'` + `experimentalUntil` per RFC 0042 §A.
|
|
33
37
|
*
|
|
34
38
|
* @see RFCS/0041-multi-agent-replay-under-nondeterminism.md §C
|
|
35
39
|
* @see spec/v1/replay.md §"Observable-output-sequence determinism vs bit-equivalent execution (MAE-9 closure)"
|
|
36
40
|
* @see spec/v1/multi-agent-execution.md §"Replay determinism under nondeterminism (RFC 0041)"
|
|
37
41
|
*/
|
|
38
42
|
|
|
39
|
-
import { describe, it } from 'vitest';
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
})
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
43
|
+
import { describe, it, expect } from 'vitest';
|
|
44
|
+
import { driver } from '../lib/driver.js';
|
|
45
|
+
import { capabilityFamily } from '../lib/discovery-capabilities.js';
|
|
46
|
+
|
|
47
|
+
const HTTP_SKIP = !process.env.OPENWOP_BASE_URL;
|
|
48
|
+
const FIXTURE = 'conformance-phase4-nondet-tool';
|
|
49
|
+
const NONDET_NODE_ID = 'nondet-tool';
|
|
50
|
+
|
|
51
|
+
interface ExecutionModelCaps {
|
|
52
|
+
version?: unknown;
|
|
53
|
+
replayDeterminism?: { supported?: unknown };
|
|
54
|
+
}
|
|
55
|
+
interface DiscoveryDoc {
|
|
56
|
+
capabilities?: {
|
|
57
|
+
multiAgent?: { executionModel?: ExecutionModelCaps };
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
interface RunSnapshot {
|
|
62
|
+
status?: string;
|
|
63
|
+
}
|
|
64
|
+
interface RunEventDoc {
|
|
65
|
+
type: string;
|
|
66
|
+
nodeId?: string;
|
|
67
|
+
sequence?: number;
|
|
68
|
+
payload?: Record<string, unknown>;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
async function readDiscovery(): Promise<DiscoveryDoc | null> {
|
|
72
|
+
try {
|
|
73
|
+
const res = await driver.get('/.well-known/openwop');
|
|
74
|
+
if (res.status !== 200) return null;
|
|
75
|
+
return res.json as DiscoveryDoc;
|
|
76
|
+
} catch {
|
|
77
|
+
return null;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/** Soft-skip unless the host advertises the RFC 0041 §C version-4 contract. */
|
|
82
|
+
async function gateOnPhase4(ctx: { skip: () => void }): Promise<boolean> {
|
|
83
|
+
const d = await readDiscovery();
|
|
84
|
+
const em = capabilityFamily<{ executionModel?: ExecutionModelCaps }>(d, 'multiAgent')?.executionModel;
|
|
85
|
+
const version = typeof em?.version === 'number' ? em.version : 0;
|
|
86
|
+
if (em?.replayDeterminism?.supported !== true || version < 4) {
|
|
87
|
+
ctx.skip();
|
|
88
|
+
return false;
|
|
89
|
+
}
|
|
90
|
+
return true;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
async function pollUntilTerminal(runId: string): Promise<RunSnapshot> {
|
|
94
|
+
for (let i = 0; i < 50; i++) {
|
|
95
|
+
const r = await driver.get(`/v1/runs/${encodeURIComponent(runId)}`);
|
|
96
|
+
const snap = r.json as RunSnapshot;
|
|
97
|
+
if (snap.status === 'completed' || snap.status === 'failed' || snap.status === 'cancelled') {
|
|
98
|
+
return snap;
|
|
99
|
+
}
|
|
100
|
+
await new Promise((resolve) => setTimeout(resolve, 100));
|
|
101
|
+
}
|
|
102
|
+
throw new Error(`run ${runId} did not reach terminal within 5s`);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
async function readEvents(runId: string): Promise<RunEventDoc[]> {
|
|
106
|
+
const r = await driver.get(`/v1/runs/${encodeURIComponent(runId)}/events`);
|
|
107
|
+
const body = r.json as { events?: RunEventDoc[] };
|
|
108
|
+
return body.events ?? [];
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
/**
|
|
112
|
+
* Volatile field names that differ legitimately between an original run and
|
|
113
|
+
* its replay: freshly-minted event ids/ULIDs, the run id, and per-region
|
|
114
|
+
* clock fields (RFC 0036 §E carve-out). Stripped wherever they appear —
|
|
115
|
+
* including NESTED inside payloads — so the byte-equivalence comparison
|
|
116
|
+
* tolerates only these carve-outs and flags any other divergence.
|
|
117
|
+
*/
|
|
118
|
+
const VOLATILE_KEYS = new Set([
|
|
119
|
+
'eventId',
|
|
120
|
+
'runId',
|
|
121
|
+
'observedAt',
|
|
122
|
+
'timestamp',
|
|
123
|
+
'occurredAt',
|
|
124
|
+
'emittedAt',
|
|
125
|
+
'id',
|
|
126
|
+
]);
|
|
127
|
+
|
|
128
|
+
/**
|
|
129
|
+
* Recursively strip {@link VOLATILE_KEYS} from an event so two runs of the
|
|
130
|
+
* same workflow are comparable. Recurses into nested objects + arrays (a
|
|
131
|
+
* host that buries a clock or ULID inside a payload is normalized too),
|
|
132
|
+
* leaving every non-volatile field intact for the equivalence assertion.
|
|
133
|
+
*/
|
|
134
|
+
function stripVolatile(ev: RunEventDoc): unknown {
|
|
135
|
+
const walk = (node: unknown): unknown => {
|
|
136
|
+
if (Array.isArray(node)) return node.map(walk);
|
|
137
|
+
if (node !== null && typeof node === 'object') {
|
|
138
|
+
const out: Record<string, unknown> = {};
|
|
139
|
+
for (const [k, v] of Object.entries(node as Record<string, unknown>)) {
|
|
140
|
+
if (VOLATILE_KEYS.has(k)) continue;
|
|
141
|
+
out[k] = walk(v);
|
|
142
|
+
}
|
|
143
|
+
return out;
|
|
144
|
+
}
|
|
145
|
+
return node;
|
|
146
|
+
};
|
|
147
|
+
return walk(JSON.parse(JSON.stringify(ev)));
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
/** Create the fixture run; returns null (with a skip) if it isn't advertised. */
|
|
151
|
+
async function startFixtureRun(ctx: { skip: () => void }): Promise<string | null> {
|
|
152
|
+
const create = await driver.post('/v1/runs', { workflowId: FIXTURE });
|
|
153
|
+
if (create.status === 404 || create.status === 422) {
|
|
154
|
+
ctx.skip(); // fixture not advertised by this host
|
|
155
|
+
return null;
|
|
156
|
+
}
|
|
157
|
+
expect(create.status).toBe(201);
|
|
158
|
+
return (create.json as { runId: string }).runId;
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
describe.skipIf(HTTP_SKIP)(
|
|
162
|
+
'replay-observable-sequence-determinism: prefix byte-equivalence (RFC 0041 §C)',
|
|
163
|
+
() => {
|
|
164
|
+
it('original and replay event-log prefixes MUST be byte-equivalent (modulo per-event clock + ULID entropy)', async (ctx) => {
|
|
165
|
+
if (!(await gateOnPhase4(ctx))) return;
|
|
166
|
+
|
|
167
|
+
const sourceRunId = await startFixtureRun(ctx);
|
|
168
|
+
if (sourceRunId === null) return;
|
|
169
|
+
const sourceTerminal = await pollUntilTerminal(sourceRunId);
|
|
170
|
+
expect(sourceTerminal.status).toBe('completed');
|
|
171
|
+
const sourceEvents = await readEvents(sourceRunId);
|
|
172
|
+
|
|
173
|
+
const forkRes = await driver.post(`/v1/runs/${encodeURIComponent(sourceRunId)}:fork`, {
|
|
174
|
+
fromSeq: 0,
|
|
175
|
+
mode: 'replay',
|
|
176
|
+
});
|
|
177
|
+
expect(forkRes.status).toBe(201);
|
|
178
|
+
const replayRunId = (forkRes.json as { runId: string }).runId;
|
|
179
|
+
await pollUntilTerminal(replayRunId);
|
|
180
|
+
const replayEvents = await readEvents(replayRunId);
|
|
181
|
+
|
|
182
|
+
const sourceNorm = sourceEvents.map(stripVolatile);
|
|
183
|
+
const replayNorm = replayEvents.map(stripVolatile);
|
|
184
|
+
expect(
|
|
185
|
+
replayNorm,
|
|
186
|
+
driver.describe(
|
|
187
|
+
'RFCS/0041-multi-agent-replay-under-nondeterminism.md §C',
|
|
188
|
+
'a mode:replay fork MUST reproduce the original observable event-log sequence byte-for-byte modulo volatile per-event fields (eventId/ULID entropy, per-region observedAt clock)',
|
|
189
|
+
),
|
|
190
|
+
).toEqual(sourceNorm);
|
|
191
|
+
});
|
|
192
|
+
},
|
|
193
|
+
);
|
|
194
|
+
|
|
195
|
+
describe.skipIf(HTTP_SKIP)(
|
|
196
|
+
'replay-observable-sequence-determinism: observable-result caching (RFC 0041 §C)',
|
|
197
|
+
() => {
|
|
198
|
+
it('replay of a nondeterministic tool node reproduces the ORIGINAL observable result, NOT a fresh call', async (ctx) => {
|
|
199
|
+
if (!(await gateOnPhase4(ctx))) return;
|
|
200
|
+
|
|
201
|
+
const sourceRunId = await startFixtureRun(ctx);
|
|
202
|
+
if (sourceRunId === null) return;
|
|
203
|
+
expect((await pollUntilTerminal(sourceRunId)).status).toBe('completed');
|
|
204
|
+
const sourceEvents = await readEvents(sourceRunId);
|
|
205
|
+
|
|
206
|
+
// The terminal event(s) for the nondeterministic node carry its
|
|
207
|
+
// observable result. Capture every event scoped to that node.
|
|
208
|
+
const sourceNodeEvents = sourceEvents.filter((e) => e.nodeId === NONDET_NODE_ID).map(stripVolatile);
|
|
209
|
+
expect(
|
|
210
|
+
sourceNodeEvents.length,
|
|
211
|
+
driver.describe(
|
|
212
|
+
'RFCS/0041-multi-agent-replay-under-nondeterminism.md §C',
|
|
213
|
+
`the fixture's nondeterministic node \`${NONDET_NODE_ID}\` MUST emit at least one observable event`,
|
|
214
|
+
),
|
|
215
|
+
).toBeGreaterThan(0);
|
|
216
|
+
|
|
217
|
+
const forkRes = await driver.post(`/v1/runs/${encodeURIComponent(sourceRunId)}:fork`, {
|
|
218
|
+
fromSeq: 0,
|
|
219
|
+
mode: 'replay',
|
|
220
|
+
});
|
|
221
|
+
expect(forkRes.status).toBe(201);
|
|
222
|
+
const replayRunId = (forkRes.json as { runId: string }).runId;
|
|
223
|
+
await pollUntilTerminal(replayRunId);
|
|
224
|
+
const replayEvents = await readEvents(replayRunId);
|
|
225
|
+
const replayNodeEvents = replayEvents.filter((e) => e.nodeId === NONDET_NODE_ID).map(stripVolatile);
|
|
226
|
+
|
|
227
|
+
expect(
|
|
228
|
+
replayNodeEvents,
|
|
229
|
+
driver.describe(
|
|
230
|
+
'RFCS/0041-multi-agent-replay-under-nondeterminism.md §C',
|
|
231
|
+
'the nondeterministic tool node MUST replay its ORIGINAL observable result (cached event-log entry) rather than re-executing — bit-equivalent re-execution would require unbounded caching, rejected per RFC 0041 §"Alternatives considered" #2',
|
|
232
|
+
),
|
|
233
|
+
).toEqual(sourceNodeEvents);
|
|
234
|
+
});
|
|
235
|
+
},
|
|
236
|
+
);
|
|
@@ -56,6 +56,7 @@ import { driver } from '../lib/driver.js';
|
|
|
56
56
|
import { pollUntilTerminal } from '../lib/polling.js';
|
|
57
57
|
import { isFixtureAdvertised } from '../lib/fixtures.js';
|
|
58
58
|
import { capabilityFamily } from '../lib/discovery-capabilities.js';
|
|
59
|
+
import { getCollector, waitForRunSpans } from '../lib/otel-collector.js';
|
|
59
60
|
|
|
60
61
|
const HTTP_SKIP = !process.env.OPENWOP_BASE_URL;
|
|
61
62
|
const BYOK_WORKFLOW_ID = 'openwop-smoke-byok-roundtrip';
|
|
@@ -205,6 +206,57 @@ describe.skipIf(HTTP_SKIP || FIXTURE_SKIP)(
|
|
|
205
206
|
},
|
|
206
207
|
);
|
|
207
208
|
|
|
209
|
+
describe.skipIf(HTTP_SKIP || FIXTURE_SKIP)(
|
|
210
|
+
'secret-leakage-otel-attribute: real OTLP export scrape (collector-side)',
|
|
211
|
+
() => {
|
|
212
|
+
// Distinct from the scrape-seam probe above: this asserts against what
|
|
213
|
+
// the host's OTLP exporter ACTUALLY shipped over the wire to the
|
|
214
|
+
// conformance collector, not what the host self-reports via its
|
|
215
|
+
// `/v1/host/sample/test/otel/spans` seam. A host could redact in its
|
|
216
|
+
// seam yet leak on the real export — only this catches that. Closes
|
|
217
|
+
// the `docs/KNOWN-LIMITS.md` "collector seam doesn't inspect span
|
|
218
|
+
// attributes" gap. Gated on the in-process collector being active
|
|
219
|
+
// (`OPENWOP_OTEL_COLLECTOR=true` + the host configured to export to it).
|
|
220
|
+
it('NO real-exported OTel span/metric attribute MUST contain the BYOK canary plaintext', async (ctx) => {
|
|
221
|
+
const collector = getCollector();
|
|
222
|
+
if (!collector || !CANARY_VALUE) {
|
|
223
|
+
ctx.skip();
|
|
224
|
+
return;
|
|
225
|
+
}
|
|
226
|
+
const d = await readDiscovery();
|
|
227
|
+
const secretsOk = capabilityFamily<{ supported?: unknown }>(d, 'secrets')?.supported === true;
|
|
228
|
+
const obsOk = capabilityFamily<unknown>(d, 'observability') !== undefined;
|
|
229
|
+
if (!secretsOk || !obsOk) {
|
|
230
|
+
ctx.skip();
|
|
231
|
+
return;
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
collector.reset();
|
|
235
|
+
const runId = await startByokRun();
|
|
236
|
+
if (runId === null) {
|
|
237
|
+
ctx.skip();
|
|
238
|
+
return;
|
|
239
|
+
}
|
|
240
|
+
const terminal = await pollUntilTerminal(runId);
|
|
241
|
+
expect(terminal.status).toBe('completed');
|
|
242
|
+
|
|
243
|
+
// Hosts export spans asynchronously after terminal; poll until the
|
|
244
|
+
// run's spans land (or the timeout elapses — an absent export is a
|
|
245
|
+
// separate coverage concern, not a leak).
|
|
246
|
+
await waitForRunSpans(runId, { timeoutMs: 8_000 });
|
|
247
|
+
|
|
248
|
+
const leaks = collector.findCanaryLeakage(CANARY_VALUE);
|
|
249
|
+
expect(
|
|
250
|
+
leaks,
|
|
251
|
+
driver.describe(
|
|
252
|
+
'SECURITY/invariants.yaml secret-leakage-otel-attribute',
|
|
253
|
+
`no real-exported OTel span/metric attribute may contain the BYOK canary plaintext. Leaking surfaces: ${JSON.stringify(leaks)}`,
|
|
254
|
+
),
|
|
255
|
+
).toEqual([]);
|
|
256
|
+
});
|
|
257
|
+
},
|
|
258
|
+
);
|
|
259
|
+
|
|
208
260
|
describe.skipIf(HTTP_SKIP || FIXTURE_SKIP)(
|
|
209
261
|
'secret-leakage-otel-attribute: advertisement-shape probe (RFC 0034 §A)',
|
|
210
262
|
() => {
|