@openwop/openwop-conformance 1.13.0 → 1.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/CHANGELOG.md +21 -0
  2. package/README.md +2 -2
  3. package/api/openapi.yaml +60 -0
  4. package/coverage.md +15 -4
  5. package/fixtures/wasm-sandbox/isolation-global.wasm +0 -0
  6. package/fixtures/wasm-sandbox/isolation-global.wat +6 -0
  7. package/fixtures/wasm-sandbox/misbehaving-capability-gate.wasm +0 -0
  8. package/fixtures/wasm-sandbox/misbehaving-capability-gate.wat +4 -0
  9. package/fixtures/wasm-sandbox/misbehaving-env.wasm +0 -0
  10. package/fixtures/wasm-sandbox/misbehaving-env.wat +4 -0
  11. package/fixtures/wasm-sandbox/misbehaving-fs.wasm +0 -0
  12. package/fixtures/wasm-sandbox/misbehaving-fs.wat +4 -0
  13. package/fixtures/wasm-sandbox/misbehaving-memory.wasm +0 -0
  14. package/fixtures/wasm-sandbox/misbehaving-memory.wat +5 -0
  15. package/fixtures/wasm-sandbox/misbehaving-network.wasm +0 -0
  16. package/fixtures/wasm-sandbox/misbehaving-network.wat +4 -0
  17. package/fixtures/wasm-sandbox/misbehaving-process.wasm +0 -0
  18. package/fixtures/wasm-sandbox/misbehaving-process.wat +4 -0
  19. package/fixtures/wasm-sandbox/misbehaving-timeout.wasm +0 -0
  20. package/fixtures/wasm-sandbox/misbehaving-timeout.wat +4 -0
  21. package/fixtures/wasm-sandbox/well-behaved-echo.wasm +0 -0
  22. package/fixtures/wasm-sandbox/well-behaved-echo.wat +2 -0
  23. package/fixtures/wasm-sandbox/well-behaved-host-fetch.wasm +0 -0
  24. package/fixtures/wasm-sandbox/well-behaved-host-fetch.wat +3 -0
  25. package/package.json +1 -1
  26. package/src/lib/discovery-capabilities.ts +18 -19
  27. package/src/lib/egressPolicy.ts +76 -0
  28. package/src/lib/otel-collector.ts +72 -0
  29. package/src/lib/profiles.ts +15 -0
  30. package/src/lib/sandbox-timeout-worker.mjs +31 -0
  31. package/src/lib/toolCatalog.ts +81 -0
  32. package/src/lib/wasm-sandbox-probe.ts +168 -0
  33. package/src/scenarios/core-standard-profile.test.ts +75 -0
  34. package/src/scenarios/egress-audience-binding.test.ts +81 -0
  35. package/src/scenarios/egress-decision-content-free.test.ts +57 -0
  36. package/src/scenarios/memory-degraded-projection.test.ts +121 -0
  37. package/src/scenarios/multi-agent-confidence-escalation.test.ts +12 -7
  38. package/src/scenarios/otel-collector-canary-inspection.test.ts +211 -0
  39. package/src/scenarios/prompt-resolution-chain-event.test.ts +113 -0
  40. package/src/scenarios/replay-observable-sequence-determinism.test.ts +192 -75
  41. package/src/scenarios/sandbox-wasm-isolation.test.ts +98 -0
  42. package/src/scenarios/sandbox-wasm-timeout.test.ts +40 -0
  43. package/src/scenarios/secret-leakage-otel-attribute.test.ts +52 -0
  44. package/src/scenarios/tool-catalog-projection.test.ts +120 -0
  45. package/src/scenarios/tool-session-lifecycle.test.ts +105 -0
  46. package/src/scenarios/workspace-cross-tenant-isolation-blackbox.test.ts +89 -0
@@ -0,0 +1,81 @@
1
+ /**
2
+ * Credential-audience-bound egress (RFC 0079 §C) — behavioral KEYSTONE.
3
+ *
4
+ * Gated on `httpClient.egressPolicy.supported` (root-first per RFC 0073).
5
+ * Soft-skips when unadvertised (default) / hard-fails under
6
+ * `OPENWOP_REQUIRE_BEHAVIOR=true`. The always-on wire-shape coverage lives in
7
+ * `egress-provenance-shape.test.ts`; this asserts host BEHAVIOR — the §C
8
+ * confused-deputy MUST that backs the `egress-credential-audience-bound`
9
+ * SECURITY invariant:
10
+ *
11
+ * 1. OUT-OF-AUDIENCE — a host-issued credential bound to audience A, used for
12
+ * an egress to destination B (B ∉ A), MUST be `denied` or `downgraded`
13
+ * with `reason: "out-of-audience"`, and the credential MUST NOT be attached
14
+ * to the egress (`credentialAttached !== true`).
15
+ * 2. PROVENANCE-UNEVALUABLE — an egress whose credential provenance cannot be
16
+ * evaluated MUST be `denied` with `reason: "provenance-unevaluable"`
17
+ * (fail-closed, not fail-open).
18
+ *
19
+ * The decision is driven through the OPTIONAL host-sample egress seam
20
+ * (`POST /v1/host/sample/egress/decide`) — soft-skip on 404/405. The decision
21
+ * reason is a CLOSED enum so a host cannot spill a blocked URL/host into a
22
+ * free-form string (SR-1, asserted in `egress-decision-content-free.test.ts`).
23
+ *
24
+ * Spec references:
25
+ * - https://github.com/openwop/openwop/blob/main/spec/v1/host-capabilities.md (§"Credential provenance + egress policy")
26
+ * - https://github.com/openwop/openwop/blob/main/RFCS/0079-credential-provenance-and-egress-policy.md
27
+ * - https://github.com/openwop/openwop/blob/main/SECURITY/invariants.yaml (egress-credential-audience-bound)
28
+ */
29
+
30
+ import { describe, it, expect } from 'vitest';
31
+ import { driver } from '../lib/driver.js';
32
+ import { behaviorGate } from '../lib/behavior-gate.js';
33
+ import { readEgressPolicyCap, driveEgress, EGRESS_DECISIONS, EGRESS_REASONS } from '../lib/egressPolicy.js';
34
+
35
+ describe('egress-audience-binding (RFC 0079 §C)', () => {
36
+ it('denies/downgrades an out-of-audience egress without attaching the credential, and fails closed on unevaluable provenance', async () => {
37
+ const cap = await readEgressPolicyCap();
38
+ if (!behaviorGate('openwop-egress-audience-binding', cap?.supported === true)) return;
39
+
40
+ // ---- Leg 1: out-of-audience — deny|downgrade + credential NOT attached --
41
+ const oob = await driveEgress({ scenario: 'out-of-audience' });
42
+ if (oob === null) return; // egress seam absent — soft-skip the whole behavior
43
+ expect(
44
+ oob.decision === 'denied' || oob.decision === 'downgraded',
45
+ driver.describe('host-capabilities.md §"Credential provenance + egress policy"', 'an out-of-audience egress MUST be denied or downgraded'),
46
+ ).toBe(true);
47
+ expect(
48
+ typeof oob.decision === 'string' && EGRESS_DECISIONS.includes(oob.decision),
49
+ driver.describe('run-event-payloads.schema.json#egressDecided', 'decision MUST be in the closed enum'),
50
+ ).toBe(true);
51
+ expect(
52
+ oob.reason === 'out-of-audience',
53
+ driver.describe('RFC 0079 §C', 'an out-of-audience denial MUST carry reason "out-of-audience"'),
54
+ ).toBe(true);
55
+ expect(
56
+ oob.credentialAttached !== true,
57
+ driver.describe('SECURITY/invariants.yaml egress-credential-audience-bound', 'the host MUST NOT attach a credential whose audience excludes the destination (confused-deputy)'),
58
+ ).toBe(true);
59
+
60
+ // ---- Leg 2: provenance-unevaluable — fail closed (deny) ----------------
61
+ const uneval = await driveEgress({ scenario: 'provenance-unevaluable' });
62
+ if (uneval !== null) {
63
+ expect(
64
+ uneval.decision === 'denied',
65
+ driver.describe('RFC 0079 §C', 'an egress with unevaluable provenance MUST fail closed (denied)'),
66
+ ).toBe(true);
67
+ expect(
68
+ uneval.reason === 'provenance-unevaluable',
69
+ driver.describe('RFC 0079 §C', 'a provenance-unevaluable denial MUST carry reason "provenance-unevaluable"'),
70
+ ).toBe(true);
71
+ expect(
72
+ typeof uneval.reason === 'string' && EGRESS_REASONS.includes(uneval.reason),
73
+ driver.describe('run-event-payloads.schema.json#egressDecided', 'reason MUST be in the closed enum'),
74
+ ).toBe(true);
75
+ expect(
76
+ uneval.credentialAttached !== true,
77
+ driver.describe('SECURITY/invariants.yaml egress-credential-audience-bound', 'a fail-closed egress MUST NOT attach the credential'),
78
+ ).toBe(true);
79
+ }
80
+ });
81
+ });
@@ -0,0 +1,57 @@
1
+ /**
2
+ * Egress-decision secret non-leak (RFC 0079 §F / SR-1) — behavioral.
3
+ *
4
+ * Gated on `httpClient.egressPolicy.supported` (root-first per RFC 0073).
5
+ * Soft-skips when unadvertised (default) / hard-fails under
6
+ * `OPENWOP_REQUIRE_BEHAVIOR=true`. Backs the `egress-decision-no-secret-leak`
7
+ * guarantee: an `egress.decided` payload is metadata-only — it MUST NOT carry
8
+ * the credential value, nor spill the blocked URL/host/header/body into a
9
+ * free-form field, and its `reason` MUST be drawn from the CLOSED vocabulary
10
+ * (so a host cannot smuggle a blocked destination into the reason string).
11
+ *
12
+ * Drives the host-sample seam with a `canary` credential whose value is a known
13
+ * sentinel and asserts the sentinel never surfaces in the decision
14
+ * (`canaryLeaked !== true`) and that the payload carries none of the forbidden
15
+ * content keys. Soft-skips on 404/405.
16
+ *
17
+ * Spec references:
18
+ * - https://github.com/openwop/openwop/blob/main/spec/v1/host-capabilities.md (§"Credential provenance + egress policy")
19
+ * - https://github.com/openwop/openwop/blob/main/RFCS/0079-credential-provenance-and-egress-policy.md
20
+ */
21
+
22
+ import { describe, it, expect } from 'vitest';
23
+ import { driver } from '../lib/driver.js';
24
+ import { behaviorGate } from '../lib/behavior-gate.js';
25
+ import { readEgressPolicyCap, driveEgress, EGRESS_REASONS, EGRESS_CONTENT_FORBIDDEN } from '../lib/egressPolicy.js';
26
+
27
+ describe('egress-decision-content-free (RFC 0079 §F / SR-1)', () => {
28
+ it('never leaks the credential value or the blocked destination into the egress.decided payload', async () => {
29
+ const cap = await readEgressPolicyCap();
30
+ if (!behaviorGate('openwop-egress-decision-content-free', cap?.supported === true)) return;
31
+
32
+ const res = await driveEgress({ scenario: 'canary' });
33
+ if (res === null) return; // seam absent — soft-skip
34
+
35
+ // The canary sentinel MUST NOT appear anywhere observable.
36
+ expect(
37
+ res.canaryLeaked !== true,
38
+ driver.describe('RFC 0079 §F (SR-1)', 'the credential value (canary) MUST NOT leak into any observable surface'),
39
+ ).toBe(true);
40
+
41
+ // No forbidden content keys on the decision payload.
42
+ for (const forbidden of EGRESS_CONTENT_FORBIDDEN) {
43
+ expect(
44
+ !(forbidden in res),
45
+ driver.describe('RFC 0079 §F (SR-1)', `egress.decided MUST be content-free (no ${forbidden})`),
46
+ ).toBe(true);
47
+ }
48
+
49
+ // The reason stays in the closed vocabulary — no free-form destination spill.
50
+ if (res.reason !== undefined) {
51
+ expect(
52
+ typeof res.reason === 'string' && EGRESS_REASONS.includes(res.reason),
53
+ driver.describe('run-event-payloads.schema.json#egressDecided', 'reason MUST be in the closed enum (no free-form spill)'),
54
+ ).toBe(true);
55
+ }
56
+ });
57
+ });
@@ -0,0 +1,121 @@
1
+ /**
2
+ * Memory-capability degraded projection (RFC 0080 §C) — behavioral.
3
+ *
4
+ * Gated on `capabilities.agents.manifestRuntime` + `capabilities.memory`
5
+ * (root-first per RFC 0073). Soft-skips when either is unadvertised (default) /
6
+ * hard-fails under `OPENWOP_REQUIRE_BEHAVIOR=true`. The always-on wire-shape
7
+ * coverage lives in `memory-capability-model-shape.test.ts` (the schema fields +
8
+ * the closed dimension enum); this asserts host BEHAVIOR on the NORMATIVE
9
+ * `GET /v1/agents` inventory:
10
+ *
11
+ * §C iff-contract — for EVERY inventory entry, when the host cannot satisfy an
12
+ * agent's requested `memoryShape` it MUST stamp `memoryDegraded: true` together
13
+ * with a NON-EMPTY `degradedMemoryDimensions[]` whose members are the RFC 0080
14
+ * §A dimension names (the CLOSED enum, NOT the `memoryShape` keys) and are
15
+ * unique; a non-degraded entry MUST carry `memoryDegraded` absent or `false`
16
+ * and MUST NOT carry a non-empty `degradedMemoryDimensions`.
17
+ *
18
+ * Non-vacuity — the inventory MUST be non-empty (the cap is advertised + the
19
+ * endpoint serves). When `OPENWOP_DEGRADED_AGENT_ID` names an agent the host
20
+ * knows is degraded (an agent whose `memoryShape` exceeds host capability —
21
+ * e.g. one requesting `longTerm` on a host without long-term durability), the
22
+ * degraded branch is asserted NON-VACUOUSLY against that agent.
23
+ *
24
+ * Black-box on the normative path — no POST seam.
25
+ *
26
+ * Spec references:
27
+ * - https://github.com/openwop/openwop/blob/main/spec/v1/agent-memory.md (§"Memory capability model")
28
+ * - https://github.com/openwop/openwop/blob/main/RFCS/0080-agent-memory-capability-reconciliation.md
29
+ */
30
+
31
+ import { describe, it, expect } from 'vitest';
32
+ import { driver } from '../lib/driver.js';
33
+ import { behaviorGate } from '../lib/behavior-gate.js';
34
+ import { readCapabilityFamily } from '../lib/discovery-capabilities.js';
35
+ import { readManifestRuntimeCap, listManifestAgents } from '../lib/agentRuntime.js';
36
+
37
+ /** The CLOSED RFC 0080 §A dimension vocabulary (agent-inventory-response.schema.json
38
+ * `degradedMemoryDimensions` enum). NOT the `memoryShape` keys. */
39
+ const DIMENSIONS = [
40
+ 'read',
41
+ 'write',
42
+ 'search',
43
+ 'long-term-durability',
44
+ 'compaction',
45
+ 'attribution',
46
+ 'replay-snapshot',
47
+ 'retention',
48
+ ];
49
+
50
+ interface InventoryEntry {
51
+ agentId?: string;
52
+ memoryDegraded?: unknown;
53
+ degradedMemoryDimensions?: unknown;
54
+ [k: string]: unknown;
55
+ }
56
+
57
+ describe('memory-degraded-projection (RFC 0080 §C)', () => {
58
+ it('stamps memoryDegraded + a closed-enum degradedMemoryDimensions on degraded agents and nothing on the rest', async () => {
59
+ const mr = await readManifestRuntimeCap();
60
+ const memory = await readCapabilityFamily<Record<string, unknown>>('memory');
61
+ const advertised = mr?.supported === true && !!memory && memory.supported === true;
62
+ if (!behaviorGate('openwop-memory-degraded', advertised)) return;
63
+
64
+ const inv = await listManifestAgents();
65
+ if (inv === null) return; // host advertises the cap but doesn't serve /v1/agents — soft-skip
66
+ const agents = (inv.agents ?? []) as InventoryEntry[];
67
+
68
+ // Non-vacuity: an advertising + serving host MUST expose its inventory.
69
+ expect(
70
+ agents.length >= 1,
71
+ driver.describe('agent-memory.md §"Memory capability model"', 'GET /v1/agents MUST return the installed manifest agents'),
72
+ ).toBe(true);
73
+
74
+ // §C iff-contract on EVERY entry.
75
+ for (const a of agents) {
76
+ const degraded = a.memoryDegraded === true;
77
+ const dims = a.degradedMemoryDimensions;
78
+
79
+ if (degraded) {
80
+ expect(
81
+ Array.isArray(dims) && dims.length >= 1,
82
+ driver.describe('RFC 0080 §C', `memoryDegraded:true MUST carry a non-empty degradedMemoryDimensions (agent ${a.agentId})`),
83
+ ).toBe(true);
84
+ if (Array.isArray(dims)) {
85
+ for (const d of dims) {
86
+ expect(
87
+ typeof d === 'string' && DIMENSIONS.includes(d),
88
+ driver.describe('agent-inventory-response.schema.json', `degradedMemoryDimensions members MUST be RFC 0080 §A dimension names (got ${String(d)})`),
89
+ ).toBe(true);
90
+ }
91
+ expect(
92
+ new Set(dims as string[]).size === dims.length,
93
+ driver.describe('RFC 0080 §C', 'degradedMemoryDimensions MUST be unique'),
94
+ ).toBe(true);
95
+ }
96
+ } else {
97
+ // Not degraded ⇒ no non-empty dimension list (absent or empty both pass).
98
+ expect(
99
+ dims === undefined || (Array.isArray(dims) && dims.length === 0),
100
+ driver.describe('RFC 0080 §C', `a non-degraded entry MUST NOT carry a non-empty degradedMemoryDimensions (agent ${a.agentId})`),
101
+ ).toBe(true);
102
+ }
103
+ }
104
+
105
+ // Non-vacuous degraded branch when the host names a known-degraded agent.
106
+ const degradedId = process.env.OPENWOP_DEGRADED_AGENT_ID;
107
+ if (degradedId) {
108
+ const target = agents.find((a) => a.agentId === degradedId);
109
+ expect(
110
+ target !== undefined,
111
+ driver.describe('RFC 0080 §C', `OPENWOP_DEGRADED_AGENT_ID=${degradedId} MUST appear in the inventory`),
112
+ ).toBe(true);
113
+ if (target) {
114
+ expect(
115
+ target.memoryDegraded === true && Array.isArray(target.degradedMemoryDimensions) && target.degradedMemoryDimensions.length >= 1,
116
+ driver.describe('RFC 0080 §C', 'the named degraded agent MUST project memoryDegraded:true + a non-empty degradedMemoryDimensions'),
117
+ ).toBe(true);
118
+ }
119
+ }
120
+ });
121
+ });
@@ -49,7 +49,7 @@
49
49
  import { describe, it, expect } from 'vitest';
50
50
  import { driver } from '../lib/driver.js';
51
51
  import { isFixtureAdvertised } from '../lib/fixtures.js';
52
- import { pollUntilTerminal } from '../lib/polling.js';
52
+ import { pollUntil } from '../lib/polling.js';
53
53
  import { capabilityFamily } from '../lib/discovery-capabilities.js';
54
54
 
55
55
  const HTTP_SKIP = !process.env.OPENWOP_BASE_URL;
@@ -111,12 +111,17 @@ describe.skipIf(BEHAVIORAL_SKIP)('multi-agent-confidence-escalation: behavioral
111
111
  expect(create.status).toBe(201);
112
112
  const runId = (create.json as { runId: string }).runId;
113
113
 
114
- const terminal = await pollUntilTerminal(runId);
115
- // RFC 0039 escalation suspends the parent — NOT a terminal `completed`.
116
- // The conformance pollUntilTerminal returns when the run reaches any
117
- // settled status. RFC 0039 §A gives hosts a choice: clarify-kind
118
- // escalation (→ waiting-clarification) OR escalate-kind approval
119
- // ( waiting-approval).
114
+ // RFC 0039 confidence escalation SUSPENDS the parent (a `waiting-*` status)
115
+ // it is NOT a terminal `completed`/`failed`/`cancelled`. So poll until the
116
+ // run either suspends or settles; polling only for terminal statuses
117
+ // (`pollUntilTerminal`, whose set is {completed,failed,cancelled}) would time
118
+ // out before the suspension is ever observed — the cause of the prior flake.
119
+ const terminal = await pollUntil(runId, (s) => {
120
+ const st = s.status as string;
121
+ return st.startsWith('waiting-') || st === 'completed' || st === 'failed' || st === 'cancelled';
122
+ });
123
+ // RFC 0039 §A gives hosts a choice: clarify-kind escalation
124
+ // (→ waiting-clarification) OR escalate-kind approval (→ waiting-approval).
120
125
  //
121
126
  // RFC 0044 routing: when the host advertises
122
127
  // `capabilities.multiAgent.executionModel.confidenceEscalationInterruptKind`
@@ -0,0 +1,211 @@
1
+ /**
2
+ * otel-collector-canary-inspection — always-on proof that the conformance
3
+ * OTel collector inspects real OTLP span attributes for secret leakage.
4
+ *
5
+ * Context: `secret-leakage-otel-attribute.test.ts` proves a host doesn't
6
+ * leak a BYOK canary on its `GET /v1/host/sample/test/otel/spans` scrape
7
+ * seam. But the scrape seam reports what the host *says* it emitted; a
8
+ * host could redact there yet still ship the plaintext over the wire via
9
+ * its real OTLP exporter. `docs/KNOWN-LIMITS.md` tracked exactly this gap:
10
+ *
11
+ * "The conformance OTel collector seam doesn't yet inspect span
12
+ * attributes; a host could pass conformance while leaking BYOK
13
+ * material on telemetry exports."
14
+ *
15
+ * `OtelCollector.findCanaryLeakage()` (added with this scenario) closes
16
+ * the harness half: it scans every captured span's name, attributes, and
17
+ * resource attributes — plus metric data-point attributes — for the
18
+ * canary, returning the offending surface. This file is the always-on,
19
+ * server-free proof that the inspector is NON-VACUOUS: it stands up a
20
+ * real collector, POSTs synthetic OTLP/HTTP-JSON payloads through the
21
+ * collector's actual ingest path, and asserts the inspector (a) catches a
22
+ * canary embedded in a span attribute / resource attribute / span name /
23
+ * metric attribute, and (b) reports zero hits on a redacted payload.
24
+ *
25
+ * Pairing this with the live, capability-gated collector scrape in
26
+ * `secret-leakage-otel-attribute.test.ts` means the invariant is proven
27
+ * end-to-end the moment a host exports OTLP to the collector — the
28
+ * inspector itself is no longer the missing piece.
29
+ *
30
+ * Runs unconditionally (no host, no network): it exercises the collector
31
+ * library directly, so it executes in CI on every run.
32
+ *
33
+ * @see conformance/src/lib/otel-collector.ts findCanaryLeakage
34
+ * @see SECURITY/invariants.yaml secret-leakage-otel-attribute
35
+ * @see docs/KNOWN-LIMITS.md "Behavior tests too coarse to fully prove an invariant"
36
+ */
37
+
38
+ import { describe, it, expect, afterEach } from 'vitest';
39
+ import { OtelCollector } from '../lib/otel-collector.js';
40
+
41
+ const CANARY = 'sk-canary-DO-NOT-LEAK-0f3a9c';
42
+ const REDACTED = '[REDACTED:openwop-conformance-canary-secret]';
43
+
44
+ /** Build an OTLP/HTTP-JSON traces export carrying the given span+resource attrs. */
45
+ function tracesPayload(opts: {
46
+ spanName: string;
47
+ spanAttrs: Record<string, string>;
48
+ resourceAttrs: Record<string, string>;
49
+ }): unknown {
50
+ const toAttrs = (m: Record<string, string>) =>
51
+ Object.entries(m).map(([key, value]) => ({ key, value: { stringValue: value } }));
52
+ return {
53
+ resourceSpans: [
54
+ {
55
+ resource: { attributes: toAttrs(opts.resourceAttrs) },
56
+ scopeSpans: [
57
+ {
58
+ scope: { name: 'openwop' },
59
+ spans: [
60
+ {
61
+ traceId: 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa',
62
+ spanId: 'bbbbbbbbbbbbbbbb',
63
+ name: opts.spanName,
64
+ startTimeUnixNano: '1',
65
+ endTimeUnixNano: '2',
66
+ attributes: toAttrs(opts.spanAttrs),
67
+ },
68
+ ],
69
+ },
70
+ ],
71
+ },
72
+ ],
73
+ };
74
+ }
75
+
76
+ /** Build an OTLP/HTTP-JSON metrics export with one sum data point carrying attrs. */
77
+ function metricsPayload(metricName: string, attrs: Record<string, string>): unknown {
78
+ return {
79
+ resourceMetrics: [
80
+ {
81
+ scopeMetrics: [
82
+ {
83
+ scope: { name: 'openwop' },
84
+ metrics: [
85
+ {
86
+ name: metricName,
87
+ sum: {
88
+ dataPoints: [
89
+ {
90
+ asInt: '1',
91
+ attributes: Object.entries(attrs).map(([key, value]) => ({
92
+ key,
93
+ value: { stringValue: value },
94
+ })),
95
+ },
96
+ ],
97
+ },
98
+ },
99
+ ],
100
+ },
101
+ ],
102
+ },
103
+ ],
104
+ };
105
+ }
106
+
107
+ describe('otel-collector-canary-inspection: collector inspects real OTLP exports', () => {
108
+ let collector: OtelCollector | null = null;
109
+
110
+ afterEach(async () => {
111
+ if (collector) {
112
+ await collector.stop();
113
+ collector = null;
114
+ }
115
+ });
116
+
117
+ async function postTraces(payload: unknown): Promise<void> {
118
+ const res = await fetch(`${collector!.endpoint()}/v1/traces`, {
119
+ method: 'POST',
120
+ headers: { 'content-type': 'application/json' },
121
+ body: JSON.stringify(payload),
122
+ });
123
+ expect(res.status).toBeLessThan(300);
124
+ }
125
+
126
+ async function postMetrics(payload: unknown): Promise<void> {
127
+ const res = await fetch(`${collector!.endpoint()}/v1/metrics`, {
128
+ method: 'POST',
129
+ headers: { 'content-type': 'application/json' },
130
+ body: JSON.stringify(payload),
131
+ });
132
+ expect(res.status).toBeLessThan(300);
133
+ }
134
+
135
+ it('catches a canary embedded in a span attribute value', async () => {
136
+ collector = new OtelCollector();
137
+ await collector.start();
138
+ await postTraces(
139
+ tracesPayload({
140
+ spanName: 'openwop.node.execute',
141
+ spanAttrs: { 'openwop.node.id': 'n1', 'http.request.header.authorization': `Bearer ${CANARY}` },
142
+ resourceAttrs: { 'service.name': 'host' },
143
+ }),
144
+ );
145
+
146
+ const leaks = collector.findCanaryLeakage(CANARY);
147
+ expect(leaks.length).toBeGreaterThan(0);
148
+ const attrLeak = leaks.find((l) => l.surface === 'span.attribute');
149
+ expect(attrLeak).toBeDefined();
150
+ expect(attrLeak!.key).toBe('http.request.header.authorization');
151
+ expect(attrLeak!.value).toContain(CANARY);
152
+ });
153
+
154
+ it('catches a canary in a resource attribute and in a span name', async () => {
155
+ collector = new OtelCollector();
156
+ await collector.start();
157
+ await postTraces(
158
+ tracesPayload({
159
+ spanName: `openwop.run ${CANARY}`,
160
+ spanAttrs: { 'openwop.run.id': 'r1' },
161
+ resourceAttrs: { 'service.name': 'host', 'deployment.token': CANARY },
162
+ }),
163
+ );
164
+
165
+ const leaks = collector.findCanaryLeakage(CANARY);
166
+ const surfaces = new Set(leaks.map((l) => l.surface));
167
+ expect(surfaces.has('span.name')).toBe(true);
168
+ expect(surfaces.has('span.resourceAttribute')).toBe(true);
169
+ });
170
+
171
+ it('catches a canary in a metric data-point attribute', async () => {
172
+ collector = new OtelCollector();
173
+ await collector.start();
174
+ await postMetrics(metricsPayload('openwop.node.duration', { 'secret.echo': CANARY }));
175
+
176
+ const leaks = collector.findCanaryLeakage(CANARY);
177
+ const metricLeak = leaks.find((l) => l.surface === 'metric.attribute');
178
+ expect(metricLeak).toBeDefined();
179
+ expect(metricLeak!.emitterName).toBe('openwop.node.duration');
180
+ });
181
+
182
+ it('reports ZERO hits when the host redacts the canary before export (positive control)', async () => {
183
+ collector = new OtelCollector();
184
+ await collector.start();
185
+ await postTraces(
186
+ tracesPayload({
187
+ spanName: 'openwop.node.execute',
188
+ spanAttrs: { 'openwop.node.id': 'n1', 'http.request.header.authorization': `Bearer ${REDACTED}` },
189
+ resourceAttrs: { 'service.name': 'host', 'deployment.token': REDACTED },
190
+ }),
191
+ );
192
+ await postMetrics(metricsPayload('openwop.node.duration', { 'secret.echo': REDACTED }));
193
+
194
+ expect(collector.findCanaryLeakage(CANARY)).toEqual([]);
195
+ });
196
+
197
+ it('an empty or whitespace canary never produces a (vacuous) hit', async () => {
198
+ collector = new OtelCollector();
199
+ await collector.start();
200
+ await postTraces(
201
+ tracesPayload({
202
+ spanName: 'openwop.node.execute',
203
+ spanAttrs: { 'a': 'b' },
204
+ resourceAttrs: { 'service.name': 'host' },
205
+ }),
206
+ );
207
+
208
+ expect(collector.findCanaryLeakage('')).toEqual([]);
209
+ expect(collector.findCanaryLeakage(' ')).toEqual([]);
210
+ });
211
+ });
@@ -0,0 +1,113 @@
1
+ /**
2
+ * prompt-resolution-chain-event — RFC 0029 layer precedence on the PRODUCTION wire.
3
+ *
4
+ * The black-box, production-path counterpart to the three seam-driven
5
+ * `prompt-resolution-chain-{node-wins,agent-intrinsic,fallback-cascade}.test.ts`
6
+ * scenarios. Instead of the synchronous `POST /v1/host/sample/prompt/resolve`
7
+ * seam, this creates a real run from a prompt-exercising fixture, reads the
8
+ * run's DURABLE event log via the NORMATIVE `GET /v1/runs/{runId}/events/poll`
9
+ * endpoint, and asserts the `agent.promptResolved` event carries the full
10
+ * layer-by-layer precedence record (`spec/v1/prompts.md` §"Resolution chain") —
11
+ * no `/v1/host/sample/*` seam.
12
+ *
13
+ * The `agentPromptResolved` payload (`schemas/run-event-payloads.schema.json`)
14
+ * already REQUIRES `chain[]` with one `applied: true` entry + the full-traversal
15
+ * MUST, so the wire is already capable of conveying precedence without the seam.
16
+ * This is the "replace seam-gated proofs with black-box production-path
17
+ * conformance" step (independent-audit acceptance-bar item 3) for RFC 0029: once
18
+ * a host emits `agent.promptResolved`, prompt-chain precedence is proven on the
19
+ * production wire and the surface graduates INTO the `openwop-core-standard`
20
+ * floor (RFC 0088 §D Lever-2 → floor).
21
+ *
22
+ * Gating: soft-skips unless `capabilities.prompts.supported` AND the host
23
+ * actually emits `agent.promptResolved` for the run (emission is staged per
24
+ * RFC 0029 / RFC 0021 — a host advertising prompts MAY not yet emit the event).
25
+ *
26
+ * @see RFCS/0029-prompt-override-hierarchy.md
27
+ * @see spec/v1/prompts.md §"Resolution chain (normative)"
28
+ */
29
+ import { describe, it, expect } from 'vitest';
30
+ import { driver } from '../lib/driver.js';
31
+ import { capabilityFamily } from '../lib/discovery-capabilities.js';
32
+ import { pollUntilTerminal } from '../lib/polling.js';
33
+
34
+ const PROMPT_FIXTURE_ID = 'conformance-prompt-end-to-end';
35
+ const VALID_LAYERS = new Set([
36
+ 'run-configurable', 'node', 'agent-intrinsic', 'agent-overrides',
37
+ 'agent-library-default', 'workflow-defaults', 'host-defaults',
38
+ ]);
39
+
40
+ interface ChainEntry { layer?: unknown; source?: unknown; applied?: unknown }
41
+ interface PromptResolvedPayload { chain?: ChainEntry[]; resolved?: unknown }
42
+ interface RawEvent { type?: string; payload?: PromptResolvedPayload }
43
+
44
+ async function promptsSupported(): Promise<boolean> {
45
+ const res = await driver.get('/.well-known/openwop');
46
+ return capabilityFamily(res.json as Record<string, unknown> | undefined, 'prompts')?.supported === true;
47
+ }
48
+
49
+ describe('prompt-resolution-chain-event (black-box): agent.promptResolved carries the precedence record (RFC 0029)', () => {
50
+ it('the production agent.promptResolved event records the full four-layer resolution chain', async () => {
51
+ if (!(await promptsSupported())) return; // capability not advertised — skip
52
+
53
+ const create = await driver.post('/v1/runs', { workflowId: PROMPT_FIXTURE_ID });
54
+ if (create.status !== 201) {
55
+ // Fixture not seeded / run not accepted — not a prompt-chain failure.
56
+ // eslint-disable-next-line no-console
57
+ console.warn(`[prompt-resolution-chain-event] POST /v1/runs for ${PROMPT_FIXTURE_ID} returned ${create.status}; skipping the production-path assertion`);
58
+ return;
59
+ }
60
+ const runId = (create.json as { runId?: string }).runId;
61
+ if (!runId) return;
62
+ await pollUntilTerminal(runId);
63
+
64
+ const poll = await driver.get(`/v1/runs/${encodeURIComponent(runId)}/events/poll`);
65
+ const events = (poll.json as { events?: RawEvent[] } | undefined)?.events ?? [];
66
+ const resolved = events.filter((e) => e.type === 'agent.promptResolved');
67
+ if (resolved.length === 0) {
68
+ // Host advertises prompts but does not yet emit agent.promptResolved
69
+ // (RFC 0029 emission is staged) — soft-skip the behavioral assertion.
70
+ // eslint-disable-next-line no-console
71
+ console.warn('[prompt-resolution-chain-event] host emitted no agent.promptResolved event; skipping (RFC 0029 emission staged)');
72
+ return;
73
+ }
74
+
75
+ for (const ev of resolved) {
76
+ const chain = ev.payload?.chain;
77
+ expect(
78
+ Array.isArray(chain) && chain.length > 0,
79
+ driver.describe('prompts.md §Resolution chain', 'agent.promptResolved MUST carry a non-empty chain[] of attempted layers'),
80
+ ).toBe(true);
81
+ const entries = chain as ChainEntry[];
82
+
83
+ // Every entry is a well-formed layer record (the full-traversal shape).
84
+ for (const e of entries) {
85
+ expect(
86
+ typeof e.layer === 'string' && VALID_LAYERS.has(e.layer),
87
+ driver.describe('prompts.md §Resolution chain', `each chain entry MUST name a valid layer, got ${String(e.layer)}`),
88
+ ).toBe(true);
89
+ expect(typeof e.applied, driver.describe('prompts.md §Resolution chain', 'each chain entry MUST carry a boolean `applied`')).toBe('boolean');
90
+ }
91
+
92
+ // Exactly one layer wins (or none, when resolved is null).
93
+ const applied = entries.filter((e) => e.applied === true);
94
+ expect(
95
+ applied.length <= 1,
96
+ driver.describe('prompts.md §Resolution chain', 'AT MOST one chain entry MAY be applied: true (the winning layer)'),
97
+ ).toBe(true);
98
+
99
+ // resolved mirrors the applied entry's source (RFC 0029 §B).
100
+ if (applied.length === 1) {
101
+ expect(
102
+ ev.payload?.resolved,
103
+ driver.describe('run-event-payloads.schema.json agentPromptResolved', '`resolved` MUST mirror the applied chain entry\'s `source`'),
104
+ ).toBe(applied[0]?.source);
105
+ } else {
106
+ expect(
107
+ ev.payload?.resolved === null || ev.payload?.resolved === undefined,
108
+ driver.describe('run-event-payloads.schema.json agentPromptResolved', 'with no applied layer, `resolved` MUST be null'),
109
+ ).toBe(true);
110
+ }
111
+ }
112
+ });
113
+ });