@openwop/openwop-conformance 1.4.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/CHANGELOG.md +60 -0
  2. package/README.md +2 -2
  3. package/api/asyncapi.yaml +8 -3
  4. package/api/openapi.yaml +305 -0
  5. package/coverage.md +35 -10
  6. package/fixtures/conformance-phase4-nondet-tool.json +53 -0
  7. package/fixtures/conformance-phase4-replay-divergence.json +40 -0
  8. package/fixtures.md +5 -3
  9. package/package.json +1 -1
  10. package/schemas/README.md +2 -0
  11. package/schemas/capabilities.schema.json +176 -3
  12. package/schemas/credential-reference.schema.json +21 -0
  13. package/schemas/node-pack-manifest.schema.json +112 -1
  14. package/schemas/run-diff-response.schema.json +64 -0
  15. package/schemas/run-event-payloads.schema.json +104 -2
  16. package/schemas/run-event.schema.json +8 -1
  17. package/schemas/run-snapshot.schema.json +11 -0
  18. package/src/lib/behavior-gate.ts +51 -0
  19. package/src/lib/driver.ts +13 -1
  20. package/src/lib/saml-idp.ts +179 -0
  21. package/src/scenarios/approval-gate-events.test.ts +61 -0
  22. package/src/scenarios/approval-gate-flow.test.ts +68 -0
  23. package/src/scenarios/auth-saml-profile.test.ts +119 -0
  24. package/src/scenarios/auth-scim-profile.test.ts +65 -0
  25. package/src/scenarios/authorization-fail-closed.test.ts +80 -0
  26. package/src/scenarios/authorization-roles-shape.test.ts +83 -0
  27. package/src/scenarios/connector-manifest-validity.test.ts +142 -0
  28. package/src/scenarios/credential-payload-redaction.test.ts +93 -0
  29. package/src/scenarios/credentials-capability-shape.test.ts +90 -0
  30. package/src/scenarios/cross-engine-append-behavior.test.ts +204 -0
  31. package/src/scenarios/cross-host-traceparent-propagation.test.ts +13 -6
  32. package/src/scenarios/cross-workspace-isolation.test.ts +72 -0
  33. package/src/scenarios/deadletter-capability-shape.test.ts +59 -0
  34. package/src/scenarios/deadletter-retry-exhaustion.test.ts +62 -0
  35. package/src/scenarios/experimental-tier-shape.test.ts +192 -0
  36. package/src/scenarios/identity-owner-shape.test.ts +64 -0
  37. package/src/scenarios/multi-agent-confidence-escalation.test.ts +59 -21
  38. package/src/scenarios/multi-agent-memory-lifecycle.test.ts +87 -12
  39. package/src/scenarios/multi-region-idempotency-behavior.test.ts +203 -0
  40. package/src/scenarios/oauth-capability-shape.test.ts +97 -0
  41. package/src/scenarios/oauth-connector-redaction.test.ts +91 -0
  42. package/src/scenarios/pack-registry-isolation.test.ts +108 -0
  43. package/src/scenarios/pack-registry-publish.test.ts +1 -1
  44. package/src/scenarios/prompt-mutation-workspace-membership-enforced.test.ts +126 -0
  45. package/src/scenarios/prompt-read-workspace-membership-enforced.test.ts +183 -0
  46. package/src/scenarios/replay-divergence-at-refusal.test.ts +187 -7
  47. package/src/scenarios/replay-observable-sequence-determinism.test.ts +20 -6
  48. package/src/scenarios/run-diff.test.ts +143 -0
  49. package/src/scenarios/sandbox-capability-gate-respected.test.ts +15 -13
  50. package/src/scenarios/sandbox-memory-cap.test.ts +7 -8
  51. package/src/scenarios/sandbox-mvp-behavior.test.ts +280 -0
  52. package/src/scenarios/sandbox-no-cross-pack-mutation.test.ts +14 -13
  53. package/src/scenarios/sandbox-no-host-env-leak.test.ts +14 -21
  54. package/src/scenarios/sandbox-no-host-fs-escape.test.ts +20 -15
  55. package/src/scenarios/sandbox-no-host-process-escape.test.ts +18 -13
  56. package/src/scenarios/sandbox-no-network-escape.test.ts +14 -31
  57. package/src/scenarios/sandbox-timeout-cap.test.ts +7 -8
  58. package/src/scenarios/scheduling-capability-shape.test.ts +81 -0
  59. package/src/scenarios/scheduling-cron-fires-once.test.ts +66 -0
  60. package/src/scenarios/secret-leakage-otel-attribute.test.ts +241 -0
  61. package/src/scenarios/spec-corpus-validity.test.ts +2 -2
@@ -0,0 +1,59 @@
1
+ /**
2
+ * deadletter-capability-shape — RFC 0053 §A advertisement-shape verification.
3
+ *
4
+ * Status: DRAFT. RFC 0053 (dead-letter routing & failure sinks) is `Draft`.
5
+ * The `capabilities.deadLetter` block has landed in
6
+ * `schemas/capabilities.schema.json`.
7
+ *
8
+ * Always runs (shape-only): when the host advertises
9
+ * `capabilities.deadLetter`, its fields MUST be well-formed.
10
+ *
11
+ * What this scenario asserts:
12
+ * 1. `capabilities.deadLetter` is either absent or a well-formed object.
13
+ * 2. When `supported: true`, `retentionDays` (when present) is an integer ≥ 1
14
+ * (RFC 0053 §A).
15
+ *
16
+ * @see RFCS/0053-dead-letter-routing-and-failure-sinks.md
17
+ * @see spec/v1/host-capabilities.md §host.deadLetter
18
+ */
19
+
20
+ import { describe, it, expect } from 'vitest';
21
+ import { driver } from '../lib/driver.js';
22
+
23
+ interface DiscoveryDeadLetter {
24
+ supported?: boolean;
25
+ retentionDays?: number;
26
+ }
27
+
28
+ interface DiscoveryDoc {
29
+ capabilities?: { deadLetter?: DiscoveryDeadLetter };
30
+ }
31
+
32
+ async function readDeadLetter(): Promise<DiscoveryDeadLetter | null> {
33
+ const res = await driver.get('/.well-known/openwop');
34
+ const body = res.json as DiscoveryDoc | undefined;
35
+ return body?.capabilities?.deadLetter ?? null;
36
+ }
37
+
38
+ describe('deadletter-capability-shape: advertisement shape (RFC 0053 §A)', () => {
39
+ it('capabilities.deadLetter is either absent or well-formed', async () => {
40
+ const dl = await readDeadLetter();
41
+ if (dl === null) return; // host doesn't advertise deadLetter at all
42
+ expect(
43
+ typeof dl.supported,
44
+ driver.describe(
45
+ 'capabilities.schema.json §deadLetter',
46
+ 'capabilities.deadLetter.supported MUST be a boolean when deadLetter is advertised',
47
+ ),
48
+ ).toBe('boolean');
49
+ });
50
+
51
+ it('retentionDays is an integer >= 1 when present + supported', async () => {
52
+ const dl = await readDeadLetter();
53
+ if (!dl?.supported || dl.retentionDays === undefined) return;
54
+ expect(
55
+ Number.isInteger(dl.retentionDays) && dl.retentionDays >= 1,
56
+ driver.describe('RFC 0053 §A', `capabilities.deadLetter.retentionDays MUST be an integer >= 1, got: ${dl.retentionDays}`),
57
+ ).toBe(true);
58
+ });
59
+ });
@@ -0,0 +1,62 @@
1
+ /**
2
+ * deadletter-retry-exhaustion — RFC 0053 §C behavioral verification.
3
+ *
4
+ * Status: DRAFT. RFC 0053 (dead-letter routing & failure sinks) is `Draft`.
5
+ *
6
+ * Capability-gated: skips when the host does not advertise
7
+ * `capabilities.deadLetter.supported = true`.
8
+ *
9
+ * What this scenario asserts (via the optional
10
+ * `POST /v1/host/sample/deadletter/exhaust` test seam, which drives a node
11
+ * that deterministically exhausts a short retry policy):
12
+ * 1. Retry exhaustion → `run.dead_lettered` — the host emits the event
13
+ * carrying `{ runId, reason, attempts }` (RFC 0053 §C.1).
14
+ * 2. Fork-eligibility — the dead-lettered run remains forkable per RFC 0011
15
+ * within the retention window (RFC 0053 §C.2).
16
+ *
17
+ * Hosts without the seam soft-skip the behavioral probes (404). Retention
18
+ * purge is part of the deferred retention scenario (needs a clock seam).
19
+ *
20
+ * @see RFCS/0053-dead-letter-routing-and-failure-sinks.md
21
+ * @see spec/v1/host-capabilities.md §host.deadLetter
22
+ */
23
+
24
+ import { describe, it, expect } from 'vitest';
25
+ import { driver } from '../lib/driver.js';
26
+
27
+ interface DiscoveryDoc {
28
+ capabilities?: { deadLetter?: { supported?: boolean } };
29
+ }
30
+
31
+ async function deadLetterSupported(): Promise<boolean> {
32
+ const res = await driver.get('/.well-known/openwop');
33
+ return (res.json as DiscoveryDoc | undefined)?.capabilities?.deadLetter?.supported === true;
34
+ }
35
+
36
+ describe('deadletter-retry-exhaustion: retry exhaustion → dead-lettered + fork-eligible (RFC 0053 §C)', () => {
37
+ it('a retry-exhausted run emits run.dead_lettered with attempts', async () => {
38
+ if (!(await deadLetterSupported())) return; // capability-gated
39
+ const res = await driver.post('/v1/host/sample/deadletter/exhaust', { scenario: 'exhaust-retries' });
40
+ if (res.status === 404) return; // seam unwired — soft-skip
41
+ const body = res.json as { event?: { type?: string; payload?: { attempts?: number; runId?: string } } } | undefined;
42
+ expect(
43
+ body?.event?.type,
44
+ driver.describe('RFC 0053 §C.1', 'retry exhaustion MUST emit a run.dead_lettered event'),
45
+ ).toBe('run.dead_lettered');
46
+ expect(
47
+ typeof body?.event?.payload?.attempts === 'number' && body.event.payload.attempts >= 1,
48
+ driver.describe('RFC 0053 §C.1', 'run.dead_lettered MUST carry the total attempts (>= 1)'),
49
+ ).toBe(true);
50
+ });
51
+
52
+ it('the dead-lettered run is fork-eligible (RFC 0011)', async () => {
53
+ if (!(await deadLetterSupported())) return; // capability-gated
54
+ const res = await driver.post('/v1/host/sample/deadletter/exhaust', { scenario: 'fork-after-dead-letter' });
55
+ if (res.status === 404) return; // seam unwired — soft-skip
56
+ const body = res.json as { forkEligible?: boolean } | undefined;
57
+ expect(
58
+ body?.forkEligible,
59
+ driver.describe('RFC 0053 §C.2', 'a dead-lettered run MUST remain fork-eligible within the retention window'),
60
+ ).toBe(true);
61
+ });
62
+ });
@@ -0,0 +1,192 @@
1
+ /**
2
+ * experimental-tier-shape — RFC 0042 §A + §B + §D advertisement-shape probes.
3
+ *
4
+ * RFC 0042 lands the audit's "Active RFC → experimental carve-out" pattern as
5
+ * an optional `tier ∈ {"stable", "experimental"}` field on capability
6
+ * advertisements, paired with a required `experimentalUntil` ISO-8601 sunset
7
+ * date when `tier === "experimental"`. This scenario asserts:
8
+ *
9
+ * 1. Schema discipline: when `multiAgent.executionModel` advertises `tier:
10
+ * "experimental"`, `experimentalUntil` MUST be present + match
11
+ * `YYYY-MM-DD` + be ≤ 365 days in the future.
12
+ * 2. Default-mode soft-skip routing: scenarios consuming
13
+ * `experimentalGate()` honor the tier — the helper returns `false`
14
+ * under default mode for `tier: "experimental"` capabilities so the
15
+ * scenario soft-skips with a dedicated log line.
16
+ * 3. Sunset detection: a host advertising `experimentalUntil` in the
17
+ * past MUST fail discovery validation (host responsibility — the
18
+ * conformance probe simply asserts that the date format and bound
19
+ * hold for hosts that DO advertise correctly).
20
+ *
21
+ * The scenario lives at three describe levels per the RFC 0042 §D
22
+ * "Conformance suite changes" contract.
23
+ *
24
+ * @see RFCS/0042-experimental-capability-tier.md
25
+ * @see schemas/capabilities.schema.json §multiAgent.executionModel.tier
26
+ * @see conformance/src/lib/behavior-gate.ts experimentalGate()
27
+ */
28
+
29
+ import { describe, it, expect } from 'vitest';
30
+ import { driver } from '../lib/driver.js';
31
+ import { experimentalGate } from '../lib/behavior-gate.js';
32
+
33
+ const HTTP_SKIP = !process.env.OPENWOP_BASE_URL;
34
+
35
+ interface DiscoveryDoc {
36
+ capabilities?: {
37
+ multiAgent?: {
38
+ executionModel?: {
39
+ supported?: unknown;
40
+ tier?: unknown;
41
+ experimentalUntil?: unknown;
42
+ };
43
+ };
44
+ };
45
+ }
46
+
47
+ async function readDiscovery(): Promise<DiscoveryDoc | null> {
48
+ try {
49
+ const res = await driver.get('/.well-known/openwop');
50
+ if (res.status !== 200) return null;
51
+ return res.json as DiscoveryDoc;
52
+ } catch {
53
+ return null;
54
+ }
55
+ }
56
+
57
+ describe.skipIf(HTTP_SKIP)('experimental-tier-shape: §A schema discipline (RFC 0042 §A)', () => {
58
+ it('multiAgent.executionModel.tier (when present) MUST be one of {stable, experimental}', async (ctx) => {
59
+ const d = await readDiscovery();
60
+ const em = d?.capabilities?.multiAgent?.executionModel;
61
+ if (em === undefined) {
62
+ ctx.skip();
63
+ return;
64
+ }
65
+ if (em.tier === undefined) {
66
+ ctx.skip(); // tier is optional with default 'stable'
67
+ return;
68
+ }
69
+ expect(
70
+ em.tier === 'stable' || em.tier === 'experimental',
71
+ driver.describe(
72
+ 'RFCS/0042-experimental-capability-tier.md §A',
73
+ 'multiAgent.executionModel.tier MUST be one of the canonical enum values',
74
+ ),
75
+ ).toBe(true);
76
+ });
77
+
78
+ it('when tier === "experimental", experimentalUntil MUST be present + valid date', async (ctx) => {
79
+ const d = await readDiscovery();
80
+ const em = d?.capabilities?.multiAgent?.executionModel;
81
+ if (em === undefined || em.tier !== 'experimental') {
82
+ ctx.skip();
83
+ return;
84
+ }
85
+
86
+ expect(
87
+ typeof em.experimentalUntil,
88
+ driver.describe(
89
+ 'RFCS/0042-experimental-capability-tier.md §B',
90
+ 'when tier is "experimental", experimentalUntil MUST be present (the §B sunset-rule contract)',
91
+ ),
92
+ ).toBe('string');
93
+
94
+ const dateStr = em.experimentalUntil as string;
95
+ expect(
96
+ /^\d{4}-\d{2}-\d{2}$/.test(dateStr),
97
+ driver.describe(
98
+ 'RFCS/0042-experimental-capability-tier.md §B',
99
+ 'experimentalUntil MUST match YYYY-MM-DD',
100
+ ),
101
+ ).toBe(true);
102
+
103
+ const parsed = new Date(dateStr + 'T00:00:00Z');
104
+ expect(
105
+ !Number.isNaN(parsed.getTime()),
106
+ driver.describe(
107
+ 'RFCS/0042-experimental-capability-tier.md §B',
108
+ 'experimentalUntil MUST parse as a valid ISO-8601 date',
109
+ ),
110
+ ).toBe(true);
111
+ });
112
+
113
+ it('experimentalUntil MUST be ≤ 365 days in the future (sunset bound)', async (ctx) => {
114
+ const d = await readDiscovery();
115
+ const em = d?.capabilities?.multiAgent?.executionModel;
116
+ if (em === undefined || em.tier !== 'experimental') {
117
+ ctx.skip();
118
+ return;
119
+ }
120
+ if (typeof em.experimentalUntil !== 'string') {
121
+ ctx.skip(); // shape probe above will fail; don't double-fail
122
+ return;
123
+ }
124
+ const target = new Date((em.experimentalUntil as string) + 'T00:00:00Z').getTime();
125
+ const now = Date.now();
126
+ const daysAhead = (target - now) / (1000 * 60 * 60 * 24);
127
+ expect(
128
+ daysAhead <= 365,
129
+ driver.describe(
130
+ 'RFCS/0042-experimental-capability-tier.md §B',
131
+ `experimentalUntil MUST be ≤ 365 days from now (got ${Math.floor(daysAhead)} days; advertised ${em.experimentalUntil})`,
132
+ ),
133
+ ).toBe(true);
134
+ });
135
+
136
+ it('sunset detection: experimentalUntil in the past is non-conformant', async (ctx) => {
137
+ const d = await readDiscovery();
138
+ const em = d?.capabilities?.multiAgent?.executionModel;
139
+ if (em === undefined || em.tier !== 'experimental') {
140
+ ctx.skip();
141
+ return;
142
+ }
143
+ if (typeof em.experimentalUntil !== 'string') {
144
+ ctx.skip();
145
+ return;
146
+ }
147
+ const target = new Date((em.experimentalUntil as string) + 'T00:00:00Z').getTime();
148
+ const now = Date.now();
149
+ expect(
150
+ target >= now,
151
+ driver.describe(
152
+ 'RFCS/0042-experimental-capability-tier.md §B',
153
+ `experimentalUntil MUST NOT be in the past (advertised ${em.experimentalUntil}; host MUST either flip tier to stable, retract the advertisement, or re-advertise with a future date + open deprecation RFC)`,
154
+ ),
155
+ ).toBe(true);
156
+ });
157
+ });
158
+
159
+ describe.skipIf(HTTP_SKIP)('experimental-tier-shape: §D experimentalGate helper routing (RFC 0042 §D)', () => {
160
+ it('experimentalGate returns false for tier="experimental" without OPENWOP_REQUIRE_EXPERIMENTAL', () => {
161
+ // Helper-level behavioral probe — no host needed, this is a pure
162
+ // function-routing assertion against the imported helper.
163
+ const prevReqExp = process.env.OPENWOP_REQUIRE_EXPERIMENTAL;
164
+ delete process.env.OPENWOP_REQUIRE_EXPERIMENTAL;
165
+ try {
166
+ const result = experimentalGate('test-profile', true, 'experimental', '2027-05-22');
167
+ expect(
168
+ result,
169
+ driver.describe(
170
+ 'RFCS/0042-experimental-capability-tier.md §D',
171
+ 'default mode + tier="experimental" MUST soft-skip — helper returns false',
172
+ ),
173
+ ).toBe(false);
174
+ } finally {
175
+ if (prevReqExp !== undefined) process.env.OPENWOP_REQUIRE_EXPERIMENTAL = prevReqExp;
176
+ }
177
+ });
178
+
179
+ it('experimentalGate routes through behaviorGate when tier === undefined or "stable"', () => {
180
+ const prevReqBeh = process.env.OPENWOP_REQUIRE_BEHAVIOR;
181
+ delete process.env.OPENWOP_REQUIRE_BEHAVIOR;
182
+ try {
183
+ // Stable + advertised → proceed.
184
+ expect(experimentalGate('test-stable', true, 'stable')).toBe(true);
185
+ expect(experimentalGate('test-stable-undef', true, undefined)).toBe(true);
186
+ // Stable + NOT advertised, default mode → skip (returns false, no throw).
187
+ expect(experimentalGate('test-not-adv', false, 'stable')).toBe(false);
188
+ } finally {
189
+ if (prevReqBeh !== undefined) process.env.OPENWOP_REQUIRE_BEHAVIOR = prevReqBeh;
190
+ }
191
+ });
192
+ });
@@ -0,0 +1,64 @@
1
+ /**
2
+ * identity-owner-shape — RFC 0048 §C verification.
3
+ *
4
+ * Status: DRAFT. RFC 0048 (tenant·workspace·principal identity model) is
5
+ * `Draft`. The optional `RunSnapshot.owner` triple has landed in
6
+ * `schemas/run-snapshot.schema.json`.
7
+ *
8
+ * Server-free schema validation of the owner triple:
9
+ * - Positive: `{ tenant }` and `{ tenant, workspace, principal }` validate.
10
+ * - Negative: missing `tenant` (required), or an unknown property, is rejected.
11
+ *
12
+ * The owner subschema is self-contained (no external $ref), so it compiles
13
+ * standalone via ajv.
14
+ *
15
+ * @see RFCS/0048-tenant-workspace-principal-identity-model.md
16
+ * @see schemas/run-snapshot.schema.json properties.owner
17
+ */
18
+
19
+ import { describe, it, expect } from 'vitest';
20
+ import { readFileSync } from 'node:fs';
21
+ import { join } from 'node:path';
22
+ import Ajv2020 from 'ajv/dist/2020.js';
23
+ import { SCHEMAS_DIR } from '../lib/paths.js';
24
+
25
+ interface SnapshotSchema {
26
+ $schema: string;
27
+ properties: { owner?: Record<string, unknown> };
28
+ }
29
+
30
+ const snapshot = JSON.parse(
31
+ readFileSync(join(SCHEMAS_DIR, 'run-snapshot.schema.json'), 'utf8'),
32
+ ) as SnapshotSchema;
33
+
34
+ describe('category: identity owner-triple shape (RFC 0048 §C)', () => {
35
+ it('run-snapshot.schema.json defines an optional owner triple', () => {
36
+ expect(
37
+ snapshot.properties.owner,
38
+ 'RFC 0048 §C: RunSnapshot MUST define an optional `owner` object',
39
+ ).toBeDefined();
40
+ });
41
+
42
+ const ajv = new Ajv2020({ allErrors: true, strict: false });
43
+ const ownerSchema = { $schema: snapshot.$schema, ...(snapshot.properties.owner as Record<string, unknown>) };
44
+ const validate = ajv.compile(ownerSchema);
45
+
46
+ it('positive: tenant-only owner validates', () => {
47
+ expect(validate({ tenant: 'acme' }), JSON.stringify(validate.errors)).toBe(true);
48
+ });
49
+
50
+ it('positive: full triple validates', () => {
51
+ expect(
52
+ validate({ tenant: 'acme', workspace: 'ws-eng', principal: 'user_42' }),
53
+ JSON.stringify(validate.errors),
54
+ ).toBe(true);
55
+ });
56
+
57
+ it('negative: owner missing tenant is rejected (tenant is required)', () => {
58
+ expect(validate({ workspace: 'ws-eng' })).toBe(false);
59
+ });
60
+
61
+ it('negative: unknown owner property is rejected (additionalProperties:false)', () => {
62
+ expect(validate({ tenant: 'acme', role: 'admin' })).toBe(false);
63
+ });
64
+ });
@@ -1,15 +1,16 @@
1
1
  /**
2
2
  * multi-agent-confidence-escalation — RFC 0039 §A behavioral.
3
3
  *
4
- * Status: ACTIVE (advertisement-shape + behavioral). RFC 0039 Phase 2
5
- * filed Draft graduated Active 2026-05-22 in the same commit chain as
6
- * this scenario. Capability-gated on
4
+ * Status: ACTIVE (advertisement-shape + behavioral). RFC 0039
5
+ * (multi-agent execution model `version: 2`) filed Draft graduated
6
+ * Active 2026-05-22 in the same commit chain as this scenario.
7
+ * Capability-gated on
7
8
  * `capabilities.multiAgent.executionModel.supported: true` AND
8
9
  * `capabilities.multiAgent.executionModel.version >= 2` AND fixture
9
- * availability. Hosts that advertise only Phase 1 (version: 1) soft-skip
10
- * cleanly — the confidence-floor MUST applies only at version >= 2.
10
+ * availability. Hosts that advertise only `version: 1` soft-skip
11
+ * cleanly — the confidence-floor MUST applies only at `version >= 2`.
11
12
  *
12
- * Asserts (behavioral when host advertises Phase 2):
13
+ * Asserts (behavioral when host advertises `version >= 2`):
13
14
  *
14
15
  * 1. Advertisement shape: confidenceEscalationFloor (when present) MUST be
15
16
  * a number in [0.5, 1.0]; floor < 0.5 is non-conformant per RFC 0039 §A.
@@ -37,11 +38,11 @@
37
38
  * interrupt fires AND BEFORE any `core.workflowChain.event` with
38
39
  * `phase: 'dispatch.began'` for the escalated decision's intended
39
40
  * next-worker"). This is the load-bearing test that distinguishes
40
- * Phase 2 from Phase 1: Phase 1 hosts dispatch unconditionally; Phase 2
41
- * hosts gate on confidence.
41
+ * `version: 2` from `version: 1`: `version: 1` hosts dispatch
42
+ * unconditionally; `version: 2` hosts gate on confidence.
42
43
  *
43
44
  * @see RFCS/0039-multi-agent-confidence-and-memory-lifecycle.md §A
44
- * @see spec/v1/multi-agent-execution.md §"Confidence escalation (RFC 0039 Phase 2)"
45
+ * @see spec/v1/multi-agent-execution.md §"Confidence escalation (RFC 0039)"
45
46
  * @see schemas/run-event-payloads.schema.json §coreWorkflowChainConfidenceEscalated
46
47
  */
47
48
 
@@ -63,6 +64,7 @@ interface DiscoveryDoc {
63
64
  supported?: unknown;
64
65
  version?: unknown;
65
66
  confidenceEscalationFloor?: unknown;
67
+ confidenceEscalationInterruptKind?: unknown;
66
68
  };
67
69
  };
68
70
  };
@@ -102,24 +104,60 @@ describe.skipIf(BEHAVIORAL_SKIP)('multi-agent-confidence-escalation: behavioral
102
104
  const supported = d?.capabilities?.multiAgent?.executionModel?.supported === true;
103
105
  const versionRaw = d?.capabilities?.multiAgent?.executionModel?.version;
104
106
  const version = typeof versionRaw === 'number' ? versionRaw : 0;
105
- if (!supported || version < 2) return; // soft-skip — Phase 1 hosts pass via this absence
107
+ if (!supported || version < 2) return; // soft-skip — `version: 1` hosts pass via this absence
106
108
 
107
109
  const create = await driver.post('/v1/runs', { workflowId: FIXTURE });
108
110
  expect(create.status).toBe(201);
109
111
  const runId = (create.json as { runId: string }).runId;
110
112
 
111
113
  const terminal = await pollUntilTerminal(runId);
112
- // Phase 2 escalation suspends the parent — NOT a terminal `completed`.
114
+ // RFC 0039 escalation suspends the parent — NOT a terminal `completed`.
113
115
  // The conformance pollUntilTerminal returns when the run reaches any
114
- // settled status; we expect `waiting-clarification` or equivalent
115
- // non-completed status carrying an open clarification interrupt.
116
- expect(
117
- terminal.status,
118
- driver.describe(
119
- 'RFCS/0039-multi-agent-confidence-and-memory-lifecycle.md §A + spec/v1/interrupt.md',
120
- 'a host emitting `interrupt.kind: "clarification"` MUST surface the run as `waiting-clarification` per spec/v1/interrupt.md §"Interrupt kinds"; low-confidence decision MUST NOT reach `completed` because no dispatch fired',
121
- ),
122
- ).toBe('waiting-clarification');
116
+ // settled status. RFC 0039 §A gives hosts a choice: clarify-kind
117
+ // escalation (→ waiting-clarification) OR escalate-kind approval
118
+ // (→ waiting-approval).
119
+ //
120
+ // RFC 0044 routing: when the host advertises
121
+ // `capabilities.multiAgent.executionModel.confidenceEscalationInterruptKind`
122
+ // the scenario derives the expected terminal-status from that advertisement
123
+ // (canonical kinds map 1:1 to waiting-clarification / waiting-approval per
124
+ // `interrupt.md`; vendor `x-host-<host>-<kind>` kinds accept any waiting-*
125
+ // status — the host's own interrupt.md mapping determines the suffix).
126
+ // When the host does NOT advertise the field, fall back to the canonical
127
+ // either-status check.
128
+ const advertisedKind = d?.capabilities?.multiAgent?.executionModel?.confidenceEscalationInterruptKind;
129
+ const isVendorKind = typeof advertisedKind === 'string' && /^x-host-[a-z][a-z0-9-]*-[a-z][a-z0-9-]*$/.test(advertisedKind);
130
+ const isCanonicalKind = advertisedKind === 'clarification' || advertisedKind === 'approval';
131
+
132
+ if (isCanonicalKind) {
133
+ const expectedStatus = advertisedKind === 'clarification' ? 'waiting-clarification' : 'waiting-approval';
134
+ expect(
135
+ terminal.status,
136
+ driver.describe(
137
+ 'RFCS/0044-confidence-escalation-interrupt-kind-advertisement.md §B',
138
+ `host advertising confidenceEscalationInterruptKind: "${advertisedKind}" MUST surface the run as "${expectedStatus}" per spec/v1/interrupt.md §"Interrupt kinds"`,
139
+ ),
140
+ ).toBe(expectedStatus);
141
+ } else if (isVendorKind) {
142
+ const status = terminal.status as string;
143
+ expect(
144
+ typeof status === 'string' && status.startsWith('waiting-'),
145
+ driver.describe(
146
+ 'RFCS/0044-confidence-escalation-interrupt-kind-advertisement.md §B',
147
+ `host advertising vendor confidenceEscalationInterruptKind ("${advertisedKind}") MUST surface the run as a waiting-* status; the suffix is determined by the host's interrupt.md mapping (see the host's vendor-extensions doc per RFC 0044 §C)`,
148
+ ),
149
+ ).toBe(true);
150
+ } else {
151
+ // No advertisement — fall back to the canonical either-status check.
152
+ const acceptedStatuses = ['waiting-clarification', 'waiting-approval'];
153
+ expect(
154
+ acceptedStatuses.includes(terminal.status as string),
155
+ driver.describe(
156
+ 'RFCS/0039-multi-agent-confidence-and-memory-lifecycle.md §A + spec/v1/interrupt.md',
157
+ 'a host below the confidence floor MUST surface the run as `waiting-clarification` (clarify-kind escalation) OR `waiting-approval` (escalate-kind escalation) per RFC 0039 §A; the low-confidence decision MUST NOT reach `completed` because no dispatch fired',
158
+ ),
159
+ ).toBe(true);
160
+ }
123
161
 
124
162
  const eventsRes = await driver.get(`/v1/runs/${encodeURIComponent(runId)}/events`);
125
163
  expect(eventsRes.status).toBe(200);
@@ -151,7 +189,7 @@ describe.skipIf(BEHAVIORAL_SKIP)('multi-agent-confidence-escalation: behavioral
151
189
  'confidence-escalated causationId MUST point at the runOrchestrator.decided that surfaced the low-confidence decision',
152
190
  ).toBe('runOrchestrator.decided');
153
191
 
154
- // Load-bearing: NO dispatch event fired. Phase 2 gates BEFORE the loop.
192
+ // Load-bearing: NO dispatch event fired. RFC 0039 gates BEFORE the loop.
155
193
  const chainEvents = events.filter((e) => e.type === 'core.workflowChain.event');
156
194
  expect(
157
195
  chainEvents.length,
@@ -108,17 +108,92 @@ describe.skipIf(HTTP_SKIP)('multi-agent-memory-lifecycle: behavioral (RFC 0039
108
108
  // Until a memory-advertising Phase 2 host wires the seam, the contract
109
109
  // is documentation-only — surfaced as `todo` so test reporters track
110
110
  // the gap rather than reporting a vacuous PASS.
111
- it.todo('MAE-2 cross-run TTL: child write expiresAt MUST be anchored at child write time, not parent start');
111
+ // MAE-2 is still out of stable profile via RFC 0042 §B (experimental
112
+ // tier): RFC 0039 §B Half B (MAE-2 + MAE-3) landed on MyndHyve
113
+ // 2026-05-23 via commit `a51f7bbd` (`snapshotAtSeq()` +
114
+ // `crossChildMemoryConcurrency: 'strict'`). The MAE-2 cross-run-ttl-
115
+ // roundtrip seam (POST /v1/host/sample/test/memory/cross-run-ttl-
116
+ // roundtrip) is still open per host-sample-test-seams.md §"Open seams"
117
+ // — no host has wired the seam endpoint yet, so the behavioral
118
+ // assertion stays `it.skip`. Hosts that implement Half B SHOULD
119
+ // advertise `multiAgent.executionModel.tier: 'experimental'` per
120
+ // RFC 0042 §A until the seam contract is wired.
121
+ it.skip('MAE-2 cross-run TTL: child write expiresAt MUST be anchored at child write time, not parent start — out of stable profile via RFC 0042');
112
122
 
113
- // Behavioral assertion lands when the host implements the snapshot
114
- // mechanism per RFC 0039 §B. The assertion drives:
115
- // 1. Run a workflow that writes MemoryEntry { key: 'k', value: 'v1' } at index 10.
116
- // 2. Write MemoryEntry { key: 'k', value: 'v2' } at index 20.
117
- // 3. POST /v1/runs/{runId}:fork { fromSeq: 15 }.
118
- // 4. Forked run reads MemoryEntry { key: 'k' }; MUST return 'v1' (not 'v2').
119
- // 5. Alternative compliance: fork refused with
120
- // error.code: 'replay_memory_snapshot_unavailable' AND
121
- // details.fromSeq === 15.
122
- // Silent substitution of v2 (current state) is non-conformant.
123
- it.todo('MAE-3 replay snapshot: fork from past index MUST return memory-as-of-index OR refuse with replay_memory_snapshot_unavailable');
123
+ // MAE-3 flipped to behavioral 2026-05-25 MyndHyve workflow-runtime
124
+ // revision `00206-tdh` advertises Phase 2 + memory and honors the
125
+ // POST /v1/runs/{runId}:fork mode:replay contract per
126
+ // host-sample-test-seams.md §"Canonical-endpoint conformance hooks"
127
+ // §9. The seam reuses the canonical fork endpoint plus the
128
+ // OPENWOP_TEST_EXPIRED_REPLAY_RUN_ID env-var convention (parallel
129
+ // naming to OPENWOP_TEST_EXPIRED_RUN_ID used by
130
+ // production-retention-expiry). Soft-skips on Phase 1 hosts, Phase 2
131
+ // hosts without memory, and hosts that have not seeded the env var.
132
+ it('MAE-3 replay snapshot refusal: fork mode:replay against a past-retention runId MUST return 422 replay_memory_snapshot_unavailable with documented envelope; silent substitution is non-conformant', async (ctx) => {
133
+ const d = await readDiscovery();
134
+ if (d === null) {
135
+ ctx.skip();
136
+ return;
137
+ }
138
+ const v = d.capabilities?.multiAgent?.executionModel?.version;
139
+ const memorySupported = d.capabilities?.memory?.supported;
140
+ const phase2OrLater = typeof v === 'number' && v >= 2;
141
+ const expiredRunId = process.env.OPENWOP_TEST_EXPIRED_REPLAY_RUN_ID;
142
+ if (!phase2OrLater || memorySupported !== true || !expiredRunId) {
143
+ ctx.skip();
144
+ return;
145
+ }
146
+
147
+ const fromSeq = 0;
148
+ const res = await driver.post(`/v1/runs/${encodeURIComponent(expiredRunId)}:fork`, {
149
+ mode: 'replay',
150
+ fromSeq,
151
+ });
152
+
153
+ expect(
154
+ res.status,
155
+ driver.describe(
156
+ 'RFCS/0039-multi-agent-confidence-and-memory-lifecycle.md §B MAE-3',
157
+ 'fork mode:replay against a past-retention runId MUST refuse with 422; silent substitution of current memory is non-conformant',
158
+ ),
159
+ ).toBe(422);
160
+
161
+ const body = res.json as {
162
+ error?: unknown;
163
+ details?: { fromSeq?: unknown; sourceRunId?: unknown; reason?: unknown };
164
+ } | null;
165
+
166
+ expect(
167
+ body?.error,
168
+ driver.describe(
169
+ 'RFCS/0039-multi-agent-confidence-and-memory-lifecycle.md §B MAE-3',
170
+ 'refusal envelope error code MUST be "replay_memory_snapshot_unavailable" (distinct from the pre-flight invalid_from_seq gate)',
171
+ ),
172
+ ).toBe('replay_memory_snapshot_unavailable');
173
+
174
+ expect(
175
+ body?.details?.fromSeq,
176
+ driver.describe(
177
+ 'RFCS/0039-multi-agent-confidence-and-memory-lifecycle.md §B MAE-3',
178
+ 'refusal envelope details.fromSeq MUST echo the requested fromSeq',
179
+ ),
180
+ ).toBe(fromSeq);
181
+
182
+ expect(
183
+ body?.details?.sourceRunId,
184
+ driver.describe(
185
+ 'RFCS/0039-multi-agent-confidence-and-memory-lifecycle.md §B MAE-3',
186
+ 'refusal envelope details.sourceRunId MUST echo the runId from the URL',
187
+ ),
188
+ ).toBe(expiredRunId);
189
+
190
+ const reason = body?.details?.reason;
191
+ expect(
192
+ reason === 'retention_expired' || reason === 'event_log_unavailable',
193
+ driver.describe(
194
+ 'RFCS/0039-multi-agent-confidence-and-memory-lifecycle.md §B MAE-3',
195
+ 'refusal envelope details.reason MUST be one of {"retention_expired", "event_log_unavailable"}',
196
+ ),
197
+ ).toBe(true);
198
+ });
124
199
  });