@openwop/openwop-conformance 1.3.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +91 -1
- package/README.md +3 -2
- package/api/asyncapi.yaml +8 -0
- package/api/openapi.yaml +371 -1
- package/coverage.md +25 -5
- package/fixtures/conformance-envelope-nl-to-format-engaged.json +41 -0
- package/fixtures/conformance-envelope-recovery-applied.json +39 -0
- package/fixtures/conformance-envelope-refusal.json +38 -0
- package/fixtures/conformance-envelope-retry-attempted.json +39 -0
- package/fixtures/conformance-envelope-retry-exhausted.json +38 -0
- package/fixtures/conformance-envelope-truncated.json +39 -0
- package/fixtures/conformance-envelope-truncation-cap-exhaustion.json +39 -0
- package/fixtures/conformance-model-capability-insufficient.json +25 -0
- package/fixtures/conformance-multi-agent-confidence-escalation.json +49 -0
- package/fixtures/conformance-multi-agent-handoff-child.json +27 -0
- package/fixtures/conformance-multi-agent-handoff.json +49 -0
- package/fixtures/conformance-prompt-all-four-kinds.json +39 -0
- package/fixtures/conformance-prompt-end-to-end.json +33 -0
- package/fixtures/conformance-subworkflow-mid-run-mutation-child.json +31 -0
- package/fixtures/conformance-subworkflow-mid-run-mutation.json +33 -0
- package/fixtures/openwop-smoke-cost-emit.json +37 -0
- package/fixtures/prompt-templates/conformance-prompt-few-shot-2.json +14 -0
- package/fixtures/prompt-templates/conformance-prompt-few-shot.json +14 -0
- package/fixtures/prompt-templates/conformance-prompt-schema-hint.json +14 -0
- package/fixtures/prompt-templates/conformance-prompt-secret-redaction.json +23 -0
- package/fixtures/prompt-templates/conformance-prompt-trust-marker.json +23 -0
- package/fixtures/prompt-templates/conformance-prompt-writer-system.json +15 -0
- package/fixtures/prompt-templates/conformance-prompt-writer-user.json +15 -0
- package/fixtures.md +39 -0
- package/package.json +1 -1
- package/schemas/README.md +5 -0
- package/schemas/agent-manifest.schema.json +16 -0
- package/schemas/capabilities.schema.json +375 -1
- package/schemas/envelopes/clarification.request.schema.json +9 -0
- package/schemas/envelopes/error.schema.json +4 -0
- package/schemas/envelopes/schema.request.schema.json +4 -0
- package/schemas/envelopes/schema.response.schema.json +1 -1
- package/schemas/node-pack-manifest.schema.json +28 -0
- package/schemas/orchestrator-decision.schema.json +12 -0
- package/schemas/prompt-kind.schema.json +8 -0
- package/schemas/prompt-pack-manifest.schema.json +80 -0
- package/schemas/prompt-ref.schema.json +40 -0
- package/schemas/prompt-template.schema.json +149 -0
- package/schemas/registry-version-manifest.schema.json +5 -0
- package/schemas/run-ancestry-response.schema.json +54 -0
- package/schemas/run-event-payloads.schema.json +479 -11
- package/schemas/run-event.schema.json +15 -1
- package/schemas/run-snapshot.schema.json +3 -2
- package/schemas/workflow-definition.schema.json +19 -1
- package/src/lib/llm-cache-key-recipe.ts +68 -0
- package/src/scenarios/aiEnvelope.contractRefusal.test.ts +104 -13
- package/src/scenarios/aiEnvelope.correlationReplay.test.ts +32 -15
- package/src/scenarios/aiEnvelope.redaction.test.ts +6 -5
- package/src/scenarios/aiEnvelope.schemaDrift.test.ts +5 -5
- package/src/scenarios/aiEnvelope.trustBoundaryPropagation.test.ts +211 -12
- package/src/scenarios/aiEnvelope.universalKinds.test.ts +7 -7
- package/src/scenarios/blob-presign-expiry.test.ts +7 -7
- package/src/scenarios/cache-ttl-expiry.test.ts +6 -6
- package/src/scenarios/cost-attribution.test.ts +124 -11
- package/src/scenarios/cross-engine-append-ordering.test.ts +99 -0
- package/src/scenarios/cross-host-ancestry-endpoint.test.ts +136 -0
- package/src/scenarios/cross-host-causation-shape.test.ts +117 -0
- package/src/scenarios/cross-host-traceparent-propagation.test.ts +60 -0
- package/src/scenarios/envelope-completion-distinguishes-truncation.test.ts +223 -0
- package/src/scenarios/envelope-nl-to-format-engaged.test.ts +152 -0
- package/src/scenarios/envelope-reasoning-secret-redaction.test.ts +343 -0
- package/src/scenarios/envelope-reasoning-shape.test.ts +190 -0
- package/src/scenarios/envelope-recovery-applied.test.ts +229 -0
- package/src/scenarios/envelope-refusal-shape.test.ts +289 -0
- package/src/scenarios/envelope-retry-attempted.test.ts +258 -0
- package/src/scenarios/envelope-retry-exhausted.test.ts +168 -0
- package/src/scenarios/envelope-tier-one-subset-static.test.ts +229 -0
- package/src/scenarios/envelope-truncated.test.ts +136 -0
- package/src/scenarios/envelope-truncation-cap-exhaustion.test.ts +144 -0
- package/src/scenarios/envelope-variant-discriminator-static.test.ts +152 -0
- package/src/scenarios/fixtures-valid.test.ts +123 -15
- package/src/scenarios/kv-ttl-expiry.test.ts +7 -7
- package/src/scenarios/model-capability-insufficient.test.ts +221 -0
- package/src/scenarios/model-capability-substituted.test.ts +203 -0
- package/src/scenarios/multi-agent-confidence-escalation.test.ts +164 -0
- package/src/scenarios/multi-agent-handoff-state-machine.test.ts +167 -0
- package/src/scenarios/multi-agent-memory-lifecycle.test.ts +124 -0
- package/src/scenarios/multi-region-idempotency.test.ts +58 -0
- package/src/scenarios/node-module-required-capabilities-shape.test.ts +185 -0
- package/src/scenarios/prompt-all-four-kinds-events.test.ts +198 -0
- package/src/scenarios/prompt-composed-secret-redaction.test.ts +178 -0
- package/src/scenarios/prompt-composed-trust-marker.test.ts +165 -0
- package/src/scenarios/prompt-end-to-end-events.test.ts +202 -0
- package/src/scenarios/prompt-list-and-fetch.test.ts +207 -0
- package/src/scenarios/prompt-mutable-lifecycle.test.ts +216 -0
- package/src/scenarios/prompt-pack-install.test.ts +187 -0
- package/src/scenarios/prompt-render-deterministic.test.ts +240 -0
- package/src/scenarios/prompt-resolution-chain-agent-intrinsic.test.ts +140 -0
- package/src/scenarios/prompt-resolution-chain-fallback-cascade.test.ts +172 -0
- package/src/scenarios/prompt-resolution-chain-node-wins.test.ts +144 -0
- package/src/scenarios/prompt-template-shape.test.ts +359 -0
- package/src/scenarios/queue-ack-nack-dlq.test.ts +7 -7
- package/src/scenarios/queue-publish-consume-roundtrip.test.ts +7 -7
- package/src/scenarios/replay-divergence-at-refusal.test.ts +134 -0
- package/src/scenarios/replay-llm-cache-key-portable.test.ts +197 -0
- package/src/scenarios/replay-llm-cache-key.test.ts +1 -40
- package/src/scenarios/replay-observable-sequence-determinism.test.ts +80 -0
- package/src/scenarios/sandbox-capability-gate-respected.test.ts +31 -0
- package/src/scenarios/sandbox-memory-cap.test.ts +61 -0
- package/src/scenarios/sandbox-no-cross-pack-mutation.test.ts +35 -0
- package/src/scenarios/sandbox-no-host-env-leak.test.ts +38 -0
- package/src/scenarios/sandbox-no-host-fs-escape.test.ts +91 -0
- package/src/scenarios/sandbox-no-host-process-escape.test.ts +30 -0
- package/src/scenarios/sandbox-no-network-escape.test.ts +49 -0
- package/src/scenarios/sandbox-timeout-cap.test.ts +61 -0
- package/src/scenarios/search-bm25-roundtrip.test.ts +7 -7
- package/src/scenarios/spec-corpus-validity.test.ts +34 -6
- package/src/scenarios/sql-transaction-atomicity.test.ts +6 -6
- package/src/scenarios/stream-subscribe-from-beginning.test.ts +7 -7
- package/src/scenarios/subworkflow-input-mapping.test.ts +70 -4
- package/src/scenarios/table-cursor-pagination.test.ts +7 -7
- package/src/scenarios/table-schema-enforcement.test.ts +7 -7
- package/src/scenarios/vector-knn-roundtrip.test.ts +7 -7
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* model-capability-insufficient — RFC 0031 §B step 4 + §D runtime behavior.
|
|
3
|
+
*
|
|
4
|
+
* Capability-gated on `capabilities.modelCapabilities.supported: true`.
|
|
5
|
+
* Drives the host's `POST /v1/host/sample/test/evaluate-model-capability-gate`
|
|
6
|
+
* seam through the refusal branches of the §B 4-step dispatch flow.
|
|
7
|
+
*
|
|
8
|
+
* @see RFCS/0031-envelope-variants-and-model-capabilities.md §B step 4 + §D
|
|
9
|
+
* @see schemas/run-event-payloads.schema.json §modelCapabilityInsufficient
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import { describe, it, expect } from 'vitest';
|
|
13
|
+
import { driver } from '../lib/driver.js';
|
|
14
|
+
|
|
15
|
+
const HTTP_SKIP = !process.env.OPENWOP_BASE_URL;
|
|
16
|
+
|
|
17
|
+
interface GateResponse {
|
|
18
|
+
outcome?: {
|
|
19
|
+
route?: 'dispatch' | 'substitute' | 'refuse';
|
|
20
|
+
missingCapabilities?: string[];
|
|
21
|
+
fallbackAttempted?: boolean;
|
|
22
|
+
};
|
|
23
|
+
event?: { type?: string; payload?: Record<string, unknown> } | null;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
async function evaluateGate(input: Record<string, unknown>): Promise<{ status: number; body: GateResponse }> {
|
|
27
|
+
const res = await driver.post('/v1/host/sample/test/evaluate-model-capability-gate', input);
|
|
28
|
+
return { status: res.status, body: res.json as GateResponse };
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
describe.skipIf(HTTP_SKIP)('model-capability-insufficient: dispatch refusal (RFC 0031 §B step 4 + §D)', () => {
|
|
32
|
+
it('unmet + NO fallbackModel declared → refuse with fallbackAttempted: false', async () => {
|
|
33
|
+
const r = await evaluateGate({
|
|
34
|
+
module: { requiredModelCapabilities: ['structured-output', 'reasoning'] },
|
|
35
|
+
// no fallbackModel
|
|
36
|
+
activeProvider: 'unknown-vendor',
|
|
37
|
+
activeModel: 'unknown-model',
|
|
38
|
+
substitutionSupported: true,
|
|
39
|
+
supportedProviders: ['unknown-vendor'],
|
|
40
|
+
nodeId: 'editor-node',
|
|
41
|
+
});
|
|
42
|
+
if (r.status === 404) return;
|
|
43
|
+
expect(
|
|
44
|
+
r.body.outcome?.route,
|
|
45
|
+
driver.describe(
|
|
46
|
+
'RFCS/0031-envelope-variants-and-model-capabilities.md §B step 4',
|
|
47
|
+
'unmet capability + no fallbackModel declared → host MUST refuse',
|
|
48
|
+
),
|
|
49
|
+
).toBe('refuse');
|
|
50
|
+
expect(
|
|
51
|
+
r.body.outcome?.fallbackAttempted,
|
|
52
|
+
driver.describe(
|
|
53
|
+
'schemas/run-event-payloads.schema.json §modelCapabilityInsufficient',
|
|
54
|
+
'fallbackAttempted MUST be false when no fallbackModel was declared on the NodeModule',
|
|
55
|
+
),
|
|
56
|
+
).toBe(false);
|
|
57
|
+
expect(
|
|
58
|
+
r.body.event?.type,
|
|
59
|
+
driver.describe(
|
|
60
|
+
'RFCS/0031-envelope-variants-and-model-capabilities.md §D',
|
|
61
|
+
'refuse path MUST emit `model.capability.insufficient` BEFORE the node failure',
|
|
62
|
+
),
|
|
63
|
+
).toBe('model.capability.insufficient');
|
|
64
|
+
const payload = (r.body.event?.payload ?? {}) as Record<string, unknown>;
|
|
65
|
+
expect(payload.nodeId).toBe('editor-node');
|
|
66
|
+
expect(payload.provider).toBe('unknown-vendor');
|
|
67
|
+
expect(payload.fallbackAttempted).toBe(false);
|
|
68
|
+
expect(Array.isArray(payload.missingCapabilities)).toBe(true);
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
it('unmet + fallback declared but provider NOT in supportedProviders → refuse with fallbackAttempted: true', async () => {
|
|
72
|
+
const r = await evaluateGate({
|
|
73
|
+
module: {
|
|
74
|
+
requiredModelCapabilities: ['structured-output'],
|
|
75
|
+
fallbackModel: { provider: 'unauthenticated-vendor', model: 'foo' },
|
|
76
|
+
},
|
|
77
|
+
activeProvider: 'unknown-vendor',
|
|
78
|
+
activeModel: 'unknown-model',
|
|
79
|
+
substitutionSupported: true,
|
|
80
|
+
// Fallback's provider is NOT in supportedProviders — host cannot
|
|
81
|
+
// authenticate per RFC 0031 §B step 3 final clause.
|
|
82
|
+
supportedProviders: ['anthropic', 'unknown-vendor'],
|
|
83
|
+
});
|
|
84
|
+
if (r.status === 404) return;
|
|
85
|
+
expect(r.body.outcome?.route).toBe('refuse');
|
|
86
|
+
expect(
|
|
87
|
+
r.body.outcome?.fallbackAttempted,
|
|
88
|
+
driver.describe(
|
|
89
|
+
'RFCS/0031-envelope-variants-and-model-capabilities.md §B step 3',
|
|
90
|
+
'fallback provider NOT in capabilities.aiProviders.supported[] → host cannot authenticate → fallbackAttempted MUST be true (the attempt failed at credential resolution)',
|
|
91
|
+
),
|
|
92
|
+
).toBe(true);
|
|
93
|
+
expect(r.body.event?.type).toBe('model.capability.insufficient');
|
|
94
|
+
});
|
|
95
|
+
|
|
96
|
+
it('unmet + substitutionSupported: false (host posture) → refuse with fallbackAttempted: false', async () => {
|
|
97
|
+
const r = await evaluateGate({
|
|
98
|
+
module: {
|
|
99
|
+
requiredModelCapabilities: ['structured-output'],
|
|
100
|
+
fallbackModel: { provider: 'anthropic', model: 'claude-opus-4-7' },
|
|
101
|
+
},
|
|
102
|
+
activeProvider: 'unknown-vendor',
|
|
103
|
+
activeModel: 'unknown-model',
|
|
104
|
+
substitutionSupported: false,
|
|
105
|
+
supportedProviders: ['anthropic', 'unknown-vendor'],
|
|
106
|
+
});
|
|
107
|
+
if (r.status === 404) return;
|
|
108
|
+
expect(r.body.outcome?.route).toBe('refuse');
|
|
109
|
+
expect(
|
|
110
|
+
r.body.outcome?.fallbackAttempted,
|
|
111
|
+
driver.describe(
|
|
112
|
+
'RFCS/0031-envelope-variants-and-model-capabilities.md §E',
|
|
113
|
+
'capabilities.modelCapabilities.substitutionSupported: false → host MUST NOT attempt fallback even when NodeModule.fallbackModel is declared → fallbackAttempted MUST be false (no attempt was made)',
|
|
114
|
+
),
|
|
115
|
+
).toBe(false);
|
|
116
|
+
});
|
|
117
|
+
|
|
118
|
+
it('recursive fallback NOT permitted — fallback that itself fails capability check → refuse with fallbackAttempted: true', async () => {
|
|
119
|
+
// Construct a scenario where fallback's provider is in supportedProviders
|
|
120
|
+
// BUT the fallback provider itself doesn't advertise the required capability.
|
|
121
|
+
// The probe map's 'unknown-vendor-2' has empty capabilities; the gate
|
|
122
|
+
// refuses with fallbackAttempted: true (RFC 0031 §"Unresolved questions" #3).
|
|
123
|
+
const r = await evaluateGate({
|
|
124
|
+
module: {
|
|
125
|
+
requiredModelCapabilities: ['structured-output'],
|
|
126
|
+
fallbackModel: { provider: 'unknown-vendor-2', model: 'fallback-model' },
|
|
127
|
+
},
|
|
128
|
+
activeProvider: 'unknown-vendor',
|
|
129
|
+
activeModel: 'unknown-model',
|
|
130
|
+
substitutionSupported: true,
|
|
131
|
+
supportedProviders: ['unknown-vendor', 'unknown-vendor-2'],
|
|
132
|
+
});
|
|
133
|
+
if (r.status === 404) return;
|
|
134
|
+
expect(r.body.outcome?.route).toBe('refuse');
|
|
135
|
+
expect(
|
|
136
|
+
r.body.outcome?.fallbackAttempted,
|
|
137
|
+
driver.describe(
|
|
138
|
+
'RFCS/0031-envelope-variants-and-model-capabilities.md §"Unresolved questions" #3',
|
|
139
|
+
'recursive fallback NOT permitted — when the declared fallback model itself fails the capability check, host MUST refuse with fallbackAttempted: true (NOT chain to another fallback)',
|
|
140
|
+
),
|
|
141
|
+
).toBe(true);
|
|
142
|
+
});
|
|
143
|
+
});
|
|
144
|
+
|
|
145
|
+
// End-to-end pipeline: a fixture-declared workflow whose only node carries a
|
|
146
|
+
// NodeModule with `requiredModelCapabilities: ['nonexistent-capability-9b3f']`
|
|
147
|
+
// (registered as `conformance.modelCapability.insufficient` on the reference
|
|
148
|
+
// host). The executor's model-capability gate at dispatch time refuses with
|
|
149
|
+
// `capability_not_provided` AND emits `model.capability.insufficient` into
|
|
150
|
+
// the run event log per RFC 0031 §D. Capability-gated AND fixture-gated:
|
|
151
|
+
// soft-skips when either is absent.
|
|
152
|
+
|
|
153
|
+
import { pollUntilTerminal } from '../lib/polling.js';
|
|
154
|
+
import { isFixtureAdvertised } from '../lib/fixtures.js';
|
|
155
|
+
|
|
156
|
+
const E2E_FIXTURE = 'conformance-model-capability-insufficient';
|
|
157
|
+
|
|
158
|
+
describe.skipIf(HTTP_SKIP)('model-capability-insufficient: end-to-end refusal through executor', () => {
|
|
159
|
+
it('workflow with a node declaring requiredModelCapabilities the active provider does not satisfy fails with RunSnapshot.error.code = "capability_not_provided" AND emits model.capability.insufficient into the run event log BEFORE node.failed', async () => {
|
|
160
|
+
if (!isFixtureAdvertised(E2E_FIXTURE)) return; // fixture not seeded — soft-skip
|
|
161
|
+
|
|
162
|
+
const create = await driver.post('/v1/runs', { workflowId: E2E_FIXTURE });
|
|
163
|
+
expect(create.status).toBe(201);
|
|
164
|
+
const runId = (create.json as { runId: string }).runId;
|
|
165
|
+
|
|
166
|
+
const terminal = await pollUntilTerminal(runId, { timeoutMs: 10_000 });
|
|
167
|
+
expect(terminal.status).toBe('failed');
|
|
168
|
+
expect(
|
|
169
|
+
(terminal as { error?: { code?: string } }).error?.code,
|
|
170
|
+
driver.describe(
|
|
171
|
+
'RFCS/0031-envelope-variants-and-model-capabilities.md §B step 4',
|
|
172
|
+
'unmet capability without viable fallback MUST fail with error.code = "capability_not_provided"',
|
|
173
|
+
),
|
|
174
|
+
).toBe('capability_not_provided');
|
|
175
|
+
|
|
176
|
+
const eventsRes = await driver.get(`/v1/runs/${encodeURIComponent(runId)}/events`);
|
|
177
|
+
expect(eventsRes.status).toBe(200);
|
|
178
|
+
const events = ((eventsRes.json as { events?: Array<{ type: string }> } | undefined)?.events ?? []);
|
|
179
|
+
const insufficientIdx = events.findIndex((e) => e.type === 'model.capability.insufficient');
|
|
180
|
+
const nodeFailedIdx = events.findIndex((e) => e.type === 'node.failed');
|
|
181
|
+
expect(insufficientIdx, 'model.capability.insufficient MUST appear in the event log').toBeGreaterThanOrEqual(0);
|
|
182
|
+
expect(nodeFailedIdx, 'node.failed MUST appear in the event log').toBeGreaterThanOrEqual(0);
|
|
183
|
+
expect(
|
|
184
|
+
insufficientIdx < nodeFailedIdx,
|
|
185
|
+
driver.describe(
|
|
186
|
+
'RFCS/0031-envelope-variants-and-model-capabilities.md §D',
|
|
187
|
+
'model.capability.insufficient MUST be emitted BEFORE node.failed (cause precedes effect)',
|
|
188
|
+
),
|
|
189
|
+
).toBe(true);
|
|
190
|
+
});
|
|
191
|
+
|
|
192
|
+
it('NO envelope emission occurs after the refusal (no node.completed, provider.usage, or envelope-reliability events)', async () => {
|
|
193
|
+
if (!isFixtureAdvertised(E2E_FIXTURE)) return; // fixture not seeded — soft-skip
|
|
194
|
+
|
|
195
|
+
const create = await driver.post('/v1/runs', { workflowId: E2E_FIXTURE });
|
|
196
|
+
expect(create.status).toBe(201);
|
|
197
|
+
const runId = (create.json as { runId: string }).runId;
|
|
198
|
+
await pollUntilTerminal(runId, { timeoutMs: 10_000 });
|
|
199
|
+
|
|
200
|
+
const eventsRes = await driver.get(`/v1/runs/${encodeURIComponent(runId)}/events`);
|
|
201
|
+
const events = ((eventsRes.json as { events?: Array<{ type: string }> } | undefined)?.events ?? []);
|
|
202
|
+
const forbidden = [
|
|
203
|
+
'node.completed',
|
|
204
|
+
'provider.usage',
|
|
205
|
+
'envelope.retry.attempted',
|
|
206
|
+
'envelope.retry.exhausted',
|
|
207
|
+
'envelope.refusal',
|
|
208
|
+
'envelope.truncated',
|
|
209
|
+
'envelope.nlToFormat.engaged',
|
|
210
|
+
'envelope.recovery.applied',
|
|
211
|
+
];
|
|
212
|
+
const leaked = events.filter((e) => forbidden.includes(e.type)).map((e) => e.type);
|
|
213
|
+
expect(
|
|
214
|
+
leaked,
|
|
215
|
+
driver.describe(
|
|
216
|
+
'RFCS/0031-envelope-variants-and-model-capabilities.md §B step 4',
|
|
217
|
+
'a refused dispatch MUST NOT emit any downstream envelope-emission events — the node never ran',
|
|
218
|
+
),
|
|
219
|
+
).toEqual([]);
|
|
220
|
+
});
|
|
221
|
+
});
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* model-capability-substituted — RFC 0031 §B step 3 + §D + §F runtime behavior.
|
|
3
|
+
*
|
|
4
|
+
* Capability-gated on `capabilities.modelCapabilities.supported: true`.
|
|
5
|
+
*
|
|
6
|
+
* Drives the host's `POST /v1/host/sample/test/evaluate-model-capability-gate`
|
|
7
|
+
* seam with synthetic inputs that hit each branch of the §B 4-step dispatch
|
|
8
|
+
* flow. The seam runs the pure `evaluateModelCapabilityGate()` evaluator
|
|
9
|
+
* and returns both the routing outcome AND the event payload the host
|
|
10
|
+
* would emit. Conformance asserts the decision-matrix + the event payload
|
|
11
|
+
* shape per RFC 0031 §D `modelCapabilitySubstituted`.
|
|
12
|
+
*
|
|
13
|
+
* @see RFCS/0031-envelope-variants-and-model-capabilities.md §B + §D + §F
|
|
14
|
+
* @see spec/v1/host-capabilities.md §"Model-capability declarations"
|
|
15
|
+
* @see schemas/run-event-payloads.schema.json §modelCapabilitySubstituted
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
import { describe, it, expect } from 'vitest';
|
|
19
|
+
import { driver } from '../lib/driver.js';
|
|
20
|
+
|
|
21
|
+
const HTTP_SKIP = !process.env.OPENWOP_BASE_URL;
|
|
22
|
+
|
|
23
|
+
interface DiscoveryDoc {
|
|
24
|
+
capabilities?: {
|
|
25
|
+
modelCapabilities?: {
|
|
26
|
+
supported?: unknown;
|
|
27
|
+
substitutionSupported?: unknown;
|
|
28
|
+
advertised?: unknown;
|
|
29
|
+
};
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
interface GateOutcome {
|
|
34
|
+
route?: 'dispatch' | 'substitute' | 'refuse';
|
|
35
|
+
originalProvider?: string;
|
|
36
|
+
originalModel?: string;
|
|
37
|
+
fallbackProvider?: string;
|
|
38
|
+
fallbackModel?: string;
|
|
39
|
+
missingCapabilities?: string[];
|
|
40
|
+
fallbackAttempted?: boolean;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
interface GateResponse {
|
|
44
|
+
outcome?: GateOutcome;
|
|
45
|
+
event?: { type?: string; payload?: Record<string, unknown> } | null;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
async function readDiscovery(): Promise<DiscoveryDoc | null> {
|
|
49
|
+
try {
|
|
50
|
+
const res = await driver.get('/.well-known/openwop');
|
|
51
|
+
if (res.status !== 200) return null;
|
|
52
|
+
return res.json as DiscoveryDoc;
|
|
53
|
+
} catch {
|
|
54
|
+
return null;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
async function evaluateGate(input: Record<string, unknown>): Promise<{ status: number; body: GateResponse }> {
|
|
59
|
+
const res = await driver.post('/v1/host/sample/test/evaluate-model-capability-gate', input);
|
|
60
|
+
return { status: res.status, body: res.json as GateResponse };
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
describe.skipIf(HTTP_SKIP)('model-capability-substituted: advertisement shape (RFC 0031 §E)', () => {
|
|
64
|
+
it('capabilities.modelCapabilities (when present) conforms to RFC 0031 §E', async () => {
|
|
65
|
+
const d = await readDiscovery();
|
|
66
|
+
if (d === null) return;
|
|
67
|
+
const mc = d.capabilities?.modelCapabilities;
|
|
68
|
+
if (mc === undefined) return;
|
|
69
|
+
expect(
|
|
70
|
+
typeof mc.supported,
|
|
71
|
+
driver.describe(
|
|
72
|
+
'schemas/capabilities.schema.json §modelCapabilities',
|
|
73
|
+
'capabilities.modelCapabilities.supported MUST be boolean when the block is advertised',
|
|
74
|
+
),
|
|
75
|
+
).toBe('boolean');
|
|
76
|
+
if (mc.advertised !== undefined) {
|
|
77
|
+
expect(
|
|
78
|
+
Array.isArray(mc.advertised),
|
|
79
|
+
driver.describe('RFCS/0031-envelope-variants-and-model-capabilities.md §E', 'modelCapabilities.advertised MUST be an array of capability identifiers'),
|
|
80
|
+
).toBe(true);
|
|
81
|
+
const SPEC_RESERVED = ['structured-output', 'discriminator-enum', 'long-context', 'reasoning', 'function-calling'];
|
|
82
|
+
for (const id of mc.advertised as unknown[]) {
|
|
83
|
+
expect(typeof id, 'each advertised identifier MUST be a string').toBe('string');
|
|
84
|
+
const idStr = String(id);
|
|
85
|
+
const isReserved = SPEC_RESERVED.includes(idStr);
|
|
86
|
+
const isHostExt = /^x-host-[a-z][a-z0-9-]*-[a-z][a-z0-9-]*$/.test(idStr);
|
|
87
|
+
expect(
|
|
88
|
+
isReserved || isHostExt,
|
|
89
|
+
driver.describe(
|
|
90
|
+
'RFCS/0031-envelope-variants-and-model-capabilities.md §C',
|
|
91
|
+
`advertised identifier "${idStr}" MUST be spec-reserved (structured-output, discriminator-enum, long-context, reasoning, function-calling) or match the x-host-<host>-<key> extension pattern`,
|
|
92
|
+
),
|
|
93
|
+
).toBe(true);
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
if (mc.substitutionSupported !== undefined) {
|
|
97
|
+
expect(typeof mc.substitutionSupported, 'substitutionSupported MUST be boolean').toBe('boolean');
|
|
98
|
+
}
|
|
99
|
+
});
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
describe.skipIf(HTTP_SKIP)('model-capability-substituted: dispatch behavior (RFC 0031 §B step 3 + §D)', () => {
|
|
103
|
+
it('all required capabilities met → outcome: dispatch (no event emitted)', async () => {
|
|
104
|
+
const r = await evaluateGate({
|
|
105
|
+
module: { requiredModelCapabilities: ['structured-output', 'function-calling'] },
|
|
106
|
+
activeProvider: 'anthropic',
|
|
107
|
+
activeModel: 'claude-3-5-sonnet',
|
|
108
|
+
substitutionSupported: true,
|
|
109
|
+
supportedProviders: ['anthropic', 'openai'],
|
|
110
|
+
});
|
|
111
|
+
if (r.status === 404) return; // host doesn't expose the seam
|
|
112
|
+
expect(
|
|
113
|
+
r.body.outcome?.route,
|
|
114
|
+
driver.describe(
|
|
115
|
+
'RFCS/0031-envelope-variants-and-model-capabilities.md §B step 2',
|
|
116
|
+
'all required model capabilities met → route MUST be "dispatch" (gate is a no-op)',
|
|
117
|
+
),
|
|
118
|
+
).toBe('dispatch');
|
|
119
|
+
expect(r.body.event, 'no event emitted when gate is a no-op').toBeNull();
|
|
120
|
+
});
|
|
121
|
+
|
|
122
|
+
it('unmet + fallback declared + authenticatable → outcome: substitute + event with originalProvider/originalModel/fallbackProvider/fallbackModel/missingCapabilities', async () => {
|
|
123
|
+
const r = await evaluateGate({
|
|
124
|
+
module: {
|
|
125
|
+
requiredModelCapabilities: ['structured-output', 'long-context'],
|
|
126
|
+
fallbackModel: { provider: 'anthropic', model: 'claude-opus-4-7' },
|
|
127
|
+
},
|
|
128
|
+
// Simulate an active provider that doesn't advertise long-context.
|
|
129
|
+
// The seam's probe map returns the spec-known capability set for
|
|
130
|
+
// known providers; we use an unknown provider id here so the gate
|
|
131
|
+
// sees an empty advertised set and refuses to substitute (no — wait,
|
|
132
|
+
// we declare a fallback that IS in supportedProviders, so the gate
|
|
133
|
+
// substitutes). Use 'unknown-vendor' as the original provider and
|
|
134
|
+
// 'anthropic' as the fallback (which IS in the host's known
|
|
135
|
+
// providers and advertises structured-output + long-context).
|
|
136
|
+
activeProvider: 'unknown-vendor',
|
|
137
|
+
activeModel: 'unknown-model',
|
|
138
|
+
substitutionSupported: true,
|
|
139
|
+
supportedProviders: ['anthropic', 'openai', 'unknown-vendor'],
|
|
140
|
+
nodeId: 'writer-node',
|
|
141
|
+
});
|
|
142
|
+
if (r.status === 404) return;
|
|
143
|
+
expect(
|
|
144
|
+
r.body.outcome?.route,
|
|
145
|
+
driver.describe(
|
|
146
|
+
'RFCS/0031-envelope-variants-and-model-capabilities.md §B step 3',
|
|
147
|
+
'unmet capability + declared fallback + fallback provider authenticatable → route MUST be "substitute"',
|
|
148
|
+
),
|
|
149
|
+
).toBe('substitute');
|
|
150
|
+
expect(
|
|
151
|
+
r.body.event?.type,
|
|
152
|
+
driver.describe(
|
|
153
|
+
'RFCS/0031-envelope-variants-and-model-capabilities.md §D',
|
|
154
|
+
'substitute path MUST emit `model.capability.substituted`',
|
|
155
|
+
),
|
|
156
|
+
).toBe('model.capability.substituted');
|
|
157
|
+
const payload = (r.body.event?.payload ?? {}) as Record<string, unknown>;
|
|
158
|
+
expect(payload.nodeId, 'payload.nodeId MUST mirror the request').toBe('writer-node');
|
|
159
|
+
expect(payload.originalProvider).toBe('unknown-vendor');
|
|
160
|
+
expect(payload.originalModel).toBe('unknown-model');
|
|
161
|
+
expect(payload.fallbackProvider).toBe('anthropic');
|
|
162
|
+
expect(payload.fallbackModel).toBe('claude-opus-4-7');
|
|
163
|
+
expect(
|
|
164
|
+
Array.isArray(payload.missingCapabilities) &&
|
|
165
|
+
(payload.missingCapabilities as string[]).includes('structured-output'),
|
|
166
|
+
driver.describe(
|
|
167
|
+
'schemas/run-event-payloads.schema.json §modelCapabilitySubstituted',
|
|
168
|
+
'missingCapabilities[] MUST include the subset of required capabilities the active model did not satisfy',
|
|
169
|
+
),
|
|
170
|
+
).toBe(true);
|
|
171
|
+
});
|
|
172
|
+
|
|
173
|
+
it('unmet + substitutionSupported: false → outcome: refuse with fallbackAttempted: false (host posture override)', async () => {
|
|
174
|
+
const r = await evaluateGate({
|
|
175
|
+
module: {
|
|
176
|
+
requiredModelCapabilities: ['structured-output'],
|
|
177
|
+
// Fallback declared but the gate refuses BEFORE attempting because the
|
|
178
|
+
// host's posture is "no substitution" per RFC 0031 §E.
|
|
179
|
+
fallbackModel: { provider: 'anthropic', model: 'claude-opus-4-7' },
|
|
180
|
+
},
|
|
181
|
+
activeProvider: 'unknown-vendor',
|
|
182
|
+
activeModel: 'unknown-model',
|
|
183
|
+
substitutionSupported: false,
|
|
184
|
+
supportedProviders: ['anthropic', 'unknown-vendor'],
|
|
185
|
+
});
|
|
186
|
+
if (r.status === 404) return;
|
|
187
|
+
expect(
|
|
188
|
+
r.body.outcome?.route,
|
|
189
|
+
driver.describe(
|
|
190
|
+
'RFCS/0031-envelope-variants-and-model-capabilities.md §E',
|
|
191
|
+
'capabilities.modelCapabilities.substitutionSupported: false → host MUST refuse on any unmet capability even when NodeModule.fallbackModel is declared',
|
|
192
|
+
),
|
|
193
|
+
).toBe('refuse');
|
|
194
|
+
expect(r.body.event?.type).toBe('model.capability.insufficient');
|
|
195
|
+
expect(
|
|
196
|
+
(r.body.event?.payload as { fallbackAttempted?: boolean }).fallbackAttempted,
|
|
197
|
+
driver.describe(
|
|
198
|
+
'schemas/run-event-payloads.schema.json §modelCapabilityInsufficient',
|
|
199
|
+
'fallbackAttempted MUST be false when the refusal is driven by substitutionSupported: false (host posture, not fallback failure)',
|
|
200
|
+
),
|
|
201
|
+
).toBe(false);
|
|
202
|
+
});
|
|
203
|
+
});
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* multi-agent-confidence-escalation — RFC 0039 §A behavioral.
|
|
3
|
+
*
|
|
4
|
+
* Status: ACTIVE (advertisement-shape + behavioral). RFC 0039 Phase 2
|
|
5
|
+
* filed Draft → graduated Active 2026-05-22 in the same commit chain as
|
|
6
|
+
* this scenario. Capability-gated on
|
|
7
|
+
* `capabilities.multiAgent.executionModel.supported: true` AND
|
|
8
|
+
* `capabilities.multiAgent.executionModel.version >= 2` AND fixture
|
|
9
|
+
* availability. Hosts that advertise only Phase 1 (version: 1) soft-skip
|
|
10
|
+
* cleanly — the confidence-floor MUST applies only at version >= 2.
|
|
11
|
+
*
|
|
12
|
+
* Asserts (behavioral when host advertises Phase 2):
|
|
13
|
+
*
|
|
14
|
+
* 1. Advertisement shape: confidenceEscalationFloor (when present) MUST be
|
|
15
|
+
* a number in [0.5, 1.0]; floor < 0.5 is non-conformant per RFC 0039 §A.
|
|
16
|
+
*
|
|
17
|
+
* 2. A run driven by the fixture's low-confidence (0.3) mockDispatchPlan
|
|
18
|
+
* reaches a `waiting-clarification` terminal-suspension status — NOT
|
|
19
|
+
* `completed`. The clarification interrupt MUST surface so the operator
|
|
20
|
+
* can confirm-or-adjust the supervisor's marginal decision.
|
|
21
|
+
*
|
|
22
|
+
* 3. The parent run's event log contains exactly ONE
|
|
23
|
+
* `core.workflowChain.confidence-escalated` event, with:
|
|
24
|
+
* - payload.confidence === 0.3
|
|
25
|
+
* - payload.floor in [0.5, 1.0] (whatever floor the host advertised
|
|
26
|
+
* — spec default 0.5, operator stricter is permitted)
|
|
27
|
+
* - payload.escalationKind === 'clarify' (the reference host emits
|
|
28
|
+
* clarify; hosts choosing 'escalate' would also be conformant)
|
|
29
|
+
* - payload.workerId === the dispatch's first nextWorkerIds entry
|
|
30
|
+
* - payload.originalDecision carries the verbatim OrchestratorDecision
|
|
31
|
+
* AND causationId chains back to the `runOrchestrator.decided` event
|
|
32
|
+
* that emitted the low-confidence decision.
|
|
33
|
+
*
|
|
34
|
+
* 4. The event log contains ZERO `core.workflowChain.event` records — the
|
|
35
|
+
* escalation fired BEFORE any dispatch.began event per RFC 0039 §A
|
|
36
|
+
* ("the escalation event MUST appear in the run event log BEFORE the
|
|
37
|
+
* interrupt fires AND BEFORE any `core.workflowChain.event` with
|
|
38
|
+
* `phase: 'dispatch.began'` for the escalated decision's intended
|
|
39
|
+
* next-worker"). This is the load-bearing test that distinguishes
|
|
40
|
+
* Phase 2 from Phase 1: Phase 1 hosts dispatch unconditionally; Phase 2
|
|
41
|
+
* hosts gate on confidence.
|
|
42
|
+
*
|
|
43
|
+
* @see RFCS/0039-multi-agent-confidence-and-memory-lifecycle.md §A
|
|
44
|
+
* @see spec/v1/multi-agent-execution.md §"Confidence escalation (RFC 0039 Phase 2)"
|
|
45
|
+
* @see schemas/run-event-payloads.schema.json §coreWorkflowChainConfidenceEscalated
|
|
46
|
+
*/
|
|
47
|
+
|
|
48
|
+
import { describe, it, expect } from 'vitest';
|
|
49
|
+
import { driver } from '../lib/driver.js';
|
|
50
|
+
import { isFixtureAdvertised } from '../lib/fixtures.js';
|
|
51
|
+
import { pollUntilTerminal } from '../lib/polling.js';
|
|
52
|
+
|
|
53
|
+
const HTTP_SKIP = !process.env.OPENWOP_BASE_URL;
|
|
54
|
+
const FIXTURE = 'conformance-multi-agent-confidence-escalation';
|
|
55
|
+
const BEHAVIORAL_SKIP = HTTP_SKIP || !isFixtureAdvertised(FIXTURE);
|
|
56
|
+
|
|
57
|
+
interface RunEvent { type: string; eventId?: string; causationId?: string; payload?: Record<string, unknown>; }
|
|
58
|
+
|
|
59
|
+
interface DiscoveryDoc {
|
|
60
|
+
capabilities?: {
|
|
61
|
+
multiAgent?: {
|
|
62
|
+
executionModel?: {
|
|
63
|
+
supported?: unknown;
|
|
64
|
+
version?: unknown;
|
|
65
|
+
confidenceEscalationFloor?: unknown;
|
|
66
|
+
};
|
|
67
|
+
};
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
async function readDiscovery(): Promise<DiscoveryDoc | null> {
|
|
72
|
+
try {
|
|
73
|
+
const res = await driver.get('/.well-known/openwop');
|
|
74
|
+
if (res.status !== 200) return null;
|
|
75
|
+
return res.json as DiscoveryDoc;
|
|
76
|
+
} catch {
|
|
77
|
+
return null;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
describe.skipIf(HTTP_SKIP)('multi-agent-confidence-escalation: capability shape (RFC 0039 §A)', () => {
|
|
82
|
+
it('confidenceEscalationFloor (when advertised) MUST be in [0.5, 1.0]', async () => {
|
|
83
|
+
const d = await readDiscovery();
|
|
84
|
+
if (d === null) return;
|
|
85
|
+
const em = d.capabilities?.multiAgent?.executionModel;
|
|
86
|
+
if (em === undefined) return;
|
|
87
|
+
const floor = em.confidenceEscalationFloor;
|
|
88
|
+
if (floor === undefined) return;
|
|
89
|
+
expect(
|
|
90
|
+
typeof floor === 'number' && Number.isFinite(floor) && floor >= 0.5 && floor <= 1.0,
|
|
91
|
+
driver.describe(
|
|
92
|
+
'RFCS/0039-multi-agent-confidence-and-memory-lifecycle.md §A',
|
|
93
|
+
'confidenceEscalationFloor MUST be number in [0.5, 1.0]; values below the spec floor are non-conformant',
|
|
94
|
+
),
|
|
95
|
+
).toBe(true);
|
|
96
|
+
});
|
|
97
|
+
});
|
|
98
|
+
|
|
99
|
+
describe.skipIf(BEHAVIORAL_SKIP)('multi-agent-confidence-escalation: behavioral (RFC 0039 §A)', () => {
|
|
100
|
+
it('happy-path: low-confidence decision → confidence-escalated event + clarification interrupt + zero dispatch events', async () => {
|
|
101
|
+
const d = await readDiscovery();
|
|
102
|
+
const supported = d?.capabilities?.multiAgent?.executionModel?.supported === true;
|
|
103
|
+
const versionRaw = d?.capabilities?.multiAgent?.executionModel?.version;
|
|
104
|
+
const version = typeof versionRaw === 'number' ? versionRaw : 0;
|
|
105
|
+
if (!supported || version < 2) return; // soft-skip — Phase 1 hosts pass via this absence
|
|
106
|
+
|
|
107
|
+
const create = await driver.post('/v1/runs', { workflowId: FIXTURE });
|
|
108
|
+
expect(create.status).toBe(201);
|
|
109
|
+
const runId = (create.json as { runId: string }).runId;
|
|
110
|
+
|
|
111
|
+
const terminal = await pollUntilTerminal(runId);
|
|
112
|
+
// Phase 2 escalation suspends the parent — NOT a terminal `completed`.
|
|
113
|
+
// The conformance pollUntilTerminal returns when the run reaches any
|
|
114
|
+
// settled status; we expect `waiting-clarification` or equivalent
|
|
115
|
+
// non-completed status carrying an open clarification interrupt.
|
|
116
|
+
expect(
|
|
117
|
+
terminal.status,
|
|
118
|
+
driver.describe(
|
|
119
|
+
'RFCS/0039-multi-agent-confidence-and-memory-lifecycle.md §A + spec/v1/interrupt.md',
|
|
120
|
+
'a host emitting `interrupt.kind: "clarification"` MUST surface the run as `waiting-clarification` per spec/v1/interrupt.md §"Interrupt kinds"; low-confidence decision MUST NOT reach `completed` because no dispatch fired',
|
|
121
|
+
),
|
|
122
|
+
).toBe('waiting-clarification');
|
|
123
|
+
|
|
124
|
+
const eventsRes = await driver.get(`/v1/runs/${encodeURIComponent(runId)}/events`);
|
|
125
|
+
expect(eventsRes.status).toBe(200);
|
|
126
|
+
const events = ((eventsRes.json as { events?: RunEvent[] } | undefined)?.events ?? []);
|
|
127
|
+
|
|
128
|
+
const escalated = events.filter((e) => e.type === 'core.workflowChain.confidence-escalated');
|
|
129
|
+
expect(escalated.length, driver.describe(
|
|
130
|
+
'RFCS/0039-multi-agent-confidence-and-memory-lifecycle.md §A',
|
|
131
|
+
'low-confidence decision MUST emit exactly one core.workflowChain.confidence-escalated event',
|
|
132
|
+
)).toBe(1);
|
|
133
|
+
|
|
134
|
+
const ev = escalated[0]!;
|
|
135
|
+
const payload = (ev.payload ?? {}) as { confidence?: number; floor?: number; escalationKind?: string; workerId?: string };
|
|
136
|
+
expect(payload.confidence, 'payload.confidence echoes the decision').toBe(0.3);
|
|
137
|
+
expect(
|
|
138
|
+
typeof payload.floor === 'number' && payload.floor >= 0.5 && payload.floor <= 1.0,
|
|
139
|
+
'payload.floor is the host-advertised floor (in [0.5, 1.0])',
|
|
140
|
+
).toBe(true);
|
|
141
|
+
expect(
|
|
142
|
+
payload.escalationKind === 'clarify' || payload.escalationKind === 'escalate',
|
|
143
|
+
'payload.escalationKind ∈ {clarify, escalate}',
|
|
144
|
+
).toBe(true);
|
|
145
|
+
|
|
146
|
+
// Causation chain: escalation event causes back to the runOrchestrator.decided
|
|
147
|
+
// that named the worker.
|
|
148
|
+
const decidedEvent = events.find((e) => e.eventId === ev.causationId);
|
|
149
|
+
expect(
|
|
150
|
+
decidedEvent?.type,
|
|
151
|
+
'confidence-escalated causationId MUST point at the runOrchestrator.decided that surfaced the low-confidence decision',
|
|
152
|
+
).toBe('runOrchestrator.decided');
|
|
153
|
+
|
|
154
|
+
// Load-bearing: NO dispatch event fired. Phase 2 gates BEFORE the loop.
|
|
155
|
+
const chainEvents = events.filter((e) => e.type === 'core.workflowChain.event');
|
|
156
|
+
expect(
|
|
157
|
+
chainEvents.length,
|
|
158
|
+
driver.describe(
|
|
159
|
+
'RFCS/0039-multi-agent-confidence-and-memory-lifecycle.md §A',
|
|
160
|
+
'low-confidence decision MUST NOT produce any core.workflowChain.event records — the escalation fires before any dispatch.began per the spec ordering',
|
|
161
|
+
),
|
|
162
|
+
).toBe(0);
|
|
163
|
+
});
|
|
164
|
+
});
|