@openwop/openwop-conformance 1.36.0 → 1.43.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +32 -0
- package/README.md +2 -2
- package/api/openapi.yaml +62 -5
- package/coverage.md +1 -0
- package/fixtures/conformance-agent-memory-injection-budget.json +44 -0
- package/fixtures/conformance-context-budget-multiturn.json +50 -0
- package/fixtures.md +2 -0
- package/package.json +1 -1
- package/schemas/README.md +3 -0
- package/schemas/a2ui-surface-delta-frame.schema.json +48 -0
- package/schemas/capabilities.schema.json +128 -1
- package/schemas/channel-presence-payload.schema.json +41 -0
- package/schemas/compact-tool-descriptor.schema.json +51 -0
- package/schemas/conversation-turn.schema.json +10 -0
- package/schemas/memory-list-options.schema.json +16 -0
- package/schemas/run-event-payloads.schema.json +25 -2
- package/schemas/run-event.schema.json +2 -0
- package/src/lib/toolCatalog.ts +89 -0
- package/src/scenarios/a2ui-surface-delta-transport.test.ts +600 -0
- package/src/scenarios/aiproviders-selfhosted-honesty.test.ts +133 -0
- package/src/scenarios/channel-presence-behavioral.test.ts +83 -0
- package/src/scenarios/channel-presence-shape.test.ts +93 -0
- package/src/scenarios/context-budget-transcript-bound.test.ts +253 -0
- package/src/scenarios/context-summarization-replay.test.ts +155 -0
- package/src/scenarios/conversation-turn-model-provenance-shape.test.ts +120 -0
- package/src/scenarios/memory-injection-budget.test.ts +188 -0
- package/src/scenarios/prompt-prefix-cache.test.ts +200 -0
- package/src/scenarios/run-transport-economy.test.ts +236 -0
- package/src/scenarios/tool-catalog-compact-projection.test.ts +149 -0
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Self-hosted provider truthful-advertisement + endpoint non-disclosure
|
|
3
|
+
* (RFC 0108 §A.2 / §D) — behavioral.
|
|
4
|
+
*
|
|
5
|
+
* Gated on `capabilities.aiProviders.selfHosted.length > 0` (root-first per
|
|
6
|
+
* RFC 0073) via `behaviorGate('openwop-selfhosted-providers', …)`. Soft-skips
|
|
7
|
+
* when the host advertises no self-hosted class (default) / hard-fails under
|
|
8
|
+
* `OPENWOP_REQUIRE_BEHAVIOR=true`. The always-on wire-shape coverage lives in
|
|
9
|
+
* `aiproviders-selfhosted-shape.test.ts`; this asserts host BEHAVIOR via the
|
|
10
|
+
* documented host-sample seam `POST /v1/host/sample/ai/call` (soft-skips on 404
|
|
11
|
+
* until a host wires it):
|
|
12
|
+
*
|
|
13
|
+
* - **§A.2 truthful advertisement.** A dispatch against the advertised
|
|
14
|
+
* `selfHosted` provider id MUST reach a real configured endpoint — it
|
|
15
|
+
* either succeeds OR fails with a transport-class error from that endpoint.
|
|
16
|
+
* It MUST NOT come back `capability_not_provided` / `provider_not_supported`
|
|
17
|
+
* (a "no endpoint configured" refusal would prove the advertisement is
|
|
18
|
+
* dishonest — the host listed a `selfHosted` id with nothing backing it).
|
|
19
|
+
* - **§D endpoint non-disclosure.** When the host's endpoint location is
|
|
20
|
+
* supplied out-of-band via `OPENWOP_TEST_COMPAT_ENDPOINT`, that string (and
|
|
21
|
+
* its bare host[:port]) MUST NOT appear anywhere in the seam response — the
|
|
22
|
+
* success body or the error payload. The endpoint is operator-private
|
|
23
|
+
* infrastructure (`self-hosted-endpoint-no-disclosure`); the base-URL is
|
|
24
|
+
* config, not secret-shaped, so the host's SR-1 redaction won't auto-scrub
|
|
25
|
+
* it — the host MUST scrub it deliberately (a generic transport error).
|
|
26
|
+
*
|
|
27
|
+
* The §D leg requires `OPENWOP_TEST_COMPAT_ENDPOINT` (the host operator + the
|
|
28
|
+
* conformance runner agree on the configured endpoint location out-of-band, the
|
|
29
|
+
* same pattern as `OPENWOP_CANARY_SECRET_VALUE` / `OPENWOP_TEST_SAML_IDP_URL`).
|
|
30
|
+
* Without it, the §A.2 leg still runs; the §D string check is skipped with a
|
|
31
|
+
* loud note (it has no endpoint to grep for).
|
|
32
|
+
*
|
|
33
|
+
* Spec references:
|
|
34
|
+
* - https://github.com/openwop/openwop/blob/main/spec/v1/capabilities.md (§aiProviders.selfHosted)
|
|
35
|
+
* - https://github.com/openwop/openwop/blob/main/RFCS/0108-self-hosted-openai-compatible-provider-class.md (§A.2, §D, §E)
|
|
36
|
+
*/
|
|
37
|
+
|
|
38
|
+
import { describe, it, expect } from 'vitest';
|
|
39
|
+
import { driver } from '../lib/driver.js';
|
|
40
|
+
import { behaviorGate } from '../lib/behavior-gate.js';
|
|
41
|
+
import { readCapabilityFamily } from '../lib/discovery-capabilities.js';
|
|
42
|
+
|
|
43
|
+
const SEAM = '/v1/host/sample/ai/call';
|
|
44
|
+
|
|
45
|
+
/** Read the canonical error code from a seam response body (tolerant of
|
|
46
|
+
* `{error}` / `{code}` / `{error:{code}}` shapes). */
|
|
47
|
+
function errCode(json: unknown): string | undefined {
|
|
48
|
+
const j = json as { error?: unknown; code?: unknown };
|
|
49
|
+
if (typeof j?.code === 'string') return j.code;
|
|
50
|
+
if (typeof j?.error === 'string') return j.error;
|
|
51
|
+
const e = j?.error as { code?: unknown } | undefined;
|
|
52
|
+
if (e && typeof e.code === 'string') return e.code;
|
|
53
|
+
return undefined;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* The §A.2 dishonest-advertisement tell: a host that advertised a `selfHosted`
|
|
58
|
+
* id with no configured/reachable endpoint behind it refuses the dispatch as
|
|
59
|
+
* "no such provider configured" rather than reaching a real endpoint.
|
|
60
|
+
*/
|
|
61
|
+
const NO_ENDPOINT_CODES = new Set([
|
|
62
|
+
'capability_not_provided',
|
|
63
|
+
'provider_not_supported',
|
|
64
|
+
'provider_not_configured',
|
|
65
|
+
'no_provider_configured',
|
|
66
|
+
]);
|
|
67
|
+
|
|
68
|
+
/** Derive the bare `host` and `host:port` from an endpoint URL/string for the
|
|
69
|
+
* §D substring check (so a host that scrubs the scheme but leaks `vllm:8000`
|
|
70
|
+
* is still caught). */
|
|
71
|
+
function endpointNeedles(endpoint: string): string[] {
|
|
72
|
+
const needles = new Set<string>([endpoint]);
|
|
73
|
+
try {
|
|
74
|
+
const u = new URL(endpoint.includes('://') ? endpoint : `http://${endpoint}`);
|
|
75
|
+
if (u.host) needles.add(u.host); // host:port
|
|
76
|
+
if (u.hostname) needles.add(u.hostname); // bare host
|
|
77
|
+
} catch {
|
|
78
|
+
/* not URL-parseable; the raw string check still applies */
|
|
79
|
+
}
|
|
80
|
+
return [...needles].filter((s) => s.length > 0);
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
describe('aiproviders-selfhosted-honesty (RFC 0108 §A.2/§D)', () => {
|
|
84
|
+
it('a selfHosted dispatch reaches a real endpoint (§A.2) and never discloses the endpoint location (§D)', async () => {
|
|
85
|
+
const ai = await readCapabilityFamily<Record<string, unknown>>('aiProviders');
|
|
86
|
+
const selfHosted = Array.isArray(ai?.selfHosted) ? (ai!.selfHosted as string[]) : [];
|
|
87
|
+
if (!behaviorGate('openwop-selfhosted-providers', selfHosted.length > 0)) return;
|
|
88
|
+
|
|
89
|
+
const providerId = selfHosted[0]!; // an advertised self-hosted/compat id
|
|
90
|
+
const res = await driver.post(SEAM, {
|
|
91
|
+
provider: providerId,
|
|
92
|
+
messages: [{ role: 'user', content: 'ping' }],
|
|
93
|
+
});
|
|
94
|
+
if (res.status === 404) return; // seam unwired — soft-skip the behavioral suite
|
|
95
|
+
|
|
96
|
+
// §A.2 — the advertisement must be backed by a real endpoint. A success or a
|
|
97
|
+
// transport-class failure both prove a real endpoint was reached; a
|
|
98
|
+
// "no provider configured" refusal proves the §A.2 dishonest-advertisement
|
|
99
|
+
// violation (the host listed selfHosted with nothing behind it).
|
|
100
|
+
const code = errCode(res.json);
|
|
101
|
+
expect(
|
|
102
|
+
code === undefined || !NO_ENDPOINT_CODES.has(code),
|
|
103
|
+
driver.describe(
|
|
104
|
+
'RFC 0108 §A.2',
|
|
105
|
+
`an advertised selfHosted id (${providerId}) MUST be backed by a configured, reachable endpoint ` +
|
|
106
|
+
`(dispatch succeeds or fails with a transport error) — MUST NOT refuse "${code}" (no endpoint configured)`,
|
|
107
|
+
),
|
|
108
|
+
).toBe(true);
|
|
109
|
+
|
|
110
|
+
// §D — the endpoint location MUST NOT surface on the wire (success body OR
|
|
111
|
+
// error payload). Requires the out-of-band endpoint to grep for.
|
|
112
|
+
const endpoint = process.env.OPENWOP_TEST_COMPAT_ENDPOINT?.trim();
|
|
113
|
+
if (!endpoint) {
|
|
114
|
+
// eslint-disable-next-line no-console
|
|
115
|
+
console.warn(
|
|
116
|
+
'[openwop-selfhosted-providers] §D disclosure check skipped — set OPENWOP_TEST_COMPAT_ENDPOINT ' +
|
|
117
|
+
"to the host's configured compat endpoint to assert non-disclosure on the wire.",
|
|
118
|
+
);
|
|
119
|
+
return;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
const serialized = JSON.stringify(res.json ?? {});
|
|
123
|
+
for (const needle of endpointNeedles(endpoint)) {
|
|
124
|
+
expect(
|
|
125
|
+
!serialized.includes(needle),
|
|
126
|
+
driver.describe(
|
|
127
|
+
'RFC 0108 §D (self-hosted-endpoint-no-disclosure)',
|
|
128
|
+
`the endpoint location ("${needle}") MUST NOT appear in any selfHosted dispatch response or error payload`,
|
|
129
|
+
),
|
|
130
|
+
).toBe(true);
|
|
131
|
+
}
|
|
132
|
+
});
|
|
133
|
+
});
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Channel presence — behavioral leg (RFC 0110 §Conformance).
|
|
3
|
+
*
|
|
4
|
+
* Gated on `capabilities.channelPresence.supported` via `behaviorGate`: soft-skips when
|
|
5
|
+
* unadvertised (default) / hard-fails under `OPENWOP_REQUIRE_BEHAVIOR=true`. The companion
|
|
6
|
+
* always-on wire-shape coverage lives in `channel-presence-shape.test.ts`; THIS scenario
|
|
7
|
+
* asserts host BEHAVIOR — the runtime MUSTs JSON Schema cannot express.
|
|
8
|
+
*
|
|
9
|
+
* RFC 0110 carries presence over a host SSE (a held connection a conformance client can't
|
|
10
|
+
* assert against), and mints no normative client route to open presence. The driver
|
|
11
|
+
* therefore reads a live `channel.presence` snapshot via the conformance-only seam
|
|
12
|
+
* `POST /v1/host/sample/channel-presence/snapshot` (`host-sample-test-seams.md`), which
|
|
13
|
+
* routes through the SAME membership gate + the closed payload the host applies in
|
|
14
|
+
* production (a transient join → snapshot, so `present` is non-vacuous). The seam is
|
|
15
|
+
* OPTIONAL — the scenario soft-skips on `404`/`405`; a capability-advertising host whose
|
|
16
|
+
* presence is bound to a product flow (e.g. the openwop-app channels SSE) witnesses
|
|
17
|
+
* instead via its own host-side route test + an `INTEROP-MATRIX.md` row (the RFC 0086
|
|
18
|
+
* dual-staging, as in `multi-party-conversation-behavioral.test.ts`).
|
|
19
|
+
*
|
|
20
|
+
* Behavioral MUSTs asserted (RFC 0110 §Proposal):
|
|
21
|
+
* 1. SHAPE — a snapshot is the CLOSED `{ conversationId, present, typing }` (no field
|
|
22
|
+
* beyond it — the no-PII guard).
|
|
23
|
+
* 2. MEMBERS-ONLY — every ref in `present`/`typing` is an opaque RFC 0041 subject ref
|
|
24
|
+
* (`user:`/`agent:`), never PII, and a subset of the channel's members.
|
|
25
|
+
* 3. NON-VACUOUS — the snapshotting member appears in `present`.
|
|
26
|
+
*
|
|
27
|
+
* Spec references:
|
|
28
|
+
* - https://github.com/openwop/openwop/blob/main/RFCS/0110-channel-presence.md
|
|
29
|
+
* - https://github.com/openwop/openwop/blob/main/spec/v1/host-sample-test-seams.md
|
|
30
|
+
*/
|
|
31
|
+
|
|
32
|
+
import { describe, it, expect } from 'vitest';
|
|
33
|
+
import { driver } from '../lib/driver.js';
|
|
34
|
+
import { behaviorGate } from '../lib/behavior-gate.js';
|
|
35
|
+
import { readCapabilityFamily } from '../lib/discovery-capabilities.js';
|
|
36
|
+
|
|
37
|
+
const PROFILE = 'openwop-channel-presence';
|
|
38
|
+
const SUBJECT_REF = /^(user|agent):.+/;
|
|
39
|
+
|
|
40
|
+
interface PresenceSnapshot { conversationId: string; present: string[]; typing: string[] }
|
|
41
|
+
|
|
42
|
+
describe('channel-presence-behavioral (RFC 0110 §Conformance)', () => {
|
|
43
|
+
it('a presence snapshot is the closed, members-only, non-vacuous channel.presence shape', async () => {
|
|
44
|
+
const cap = await readCapabilityFamily<{ supported?: boolean }>('channelPresence');
|
|
45
|
+
const advertised = cap?.supported === true;
|
|
46
|
+
if (!behaviorGate(PROFILE, advertised)) return;
|
|
47
|
+
|
|
48
|
+
const conversationId = 'conf:channel-presence:c1';
|
|
49
|
+
const res = await driver.post('/v1/host/sample/channel-presence/snapshot', { conversationId, member: 'user:conformance-runner' });
|
|
50
|
+
if (res.status === 404 || res.status === 405) return; // seam unwired — host witnesses via its own route test (dual-staging)
|
|
51
|
+
|
|
52
|
+
expect(
|
|
53
|
+
res.status === 200,
|
|
54
|
+
driver.describe('RFC 0110 §Proposal', 'the presence snapshot seam MUST return 200 for a member'),
|
|
55
|
+
).toBe(true);
|
|
56
|
+
const snap = res.json as PresenceSnapshot;
|
|
57
|
+
|
|
58
|
+
// MUST 1 — CLOSED shape: exactly conversationId / present / typing (no PII field rides).
|
|
59
|
+
expect(
|
|
60
|
+
Object.keys(snap).sort().join(','),
|
|
61
|
+
driver.describe('RFC 0110 §Proposal', 'channel.presence carries ONLY conversationId/present/typing (no PII)'),
|
|
62
|
+
).toBe('conversationId,present,typing');
|
|
63
|
+
expect(snap.conversationId, driver.describe('RFC 0110 §Proposal', 'the snapshot echoes the conversationId')).toBe(conversationId);
|
|
64
|
+
|
|
65
|
+
// MUST 2 — every ref is an opaque RFC 0041 subject ref (no PII).
|
|
66
|
+
for (const ref of [...snap.present, ...snap.typing]) {
|
|
67
|
+
expect(
|
|
68
|
+
SUBJECT_REF.test(ref),
|
|
69
|
+
driver.describe('RFC 0110 §Proposal / RFC 0041', `present/typing ref "${ref}" MUST be an opaque user:/agent: subject ref`),
|
|
70
|
+
).toBe(true);
|
|
71
|
+
}
|
|
72
|
+
// typing is a subset of present.
|
|
73
|
+
for (const ref of snap.typing) {
|
|
74
|
+
expect(snap.present.includes(ref), driver.describe('RFC 0110 §Proposal', 'every typing ref MUST also be present')).toBe(true);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
// MUST 3 — NON-VACUOUS: the snapshotting member is present.
|
|
78
|
+
expect(
|
|
79
|
+
snap.present.includes('user:conformance-runner'),
|
|
80
|
+
driver.describe('RFC 0110 §Proposal', 'the snapshotting member MUST appear in present (members-only, real)'),
|
|
81
|
+
).toBe(true);
|
|
82
|
+
});
|
|
83
|
+
});
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Channel presence — `channel.presence` ephemeral event (RFC 0110).
|
|
3
|
+
*
|
|
4
|
+
* Always-on, server-free schema-shape probe. Verifies the additive, normative
|
|
5
|
+
* RFC 0110 wire facts on the published schemas:
|
|
6
|
+
*
|
|
7
|
+
* 1. `channel-presence-payload.schema.json` validates a conforming presence
|
|
8
|
+
* snapshot (`{ conversationId, present[], typing? }`), REQUIRES
|
|
9
|
+
* conversationId + present, and is CLOSED (`additionalProperties: false`) —
|
|
10
|
+
* the no-PII guard: no `ip`/`location`/free-text can ride the payload.
|
|
11
|
+
* 2. `typing` is OPTIONAL — a snapshot with nobody typing validates.
|
|
12
|
+
* 3. `run-event.schema.json` enumerates `channel.presence` as a RunEvent type.
|
|
13
|
+
* 4. `capabilities.schema.json` declares `channelPresence` with `supported`,
|
|
14
|
+
* closed.
|
|
15
|
+
*
|
|
16
|
+
* The host-side MUSTs — presence is membership-gated (every ref a current
|
|
17
|
+
* participant; never delivered to a non-member) and EPHEMERAL (never persisted
|
|
18
|
+
* to the replayable log; replay/`:fork`-invisible) — are behavioral contracts
|
|
19
|
+
* gated on `channelPresence.supported`, landing at the reference-host
|
|
20
|
+
* implementation (RFC 0110 §Conformance). This scenario asserts the wire SHAPE.
|
|
21
|
+
*
|
|
22
|
+
* Normative references:
|
|
23
|
+
* - RFCS/0110-channel-presence.md (§Proposal / §Conformance)
|
|
24
|
+
* - schemas/channel-presence-payload.schema.json
|
|
25
|
+
* - schemas/run-event.schema.json (the channel.presence type)
|
|
26
|
+
* - schemas/capabilities.schema.json (channelPresence)
|
|
27
|
+
*
|
|
28
|
+
* @see RFCS/0110-channel-presence.md
|
|
29
|
+
*/
|
|
30
|
+
|
|
31
|
+
import { describe, it, expect } from 'vitest';
|
|
32
|
+
import { readFileSync } from 'node:fs';
|
|
33
|
+
import { join } from 'node:path';
|
|
34
|
+
import Ajv2020 from 'ajv/dist/2020.js';
|
|
35
|
+
import addFormats from 'ajv-formats';
|
|
36
|
+
import { SCHEMAS_DIR } from '../lib/paths.js';
|
|
37
|
+
|
|
38
|
+
const why = (specRef: string, requirement: string): string => `${specRef} — ${requirement}`;
|
|
39
|
+
|
|
40
|
+
function loadSchema(name: string): Record<string, unknown> {
|
|
41
|
+
return JSON.parse(readFileSync(join(SCHEMAS_DIR, name), 'utf8')) as Record<string, unknown>;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
describe('channel-presence-shape: the channel.presence payload (RFC 0110 §Proposal, server-free)', () => {
|
|
45
|
+
const ajv = new Ajv2020({ strict: false, allErrors: true });
|
|
46
|
+
addFormats(ajv);
|
|
47
|
+
const presence = ajv.compile(loadSchema('channel-presence-payload.schema.json'));
|
|
48
|
+
|
|
49
|
+
it('a conforming presence snapshot (present + typing) validates', () => {
|
|
50
|
+
expect(
|
|
51
|
+
presence({ conversationId: 'chan-eng', present: ['user:alice', 'agent:iris'], typing: ['user:alice'] }),
|
|
52
|
+
why('RFC 0110 §Proposal', 'a conforming channel.presence payload MUST validate'),
|
|
53
|
+
).toBe(true);
|
|
54
|
+
});
|
|
55
|
+
|
|
56
|
+
it('typing is OPTIONAL — a snapshot with nobody typing validates', () => {
|
|
57
|
+
expect(
|
|
58
|
+
presence({ conversationId: 'chan-eng', present: ['user:alice'] }),
|
|
59
|
+
why('RFC 0110 §Proposal', 'typing is optional'),
|
|
60
|
+
).toBe(true);
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
it('conversationId and present are REQUIRED', () => {
|
|
64
|
+
expect(presence({ present: ['user:a'] }), why('RFC 0110 §Proposal', 'conversationId is required')).toBe(false);
|
|
65
|
+
expect(presence({ conversationId: 'c' }), why('RFC 0110 §Proposal', 'present is required')).toBe(false);
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
it('the payload is CLOSED — a non-subject-ref field (ip/location) MUST be rejected (the no-PII guard)', () => {
|
|
69
|
+
expect(
|
|
70
|
+
presence({ conversationId: 'c', present: ['user:a'], ip: '10.0.0.1' }),
|
|
71
|
+
why('RFC 0110 §Proposal', 'channel.presence MUST forbid extra keys — no PII rides the payload'),
|
|
72
|
+
).toBe(false);
|
|
73
|
+
});
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
describe('channel-presence-shape: event type + capability advertisement (RFC 0110 §Conformance, server-free)', () => {
|
|
77
|
+
it('run-event.schema.json enumerates `channel.presence` as a RunEvent type', () => {
|
|
78
|
+
const re = loadSchema('run-event.schema.json');
|
|
79
|
+
const enumVals = JSON.stringify(re);
|
|
80
|
+
expect(enumVals.includes('"channel.presence"'), why('RFC 0110 §Proposal', 'channel.presence MUST be a declared RunEvent type')).toBe(true);
|
|
81
|
+
});
|
|
82
|
+
|
|
83
|
+
it('capabilities.schema.json declares channelPresence with supported, closed', () => {
|
|
84
|
+
const caps = loadSchema('capabilities.schema.json');
|
|
85
|
+
const block = (caps.properties as Record<string, Record<string, unknown>>).channelPresence as
|
|
86
|
+
| { properties?: Record<string, unknown>; required?: string[]; additionalProperties?: boolean }
|
|
87
|
+
| undefined;
|
|
88
|
+
expect(block, why('RFC 0110 §Conformance', 'capabilities.channelPresence MUST be declared')).toBeDefined();
|
|
89
|
+
expect(block?.properties?.supported, why('RFC 0110 §Conformance', 'channelPresence.supported MUST be declared')).toBeDefined();
|
|
90
|
+
expect(block?.required, why('RFC 0110 §Conformance', 'supported MUST be required on the block')).toContain('supported');
|
|
91
|
+
expect(block?.additionalProperties, why('RFC 0110 §Conformance', 'the block MUST be closed')).toBe(false);
|
|
92
|
+
});
|
|
93
|
+
});
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* RFC 0111 — Context Economy: transcript token budget.
|
|
3
|
+
*
|
|
4
|
+
* Verifies the OPT-IN per-turn token bound on the orchestrator transcript
|
|
5
|
+
* (`spec/v1/multi-agent-execution.md` §"Context economy"). A host advertising
|
|
6
|
+
* `multiAgent.executionModel.contextBudget.transcriptTokenBudget` MUST NOT feed
|
|
7
|
+
* more than that many tokens of transcript to any single orchestrator turn,
|
|
8
|
+
* measured in the advertised `tokenCounter` unit.
|
|
9
|
+
*
|
|
10
|
+
* Capability-gated on `multiAgent.executionModel.contextBudget.transcriptTokenBudget`
|
|
11
|
+
* being PRESENT (root-first per RFC 0073) via `behaviorGate`. The assembled
|
|
12
|
+
* transcript is host-internal and never crosses the wire, so the scenario reads
|
|
13
|
+
* the host's own per-iteration accounting via the OPTIONAL conformance seam
|
|
14
|
+
* `GET /v1/host/sample/agent/transcript-window?runId=…&iteration=N`
|
|
15
|
+
* (`host-sample-test-seams.md` §14): `{ tokenCounter, tokenCount, eventIds,
|
|
16
|
+
* summarizedRanges }`. The seam is OPTIONAL — the scenario soft-skips on
|
|
17
|
+
* `404`/`405` (the RFC defers reference-host implementation).
|
|
18
|
+
*
|
|
19
|
+
* Asserts, for each iteration the host reports:
|
|
20
|
+
* 1. `tokenCounter` equals the advertised `contextBudget.tokenCounter`.
|
|
21
|
+
* 2. `tokenCount ≤ transcriptTokenBudget` (the per-turn bound).
|
|
22
|
+
* 3. CROSS-CHECK — the harness independently reads the events named in
|
|
23
|
+
* `eventIds` from the run event-log (`/v1/host/sample/test/runs/:runId/events`)
|
|
24
|
+
* and confirms every named id is a real persisted event of the run, so the
|
|
25
|
+
* host's reported accounting is internally consistent (not fabricated).
|
|
26
|
+
* 4. RECENT-TAIL — `eventIds` are a contiguous most-recent suffix of the run's
|
|
27
|
+
* eligible event-log entries (no older event included while a newer eligible
|
|
28
|
+
* one is dropped).
|
|
29
|
+
* 5. SUMMARIZED-RANGE — every `summarizedRanges[].summaryRef` has a matching
|
|
30
|
+
* `context.summarized` event in the run event-log.
|
|
31
|
+
*
|
|
32
|
+
* Honest non-vacuity ceiling (RFC 0111 §"Conformance seam"): the model-facing
|
|
33
|
+
* prompt is genuinely host-internal, so this proves the host's DECLARED
|
|
34
|
+
* accounting is internally consistent + within budget — it cannot black-box-prove
|
|
35
|
+
* the host feeds nothing additional off-seam. The capability is advertise-and-attest.
|
|
36
|
+
*
|
|
37
|
+
* @see RFCS/0111-context-economy.md
|
|
38
|
+
* @see spec/v1/multi-agent-execution.md §"Context economy (RFC 0111)"
|
|
39
|
+
* @see spec/v1/host-sample-test-seams.md §14
|
|
40
|
+
*/
|
|
41
|
+
|
|
42
|
+
import { describe, it, expect } from 'vitest';
|
|
43
|
+
import { driver } from '../lib/driver.js';
|
|
44
|
+
import { pollUntilTerminal } from '../lib/polling.js';
|
|
45
|
+
import { behaviorGate } from '../lib/behavior-gate.js';
|
|
46
|
+
import { isFixtureAdvertised } from '../lib/fixtures.js';
|
|
47
|
+
import { readCapabilityFamily } from '../lib/discovery-capabilities.js';
|
|
48
|
+
import { queryTestEvents } from '../lib/event-log-query.js';
|
|
49
|
+
|
|
50
|
+
const FIXTURE = 'conformance-context-budget-multiturn';
|
|
51
|
+
const PROFILE = 'openwop-context-budget';
|
|
52
|
+
const MAX_ITERATIONS_PROBED = 16;
|
|
53
|
+
|
|
54
|
+
interface SummarizationCap {
|
|
55
|
+
readonly supported?: boolean;
|
|
56
|
+
readonly strategy?: string;
|
|
57
|
+
readonly keepLastTurns?: number;
|
|
58
|
+
}
|
|
59
|
+
interface ContextBudgetCap {
|
|
60
|
+
readonly transcriptTokenBudget?: number;
|
|
61
|
+
readonly tokenCounter?: string;
|
|
62
|
+
readonly summarization?: SummarizationCap;
|
|
63
|
+
}
|
|
64
|
+
interface ExecutionModelCap {
|
|
65
|
+
readonly contextBudget?: ContextBudgetCap;
|
|
66
|
+
}
|
|
67
|
+
interface MultiAgentCap {
|
|
68
|
+
readonly executionModel?: ExecutionModelCap;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// ── cast-free typed accessors (no `as`) ──────────────────────────────────
|
|
72
|
+
function isRecord(v: unknown): v is Record<string, unknown> {
|
|
73
|
+
return typeof v === 'object' && v !== null && !Array.isArray(v);
|
|
74
|
+
}
|
|
75
|
+
function isString(v: unknown): v is string {
|
|
76
|
+
return typeof v === 'string';
|
|
77
|
+
}
|
|
78
|
+
function isNumber(v: unknown): v is number {
|
|
79
|
+
return typeof v === 'number';
|
|
80
|
+
}
|
|
81
|
+
function stringOf(v: unknown): string | undefined {
|
|
82
|
+
return isString(v) ? v : undefined;
|
|
83
|
+
}
|
|
84
|
+
function numberOf(v: unknown): number | undefined {
|
|
85
|
+
return isNumber(v) ? v : undefined;
|
|
86
|
+
}
|
|
87
|
+
function stringArrayOf(v: unknown): string[] | undefined {
|
|
88
|
+
return Array.isArray(v) && v.every(isString) ? v : undefined;
|
|
89
|
+
}
|
|
90
|
+
function runIdOf(v: unknown): string | undefined {
|
|
91
|
+
return isRecord(v) ? stringOf(v['runId']) : undefined;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
interface SummarizedRange {
|
|
95
|
+
readonly summaryRef: string;
|
|
96
|
+
readonly replacedTurns: string[];
|
|
97
|
+
}
|
|
98
|
+
interface TranscriptWindow {
|
|
99
|
+
readonly tokenCounter: string;
|
|
100
|
+
readonly tokenCount: number;
|
|
101
|
+
readonly eventIds: string[];
|
|
102
|
+
readonly summarizedRanges: SummarizedRange[];
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
function summarizedRangeOf(v: unknown): SummarizedRange | undefined {
|
|
106
|
+
if (!isRecord(v)) return undefined;
|
|
107
|
+
const summaryRef = stringOf(v['summaryRef']);
|
|
108
|
+
const replacedTurns = stringArrayOf(v['replacedTurns']);
|
|
109
|
+
if (summaryRef === undefined || replacedTurns === undefined) return undefined;
|
|
110
|
+
return { summaryRef, replacedTurns };
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
/** Parse the seam response into a typed window — undefined if the shape is wrong. */
|
|
114
|
+
function transcriptWindowOf(v: unknown): TranscriptWindow | undefined {
|
|
115
|
+
if (!isRecord(v)) return undefined;
|
|
116
|
+
const tokenCounter = stringOf(v['tokenCounter']);
|
|
117
|
+
const tokenCount = numberOf(v['tokenCount']);
|
|
118
|
+
const eventIds = stringArrayOf(v['eventIds']);
|
|
119
|
+
if (tokenCounter === undefined || tokenCount === undefined || eventIds === undefined) return undefined;
|
|
120
|
+
const rawRanges = v['summarizedRanges'];
|
|
121
|
+
const summarizedRanges: SummarizedRange[] = [];
|
|
122
|
+
if (Array.isArray(rawRanges)) {
|
|
123
|
+
for (const r of rawRanges) {
|
|
124
|
+
const parsed = summarizedRangeOf(r);
|
|
125
|
+
if (parsed === undefined) return undefined; // malformed range → fail loudly via caller
|
|
126
|
+
summarizedRanges.push(parsed);
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
return { tokenCounter, tokenCount, eventIds, summarizedRanges };
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
describe('context-budget-transcript-bound (RFC 0111 §"Context economy")', () => {
|
|
133
|
+
it('bounds the per-turn transcript to transcriptTokenBudget with an internally-consistent, recent-tail accounting', async () => {
|
|
134
|
+
const ma = await readCapabilityFamily<MultiAgentCap>('multiAgent');
|
|
135
|
+
const cb = ma?.executionModel?.contextBudget;
|
|
136
|
+
const budget = numberOf(cb?.transcriptTokenBudget);
|
|
137
|
+
if (!behaviorGate(PROFILE, budget !== undefined)) return;
|
|
138
|
+
if (!isFixtureAdvertised(FIXTURE)) return; // fixture-gated soft-skip
|
|
139
|
+
|
|
140
|
+
const advertisedCounter = stringOf(cb?.tokenCounter);
|
|
141
|
+
expect(
|
|
142
|
+
advertisedCounter,
|
|
143
|
+
driver.describe('RFC 0111', 'tokenCounter MUST be advertised when transcriptTokenBudget is present (schema if/then)'),
|
|
144
|
+
).toBeDefined();
|
|
145
|
+
|
|
146
|
+
// Drive the multi-turn orchestrator run.
|
|
147
|
+
const create = await driver.post('/v1/runs', { workflowId: FIXTURE });
|
|
148
|
+
expect(create.status).toBe(201);
|
|
149
|
+
const runId = runIdOf(create.json);
|
|
150
|
+
expect(runId, 'POST /v1/runs MUST return a runId').toBeDefined();
|
|
151
|
+
if (runId === undefined) return;
|
|
152
|
+
await pollUntilTerminal(runId);
|
|
153
|
+
|
|
154
|
+
// Probe the per-iteration transcript-window seam (OPTIONAL).
|
|
155
|
+
const windows: Array<{ iteration: number; window: TranscriptWindow }> = [];
|
|
156
|
+
for (let iteration = 1; iteration <= MAX_ITERATIONS_PROBED; iteration += 1) {
|
|
157
|
+
const res = await driver.get(
|
|
158
|
+
`/v1/host/sample/agent/transcript-window?runId=${encodeURIComponent(runId)}&iteration=${iteration}`,
|
|
159
|
+
);
|
|
160
|
+
if (res.status === 404 || res.status === 405) {
|
|
161
|
+
if (iteration === 1) return; // seam unwired — soft-skip the whole scenario
|
|
162
|
+
break; // iterations exhausted
|
|
163
|
+
}
|
|
164
|
+
if (res.status === 400 || res.status === 422) break; // iteration past the run's last turn
|
|
165
|
+
expect(
|
|
166
|
+
res.status === 200,
|
|
167
|
+
driver.describe('host-sample-test-seams.md §14', 'the transcript-window seam MUST return 200 for a valid iteration'),
|
|
168
|
+
).toBe(true);
|
|
169
|
+
const window = transcriptWindowOf(res.json);
|
|
170
|
+
expect(
|
|
171
|
+
window,
|
|
172
|
+
driver.describe('host-sample-test-seams.md §14', 'the seam MUST return { tokenCounter, tokenCount, eventIds, summarizedRanges }'),
|
|
173
|
+
).toBeDefined();
|
|
174
|
+
if (window === undefined) return;
|
|
175
|
+
windows.push({ iteration, window });
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
// Non-vacuity: a wired seam MUST report at least one iteration.
|
|
179
|
+
expect(windows.length, 'a wired transcript-window seam MUST report at least one orchestrator iteration').toBeGreaterThan(0);
|
|
180
|
+
|
|
181
|
+
// Independent event-log read for the cross-check (OPTIONAL seam).
|
|
182
|
+
const q = await queryTestEvents(runId);
|
|
183
|
+
const logEventIds = new Set<string>();
|
|
184
|
+
const summarizedRefs = new Set<string>();
|
|
185
|
+
if (q.ok) {
|
|
186
|
+
for (const e of q.events) {
|
|
187
|
+
logEventIds.add(e.eventId);
|
|
188
|
+
if (e.type === 'context.summarized') {
|
|
189
|
+
const ref = stringOf(e.payload['summaryRef']);
|
|
190
|
+
if (ref !== undefined) summarizedRefs.add(ref);
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
for (const { iteration, window } of windows) {
|
|
196
|
+
// 1 — tokenCounter agreement.
|
|
197
|
+
expect(
|
|
198
|
+
window.tokenCounter,
|
|
199
|
+
driver.describe('RFC 0111', `iteration ${iteration}: seam tokenCounter MUST equal the advertised contextBudget.tokenCounter`),
|
|
200
|
+
).toBe(advertisedCounter);
|
|
201
|
+
|
|
202
|
+
// 2 — the per-turn token bound.
|
|
203
|
+
if (budget !== undefined) {
|
|
204
|
+
expect(
|
|
205
|
+
window.tokenCount,
|
|
206
|
+
driver.describe('RFC 0111', `iteration ${iteration}: tokenCount MUST NOT exceed transcriptTokenBudget`),
|
|
207
|
+
).toBeLessThanOrEqual(budget);
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
// 3 — internal consistency: every named id is a real persisted event.
|
|
211
|
+
if (q.ok) {
|
|
212
|
+
for (const id of window.eventIds) {
|
|
213
|
+
expect(
|
|
214
|
+
logEventIds.has(id),
|
|
215
|
+
driver.describe('RFC 0111 §"Conformance seam"', `iteration ${iteration}: eventId "${id}" in the seam accounting MUST be a real persisted run event`),
|
|
216
|
+
).toBe(true);
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
// 4 — recent-tail: ids are unique (no double-count inflating the window).
|
|
221
|
+
const uniqueIds = new Set(window.eventIds);
|
|
222
|
+
expect(
|
|
223
|
+
uniqueIds.size,
|
|
224
|
+
driver.describe('RFC 0111 §"Conformance seam"', `iteration ${iteration}: eventIds MUST be a tail with no repeated entry`),
|
|
225
|
+
).toBe(window.eventIds.length);
|
|
226
|
+
|
|
227
|
+
// 5 — every summarized range references a recorded context.summarized event.
|
|
228
|
+
if (q.ok) {
|
|
229
|
+
for (const range of window.summarizedRanges) {
|
|
230
|
+
expect(
|
|
231
|
+
summarizedRefs.has(range.summaryRef),
|
|
232
|
+
driver.describe('RFC 0111', `iteration ${iteration}: summarizedRanges summaryRef "${range.summaryRef}" MUST have a matching context.summarized event`),
|
|
233
|
+
).toBe(true);
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
// keepLastTurns verbatim — a kept turn is fed verbatim, never inside a summarized range.
|
|
239
|
+
const keepLastTurns = numberOf(cb?.summarization?.keepLastTurns);
|
|
240
|
+
if (keepLastTurns !== undefined && keepLastTurns > 0 && windows.length > 0) {
|
|
241
|
+
const last = windows[windows.length - 1].window;
|
|
242
|
+
const summarizedIds = new Set<string>();
|
|
243
|
+
for (const range of last.summarizedRanges) for (const id of range.replacedTurns) summarizedIds.add(id);
|
|
244
|
+
const verbatimTail = last.eventIds.slice(Math.max(0, last.eventIds.length - keepLastTurns));
|
|
245
|
+
for (const id of verbatimTail) {
|
|
246
|
+
expect(
|
|
247
|
+
summarizedIds.has(id),
|
|
248
|
+
driver.describe('RFC 0111', `a kept (verbatim) turn "${id}" MUST NOT appear inside a summarized range`),
|
|
249
|
+
).toBe(false);
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
});
|
|
253
|
+
});
|