@openwop/openwop-conformance 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +65 -0
- package/README.md +2 -2
- package/api/redocly.yaml +15 -0
- package/coverage.md +2 -1
- package/fixtures/conformance-agent-reasoning-streaming.json +37 -0
- package/fixtures/conformance-dispatch-cancellable-child.json +27 -0
- package/fixtures/conformance-dispatch-deterministic-fail-child.json +30 -0
- package/fixtures/conformance-dispatch-input-mapping-no-default.json +49 -0
- package/fixtures/conformance-dispatch-per-worker-override.json +59 -0
- package/fixtures/conformance-subworkflow-input-mapping-no-default.json +33 -0
- package/fixtures.md +6 -0
- package/package.json +1 -1
- package/schemas/capabilities.schema.json +16 -0
- package/schemas/core-conformance-mock-agent-config.schema.json +5 -0
- package/schemas/run-event-payloads.schema.json +35 -1
- package/schemas/run-event.schema.json +2 -0
- package/src/lib/driver.ts +15 -0
- package/src/lib/env.ts +51 -0
- package/src/lib/event-log-query.ts +62 -0
- package/src/lib/fixtures.ts +38 -1
- package/src/lib/host-toggle.ts +54 -0
- package/src/lib/multi-agent-capabilities.ts +10 -0
- package/src/lib/otel-scrape.ts +59 -0
- package/src/scenarios/agentReasoningStreaming.test.ts +193 -0
- package/src/scenarios/aiEnvelope.capBreached.test.ts +97 -9
- package/src/scenarios/aiEnvelope.contractRefusal.test.ts +128 -10
- package/src/scenarios/aiEnvelope.correlationReplay.test.ts +236 -21
- package/src/scenarios/aiEnvelope.redaction.test.ts +204 -24
- package/src/scenarios/aiEnvelope.schemaDrift.test.ts +158 -19
- package/src/scenarios/aiEnvelope.trustBoundaryPropagation.test.ts +59 -8
- package/src/scenarios/aiEnvelope.universalKinds.test.ts +100 -9
- package/src/scenarios/blob-presign-expiry.test.ts +35 -2
- package/src/scenarios/blob-roundtrip.test.ts +0 -0
- package/src/scenarios/cache-ttl-expiry.test.ts +28 -2
- package/src/scenarios/dispatch-cross-worker-handoff.test.ts +34 -3
- package/src/scenarios/dispatch-input-mapping.test.ts +75 -6
- package/src/scenarios/dispatch-output-mapping.test.ts +96 -6
- package/src/scenarios/fixtures-gating.test.ts +139 -1
- package/src/scenarios/kv-ttl-expiry.test.ts +33 -2
- package/src/scenarios/otel-trace-propagation-subworkflow.test.ts +19 -0
- package/src/scenarios/pack-registry-publish.test.ts +231 -51
- package/src/scenarios/provider-usage.test.ts +185 -0
- package/src/scenarios/queue-ack-nack-dlq.test.ts +57 -3
- package/src/scenarios/queue-publish-consume-roundtrip.test.ts +43 -3
- package/src/scenarios/replay-llm-cache-key.test.ts +166 -25
- package/src/scenarios/search-bm25-roundtrip.test.ts +47 -2
- package/src/scenarios/sql-transaction-atomicity.test.ts +31 -2
- package/src/scenarios/stream-subscribe-from-beginning.test.ts +39 -2
- package/src/scenarios/subworkflow-input-mapping.test.ts +77 -7
- package/src/scenarios/table-cursor-pagination.test.ts +40 -2
- package/src/scenarios/table-schema-enforcement.test.ts +39 -2
- package/src/scenarios/vector-knn-roundtrip.test.ts +43 -3
- package/src/scenarios/workflow-chain-host-expansion.test.ts +202 -0
package/src/lib/env.ts
CHANGED
|
@@ -25,6 +25,28 @@
|
|
|
25
25
|
* hosts go strict-mode green without falsifying capability claims.
|
|
26
26
|
* Example for SQLite:
|
|
27
27
|
* OPENWOP_OPTED_OUT_PROFILES=openwop-production,openwop-auth-mtls
|
|
28
|
+
*
|
|
29
|
+
* OPENWOP_OPTED_OUT_FIXTURES — comma-separated fixture ids (or
|
|
30
|
+
* trailing-`*` globs) the host operator has DELIBERATELY chosen
|
|
31
|
+
* not to honor. Applied in `lib/fixtures.ts` by filtering matching
|
|
32
|
+
* entries out of the cached advertised-fixture set, so any
|
|
33
|
+
* scenario gated via `isFixtureAdvertised(...)` skips cleanly.
|
|
34
|
+
* Use when a host auto-loads every `conformance-*.json` on disk
|
|
35
|
+
* (so the fixture id IS in the discovery doc) but the host doesn't
|
|
36
|
+
* implement the gated feature. Symmetric to `OPENWOP_OPTED_OUT_
|
|
37
|
+
* PROFILES` for the fixture-id axis. Example for SQLite:
|
|
38
|
+
* OPENWOP_OPTED_OUT_FIXTURES=conformance-dispatch-*,conformance-subworkflow-input-mapping*
|
|
39
|
+
*
|
|
40
|
+
* OPENWOP_OPTED_OUT_SCENARIOS — comma-separated scenario ids that
|
|
41
|
+
* individual tests consult to skip themselves where neither
|
|
42
|
+
* profile-opt-out nor fixture-opt-out is fine-grained enough
|
|
43
|
+
* (e.g., OTel trace-inheritance across `core.subWorkflow` —
|
|
44
|
+
* `conformance-subworkflow-parent` is correctly advertised because
|
|
45
|
+
* non-OTel subworkflow scenarios pass, but the host doesn't
|
|
46
|
+
* propagate traceparent across the dispatch boundary). Use
|
|
47
|
+
* `isScenarioOptedOut(scenarioId)` from `env.ts` in the test's
|
|
48
|
+
* skip predicate. Reserved for cases where the suite-wide
|
|
49
|
+
* skip mechanisms can't carry the granularity.
|
|
28
50
|
*/
|
|
29
51
|
|
|
30
52
|
export interface ConformanceEnv {
|
|
@@ -84,3 +106,32 @@ export function loadEnv(): ConformanceEnv {
|
|
|
84
106
|
};
|
|
85
107
|
return cached;
|
|
86
108
|
}
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* Returns true when the operator has listed `scenarioId` in
|
|
112
|
+
* `OPENWOP_OPTED_OUT_SCENARIOS`. Use inside a test's `describe.skipIf`
|
|
113
|
+
* predicate when neither profile-opt-out nor fixture-opt-out is
|
|
114
|
+
* granular enough. Logs the skip reason via the caller — this helper
|
|
115
|
+
* is silent so callers can format their own message.
|
|
116
|
+
*
|
|
117
|
+
* Re-reads `process.env` on every call (single env access + split, no
|
|
118
|
+
* cache). Symmetric with `lib/fixtures.ts:loadOptedOutPredicate` which
|
|
119
|
+
* re-reads on every `setAdvertisedFixtures(...)` call — so unit tests
|
|
120
|
+
* can mutate `process.env.OPENWOP_OPTED_OUT_SCENARIOS` between cases
|
|
121
|
+
* without having to invalidate a memoization.
|
|
122
|
+
*/
|
|
123
|
+
export function isScenarioOptedOut(scenarioId: string): boolean {
|
|
124
|
+
const raw = process.env.OPENWOP_OPTED_OUT_SCENARIOS?.trim() ?? '';
|
|
125
|
+
if (raw.length === 0) return false;
|
|
126
|
+
for (const entry of raw.split(',')) {
|
|
127
|
+
if (entry.trim() === scenarioId) return true;
|
|
128
|
+
}
|
|
129
|
+
return false;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/** Test-only: clear the `loadEnv()` memoization so subsequent calls
|
|
133
|
+
* re-read `process.env`. Required for any test that mutates the env
|
|
134
|
+
* vars consumed by `loadEnv()` mid-suite. */
|
|
135
|
+
export function __resetEnvCacheForTests(): void {
|
|
136
|
+
cached = null;
|
|
137
|
+
}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Driver helpers for the test-only event-log query seam
|
|
3
|
+
* (`GET /v1/host/sample/test/runs/:runId/events`).
|
|
4
|
+
*
|
|
5
|
+
* Used by aiEnvelope engine-projection scenarios that verify the
|
|
6
|
+
* spec-prescribed events the host MUST emit on each envelope outcome
|
|
7
|
+
* (per RFC 0021 §A point 1-7 + interrupt.md + capabilities.md
|
|
8
|
+
* §"cap.breached"). All operations soft-skip on HTTP 404 — hosts
|
|
9
|
+
* without the seam keep the existing advertisement-shape coverage.
|
|
10
|
+
*
|
|
11
|
+
* Reset semantics: callers SHOULD `resetTestSeam()` in their test's
|
|
12
|
+
* `afterEach` (or scope each test to a unique runId) to keep state
|
|
13
|
+
* from leaking across scenarios.
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import { driver } from './driver.js';
|
|
17
|
+
|
|
18
|
+
export interface TestEvent {
|
|
19
|
+
readonly eventId: string;
|
|
20
|
+
readonly runId: string;
|
|
21
|
+
readonly type: string;
|
|
22
|
+
readonly payload: Record<string, unknown>;
|
|
23
|
+
readonly timestamp: string;
|
|
24
|
+
readonly sequence: number;
|
|
25
|
+
readonly causationId?: string;
|
|
26
|
+
readonly nodeId?: string;
|
|
27
|
+
readonly contentTrust?: 'trusted' | 'untrusted';
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
export type QueryOutcome =
|
|
31
|
+
| { ok: true; events: TestEvent[] }
|
|
32
|
+
| { ok: false; reason: 'seam_unavailable' }
|
|
33
|
+
| { ok: false; reason: 'http_error'; status: number };
|
|
34
|
+
|
|
35
|
+
/** Query the test-only event log for a run, with optional filters. */
|
|
36
|
+
export async function queryTestEvents(
|
|
37
|
+
runId: string,
|
|
38
|
+
filter: { type?: string; correlationId?: string; causationId?: string; nodeId?: string } = {},
|
|
39
|
+
): Promise<QueryOutcome> {
|
|
40
|
+
const qs = new URLSearchParams();
|
|
41
|
+
if (filter.type) qs.set('type', filter.type);
|
|
42
|
+
if (filter.correlationId) qs.set('correlationId', filter.correlationId);
|
|
43
|
+
if (filter.causationId) qs.set('causationId', filter.causationId);
|
|
44
|
+
if (filter.nodeId) qs.set('nodeId', filter.nodeId);
|
|
45
|
+
const url = `/v1/host/sample/test/runs/${encodeURIComponent(runId)}/events${qs.toString() ? '?' + qs.toString() : ''}`;
|
|
46
|
+
const res = await driver.get(url);
|
|
47
|
+
if (res.status === 404) return { ok: false, reason: 'seam_unavailable' };
|
|
48
|
+
if (res.status !== 200) return { ok: false, reason: 'http_error', status: res.status };
|
|
49
|
+
const body = res.json as { events?: TestEvent[] };
|
|
50
|
+
return { ok: true, events: body.events ?? [] };
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/** Reset the test-only event log + capability overlay (suite teardown). */
|
|
54
|
+
export async function resetTestSeam(): Promise<void> {
|
|
55
|
+
await driver.post('/v1/host/sample/test/reset', {});
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/** Probe whether the seam is exposed. Use to soft-skip early. */
|
|
59
|
+
export async function isEventLogSeamAvailable(): Promise<boolean> {
|
|
60
|
+
const res = await queryTestEvents('__probe__');
|
|
61
|
+
return res.ok;
|
|
62
|
+
}
|
package/src/lib/fixtures.ts
CHANGED
|
@@ -26,6 +26,16 @@
|
|
|
26
26
|
* This module is sync. The async fetch lives in `setup.ts` which calls
|
|
27
27
|
* `setAdvertisedFixtures(...)` from a top-level `await`.
|
|
28
28
|
*
|
|
29
|
+
* Honest opt-out (symmetric to `OPENWOP_OPTED_OUT_PROFILES`):
|
|
30
|
+
* `OPENWOP_OPTED_OUT_FIXTURES` (CSV, supports trailing `*` glob)
|
|
31
|
+
* subtracts matching fixture-ids from the cached set even when the
|
|
32
|
+
* host advertises them. Operators use this when the host happens to
|
|
33
|
+
* carry a fixture file (e.g., it auto-loads every `conformance-*.json`
|
|
34
|
+
* on disk) but does NOT implement the underlying feature — so the
|
|
35
|
+
* gated scenario should skip instead of running and failing. The
|
|
36
|
+
* subtraction happens at cache-population time, so the predicate
|
|
37
|
+
* remains a single sync set lookup at scenario-evaluation time.
|
|
38
|
+
*
|
|
29
39
|
* @see spec/v1/capabilities.md §`fixtures`
|
|
30
40
|
* @see spec/v1/profiles.md §`openwop-fixtures`
|
|
31
41
|
* @see RFCS/0003-fixture-gating.md
|
|
@@ -35,19 +45,46 @@ import type { DiscoveryPayload } from './profiles.js';
|
|
|
35
45
|
|
|
36
46
|
let _advertisedFixtures: ReadonlySet<string> | null = null;
|
|
37
47
|
|
|
48
|
+
/**
|
|
49
|
+
* Parse `OPENWOP_OPTED_OUT_FIXTURES` into a match predicate. Each entry
|
|
50
|
+
* is either an exact id or a glob with a trailing `*`. Returns a
|
|
51
|
+
* function that answers "is this fixture-id opted out?" — empty / unset
|
|
52
|
+
* env reduces to "always false."
|
|
53
|
+
*/
|
|
54
|
+
function loadOptedOutPredicate(): (id: string) => boolean {
|
|
55
|
+
const raw = process.env.OPENWOP_OPTED_OUT_FIXTURES?.trim() ?? '';
|
|
56
|
+
if (raw.length === 0) return () => false;
|
|
57
|
+
const exact = new Set<string>();
|
|
58
|
+
const prefixes: string[] = [];
|
|
59
|
+
for (const entry of raw.split(',').map((s) => s.trim()).filter((s) => s.length > 0)) {
|
|
60
|
+
if (entry.endsWith('*')) {
|
|
61
|
+
prefixes.push(entry.slice(0, -1));
|
|
62
|
+
} else {
|
|
63
|
+
exact.add(entry);
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
return (id) => exact.has(id) || prefixes.some((p) => id.startsWith(p));
|
|
67
|
+
}
|
|
68
|
+
|
|
38
69
|
/**
|
|
39
70
|
* Populate the cache from a discovery-doc payload. The function is
|
|
40
71
|
* tolerant of malformed inputs — anything other than a string array
|
|
41
72
|
* collapses to "no fixtures advertised" rather than throwing, so the
|
|
42
73
|
* suite remains resilient against host bugs in the discovery surface.
|
|
74
|
+
*
|
|
75
|
+
* Applies `OPENWOP_OPTED_OUT_FIXTURES` at this step: opted-out ids are
|
|
76
|
+
* filtered out of the cache before storage so downstream lookups can
|
|
77
|
+
* stay a single sync set-membership test.
|
|
43
78
|
*/
|
|
44
79
|
export function setAdvertisedFixtures(c: DiscoveryPayload | null | undefined): void {
|
|
45
80
|
if (c == null || !Array.isArray(c.fixtures)) {
|
|
46
81
|
_advertisedFixtures = new Set();
|
|
47
82
|
return;
|
|
48
83
|
}
|
|
84
|
+
const isOptedOut = loadOptedOutPredicate();
|
|
49
85
|
const ids = c.fixtures.filter(
|
|
50
|
-
(entry): entry is string =>
|
|
86
|
+
(entry): entry is string =>
|
|
87
|
+
typeof entry === 'string' && entry.length > 0 && !isOptedOut(entry),
|
|
51
88
|
);
|
|
52
89
|
_advertisedFixtures = new Set(ids);
|
|
53
90
|
}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Capability-toggle harness primitive — driver helper for the
|
|
3
|
+
* env-gated test-seam endpoint at
|
|
4
|
+
* `POST /v1/host/sample/test/capability-toggle`.
|
|
5
|
+
*
|
|
6
|
+
* Lets refusal-case scenarios (RFC 0022 §C HVMAP-1a-refusal,
|
|
7
|
+
* HVMAP-2-refusal, etc.) flip a capability flag off temporarily,
|
|
8
|
+
* exercise the host's refusal path, then restore the default.
|
|
9
|
+
*
|
|
10
|
+
* All operations soft-skip on HTTP 404 — hosts that don't expose the
|
|
11
|
+
* seam keep the existing advertisement-shape coverage intact.
|
|
12
|
+
*
|
|
13
|
+
* Reset semantics: callers MUST `resetHostCapabilities()` in their
|
|
14
|
+
* test's `afterEach` (or equivalent) to keep state from leaking
|
|
15
|
+
* across scenarios.
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
import { driver } from './driver.js';
|
|
19
|
+
|
|
20
|
+
export type ToggleOutcome =
|
|
21
|
+
| { ok: true; overlay: Record<string, boolean> }
|
|
22
|
+
| { ok: false; reason: 'seam_unavailable' }
|
|
23
|
+
| { ok: false; reason: 'http_error'; status: number };
|
|
24
|
+
|
|
25
|
+
/** Set a capability flag's overlay value. `value: null` removes the
|
|
26
|
+
* overlay entry (restoring the host's hard-coded default). */
|
|
27
|
+
export async function setHostCapability(
|
|
28
|
+
name: string,
|
|
29
|
+
value: boolean | null,
|
|
30
|
+
): Promise<ToggleOutcome> {
|
|
31
|
+
const res = await driver.post('/v1/host/sample/test/capability-toggle', { name, value });
|
|
32
|
+
if (res.status === 404) return { ok: false, reason: 'seam_unavailable' };
|
|
33
|
+
if (res.status !== 200) return { ok: false, reason: 'http_error', status: res.status };
|
|
34
|
+
const body = res.json as { overlay?: Record<string, boolean> };
|
|
35
|
+
return { ok: true, overlay: body.overlay ?? {} };
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/** Clear ALL capability overlay entries on the host. */
|
|
39
|
+
export async function resetHostCapabilities(): Promise<ToggleOutcome> {
|
|
40
|
+
const res = await driver.post('/v1/host/sample/test/capability-toggle', { reset: true });
|
|
41
|
+
if (res.status === 404) return { ok: false, reason: 'seam_unavailable' };
|
|
42
|
+
if (res.status !== 200) return { ok: false, reason: 'http_error', status: res.status };
|
|
43
|
+
const body = res.json as { overlay?: Record<string, boolean> };
|
|
44
|
+
return { ok: true, overlay: body.overlay ?? {} };
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/** Probe whether the host exposes the capability-toggle seam at all.
|
|
48
|
+
* Use this to soft-skip a scenario early when the host lacks the
|
|
49
|
+
* toggle (the refusal contract is still spec-normative; the test just
|
|
50
|
+
* can't drive it from outside). */
|
|
51
|
+
export async function isToggleAvailable(): Promise<boolean> {
|
|
52
|
+
const probe = await setHostCapability('__probe__', null);
|
|
53
|
+
return probe.ok;
|
|
54
|
+
}
|
|
@@ -37,6 +37,9 @@ interface AgentCaps {
|
|
|
37
37
|
| {
|
|
38
38
|
verbosity: 'summary' | 'full' | 'off' | undefined;
|
|
39
39
|
tokenLimit: number | undefined;
|
|
40
|
+
/** RFC 0024. When true, host may emit `agent.reasoning.delta`
|
|
41
|
+
* events in addition to the closing `agent.reasoned`. */
|
|
42
|
+
streaming: boolean;
|
|
40
43
|
}
|
|
41
44
|
| undefined;
|
|
42
45
|
}
|
|
@@ -84,6 +87,7 @@ export function setMultiAgentCapabilities(c: DiscoveryPayload | null | undefined
|
|
|
84
87
|
typeof (reasoningRaw as Record<string, unknown>).tokenLimit === 'number'
|
|
85
88
|
? ((reasoningRaw as Record<string, unknown>).tokenLimit as number)
|
|
86
89
|
: undefined,
|
|
90
|
+
streaming: asBoolean((reasoningRaw as Record<string, unknown>).streaming),
|
|
87
91
|
}
|
|
88
92
|
: undefined;
|
|
89
93
|
_agentCaps = {
|
|
@@ -113,6 +117,12 @@ export function getReasoningVerbosity(): 'summary' | 'full' | 'off' | undefined
|
|
|
113
117
|
return _agentCaps?.reasoning?.verbosity;
|
|
114
118
|
}
|
|
115
119
|
|
|
120
|
+
/** RFC 0024 — host emits incremental `agent.reasoning.delta` events
|
|
121
|
+
* while a reasoning block is still open. */
|
|
122
|
+
export function isReasoningStreamingSupported(): boolean {
|
|
123
|
+
return _agentCaps?.reasoning?.streaming === true;
|
|
124
|
+
}
|
|
125
|
+
|
|
116
126
|
/** Phase 2 — host supports the named modelClass. */
|
|
117
127
|
export function hasModelClass(modelClass: string): boolean {
|
|
118
128
|
return _agentCaps?.modelClasses.has(modelClass) === true;
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Driver helpers for the OTel + debug-bundle test seams (E.2 + E.3).
|
|
3
|
+
*
|
|
4
|
+
* Used by aiEnvelope + cost-attribution scenarios that need to verify
|
|
5
|
+
* span-attribute redaction (no BYOK canary in OTel attributes) and
|
|
6
|
+
* debug-bundle export shape.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { driver } from './driver.js';
|
|
10
|
+
|
|
11
|
+
export interface TestSpan {
|
|
12
|
+
readonly spanId: string;
|
|
13
|
+
readonly name: string;
|
|
14
|
+
readonly attributes: Record<string, string | number | boolean>;
|
|
15
|
+
readonly envelopeId?: string;
|
|
16
|
+
readonly runId?: string;
|
|
17
|
+
readonly timestamp: string;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export interface DebugBundle {
|
|
21
|
+
readonly runId: string;
|
|
22
|
+
readonly events: unknown[];
|
|
23
|
+
readonly spans: TestSpan[];
|
|
24
|
+
readonly exportedAt: string;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export type ScrapeOutcome<T> =
|
|
28
|
+
| { ok: true; data: T }
|
|
29
|
+
| { ok: false; reason: 'seam_unavailable' }
|
|
30
|
+
| { ok: false; reason: 'http_error'; status: number };
|
|
31
|
+
|
|
32
|
+
export async function queryTestSpans(
|
|
33
|
+
filter: { envelopeId?: string; runId?: string; name?: string } = {},
|
|
34
|
+
): Promise<ScrapeOutcome<TestSpan[]>> {
|
|
35
|
+
const qs = new URLSearchParams();
|
|
36
|
+
if (filter.envelopeId) qs.set('envelopeId', filter.envelopeId);
|
|
37
|
+
if (filter.runId) qs.set('runId', filter.runId);
|
|
38
|
+
if (filter.name) qs.set('name', filter.name);
|
|
39
|
+
const url = `/v1/host/sample/test/otel/spans${qs.toString() ? '?' + qs.toString() : ''}`;
|
|
40
|
+
const res = await driver.get(url);
|
|
41
|
+
if (res.status === 404) return { ok: false, reason: 'seam_unavailable' };
|
|
42
|
+
if (res.status !== 200) return { ok: false, reason: 'http_error', status: res.status };
|
|
43
|
+
const body = res.json as { spans?: TestSpan[] };
|
|
44
|
+
return { ok: true, data: body.spans ?? [] };
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
export async function exportDebugBundle(runId: string): Promise<ScrapeOutcome<DebugBundle>> {
|
|
48
|
+
const res = await driver.post('/v1/host/sample/test/debug-bundle/export', { runId });
|
|
49
|
+
if (res.status === 404) return { ok: false, reason: 'seam_unavailable' };
|
|
50
|
+
if (res.status !== 200) return { ok: false, reason: 'http_error', status: res.status };
|
|
51
|
+
const body = res.json as { bundle?: DebugBundle };
|
|
52
|
+
if (!body.bundle) return { ok: false, reason: 'http_error', status: 500 };
|
|
53
|
+
return { ok: true, data: body.bundle };
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
export async function isOtelSeamAvailable(): Promise<boolean> {
|
|
57
|
+
const res = await queryTestSpans({ runId: '__probe__' });
|
|
58
|
+
return res.ok;
|
|
59
|
+
}
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* RFC 0024 — streaming `agent.reasoning.delta` events.
|
|
3
|
+
*
|
|
4
|
+
* Verifies that hosts advertising `capabilities.agents.reasoning.streaming: true`
|
|
5
|
+
* emit incremental `agent.reasoning.delta` events while a reasoning
|
|
6
|
+
* block is still open, followed by exactly one closing `agent.reasoned`
|
|
7
|
+
* event carrying the full authoritative content.
|
|
8
|
+
*
|
|
9
|
+
* Capability-gated: skips when the host doesn't advertise
|
|
10
|
+
* `capabilities.agents.supported: true` AND
|
|
11
|
+
* `capabilities.agents.reasoning.streaming: true`, OR when reasoning
|
|
12
|
+
* verbosity is `'off'`.
|
|
13
|
+
*
|
|
14
|
+
* Driven by the `core.conformance.mock-agent` typeId (RFC 0023)
|
|
15
|
+
* extended with `mockReasoning.streamChunks` per RFC 0024 §"Conformance"
|
|
16
|
+
* (see `schemas/core-conformance-mock-agent-config.schema.json`).
|
|
17
|
+
*
|
|
18
|
+
* @see RFCS/0024-agent-reasoning-streaming.md
|
|
19
|
+
* @see schemas/run-event-payloads.schema.json §`agentReasoningDelta`
|
|
20
|
+
* @see schemas/capabilities.schema.json §`agents.reasoning.streaming`
|
|
21
|
+
*/
|
|
22
|
+
|
|
23
|
+
import { describe, it, expect } from 'vitest';
|
|
24
|
+
import { driver } from '../lib/driver.js';
|
|
25
|
+
import { pollUntilTerminal } from '../lib/polling.js';
|
|
26
|
+
import { isFixtureAdvertised } from '../lib/fixtures.js';
|
|
27
|
+
import {
|
|
28
|
+
isAgentSupported,
|
|
29
|
+
isReasoningStreamingSupported,
|
|
30
|
+
getReasoningVerbosity,
|
|
31
|
+
} from '../lib/multi-agent-capabilities.js';
|
|
32
|
+
|
|
33
|
+
const FIXTURE = 'conformance-agent-reasoning-streaming';
|
|
34
|
+
/** Expected concatenation of the fixture's `streamChunks` — kept in sync
|
|
35
|
+
* with `conformance/fixtures/conformance-agent-reasoning-streaming.json`.
|
|
36
|
+
* When the fixture changes, this constant changes with it. */
|
|
37
|
+
const EXPECTED_CHUNKS = [
|
|
38
|
+
'Let me think about this. ',
|
|
39
|
+
'First, the user is asking a question. ',
|
|
40
|
+
'Therefore, I should respond clearly.',
|
|
41
|
+
] as const;
|
|
42
|
+
const EXPECTED_FULL = EXPECTED_CHUNKS.join('');
|
|
43
|
+
|
|
44
|
+
const SKIP =
|
|
45
|
+
!isAgentSupported() ||
|
|
46
|
+
!isReasoningStreamingSupported() ||
|
|
47
|
+
getReasoningVerbosity() === 'off' ||
|
|
48
|
+
!isFixtureAdvertised(FIXTURE);
|
|
49
|
+
|
|
50
|
+
describe.skipIf(SKIP)('agentReasoningStreaming: RFC 0024 incremental + closing event contract', () => {
|
|
51
|
+
it('emits N agent.reasoning.delta events followed by exactly one closing agent.reasoned', async () => {
|
|
52
|
+
const create = await driver.post('/v1/runs', { workflowId: FIXTURE });
|
|
53
|
+
expect(create.status).toBe(201);
|
|
54
|
+
const runId = (create.json as { runId: string }).runId;
|
|
55
|
+
|
|
56
|
+
await pollUntilTerminal(runId);
|
|
57
|
+
|
|
58
|
+
const events = await driver.get(`/v1/runs/${encodeURIComponent(runId)}/events`);
|
|
59
|
+
expect(events.status).toBe(200);
|
|
60
|
+
const list = (events.json as { events: Array<{ type: string; payload?: Record<string, unknown> }> }).events;
|
|
61
|
+
|
|
62
|
+
const deltas = list.filter((e) => e.type === 'agent.reasoning.delta');
|
|
63
|
+
const finals = list.filter((e) => e.type === 'agent.reasoned');
|
|
64
|
+
|
|
65
|
+
expect(
|
|
66
|
+
deltas.length,
|
|
67
|
+
driver.describe(
|
|
68
|
+
'RFCS/0024-agent-reasoning-streaming.md §Proposal',
|
|
69
|
+
'streaming host MUST emit one agent.reasoning.delta per streamChunks entry',
|
|
70
|
+
),
|
|
71
|
+
).toBe(EXPECTED_CHUNKS.length);
|
|
72
|
+
expect(
|
|
73
|
+
finals.length,
|
|
74
|
+
driver.describe(
|
|
75
|
+
'RFCS/0024-agent-reasoning-streaming.md §Proposal',
|
|
76
|
+
'streaming host MUST emit exactly one closing agent.reasoned event after the deltas',
|
|
77
|
+
),
|
|
78
|
+
).toBe(1);
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
it('agent.reasoning.delta `sequence` starts at 0 and increments by 1 within the block', async () => {
|
|
82
|
+
const create = await driver.post('/v1/runs', { workflowId: FIXTURE });
|
|
83
|
+
expect(create.status).toBe(201);
|
|
84
|
+
const runId = (create.json as { runId: string }).runId;
|
|
85
|
+
|
|
86
|
+
await pollUntilTerminal(runId);
|
|
87
|
+
|
|
88
|
+
const events = await driver.get(`/v1/runs/${encodeURIComponent(runId)}/events`);
|
|
89
|
+
const list = (events.json as { events: Array<{ type: string; payload?: Record<string, unknown> }> }).events;
|
|
90
|
+
|
|
91
|
+
const deltas = list.filter((e) => e.type === 'agent.reasoning.delta');
|
|
92
|
+
const sequences = deltas
|
|
93
|
+
.map((e) => e.payload?.sequence)
|
|
94
|
+
.filter((s): s is number => typeof s === 'number');
|
|
95
|
+
|
|
96
|
+
expect(
|
|
97
|
+
sequences,
|
|
98
|
+
driver.describe(
|
|
99
|
+
'RFCS/0024-agent-reasoning-streaming.md §Proposal',
|
|
100
|
+
'`sequence` MUST start at 0 and increment by 1 per delta within a block',
|
|
101
|
+
),
|
|
102
|
+
).toEqual(EXPECTED_CHUNKS.map((_, i) => i));
|
|
103
|
+
});
|
|
104
|
+
|
|
105
|
+
it('closing agent.reasoned.reasoning is the concatenation of the deltas (authoritative)', async () => {
|
|
106
|
+
const create = await driver.post('/v1/runs', { workflowId: FIXTURE });
|
|
107
|
+
expect(create.status).toBe(201);
|
|
108
|
+
const runId = (create.json as { runId: string }).runId;
|
|
109
|
+
|
|
110
|
+
await pollUntilTerminal(runId);
|
|
111
|
+
|
|
112
|
+
const events = await driver.get(`/v1/runs/${encodeURIComponent(runId)}/events`);
|
|
113
|
+
const list = (events.json as { events: Array<{ type: string; payload?: Record<string, unknown> }> }).events;
|
|
114
|
+
|
|
115
|
+
const finalEvent = list.find((e) => e.type === 'agent.reasoned');
|
|
116
|
+
expect(finalEvent, 'closing agent.reasoned must be present').toBeDefined();
|
|
117
|
+
const reasoning = finalEvent?.payload?.reasoning;
|
|
118
|
+
expect(typeof reasoning, 'closing event MUST carry a reasoning string').toBe('string');
|
|
119
|
+
// The mock-agent's contract: closing reasoning equals concat(streamChunks).
|
|
120
|
+
// Real hosts MAY transform at finalize (summary truncation, redaction);
|
|
121
|
+
// for the mock-agent fixture, no transform applies — exact equality.
|
|
122
|
+
expect(
|
|
123
|
+
reasoning,
|
|
124
|
+
driver.describe(
|
|
125
|
+
'RFCS/0024-agent-reasoning-streaming.md §Proposal',
|
|
126
|
+
'closing agent.reasoned.reasoning is authoritative; for the mock-agent fixture, equals delta concatenation',
|
|
127
|
+
),
|
|
128
|
+
).toBe(EXPECTED_FULL);
|
|
129
|
+
});
|
|
130
|
+
|
|
131
|
+
it('agentId is consistent across all streaming + closing events in a block', async () => {
|
|
132
|
+
const create = await driver.post('/v1/runs', { workflowId: FIXTURE });
|
|
133
|
+
expect(create.status).toBe(201);
|
|
134
|
+
const runId = (create.json as { runId: string }).runId;
|
|
135
|
+
|
|
136
|
+
await pollUntilTerminal(runId);
|
|
137
|
+
|
|
138
|
+
const events = await driver.get(`/v1/runs/${encodeURIComponent(runId)}/events`);
|
|
139
|
+
const list = (events.json as { events: Array<{ type: string; payload?: Record<string, unknown> }> }).events;
|
|
140
|
+
|
|
141
|
+
const relevant = list.filter(
|
|
142
|
+
(e) => e.type === 'agent.reasoning.delta' || e.type === 'agent.reasoned',
|
|
143
|
+
);
|
|
144
|
+
const agentIds = new Set(
|
|
145
|
+
relevant
|
|
146
|
+
.map((e) => e.payload?.agentId)
|
|
147
|
+
.filter((a): a is string => typeof a === 'string' && a.length > 0),
|
|
148
|
+
);
|
|
149
|
+
|
|
150
|
+
expect(
|
|
151
|
+
agentIds.size,
|
|
152
|
+
driver.describe(
|
|
153
|
+
'RFCS/0024-agent-reasoning-streaming.md §Proposal',
|
|
154
|
+
'agentId MUST be consistent across all `agent.reasoning.delta` events AND the closing `agent.reasoned` for a given block',
|
|
155
|
+
),
|
|
156
|
+
).toBe(1);
|
|
157
|
+
});
|
|
158
|
+
|
|
159
|
+
it('all agent.reasoning.delta events arrive BEFORE the closing agent.reasoned', async () => {
|
|
160
|
+
const create = await driver.post('/v1/runs', { workflowId: FIXTURE });
|
|
161
|
+
expect(create.status).toBe(201);
|
|
162
|
+
const runId = (create.json as { runId: string }).runId;
|
|
163
|
+
|
|
164
|
+
await pollUntilTerminal(runId);
|
|
165
|
+
|
|
166
|
+
const events = await driver.get(`/v1/runs/${encodeURIComponent(runId)}/events`);
|
|
167
|
+
const list = (events.json as Array<{ type: string }> | { events: Array<{ type: string }> });
|
|
168
|
+
const arr = Array.isArray(list) ? list : list.events;
|
|
169
|
+
|
|
170
|
+
const closingIdx = arr.findIndex((e) => e.type === 'agent.reasoned');
|
|
171
|
+
expect(closingIdx, 'closing event present').toBeGreaterThan(-1);
|
|
172
|
+
const lastDeltaIdx = arr.map((e) => e.type).lastIndexOf('agent.reasoning.delta');
|
|
173
|
+
|
|
174
|
+
// Guard against vacuous pass: a host advertising streaming but
|
|
175
|
+
// emitting ZERO deltas would otherwise pass `-1 < closingIdx`
|
|
176
|
+
// trivially. The fixture configures 3 streamChunks, so at least
|
|
177
|
+
// one delta MUST appear in the event log.
|
|
178
|
+
expect(
|
|
179
|
+
lastDeltaIdx,
|
|
180
|
+
driver.describe(
|
|
181
|
+
'RFCS/0024-agent-reasoning-streaming.md §Proposal',
|
|
182
|
+
'streaming host MUST emit at least one `agent.reasoning.delta` for a fixture with non-empty `streamChunks`',
|
|
183
|
+
),
|
|
184
|
+
).toBeGreaterThan(-1);
|
|
185
|
+
expect(
|
|
186
|
+
lastDeltaIdx,
|
|
187
|
+
driver.describe(
|
|
188
|
+
'RFCS/0024-agent-reasoning-streaming.md §Proposal',
|
|
189
|
+
'every `agent.reasoning.delta` MUST precede the closing `agent.reasoned` for the same block',
|
|
190
|
+
),
|
|
191
|
+
).toBeLessThan(closingIdx);
|
|
192
|
+
});
|
|
193
|
+
});
|
|
@@ -161,13 +161,101 @@ describe('aiEnvelope.capBreached: behavioral cap enforcement (FINAL v1.1)', () =
|
|
|
161
161
|
});
|
|
162
162
|
});
|
|
163
163
|
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
it
|
|
172
|
-
|
|
164
|
+
// E.1 engine-projection via the test-only event-log seam. The acceptor
|
|
165
|
+
// returns the breached outcome; the seam projects it onto cap.breached +
|
|
166
|
+
// node.failed per capabilities.md §"Engine-enforced limits". Tests
|
|
167
|
+
// soft-skip on HTTP 404 when the seam isn't exposed.
|
|
168
|
+
import { queryTestEvents, isEventLogSeamAvailable, resetTestSeam } from '../lib/event-log-query.js';
|
|
169
|
+
|
|
170
|
+
describe('aiEnvelope.capBreached: engine projection via event-log seam (capabilities.md §"cap.breached")', () => {
|
|
171
|
+
it('breached outcome projects to cap.breached { kind: "envelopes" } event with causationId chain', async () => {
|
|
172
|
+
if (!(await isEventLogSeamAvailable())) return;
|
|
173
|
+
const runId = `r-cap-env-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
|
|
174
|
+
const correlationId = `${runId}:node-1:turn-0:cap-env`;
|
|
175
|
+
const r = await accept(
|
|
176
|
+
{
|
|
177
|
+
type: 'error',
|
|
178
|
+
schemaVersion: 1,
|
|
179
|
+
envelopeId: 'env-proj-cap-env',
|
|
180
|
+
correlationId,
|
|
181
|
+
payload: { code: 'x', message: 'y' },
|
|
182
|
+
meta: baseMeta,
|
|
183
|
+
},
|
|
184
|
+
{
|
|
185
|
+
counters: { envelopesPerTurn: { current: 32, cap: 32 } },
|
|
186
|
+
projectTo: { runId, nodeId: 'node-1' },
|
|
187
|
+
},
|
|
188
|
+
);
|
|
189
|
+
if (r.status === 404) return;
|
|
190
|
+
expect(r.body.status).toBe('breached');
|
|
191
|
+
|
|
192
|
+
const events = await queryTestEvents(runId, { type: 'cap.breached' });
|
|
193
|
+
if (!events.ok) return;
|
|
194
|
+
expect(
|
|
195
|
+
events.events.length,
|
|
196
|
+
driver.describe('capabilities.md §"Engine-enforced limits and the cap.breached event"', 'breached outcome MUST project to exactly one cap.breached event'),
|
|
197
|
+
).toBe(1);
|
|
198
|
+
const evt = events.events[0]!;
|
|
199
|
+
expect(evt.payload.kind).toBe('envelopes');
|
|
200
|
+
expect(evt.causationId).toBe(correlationId);
|
|
201
|
+
await resetTestSeam();
|
|
202
|
+
});
|
|
203
|
+
|
|
204
|
+
it('cap.breached payload includes limit, observed, and nodeId per capabilities.md', async () => {
|
|
205
|
+
if (!(await isEventLogSeamAvailable())) return;
|
|
206
|
+
const runId = `r-cap-payload-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
|
|
207
|
+
await accept(
|
|
208
|
+
{
|
|
209
|
+
type: 'clarification.request',
|
|
210
|
+
schemaVersion: 1,
|
|
211
|
+
envelopeId: 'env-proj-cap-clar',
|
|
212
|
+
correlationId: `${runId}:node-2:turn-0:cap`,
|
|
213
|
+
payload: { questions: [{ id: 'q1', question: 'why?' }] },
|
|
214
|
+
meta: baseMeta,
|
|
215
|
+
},
|
|
216
|
+
{
|
|
217
|
+
counters: { clarificationRounds: { current: 5, cap: 5 } },
|
|
218
|
+
projectTo: { runId, nodeId: 'node-2' },
|
|
219
|
+
},
|
|
220
|
+
);
|
|
221
|
+
const events = await queryTestEvents(runId, { type: 'cap.breached' });
|
|
222
|
+
if (!events.ok || events.events.length === 0) return;
|
|
223
|
+
const evt = events.events[0]!;
|
|
224
|
+
expect(evt.payload.kind).toBe('clarification');
|
|
225
|
+
expect(
|
|
226
|
+
typeof evt.payload.limit,
|
|
227
|
+
driver.describe('capabilities.md §"cap.breached"', 'payload.limit MUST be present as a number'),
|
|
228
|
+
).toBe('number');
|
|
229
|
+
expect(evt.payload.nodeId).toBe('node-2');
|
|
230
|
+
await resetTestSeam();
|
|
231
|
+
});
|
|
232
|
+
|
|
233
|
+
it('cap.breached MUST be paired with a terminal node.failed transition', async () => {
|
|
234
|
+
if (!(await isEventLogSeamAvailable())) return;
|
|
235
|
+
const runId = `r-cap-fail-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
|
|
236
|
+
await accept(
|
|
237
|
+
{
|
|
238
|
+
type: 'schema.request',
|
|
239
|
+
schemaVersion: 1,
|
|
240
|
+
envelopeId: 'env-proj-cap-fail',
|
|
241
|
+
correlationId: `${runId}:node-3:turn-0:cap`,
|
|
242
|
+
payload: { envelopeType: 'vendor.acme.foo' },
|
|
243
|
+
meta: baseMeta,
|
|
244
|
+
},
|
|
245
|
+
{
|
|
246
|
+
counters: { schemaRounds: { current: 3, cap: 3 } },
|
|
247
|
+
projectTo: { runId, nodeId: 'node-3' },
|
|
248
|
+
},
|
|
249
|
+
);
|
|
250
|
+
const breached = await queryTestEvents(runId, { type: 'cap.breached' });
|
|
251
|
+
const failed = await queryTestEvents(runId, { type: 'node.failed' });
|
|
252
|
+
if (!breached.ok || !failed.ok) return;
|
|
253
|
+
expect(breached.events.length).toBe(1);
|
|
254
|
+
expect(
|
|
255
|
+
failed.events.length,
|
|
256
|
+
driver.describe('capabilities.md §"cap.breached"', 'cap.breached MUST be paired with a terminal node.failed event'),
|
|
257
|
+
).toBe(1);
|
|
258
|
+
expect((failed.events[0]!.payload.error as { code?: string }).code).toBe('cap_breached');
|
|
259
|
+
await resetTestSeam();
|
|
260
|
+
});
|
|
173
261
|
});
|