@openwop/openwop-conformance 1.10.0 → 1.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +48 -0
- package/README.md +2 -2
- package/api/asyncapi.yaml +70 -0
- package/api/openapi.yaml +268 -1
- package/coverage.md +33 -2
- package/fixtures/oauth-providers/synthetic.json +38 -0
- package/fixtures.md +10 -0
- package/package.json +1 -1
- package/schemas/README.md +12 -0
- package/schemas/agent-deployment-transition.schema.json +49 -0
- package/schemas/agent-deployment.schema.json +54 -0
- package/schemas/agent-eval-suite.schema.json +140 -0
- package/schemas/agent-inventory-response.schema.json +25 -0
- package/schemas/agent-manifest.schema.json +5 -0
- package/schemas/agent-org-chart.schema.json +82 -0
- package/schemas/agent-ref.schema.json +12 -2
- package/schemas/agent-roster-entry.schema.json +81 -0
- package/schemas/agent-roster-response.schema.json +21 -0
- package/schemas/budget-policy.schema.json +18 -0
- package/schemas/capabilities.schema.json +277 -0
- package/schemas/credential-provenance.schema.json +18 -0
- package/schemas/eval-summary.schema.json +92 -0
- package/schemas/node-pack-manifest.schema.json +17 -0
- package/schemas/org-chart-responsibility-view.schema.json +26 -0
- package/schemas/run-event-payloads.schema.json +286 -3
- package/schemas/run-event.schema.json +19 -0
- package/schemas/tool-descriptor.schema.json +63 -0
- package/schemas/trigger-subscription.schema.json +26 -0
- package/src/lib/agentOrgChart.ts +82 -0
- package/src/lib/agentRoster.ts +76 -0
- package/src/lib/liveRuntime.ts +59 -0
- package/src/lib/profiles.ts +157 -0
- package/src/lib/runtimeRequires.ts +38 -0
- package/src/lib/safeFetch.ts +87 -0
- package/src/lib/triggerBridge.ts +74 -0
- package/src/scenarios/agent-deployment-shape.test.ts +139 -0
- package/src/scenarios/agent-eval-suite-shape.test.ts +167 -0
- package/src/scenarios/agent-live-allowlist-enforced.test.ts +53 -0
- package/src/scenarios/agent-live-invocation-bracket.test.ts +98 -0
- package/src/scenarios/agent-live-runtime-shape.test.ts +98 -0
- package/src/scenarios/agent-live-structured-output.test.ts +58 -0
- package/src/scenarios/agent-org-chart-scoping.test.ts +137 -0
- package/src/scenarios/agent-org-chart-shape.test.ts +127 -0
- package/src/scenarios/agent-platform-profile.test.ts +158 -0
- package/src/scenarios/agent-roster-attribution.test.ts +179 -0
- package/src/scenarios/agent-roster-shape.test.ts +146 -0
- package/src/scenarios/budget-policy-shape.test.ts +136 -0
- package/src/scenarios/egress-provenance-shape.test.ts +137 -0
- package/src/scenarios/memory-capability-model-shape.test.ts +186 -0
- package/src/scenarios/oauth-authorization-code-roundtrip.test.ts +145 -0
- package/src/scenarios/org-position-no-authority-escalation.test.ts +78 -0
- package/src/scenarios/runtime-requires-install-gate.test.ts +92 -0
- package/src/scenarios/runtime-requires-shape.test.ts +134 -0
- package/src/scenarios/safefetch-behavior.test.ts +99 -0
- package/src/scenarios/safefetch-live-audit.test.ts +175 -0
- package/src/scenarios/spec-corpus-validity.test.ts +19 -3
- package/src/scenarios/tool-descriptor-shape.test.ts +133 -0
- package/src/scenarios/trigger-bridge-delivery.test.ts +126 -0
- package/src/scenarios/trigger-bridge-shape.test.ts +135 -0
- package/src/scenarios/x-openwop-form-pack-manifest.test.ts +155 -0
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Agent deployment lifecycle — record + binding + event shapes (RFC 0082).
|
|
3
|
+
*
|
|
4
|
+
* Always-on, server-free schema-shape probe. Verifies that:
|
|
5
|
+
* - `capabilities.agents.deployment` is declared with its `supported` /
|
|
6
|
+
* `channels` / `canary` / `rollback` / `states` sub-flags.
|
|
7
|
+
* - `agent-deployment.schema.json` compiles and round-trips a conforming
|
|
8
|
+
* deployment record, and rejects malformed ones (an out-of-enum `state`;
|
|
9
|
+
* `canaryPercent` out of 0..100).
|
|
10
|
+
* - the `AgentRef` `channel` XOR `version` rule holds: each alone (and
|
|
11
|
+
* neither) validates; both together is rejected (the `not` clause).
|
|
12
|
+
* - the four `deployment.*` payload $defs validate conforming content-free
|
|
13
|
+
* payloads and reject malformed ones.
|
|
14
|
+
* - the four `deployment.*` payloads are CONTENT-FREE: a `deployment.promoted`
|
|
15
|
+
* carrying a `manifestBody`, and a `deployment.state.changed` carrying a
|
|
16
|
+
* `prompt`, are rejected (`additionalProperties:false`). This is the public
|
|
17
|
+
* test for the protocol-tier SECURITY invariant `deployment-event-no-content-leak`.
|
|
18
|
+
* - `agent.invocation.started` carries the additive recorded-fact
|
|
19
|
+
* `resolvedAgentVersion` / `resolvedChannel` fields (RFC 0082 §B).
|
|
20
|
+
* - all four event names appear in the RunEventType enum.
|
|
21
|
+
*
|
|
22
|
+
* Behavioral assertions (the authz → approvalGate → eval-verify → promotion path,
|
|
23
|
+
* the fail-closed denial, the §B replay re-read of `resolvedAgentVersion`) are
|
|
24
|
+
* gated on `capabilities.agents.deployment.supported` and land in
|
|
25
|
+
* `agent-deployment-lifecycle.test.ts` (deferred per RFC 0082 §Conformance —
|
|
26
|
+
* reference host deferred). This scenario asserts the wire contract, not host
|
|
27
|
+
* behavior.
|
|
28
|
+
*
|
|
29
|
+
* Spec references:
|
|
30
|
+
* - https://github.com/openwop/openwop/blob/main/spec/v1/agent-deployment.md
|
|
31
|
+
* - https://github.com/openwop/openwop/blob/main/RFCS/0082-agent-deployment-lifecycle.md
|
|
32
|
+
* - https://github.com/openwop/openwop/blob/main/SECURITY/invariants.yaml (deployment-event-no-content-leak)
|
|
33
|
+
*/
|
|
34
|
+
|
|
35
|
+
import { describe, it, expect } from 'vitest';
|
|
36
|
+
import { readFileSync } from 'node:fs';
|
|
37
|
+
import { join } from 'node:path';
|
|
38
|
+
import Ajv2020 from 'ajv/dist/2020.js';
|
|
39
|
+
import addFormats from 'ajv-formats';
|
|
40
|
+
import { SCHEMAS_DIR } from '../lib/paths.js';
|
|
41
|
+
|
|
42
|
+
/** Server-free assertion-message helper. */
|
|
43
|
+
const why = (specRef: string, requirement: string): string => `${specRef} — ${requirement}`;
|
|
44
|
+
|
|
45
|
+
function loadSchema(name: string): Record<string, unknown> {
|
|
46
|
+
return JSON.parse(readFileSync(join(SCHEMAS_DIR, name), 'utf8')) as Record<string, unknown>;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
describe('agent-deployment-shape: capability advertisement (RFC 0082, server-free)', () => {
|
|
50
|
+
it('the capabilities schema declares agents.deployment with its sub-flags', () => {
|
|
51
|
+
const caps = loadSchema('capabilities.schema.json');
|
|
52
|
+
const agents = (caps.properties as Record<string, { properties?: Record<string, { properties?: Record<string, unknown> }> }>).agents;
|
|
53
|
+
const deployment = agents?.properties?.deployment;
|
|
54
|
+
expect(deployment, why('capabilities.md §agents', 'agents.deployment MUST be declared')).toBeDefined();
|
|
55
|
+
for (const flag of ['supported', 'channels', 'canary', 'rollback', 'states']) {
|
|
56
|
+
expect(
|
|
57
|
+
deployment?.properties?.[flag],
|
|
58
|
+
why('agent-deployment.md §F', `agents.deployment.${flag} MUST be declared`),
|
|
59
|
+
).toBeDefined();
|
|
60
|
+
}
|
|
61
|
+
});
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
describe('agent-deployment-shape: deployment record + AgentRef binding (RFC 0082, server-free)', () => {
|
|
65
|
+
const ajv = new Ajv2020({ strict: false, allErrors: true });
|
|
66
|
+
addFormats(ajv);
|
|
67
|
+
const record = ajv.compile(loadSchema('agent-deployment.schema.json'));
|
|
68
|
+
const agentRef = ajv.compile(loadSchema('agent-ref.schema.json'));
|
|
69
|
+
|
|
70
|
+
it('AgentDeployment validates a conforming record and rejects a bad state / out-of-range canary', () => {
|
|
71
|
+
const good = { agentId: 'core.openwop.agents.support-resolver', version: '2.4.0', state: 'active', canaryPercent: 10, channels: ['stable'] };
|
|
72
|
+
expect(record(good), why('RFC 0082 §C', 'a conforming deployment record MUST validate')).toBe(true);
|
|
73
|
+
expect(record({ ...good, state: 'live' }), why('RFC 0082 §C', 'an out-of-enum state MUST be rejected')).toBe(false);
|
|
74
|
+
expect(record({ ...good, canaryPercent: 150 }), why('RFC 0082 §C', 'canaryPercent > 100 MUST be rejected')).toBe(false);
|
|
75
|
+
});
|
|
76
|
+
|
|
77
|
+
it('AgentRef channel XOR version: each alone and neither validate; both is rejected (RFC 0082 §A)', () => {
|
|
78
|
+
expect(agentRef({ agentId: 'core.x.y.z', version: '1.0.0' }), why('RFC 0082 §A', 'version-only AgentRef MUST validate')).toBe(true);
|
|
79
|
+
expect(agentRef({ agentId: 'core.x.y.z', channel: 'stable' }), why('RFC 0082 §A', 'channel-only AgentRef MUST validate')).toBe(true);
|
|
80
|
+
expect(agentRef({ agentId: 'core.x.y.z' }), why('RFC 0082 §A', 'a ref with neither version nor channel MUST validate (host default)')).toBe(true);
|
|
81
|
+
expect(agentRef({ agentId: 'core.x.y.z', version: '1.0.0', channel: 'stable' }), why('RFC 0082 §A', 'a ref with BOTH version and channel MUST be rejected')).toBe(false);
|
|
82
|
+
});
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
describe('agent-deployment-shape: deployment.* event payloads (RFC 0082, server-free)', () => {
|
|
86
|
+
const payloads = loadSchema('run-event-payloads.schema.json');
|
|
87
|
+
const ajv = new Ajv2020({ strict: false, allErrors: true });
|
|
88
|
+
addFormats(ajv);
|
|
89
|
+
ajv.addSchema(payloads, 'payloads');
|
|
90
|
+
|
|
91
|
+
const promoted = ajv.getSchema('payloads#/$defs/deploymentPromoted');
|
|
92
|
+
const rolledBack = ajv.getSchema('payloads#/$defs/deploymentRolledBack');
|
|
93
|
+
const canary = ajv.getSchema('payloads#/$defs/deploymentCanaryAdjusted');
|
|
94
|
+
const stateChanged = ajv.getSchema('payloads#/$defs/deploymentStateChanged');
|
|
95
|
+
|
|
96
|
+
it('deployment.promoted validates a content-free promotion record and requires toVersion + toState', () => {
|
|
97
|
+
expect(promoted, 'the deploymentPromoted $def MUST exist').toBeTruthy();
|
|
98
|
+
expect(
|
|
99
|
+
promoted!({ agentId: 'core.openwop.agents.support-resolver', toVersion: '2.4.0', toState: 'active', channel: 'stable', canaryPercent: 10, evalRunId: 'run_abc' }),
|
|
100
|
+
why('RFC 0082 §D', 'a conforming deployment.promoted payload MUST validate'),
|
|
101
|
+
).toBe(true);
|
|
102
|
+
expect(promoted!({ agentId: 'a' }), why('RFC 0082 §D', 'deployment.promoted without toVersion/toState MUST be rejected')).toBe(false);
|
|
103
|
+
});
|
|
104
|
+
|
|
105
|
+
it('deployment.rolled-back / canary.adjusted / state.changed validate conforming records', () => {
|
|
106
|
+
expect(rolledBack!({ agentId: 'a', fromVersion: '2.4.0', toVersion: '2.3.1', rollbackPointer: '2.3.1' }), why('RFC 0082 §D', 'a conforming deployment.rolled-back MUST validate')).toBe(true);
|
|
107
|
+
expect(canary!({ agentId: 'a', version: '2.4.0', fromPercent: 10, toPercent: 50 }), why('RFC 0082 §D', 'a conforming deployment.canary.adjusted MUST validate')).toBe(true);
|
|
108
|
+
expect(stateChanged!({ agentId: 'a', version: '2.4.0', fromState: 'active', toState: 'paused' }), why('RFC 0082 §D', 'a conforming deployment.state.changed MUST validate')).toBe(true);
|
|
109
|
+
expect(stateChanged!({ agentId: 'a', version: '2.4.0', fromState: 'active', toState: 'live' }), why('RFC 0082 §D', 'an out-of-enum toState MUST be rejected')).toBe(false);
|
|
110
|
+
});
|
|
111
|
+
|
|
112
|
+
it('deployment.* events are content-free — a manifest body and a prompt are rejected (deployment-event-no-content-leak)', () => {
|
|
113
|
+
expect(
|
|
114
|
+
promoted!({ agentId: 'a', toVersion: '2.4.0', toState: 'active', manifestBody: '{...}' }),
|
|
115
|
+
why('SECURITY invariant deployment-event-no-content-leak', 'a deployment.promoted MUST NOT carry a manifest body'),
|
|
116
|
+
).toBe(false);
|
|
117
|
+
expect(
|
|
118
|
+
stateChanged!({ agentId: 'a', version: '2.4.0', fromState: 'active', toState: 'paused', prompt: 'system: …' }),
|
|
119
|
+
why('SECURITY invariant deployment-event-no-content-leak', 'a deployment.state.changed MUST NOT carry prompt content'),
|
|
120
|
+
).toBe(false);
|
|
121
|
+
});
|
|
122
|
+
});
|
|
123
|
+
|
|
124
|
+
describe('agent-deployment-shape: §B recorded-fact pin + enum (RFC 0082, server-free)', () => {
|
|
125
|
+
it('agent.invocation.started carries the additive recorded-fact resolvedAgentVersion / resolvedChannel', () => {
|
|
126
|
+
const payloads = loadSchema('run-event-payloads.schema.json');
|
|
127
|
+
const started = ((payloads.$defs as Record<string, { properties?: Record<string, unknown> }>).agentInvocationStarted)?.properties ?? {};
|
|
128
|
+
expect(started.resolvedAgentVersion, why('RFC 0082 §B', 'agent.invocation.started.resolvedAgentVersion MUST be declared (the channel pin)')).toBeDefined();
|
|
129
|
+
expect(started.resolvedChannel, why('RFC 0082 §B', 'agent.invocation.started.resolvedChannel MUST be declared')).toBeDefined();
|
|
130
|
+
});
|
|
131
|
+
|
|
132
|
+
it('all four deployment event names appear in the RunEventType enum', () => {
|
|
133
|
+
const runEvent = loadSchema('run-event.schema.json');
|
|
134
|
+
const enumVals = (runEvent.$defs as Record<string, { enum?: string[] }>).RunEventType?.enum ?? [];
|
|
135
|
+
for (const e of ['deployment.promoted', 'deployment.rolled-back', 'deployment.canary.adjusted', 'deployment.state.changed']) {
|
|
136
|
+
expect(enumVals).toContain(e);
|
|
137
|
+
}
|
|
138
|
+
});
|
|
139
|
+
});
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Agent evaluation — suite + summary + event shapes (RFC 0081).
|
|
3
|
+
*
|
|
4
|
+
* Always-on, server-free schema-shape probe. Verifies that:
|
|
5
|
+
* - `capabilities.agents.evalSuite` is declared with its `supported` / `modes`
|
|
6
|
+
* sub-flags.
|
|
7
|
+
* - the `AgentEvalSuite` + `EvalSummary` schemas compile and round-trip a
|
|
8
|
+
* conforming artifact, and reject malformed ones (a bad `suiteId`; a
|
|
9
|
+
* `thresholds.passScore` out of 0..1).
|
|
10
|
+
* - the `eval.started` / `eval.scored` / `eval.completed` payload $defs
|
|
11
|
+
* validate conforming content-free payloads and reject malformed ones.
|
|
12
|
+
* - both the summary and the per-task `eval.scored` payload are CONTENT-FREE:
|
|
13
|
+
* an `EvalSummary` carrying a task-output body and a `safetyFinding` carrying
|
|
14
|
+
* an excerpt are rejected. This is the public test for the protocol-tier
|
|
15
|
+
* SECURITY invariant `eval-summary-no-content-leak`.
|
|
16
|
+
* - all three event names appear in the RunEventType enum.
|
|
17
|
+
*
|
|
18
|
+
* Behavioral assertions (the eval-run event ordering, per-task scoring, the
|
|
19
|
+
* EvalSummary round-trip against a live host, the `mode: "eval"` 501 on
|
|
20
|
+
* unadvertised hosts) are gated on `capabilities.agents.evalSuite.supported` and
|
|
21
|
+
* land in `agent-eval-run.test.ts` (deferred per RFC 0081 §Conformance — reference
|
|
22
|
+
* host deferred). This scenario asserts the wire contract, not host behavior.
|
|
23
|
+
*
|
|
24
|
+
* Spec references:
|
|
25
|
+
* - https://github.com/openwop/openwop/blob/main/spec/v1/agent-evaluation.md
|
|
26
|
+
* - https://github.com/openwop/openwop/blob/main/RFCS/0081-agent-evaluation-and-scorecards.md
|
|
27
|
+
* - https://github.com/openwop/openwop/blob/main/SECURITY/invariants.yaml (eval-summary-no-content-leak)
|
|
28
|
+
*/
|
|
29
|
+
|
|
30
|
+
import { describe, it, expect } from 'vitest';
|
|
31
|
+
import { readFileSync } from 'node:fs';
|
|
32
|
+
import { join } from 'node:path';
|
|
33
|
+
import Ajv2020 from 'ajv/dist/2020.js';
|
|
34
|
+
import addFormats from 'ajv-formats';
|
|
35
|
+
import { SCHEMAS_DIR } from '../lib/paths.js';
|
|
36
|
+
|
|
37
|
+
/** Server-free assertion-message helper (mirrors driver.describe's "spec — requirement" shape without requiring OPENWOP_BASE_URL). */
|
|
38
|
+
const why = (specRef: string, requirement: string): string => `${specRef} — ${requirement}`;
|
|
39
|
+
|
|
40
|
+
function loadSchema(name: string): Record<string, unknown> {
|
|
41
|
+
return JSON.parse(readFileSync(join(SCHEMAS_DIR, name), 'utf8')) as Record<string, unknown>;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
describe('agent-eval-suite-shape: capability advertisement (RFC 0081, server-free)', () => {
|
|
45
|
+
it('the capabilities schema declares agents.evalSuite with its sub-flags', () => {
|
|
46
|
+
const caps = loadSchema('capabilities.schema.json');
|
|
47
|
+
const agents = (caps.properties as Record<string, { properties?: Record<string, { properties?: Record<string, unknown> }> }>).agents;
|
|
48
|
+
const evalSuite = agents?.properties?.evalSuite;
|
|
49
|
+
expect(
|
|
50
|
+
evalSuite,
|
|
51
|
+
why('capabilities.md §agents', 'agents.evalSuite MUST be declared'),
|
|
52
|
+
).toBeDefined();
|
|
53
|
+
for (const flag of ['supported', 'modes']) {
|
|
54
|
+
expect(
|
|
55
|
+
evalSuite?.properties?.[flag],
|
|
56
|
+
why('agent-evaluation.md §Capability advertisement', `agents.evalSuite.${flag} MUST be declared`),
|
|
57
|
+
).toBeDefined();
|
|
58
|
+
}
|
|
59
|
+
});
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
describe('agent-eval-suite-shape: AgentEvalSuite + EvalSummary schemas (RFC 0081, server-free)', () => {
|
|
63
|
+
const ajv = new Ajv2020({ strict: false, allErrors: true });
|
|
64
|
+
addFormats(ajv);
|
|
65
|
+
const suite = ajv.compile(loadSchema('agent-eval-suite.schema.json'));
|
|
66
|
+
const summary = ajv.compile(loadSchema('eval-summary.schema.json'));
|
|
67
|
+
|
|
68
|
+
it('AgentEvalSuite validates a conforming suite and rejects a malformed suiteId / out-of-range threshold', () => {
|
|
69
|
+
const good = {
|
|
70
|
+
suiteId: 'core.openwop.evals.support-resolver',
|
|
71
|
+
version: '1.0.0',
|
|
72
|
+
modes: ['golden', 'regression'],
|
|
73
|
+
thresholds: { passScore: 0.8 },
|
|
74
|
+
tasks: [
|
|
75
|
+
{ taskId: 'refund-window', input: { q: 'refund policy?' }, expected: { kind: 'golden', match: { strategy: 'contains', value: '30 days' } } },
|
|
76
|
+
],
|
|
77
|
+
};
|
|
78
|
+
expect(suite(good), why('RFC 0081 §A', 'a conforming AgentEvalSuite MUST validate')).toBe(true);
|
|
79
|
+
// Negative: suiteId must carry the `.evals.` infix.
|
|
80
|
+
expect(suite({ ...good, suiteId: 'core.openwop.support-resolver' }), why('RFC 0081 §A', 'a suiteId without the `.evals.` infix MUST be rejected')).toBe(false);
|
|
81
|
+
// Negative: passScore out of 0..1.
|
|
82
|
+
expect(suite({ ...good, thresholds: { passScore: 1.5 } }), why('RFC 0081 §A', 'thresholds.passScore > 1 MUST be rejected')).toBe(false);
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
it('EvalSummary validates a conforming scorecard and rejects an out-of-range score', () => {
|
|
86
|
+
const good = {
|
|
87
|
+
suiteId: 'core.openwop.evals.support-resolver',
|
|
88
|
+
suiteVersion: '1.0.0',
|
|
89
|
+
aggregateScore: 0.86,
|
|
90
|
+
passed: true,
|
|
91
|
+
taskCount: 2,
|
|
92
|
+
passedCount: 2,
|
|
93
|
+
tasks: [{ taskId: 'refund-window', score: 0.9, passed: true, safetyFindings: [{ kind: 'jailbreak', severity: 'low' }] }],
|
|
94
|
+
};
|
|
95
|
+
expect(summary(good), why('RFC 0081 §C', 'a conforming EvalSummary MUST validate')).toBe(true);
|
|
96
|
+
expect(summary({ ...good, aggregateScore: 1.4 }), why('RFC 0081 §C', 'aggregateScore > 1 MUST be rejected')).toBe(false);
|
|
97
|
+
});
|
|
98
|
+
|
|
99
|
+
it('EvalSummary is content-free — a task-output body and a safety-finding excerpt are rejected (eval-summary-no-content-leak)', () => {
|
|
100
|
+
const base = { suiteId: 'core.openwop.evals.x', suiteVersion: '1.0.0', aggregateScore: 0.5, passed: false, taskCount: 1, passedCount: 0 };
|
|
101
|
+
// Negative: a per-task entry carrying the output body.
|
|
102
|
+
expect(
|
|
103
|
+
summary({ ...base, tasks: [{ taskId: 't1', score: 0.5, passed: false, taskOutput: 'the model said …' }] }),
|
|
104
|
+
why('SECURITY invariant eval-summary-no-content-leak', 'an EvalSummary task entry MUST NOT carry an output body'),
|
|
105
|
+
).toBe(false);
|
|
106
|
+
// Negative: a safety finding carrying excerpted content rather than a {kind, severity} descriptor.
|
|
107
|
+
expect(
|
|
108
|
+
summary({ ...base, tasks: [{ taskId: 't1', score: 0.5, passed: false, safetyFindings: [{ kind: 'pii-leak', severity: 'high', excerpt: 'SSN 123-45-6789' }] }] }),
|
|
109
|
+
why('SECURITY invariant eval-summary-no-content-leak', 'a safetyFinding MUST NOT carry excerpted content'),
|
|
110
|
+
).toBe(false);
|
|
111
|
+
});
|
|
112
|
+
});
|
|
113
|
+
|
|
114
|
+
describe('agent-eval-suite-shape: eval event payloads (RFC 0081, server-free)', () => {
|
|
115
|
+
const payloads = loadSchema('run-event-payloads.schema.json');
|
|
116
|
+
const ajv = new Ajv2020({ strict: false, allErrors: true });
|
|
117
|
+
addFormats(ajv);
|
|
118
|
+
ajv.addSchema(payloads, 'payloads');
|
|
119
|
+
|
|
120
|
+
const started = ajv.getSchema('payloads#/$defs/evalStarted');
|
|
121
|
+
const scored = ajv.getSchema('payloads#/$defs/evalScored');
|
|
122
|
+
const completed = ajv.getSchema('payloads#/$defs/evalCompleted');
|
|
123
|
+
|
|
124
|
+
it('eval.started validates a content-free start record and requires the suite provenance', () => {
|
|
125
|
+
expect(started, 'the evalStarted $def MUST exist').toBeTruthy();
|
|
126
|
+
expect(
|
|
127
|
+
started!({ suiteId: 'core.openwop.evals.support-resolver', suiteVersion: '1.0.0', taskCount: 12, modes: ['golden'] }),
|
|
128
|
+
why('RFC 0081 §C', 'a conforming eval.started payload MUST validate'),
|
|
129
|
+
).toBe(true);
|
|
130
|
+
expect(
|
|
131
|
+
started!({ suiteId: 'core.openwop.evals.x' }),
|
|
132
|
+
why('RFC 0081 §C', 'eval.started without suiteVersion/taskCount/modes MUST be rejected'),
|
|
133
|
+
).toBe(false);
|
|
134
|
+
});
|
|
135
|
+
|
|
136
|
+
it('eval.scored validates a content-free per-task score and requires score + passed', () => {
|
|
137
|
+
expect(scored, 'the evalScored $def MUST exist').toBeTruthy();
|
|
138
|
+
expect(
|
|
139
|
+
scored!({ taskId: 'refund-window', score: 0.9, passed: true, costUsd: 0.012 }),
|
|
140
|
+
why('RFC 0081 §C', 'a conforming eval.scored payload MUST validate'),
|
|
141
|
+
).toBe(true);
|
|
142
|
+
expect(
|
|
143
|
+
scored!({ taskId: 'refund-window' }),
|
|
144
|
+
why('RFC 0081 §C', 'eval.scored without score/passed MUST be rejected'),
|
|
145
|
+
).toBe(false);
|
|
146
|
+
});
|
|
147
|
+
|
|
148
|
+
it('eval.completed validates a content-free aggregate record', () => {
|
|
149
|
+
expect(completed, 'the evalCompleted $def MUST exist').toBeTruthy();
|
|
150
|
+
expect(
|
|
151
|
+
completed!({ aggregateScore: 0.86, passed: true, taskCount: 12, passedCount: 11, regressionVsBaseline: 0.04 }),
|
|
152
|
+
why('RFC 0081 §C', 'a conforming eval.completed payload MUST validate'),
|
|
153
|
+
).toBe(true);
|
|
154
|
+
expect(
|
|
155
|
+
completed!({ aggregateScore: 2 }),
|
|
156
|
+
why('RFC 0081 §C', 'eval.completed with an out-of-range aggregateScore MUST be rejected'),
|
|
157
|
+
).toBe(false);
|
|
158
|
+
});
|
|
159
|
+
|
|
160
|
+
it('all three eval event names appear in the RunEventType enum', () => {
|
|
161
|
+
const runEvent = loadSchema('run-event.schema.json');
|
|
162
|
+
const enumVals = (runEvent.$defs as Record<string, { enum?: string[] }>).RunEventType?.enum ?? [];
|
|
163
|
+
expect(enumVals).toContain('eval.started');
|
|
164
|
+
expect(enumVals).toContain('eval.scored');
|
|
165
|
+
expect(enumVals).toContain('eval.completed');
|
|
166
|
+
});
|
|
167
|
+
});
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Live manifest-dispatch tool-allowlist enforcement (RFC 0077 §F-1) —
|
|
3
|
+
* behavioral.
|
|
4
|
+
*
|
|
5
|
+
* Gated on `capabilities.agents.liveRuntime.supported` (root-first per RFC 0073).
|
|
6
|
+
* Soft-skips when unadvertised (default) / hard-fails under
|
|
7
|
+
* `OPENWOP_REQUIRE_BEHAVIOR=true`.
|
|
8
|
+
*
|
|
9
|
+
* Asserts the §F-1 safety carry-forward: a live invocation MUST NOT call a tool
|
|
10
|
+
* outside the agent's `toolAllowlist` (the per-tool application of the RFC 0002
|
|
11
|
+
* §A14 mandatory-allowlist floor). Driven by the `attemptTool` seam param naming
|
|
12
|
+
* a disallowed tool; the invocation MUST NOT emit an `agent.toolCalled` for it
|
|
13
|
+
* (a refused/failed outcome is acceptable, a silent successful call is not).
|
|
14
|
+
* Soft-skips when the seam/hook is unwired.
|
|
15
|
+
*
|
|
16
|
+
* Spec references:
|
|
17
|
+
* - https://github.com/openwop/openwop/blob/main/spec/v1/multi-agent-execution.md (§"Live manifest dispatch")
|
|
18
|
+
* - https://github.com/openwop/openwop/blob/main/RFCS/0077-agent-run-lifecycle-and-live-manifest-dispatch.md (§F-1)
|
|
19
|
+
* - https://github.com/openwop/openwop/blob/main/RFCS/0002-agent-identity-and-handoff.md (§A14 toolAllowlist)
|
|
20
|
+
*/
|
|
21
|
+
|
|
22
|
+
import { describe, it, expect } from 'vitest';
|
|
23
|
+
import { driver } from '../lib/driver.js';
|
|
24
|
+
import { behaviorGate } from '../lib/behavior-gate.js';
|
|
25
|
+
import { readLiveRuntimeCap, invokeLive } from '../lib/liveRuntime.js';
|
|
26
|
+
import { queryTestEvents, isEventLogSeamAvailable, resetTestSeam } from '../lib/event-log-query.js';
|
|
27
|
+
|
|
28
|
+
const DISALLOWED_TOOL = 'conformance-disallowed-tool';
|
|
29
|
+
|
|
30
|
+
describe('agent-live-allowlist-enforced (RFC 0077 §F-1)', () => {
|
|
31
|
+
it('does not call a tool outside the agent toolAllowlist', async () => {
|
|
32
|
+
const cap = await readLiveRuntimeCap();
|
|
33
|
+
if (!behaviorGate('openwop-live-allowlist-enforced', cap?.supported === true)) return;
|
|
34
|
+
|
|
35
|
+
if (!(await isEventLogSeamAvailable())) return; // soft-skip
|
|
36
|
+
const res = await invokeLive({ source: 'run-api', attemptTool: DISALLOWED_TOOL });
|
|
37
|
+
if (res === null || !res.runId) return; // seam/hook absent — soft-skip
|
|
38
|
+
|
|
39
|
+
const q = await queryTestEvents(res.runId, { type: 'agent.toolCalled' });
|
|
40
|
+
if (!q.ok) return;
|
|
41
|
+
|
|
42
|
+
const calledDisallowed = q.events.some((e) => {
|
|
43
|
+
const tool = e.payload.tool ?? e.payload.toolId ?? e.payload.name;
|
|
44
|
+
return tool === DISALLOWED_TOOL;
|
|
45
|
+
});
|
|
46
|
+
expect(
|
|
47
|
+
calledDisallowed === false,
|
|
48
|
+
driver.describe('RFC 0077 §F-1 / RFC 0002 §A14', 'a live invocation MUST NOT call a tool outside the agent toolAllowlist'),
|
|
49
|
+
).toBe(true);
|
|
50
|
+
|
|
51
|
+
await resetTestSeam();
|
|
52
|
+
});
|
|
53
|
+
});
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Live manifest-dispatch invocation bracket (RFC 0077 §E) — behavioral.
|
|
3
|
+
*
|
|
4
|
+
* Gated on `capabilities.agents.liveRuntime.supported` (root-first per RFC 0073).
|
|
5
|
+
* Soft-skips when unadvertised (default) / hard-fails under
|
|
6
|
+
* `OPENWOP_REQUIRE_BEHAVIOR=true`. The always-on wire-shape coverage lives in
|
|
7
|
+
* `agent-live-runtime-shape.test.ts`; this asserts host BEHAVIOR: a live
|
|
8
|
+
* invocation brackets its `agent.*` family with
|
|
9
|
+
* `agent.invocation.started` (FIRST agent-scoped event) and
|
|
10
|
+
* `agent.invocation.completed` (LAST), with a matching `invocationId`, a
|
|
11
|
+
* `source` in the enum, an `outcome` in the enum, and both events content-free
|
|
12
|
+
* (no prompt/result body).
|
|
13
|
+
*
|
|
14
|
+
* Drives the OPTIONAL `POST /v1/host/sample/agents/live-invoke` seam + reads the
|
|
15
|
+
* bracket back via the test event-log seam (both deferred per RFC 0077
|
|
16
|
+
* §Conformance — soft-skip on 404).
|
|
17
|
+
*
|
|
18
|
+
* Spec references:
|
|
19
|
+
* - https://github.com/openwop/openwop/blob/main/spec/v1/multi-agent-execution.md (§"Live manifest dispatch")
|
|
20
|
+
* - https://github.com/openwop/openwop/blob/main/RFCS/0077-agent-run-lifecycle-and-live-manifest-dispatch.md
|
|
21
|
+
*/
|
|
22
|
+
|
|
23
|
+
import { describe, it, expect } from 'vitest';
|
|
24
|
+
import { driver } from '../lib/driver.js';
|
|
25
|
+
import { behaviorGate } from '../lib/behavior-gate.js';
|
|
26
|
+
import { readLiveRuntimeCap, invokeLive } from '../lib/liveRuntime.js';
|
|
27
|
+
import { queryTestEvents, isEventLogSeamAvailable, resetTestSeam } from '../lib/event-log-query.js';
|
|
28
|
+
|
|
29
|
+
const SOURCES = ['workflow-node', 'run-api', 'chat-mention'];
|
|
30
|
+
const OUTCOMES = ['completed', 'handed-off', 'escalated', 'refused', 'failed'];
|
|
31
|
+
const AGENT_SCOPED = (t: string): boolean => t === 'agent.invocation.started' || t === 'agent.invocation.completed' || t.startsWith('agent.');
|
|
32
|
+
|
|
33
|
+
describe('agent-live-invocation-bracket (RFC 0077 §E)', () => {
|
|
34
|
+
it('brackets a live invocation with started-first / completed-last + matching invocationId, content-free', async () => {
|
|
35
|
+
const cap = await readLiveRuntimeCap();
|
|
36
|
+
if (!behaviorGate('openwop-live-invocation-bracket', cap?.supported === true)) return;
|
|
37
|
+
|
|
38
|
+
if (!(await isEventLogSeamAvailable())) return; // event-log seam absent — soft-skip
|
|
39
|
+
const res = await invokeLive({ source: 'run-api' });
|
|
40
|
+
if (res === null || !res.runId) return; // live-invoke seam absent — soft-skip
|
|
41
|
+
|
|
42
|
+
const q = await queryTestEvents(res.runId);
|
|
43
|
+
if (!q.ok) return;
|
|
44
|
+
const events = q.events.slice().sort((a, b) => a.sequence - b.sequence);
|
|
45
|
+
|
|
46
|
+
const started = events.filter((e) => e.type === 'agent.invocation.started');
|
|
47
|
+
const completed = events.filter((e) => e.type === 'agent.invocation.completed');
|
|
48
|
+
expect(
|
|
49
|
+
started.length >= 1 && completed.length >= 1,
|
|
50
|
+
driver.describe('multi-agent-execution.md §"Live manifest dispatch"', 'a live invocation MUST emit agent.invocation.started + agent.invocation.completed'),
|
|
51
|
+
).toBe(true);
|
|
52
|
+
if (started.length === 0 || completed.length === 0) return;
|
|
53
|
+
|
|
54
|
+
const start = started[0]!;
|
|
55
|
+
const end = completed[completed.length - 1]!;
|
|
56
|
+
|
|
57
|
+
// §E ordering: started is the FIRST agent-scoped event, completed the LAST.
|
|
58
|
+
const agentScoped = events.filter((e) => AGENT_SCOPED(e.type));
|
|
59
|
+
expect(
|
|
60
|
+
agentScoped[0]?.type === 'agent.invocation.started',
|
|
61
|
+
driver.describe('RFC 0077 §E', 'agent.invocation.started MUST be the first agent-scoped event of the invocation'),
|
|
62
|
+
).toBe(true);
|
|
63
|
+
expect(
|
|
64
|
+
agentScoped[agentScoped.length - 1]?.type === 'agent.invocation.completed',
|
|
65
|
+
driver.describe('RFC 0077 §E', 'agent.invocation.completed MUST be the last agent-scoped event of the invocation'),
|
|
66
|
+
).toBe(true);
|
|
67
|
+
|
|
68
|
+
// Matching invocationId across the bracket.
|
|
69
|
+
const startId = start.payload.invocationId;
|
|
70
|
+
const endId = end.payload.invocationId;
|
|
71
|
+
expect(
|
|
72
|
+
typeof startId === 'string' && startId === endId,
|
|
73
|
+
driver.describe('run-event-payloads.schema.json#agentInvocation*', 'the bracket MUST share one invocationId'),
|
|
74
|
+
).toBe(true);
|
|
75
|
+
|
|
76
|
+
// Enum discipline.
|
|
77
|
+
expect(
|
|
78
|
+
typeof start.payload.source === 'string' && SOURCES.includes(start.payload.source as string),
|
|
79
|
+
driver.describe('run-event-payloads.schema.json#agentInvocationStarted', 'source MUST be workflow-node|run-api|chat-mention'),
|
|
80
|
+
).toBe(true);
|
|
81
|
+
expect(
|
|
82
|
+
typeof end.payload.outcome === 'string' && OUTCOMES.includes(end.payload.outcome as string),
|
|
83
|
+
driver.describe('run-event-payloads.schema.json#agentInvocationCompleted', 'outcome MUST be in the closed enum'),
|
|
84
|
+
).toBe(true);
|
|
85
|
+
|
|
86
|
+
// Content-free: identifiers + metadata only, never prompt/result body.
|
|
87
|
+
for (const evt of [start, end]) {
|
|
88
|
+
for (const forbidden of ['prompt', 'result', 'body', 'input', 'output', 'apiKey', 'secret', 'credentials', 'token']) {
|
|
89
|
+
expect(
|
|
90
|
+
!(forbidden in evt.payload),
|
|
91
|
+
driver.describe('RFC 0077', `agent.invocation.* MUST be content-free (no ${forbidden})`),
|
|
92
|
+
).toBe(true);
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
await resetTestSeam();
|
|
97
|
+
});
|
|
98
|
+
});
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Live manifest dispatch — capability + invocation-event shapes (RFC 0077).
|
|
3
|
+
*
|
|
4
|
+
* Always-on, server-free schema-shape probe. Verifies that:
|
|
5
|
+
* - `capabilities.agents.liveRuntime` is declared on the capabilities schema
|
|
6
|
+
* (with the `supported` / `structuredOutput` / `confidenceEscalation` /
|
|
7
|
+
* `sources` sub-flags).
|
|
8
|
+
* - the `agent.invocation.started` + `agent.invocation.completed` payload
|
|
9
|
+
* $defs validate conforming content-free payloads and reject malformed
|
|
10
|
+
* ones (a `started` missing `source`; a `completed` with an out-of-enum
|
|
11
|
+
* `outcome`).
|
|
12
|
+
* - both event names appear in the RunEventType enum.
|
|
13
|
+
*
|
|
14
|
+
* Behavioral assertions (the started→completed bracket ordering, structured-
|
|
15
|
+
* output enforcement, toolAllowlist enforcement) are gated on
|
|
16
|
+
* `capabilities.agents.liveRuntime.supported` and soft-skip until a reference
|
|
17
|
+
* host wires the live-invoke seam (RFC 0077 §Conformance — reference host
|
|
18
|
+
* deferred). This scenario asserts the wire contract, not host behavior.
|
|
19
|
+
*
|
|
20
|
+
* Spec references:
|
|
21
|
+
* - https://github.com/openwop/openwop/blob/main/spec/v1/multi-agent-execution.md §"Live manifest dispatch"
|
|
22
|
+
* - https://github.com/openwop/openwop/blob/main/RFCS/0077-agent-run-lifecycle-and-live-manifest-dispatch.md
|
|
23
|
+
*/
|
|
24
|
+
|
|
25
|
+
import { describe, it, expect } from 'vitest';
|
|
26
|
+
import { readFileSync } from 'node:fs';
|
|
27
|
+
import { join } from 'node:path';
|
|
28
|
+
import Ajv2020 from 'ajv/dist/2020.js';
|
|
29
|
+
import addFormats from 'ajv-formats';
|
|
30
|
+
import { SCHEMAS_DIR } from '../lib/paths.js';
|
|
31
|
+
|
|
32
|
+
/** Server-free assertion-message helper (mirrors driver.describe's "spec — requirement" shape without requiring OPENWOP_BASE_URL). */
|
|
33
|
+
const why = (specRef: string, requirement: string): string => `${specRef} — ${requirement}`;
|
|
34
|
+
|
|
35
|
+
function loadSchema(name: string): Record<string, unknown> {
|
|
36
|
+
return JSON.parse(readFileSync(join(SCHEMAS_DIR, name), 'utf8')) as Record<string, unknown>;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
describe('agent-live-runtime-shape: capability advertisement (RFC 0077, server-free)', () => {
|
|
40
|
+
it('the capabilities schema declares agents.liveRuntime with its sub-flags', () => {
|
|
41
|
+
const caps = loadSchema('capabilities.schema.json');
|
|
42
|
+
const agents = (caps.properties as Record<string, { properties?: Record<string, { properties?: Record<string, unknown> }> }>).agents;
|
|
43
|
+
const live = agents?.properties?.liveRuntime;
|
|
44
|
+
expect(
|
|
45
|
+
live,
|
|
46
|
+
why('capabilities.md §agents', 'agents.liveRuntime MUST be declared'),
|
|
47
|
+
).toBeDefined();
|
|
48
|
+
for (const flag of ['supported', 'structuredOutput', 'confidenceEscalation', 'sources']) {
|
|
49
|
+
expect(
|
|
50
|
+
live?.properties?.[flag],
|
|
51
|
+
why('multi-agent-execution.md §Live manifest dispatch', `agents.liveRuntime.${flag} MUST be declared`),
|
|
52
|
+
).toBeDefined();
|
|
53
|
+
}
|
|
54
|
+
});
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
describe('agent-live-runtime-shape: invocation event payloads (RFC 0077, server-free)', () => {
|
|
58
|
+
const payloads = loadSchema('run-event-payloads.schema.json');
|
|
59
|
+
const ajv = new Ajv2020({ strict: false, allErrors: true });
|
|
60
|
+
addFormats(ajv);
|
|
61
|
+
ajv.addSchema(payloads, 'payloads');
|
|
62
|
+
|
|
63
|
+
const started = ajv.getSchema('payloads#/$defs/agentInvocationStarted');
|
|
64
|
+
const completed = ajv.getSchema('payloads#/$defs/agentInvocationCompleted');
|
|
65
|
+
|
|
66
|
+
it('agent.invocation.started validates a content-free start record and requires source', () => {
|
|
67
|
+
expect(started, 'the agentInvocationStarted $def MUST exist').toBeTruthy();
|
|
68
|
+
expect(
|
|
69
|
+
started!({ invocationId: 'inv-1', agentId: 'vendor.acme.review.code-reviewer', source: 'run-api', modelClass: 'coding', toolSurfaceCount: 3, memoryBound: false }),
|
|
70
|
+
why('RFC 0077 §C', 'a conforming agent.invocation.started payload MUST validate'),
|
|
71
|
+
).toBe(true);
|
|
72
|
+
// Negative: missing source — every invocation must record its entry point.
|
|
73
|
+
expect(
|
|
74
|
+
started!({ invocationId: 'inv-1', agentId: 'vendor.acme.review.code-reviewer' }),
|
|
75
|
+
why('RFC 0077 §C', 'agent.invocation.started without source MUST be rejected'),
|
|
76
|
+
).toBe(false);
|
|
77
|
+
});
|
|
78
|
+
|
|
79
|
+
it('agent.invocation.completed validates a content-free outcome record and pins the outcome enum', () => {
|
|
80
|
+
expect(completed, 'the agentInvocationCompleted $def MUST exist').toBeTruthy();
|
|
81
|
+
expect(
|
|
82
|
+
completed!({ invocationId: 'inv-1', agentId: 'vendor.acme.review.code-reviewer', outcome: 'completed', schemaValidated: true, confidence: 0.91 }),
|
|
83
|
+
why('RFC 0077 §C', 'a conforming agent.invocation.completed payload MUST validate'),
|
|
84
|
+
).toBe(true);
|
|
85
|
+
// Negative: out-of-enum outcome — the canonical value is `completed`, not `done`.
|
|
86
|
+
expect(
|
|
87
|
+
completed!({ invocationId: 'inv-1', agentId: 'a', outcome: 'done' }),
|
|
88
|
+
why('RFC 0077 §C', 'agent.invocation.completed with an out-of-enum outcome MUST be rejected'),
|
|
89
|
+
).toBe(false);
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
it('both invocation event names appear in the RunEventType enum', () => {
|
|
93
|
+
const runEvent = loadSchema('run-event.schema.json');
|
|
94
|
+
const enumVals = (runEvent.$defs as Record<string, { enum?: string[] }>).RunEventType?.enum ?? [];
|
|
95
|
+
expect(enumVals).toContain('agent.invocation.started');
|
|
96
|
+
expect(enumVals).toContain('agent.invocation.completed');
|
|
97
|
+
});
|
|
98
|
+
});
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Live manifest-dispatch structured-output enforcement (RFC 0077 §B step 6) —
|
|
3
|
+
* behavioral.
|
|
4
|
+
*
|
|
5
|
+
* Gated on `capabilities.agents.liveRuntime.structuredOutput` (root-first per
|
|
6
|
+
* RFC 0073) — itself meaningful only alongside `liveRuntime.supported`.
|
|
7
|
+
* Soft-skips when unadvertised (default) / hard-fails under
|
|
8
|
+
* `OPENWOP_REQUIRE_BEHAVIOR=true`.
|
|
9
|
+
*
|
|
10
|
+
* Asserts the §B step-6 MUST: when the host advertises `structuredOutput` and an
|
|
11
|
+
* agent declares a `handoff.returnSchemaRef`, a terminal result that VIOLATES
|
|
12
|
+
* that schema MUST fail the invocation (`agent.invocation.completed.outcome ===
|
|
13
|
+
* "failed"`, `schemaValidated !== true`) rather than ship a non-conforming
|
|
14
|
+
* result as `completed`. Driven by the `forceInvalidResult` seam param so the
|
|
15
|
+
* assertion is deterministic; soft-skips when the seam/hook is unwired.
|
|
16
|
+
*
|
|
17
|
+
* Spec references:
|
|
18
|
+
* - https://github.com/openwop/openwop/blob/main/spec/v1/multi-agent-execution.md (§"Live manifest dispatch")
|
|
19
|
+
* - https://github.com/openwop/openwop/blob/main/RFCS/0077-agent-run-lifecycle-and-live-manifest-dispatch.md (§B step 6)
|
|
20
|
+
*/
|
|
21
|
+
|
|
22
|
+
import { describe, it, expect } from 'vitest';
|
|
23
|
+
import { driver } from '../lib/driver.js';
|
|
24
|
+
import { behaviorGate } from '../lib/behavior-gate.js';
|
|
25
|
+
import { readLiveRuntimeCap, invokeLive } from '../lib/liveRuntime.js';
|
|
26
|
+
import { queryTestEvents, isEventLogSeamAvailable, resetTestSeam } from '../lib/event-log-query.js';
|
|
27
|
+
|
|
28
|
+
describe('agent-live-structured-output (RFC 0077 §B step 6)', () => {
|
|
29
|
+
it('fails the invocation on a result that violates handoff.returnSchemaRef', async () => {
|
|
30
|
+
const cap = await readLiveRuntimeCap();
|
|
31
|
+
// structuredOutput is a sub-flag of a supported liveRuntime; gate on both.
|
|
32
|
+
const advertised = cap?.supported === true && cap?.structuredOutput === true;
|
|
33
|
+
if (!behaviorGate('openwop-live-structured-output', advertised)) return;
|
|
34
|
+
|
|
35
|
+
if (!(await isEventLogSeamAvailable())) return; // soft-skip
|
|
36
|
+
const res = await invokeLive({
|
|
37
|
+
source: 'run-api',
|
|
38
|
+
returnSchemaRef: 'conformance-strict-handoff',
|
|
39
|
+
forceInvalidResult: true,
|
|
40
|
+
});
|
|
41
|
+
if (res === null || !res.runId) return; // seam/hook absent — soft-skip
|
|
42
|
+
|
|
43
|
+
const q = await queryTestEvents(res.runId, { type: 'agent.invocation.completed' });
|
|
44
|
+
if (!q.ok || !q.events[0]) return;
|
|
45
|
+
const payload = q.events[q.events.length - 1]!.payload;
|
|
46
|
+
|
|
47
|
+
expect(
|
|
48
|
+
payload.outcome === 'failed',
|
|
49
|
+
driver.describe('RFC 0077 §B step 6', 'a result violating handoff.returnSchemaRef MUST fail the invocation (outcome "failed"), not ship as completed'),
|
|
50
|
+
).toBe(true);
|
|
51
|
+
expect(
|
|
52
|
+
payload.schemaValidated !== true,
|
|
53
|
+
driver.describe('RFC 0077 §B step 6', 'schemaValidated MUST NOT be true for a schema-violating result'),
|
|
54
|
+
).toBe(true);
|
|
55
|
+
|
|
56
|
+
await resetTestSeam();
|
|
57
|
+
});
|
|
58
|
+
});
|