@openwop/openwop-conformance 1.15.0 → 1.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -42,7 +42,7 @@ import {
42
42
  DEPLOYMENT_STATES,
43
43
  DEPLOYMENT_CONTENT_FORBIDDEN,
44
44
  } from '../lib/agentDeployment.js';
45
- import { queryTestEvents, isEventLogSeamAvailable, resetTestSeam } from '../lib/event-log-query.js';
45
+ import { queryTestEvents, requireEvents, isEventLogSeamAvailable, resetTestSeam } from '../lib/event-log-query.js';
46
46
 
47
47
  function loadSchema(name: string): Record<string, unknown> {
48
48
  return JSON.parse(readFileSync(join(SCHEMAS_DIR, name), 'utf8')) as Record<string, unknown>;
@@ -71,76 +71,99 @@ describe('agent-deployment-lifecycle (RFC 0082 §B/§E)', () => {
71
71
  const promote = await driveDeploymentTransition({ scenario: 'promote' });
72
72
  if (promote === null) return; // deployment seam unwired — soft-skip the whole behavioral suite
73
73
 
74
- if (promote.record) {
74
+ // The host has ADVERTISED agents.deployment AND wired the seam — missing
75
+ // evidence is a FAILURE, not a soft-skip. A successful promote MUST return
76
+ // a runId + a schema-valid record + emit ≥1 content-free deployment.promoted.
77
+ expect(
78
+ typeof promote.runId === 'string' && (promote.runId as string).length > 0,
79
+ driver.describe('agent-deployment.md §E', 'a wired promote MUST return the runId'),
80
+ ).toBe(true);
81
+ expect(
82
+ promote.record !== undefined && promote.record !== null,
83
+ driver.describe('agent-deployment.md §E', 'a successful promote MUST return the deployment record'),
84
+ ).toBe(true);
85
+ expect(
86
+ validateRecord(promote.record),
87
+ driver.describe('agent-deployment.schema.json', `a promoted deployment record MUST validate (${ajv.errorsText(validateRecord.errors)})`),
88
+ ).toBe(true);
89
+
90
+ const promotedEvents = requireEvents(
91
+ await queryTestEvents(promote.runId as string, { type: 'deployment.promoted' }),
92
+ 'deployment.promoted',
93
+ );
94
+ expect(
95
+ promotedEvents.length >= 1,
96
+ driver.describe('agent-deployment.md §E', 'a successful promote MUST emit at least one deployment.promoted'),
97
+ ).toBe(true);
98
+ for (const e of promotedEvents) {
99
+ expectContentFree(e.payload, 'deployment.promoted');
75
100
  expect(
76
- validateRecord(promote.record),
77
- driver.describe(
78
- 'agent-deployment.schema.json',
79
- `a promoted deployment record MUST validate (${ajv.errorsText(validateRecord.errors)})`,
80
- ),
101
+ typeof e.payload.toState === 'string' && DEPLOYMENT_STATES.includes(e.payload.toState as string),
102
+ driver.describe('run-event-payloads.schema.json#/$defs/deploymentPromoted', 'toState MUST be in the seven-state vocabulary'),
103
+ ).toBe(true);
104
+ expect(
105
+ typeof e.payload.toVersion === 'string' && (e.payload.toVersion as string).length > 0,
106
+ driver.describe('agent-deployment.md §D', 'deployment.promoted MUST carry the promoted toVersion'),
81
107
  ).toBe(true);
82
- }
83
- if (promote.runId) {
84
- const pq = await queryTestEvents(promote.runId, { type: 'deployment.promoted' });
85
- if (pq.ok) {
86
- for (const e of pq.events) {
87
- expectContentFree(e.payload, 'deployment.promoted');
88
- expect(
89
- typeof e.payload.toState === 'string' && DEPLOYMENT_STATES.includes(e.payload.toState as string),
90
- driver.describe('run-event-payloads.schema.json#/$defs/deploymentPromoted', 'toState MUST be in the seven-state vocabulary'),
91
- ).toBe(true);
92
- expect(
93
- typeof e.payload.toVersion === 'string' && (e.payload.toVersion as string).length > 0,
94
- driver.describe('agent-deployment.md §D', 'deployment.promoted MUST carry the promoted toVersion'),
95
- ).toBe(true);
96
- }
97
- }
98
108
  }
99
109
 
100
110
  // ---- Leg 2: fail-closed authz (§E-1; deployment-promotion-fail-closed) -
101
111
  const unauth = await driveDeploymentTransition({ scenario: 'unauthorized' });
102
- if (unauth && unauth.runId) {
103
- expect(
104
- unauth.allowed !== true,
105
- driver.describe('agent-deployment.md §E-1', 'a principal without deploy:promote MUST be denied (fail-closed)'),
106
- ).toBe(true);
107
- const uq = await queryTestEvents(unauth.runId, { type: 'deployment.promoted' });
108
- if (uq.ok) {
109
- expect(
110
- uq.events.length === 0,
111
- driver.describe('SECURITY invariant deployment-promotion-fail-closed', 'a denied transition MUST emit NO deployment.promoted'),
112
- ).toBe(true);
113
- }
114
- }
112
+ expect(
113
+ unauth !== null && typeof unauth.runId === 'string' && (unauth.runId as string).length > 0,
114
+ driver.describe('agent-deployment.md §E-1', 'the unauthorized scenario MUST return a runId to evidence the fail-closed denial'),
115
+ ).toBe(true);
116
+ expect(
117
+ unauth!.allowed !== true,
118
+ driver.describe('agent-deployment.md §E-1', 'a principal without deploy:promote MUST be denied (fail-closed)'),
119
+ ).toBe(true);
120
+ const unauthPromoted = requireEvents(
121
+ await queryTestEvents(unauth!.runId as string, { type: 'deployment.promoted' }),
122
+ 'deployment.promoted (unauthorized)',
123
+ );
124
+ expect(
125
+ unauthPromoted.length === 0,
126
+ driver.describe('SECURITY invariant deployment-promotion-fail-closed', 'a denied transition MUST emit NO deployment.promoted'),
127
+ ).toBe(true);
115
128
 
116
129
  // ---- Leg 3: eval-gate-unmet denial (§E-3) ----------------------------
117
130
  const evalUnmet = await driveDeploymentTransition({ scenario: 'eval-gate-unmet' });
118
- if (evalUnmet && evalUnmet.runId) {
119
- expect(
120
- evalUnmet.error === 'eval_gate_unmet' || evalUnmet.allowed !== true,
121
- driver.describe('agent-deployment.md §E-3', 'a promote whose eval evidence has passed:false MUST be denied (eval_gate_unmet)'),
122
- ).toBe(true);
123
- const eq = await queryTestEvents(evalUnmet.runId, { type: 'deployment.promoted' });
124
- if (eq.ok) {
125
- expect(
126
- eq.events.length === 0,
127
- driver.describe('agent-deployment.md §E-3', 'an unmet eval gate MUST emit NO deployment.promoted'),
128
- ).toBe(true);
129
- }
130
- }
131
+ expect(
132
+ evalUnmet !== null && typeof evalUnmet.runId === 'string' && (evalUnmet.runId as string).length > 0,
133
+ driver.describe('agent-deployment.md §E-3', 'the eval-gate-unmet scenario MUST return a runId to evidence the denial'),
134
+ ).toBe(true);
135
+ expect(
136
+ evalUnmet!.error === 'eval_gate_unmet' || evalUnmet!.allowed !== true,
137
+ driver.describe('agent-deployment.md §E-3', 'a promote whose eval evidence has passed:false MUST be denied (eval_gate_unmet)'),
138
+ ).toBe(true);
139
+ const evalUnmetPromoted = requireEvents(
140
+ await queryTestEvents(evalUnmet!.runId as string, { type: 'deployment.promoted' }),
141
+ 'deployment.promoted (eval-gate-unmet)',
142
+ );
143
+ expect(
144
+ evalUnmetPromoted.length === 0,
145
+ driver.describe('agent-deployment.md §E-3', 'an unmet eval gate MUST emit NO deployment.promoted'),
146
+ ).toBe(true);
131
147
 
132
148
  // ---- Leg 4: channel-resolution pin (§B) ------------------------------
133
149
  const pin = await driveDeploymentTransition({ scenario: 'channel-pin', channel: 'stable' });
134
- if (pin && pin.runId) {
135
- const iq = await queryTestEvents(pin.runId, { type: 'agent.invocation.started' });
136
- if (iq.ok && iq.events.length > 0) {
137
- const started = iq.events.sort((a, b) => a.sequence - b.sequence)[0]!;
138
- expect(
139
- typeof started.payload.resolvedAgentVersion === 'string' && (started.payload.resolvedAgentVersion as string).length > 0,
140
- driver.describe('agent-deployment.md §B', 'a @channel-bound run MUST record resolvedAgentVersion on agent.invocation.started (the recorded fact a replay re-reads)'),
141
- ).toBe(true);
142
- }
143
- }
150
+ expect(
151
+ pin !== null && typeof pin.runId === 'string' && (pin.runId as string).length > 0,
152
+ driver.describe('agent-deployment.md §B', 'the channel-pin scenario MUST return a runId'),
153
+ ).toBe(true);
154
+ const invEvents = requireEvents(
155
+ await queryTestEvents(pin!.runId as string, { type: 'agent.invocation.started' }),
156
+ 'agent.invocation.started (channel-pin)',
157
+ );
158
+ expect(
159
+ invEvents.length >= 1,
160
+ driver.describe('agent-deployment.md §B', 'a @channel-bound run MUST emit agent.invocation.started'),
161
+ ).toBe(true);
162
+ const startedInv = invEvents.sort((a, b) => a.sequence - b.sequence)[0]!;
163
+ expect(
164
+ typeof startedInv.payload.resolvedAgentVersion === 'string' && (startedInv.payload.resolvedAgentVersion as string).length > 0,
165
+ driver.describe('agent-deployment.md §B', 'a @channel-bound run MUST record resolvedAgentVersion on agent.invocation.started (the recorded fact a replay re-reads)'),
166
+ ).toBe(true);
144
167
 
145
168
  await resetTestSeam();
146
169
  });
@@ -38,7 +38,7 @@ import {
38
38
  getEvalSummary,
39
39
  EVAL_CONTENT_FORBIDDEN,
40
40
  } from '../lib/agentEval.js';
41
- import { queryTestEvents, isEventLogSeamAvailable, resetTestSeam } from '../lib/event-log-query.js';
41
+ import { queryTestEvents, requireEvents, isEventLogSeamAvailable, resetTestSeam } from '../lib/event-log-query.js';
42
42
 
43
43
  function loadSchema(name: string): Record<string, unknown> {
44
44
  return JSON.parse(readFileSync(join(SCHEMAS_DIR, name), 'utf8')) as Record<string, unknown>;
@@ -61,83 +61,110 @@ describe('agent-eval-run (RFC 0081 §B/§C)', () => {
61
61
 
62
62
  const run = await driveEvalRun({ modes: ['golden'] });
63
63
  if (run === null) return; // eval-run seam unwired — soft-skip the whole behavioral suite
64
- if (!run.runId) return;
65
64
 
66
- // ---- Legs 1+2: eval.* ordering + content-free (§C) -------------------
67
- const startedQ = await queryTestEvents(run.runId, { type: 'eval.started' });
68
- const scoredQ = await queryTestEvents(run.runId, { type: 'eval.scored' });
69
- const completedQ = await queryTestEvents(run.runId, { type: 'eval.completed' });
65
+ // From here the host has ADVERTISED agents.evalSuite AND wired the eval-run
66
+ // seam missing evidence is a FAILURE, not a soft-skip. A host claiming the
67
+ // capability MUST produce the runId, the full eval.* sequence, and the
68
+ // normative EvalSummary, or it is advertising a capability it doesn't deliver.
69
+ expect(
70
+ typeof run.runId === 'string' && run.runId.length > 0,
71
+ driver.describe('agent-evaluation.md §B', 'a wired eval-run seam MUST return the projected runId'),
72
+ ).toBe(true);
73
+ const runId = run.runId as string;
70
74
 
71
- if (startedQ.ok && scoredQ.ok && startedQ.events.length > 0) {
72
- const started = startedQ.events.sort((a, b) => a.sequence - b.sequence)[0]!;
75
+ // ---- Legs 1+2: eval.* ordering + content-free (§C) -------------------
76
+ const startedQ = await queryTestEvents(runId, { type: 'eval.started' });
77
+ const scoredQ = await queryTestEvents(runId, { type: 'eval.scored' });
78
+ const completedQ = await queryTestEvents(runId, { type: 'eval.completed' });
73
79
 
74
- // eval.started precedes every eval.scored (§C ordering).
75
- for (const s of scoredQ.events) {
76
- expect(
77
- started.sequence < s.sequence,
78
- driver.describe('agent-evaluation.md §C', 'eval.started MUST precede every eval.scored'),
79
- ).toBe(true);
80
- }
80
+ // The event-log seam MUST return the eval.* events for a wired eval run
81
+ // (requireEvents hard-fails if a leg's query is not ok — no vacuous pass).
82
+ const startedEvents = requireEvents(startedQ, 'eval.started');
83
+ const scoredEvents = requireEvents(scoredQ, 'eval.scored');
84
+ const completedEvents = requireEvents(completedQ, 'eval.completed');
81
85
 
82
- if (completedQ.ok && completedQ.events.length > 0) {
83
- const completed = completedQ.events.sort((a, b) => a.sequence - b.sequence)[completedQ.events.length - 1]!;
84
- for (const s of scoredQ.events) {
85
- expect(
86
- s.sequence < completed.sequence,
87
- driver.describe('agent-evaluation.md §C', 'every eval.scored MUST precede eval.completed'),
88
- ).toBe(true);
89
- }
90
- // eval.scored is emitted once per task (count == eval.completed.taskCount).
91
- if (typeof completed.payload.taskCount === 'number') {
92
- expect(
93
- scoredQ.events.length === completed.payload.taskCount,
94
- driver.describe('agent-evaluation.md §C', 'one eval.scored per task (count == eval.completed.taskCount)'),
95
- ).toBe(true);
96
- }
97
- expectContentFree(completed.payload, 'eval.completed');
98
- }
86
+ // eval.started exactly once (FIRST); eval.completed exactly once (LAST);
87
+ // ≥1 eval.scored a wired eval run MUST emit the full sequence.
88
+ expect(
89
+ startedEvents.length === 1,
90
+ driver.describe('agent-evaluation.md §C', 'an eval run MUST emit exactly one eval.started'),
91
+ ).toBe(true);
92
+ expect(
93
+ scoredEvents.length >= 1,
94
+ driver.describe('agent-evaluation.md §C', 'an eval run MUST emit at least one eval.scored'),
95
+ ).toBe(true);
96
+ expect(
97
+ completedEvents.length === 1,
98
+ driver.describe('agent-evaluation.md §C', 'an eval run MUST emit exactly one eval.completed'),
99
+ ).toBe(true);
100
+ const started = startedEvents[0]!;
101
+ const completed = completedEvents[0]!;
99
102
 
100
- // each eval.scored content-free + score 0..1, passed boolean.
101
- for (const s of scoredQ.events) {
102
- expectContentFree(s.payload, 'eval.scored');
103
- expect(
104
- typeof s.payload.score === 'number' && (s.payload.score as number) >= 0 && (s.payload.score as number) <= 1,
105
- driver.describe('run-event-payloads.schema.json#/$defs/evalScored', 'eval.scored.score MUST be in 0..1'),
106
- ).toBe(true);
107
- expect(
108
- typeof s.payload.passed === 'boolean',
109
- driver.describe('run-event-payloads.schema.json#/$defs/evalScored', 'eval.scored.passed MUST be a boolean'),
110
- ).toBe(true);
111
- }
112
- expectContentFree(started.payload, 'eval.started');
103
+ // Ordering: eval.started precedes every eval.scored precedes eval.completed.
104
+ for (const s of scoredEvents) {
105
+ expect(
106
+ started.sequence < s.sequence,
107
+ driver.describe('agent-evaluation.md §C', 'eval.started MUST precede every eval.scored'),
108
+ ).toBe(true);
109
+ expect(
110
+ s.sequence < completed.sequence,
111
+ driver.describe('agent-evaluation.md §C', 'every eval.scored MUST precede eval.completed'),
112
+ ).toBe(true);
113
113
  }
114
114
 
115
- // ---- Leg 3: NORMATIVE EvalSummary read (§C) --------------------------
116
- const { status, summary } = await getEvalSummary(run.runId);
117
- if (status === 200 && summary) {
118
- const ajv = new Ajv2020({ strict: false, allErrors: true });
119
- addFormats(ajv);
120
- const validate = ajv.compile(loadSchema('eval-summary.schema.json'));
115
+ // One eval.scored per task (count == eval.completed.taskCount).
116
+ expect(
117
+ typeof completed.payload.taskCount === 'number',
118
+ driver.describe('run-event-payloads.schema.json#/$defs/evalCompleted', 'eval.completed MUST carry a numeric taskCount'),
119
+ ).toBe(true);
120
+ expect(
121
+ scoredEvents.length === completed.payload.taskCount,
122
+ driver.describe('agent-evaluation.md §C', 'one eval.scored per task (count == eval.completed.taskCount)'),
123
+ ).toBe(true);
124
+
125
+ // Content-free (§C / eval-summary-no-content-leak) + score ∈ 0..1, passed boolean.
126
+ expectContentFree(started.payload, 'eval.started');
127
+ expectContentFree(completed.payload, 'eval.completed');
128
+ for (const s of scoredEvents) {
129
+ expectContentFree(s.payload, 'eval.scored');
130
+ expect(
131
+ typeof s.payload.score === 'number' && (s.payload.score as number) >= 0 && (s.payload.score as number) <= 1,
132
+ driver.describe('run-event-payloads.schema.json#/$defs/evalScored', 'eval.scored.score MUST be in 0..1'),
133
+ ).toBe(true);
121
134
  expect(
122
- validate(summary),
123
- driver.describe(
124
- 'eval-summary.schema.json',
125
- `GET /v1/runs/{runId}/eval-summary MUST return a schema-valid EvalSummary (${ajv.errorsText(validate.errors)})`,
126
- ),
135
+ typeof s.payload.passed === 'boolean',
136
+ driver.describe('run-event-payloads.schema.json#/$defs/evalScored', 'eval.scored.passed MUST be a boolean'),
127
137
  ).toBe(true);
138
+ }
128
139
 
129
- const tasks = (summary.tasks as Array<Record<string, unknown>> | undefined) ?? [];
130
- const passedCount = summary.passedCount as number | undefined;
131
- const taskCount = summary.taskCount as number | undefined;
132
- if (typeof passedCount === 'number' && typeof taskCount === 'number') {
133
- expect(
134
- passedCount <= taskCount,
135
- driver.describe('agent-evaluation.md §C', 'EvalSummary.passedCount MUST NOT exceed taskCount'),
136
- ).toBe(true);
137
- }
138
- for (const t of tasks) {
139
- expectContentFree(t, 'EvalSummary.tasks[]');
140
- }
140
+ // ---- Leg 3: NORMATIVE EvalSummary read (§C) MUST serve a 200 -------
141
+ const { status, summary } = await getEvalSummary(runId);
142
+ expect(
143
+ status === 200 && summary !== undefined,
144
+ driver.describe('agent-evaluation.md §C', `GET /v1/runs/{runId}/eval-summary MUST serve a 200 EvalSummary for a completed eval run (got ${status})`),
145
+ ).toBe(true);
146
+ const sum = summary as Record<string, unknown>;
147
+ const ajv = new Ajv2020({ strict: false, allErrors: true });
148
+ addFormats(ajv);
149
+ const validate = ajv.compile(loadSchema('eval-summary.schema.json'));
150
+ expect(
151
+ validate(sum),
152
+ driver.describe('eval-summary.schema.json', `EvalSummary MUST be schema-valid (${ajv.errorsText(validate.errors)})`),
153
+ ).toBe(true);
154
+
155
+ const tasks = (sum.tasks as Array<Record<string, unknown>> | undefined) ?? [];
156
+ const passedCount = sum.passedCount as number | undefined;
157
+ const taskCount = sum.taskCount as number | undefined;
158
+ expect(
159
+ typeof passedCount === 'number' && typeof taskCount === 'number',
160
+ driver.describe('eval-summary.schema.json', 'EvalSummary MUST carry numeric passedCount + taskCount'),
161
+ ).toBe(true);
162
+ expect(
163
+ (passedCount as number) <= (taskCount as number),
164
+ driver.describe('agent-evaluation.md §C', 'EvalSummary.passedCount MUST NOT exceed taskCount'),
165
+ ).toBe(true);
166
+ for (const t of tasks) {
167
+ expectContentFree(t, 'EvalSummary.tasks[]');
141
168
  }
142
169
 
143
170
  await resetTestSeam();
@@ -0,0 +1,68 @@
1
+ /**
2
+ * openwop-agent-platform — LIVE aggregate-evidence (RFC 0085 §C) — behavioral.
3
+ *
4
+ * The `Active → Accepted` bar for the meta-profile. Capability-gated on a host
5
+ * CLAIMING the operational annex — i.e. its live discovery `profiles[]` includes
6
+ * `openwop-agent-platform`. Soft-skips when unclaimed (default) / hard-fails
7
+ * under `OPENWOP_REQUIRE_BEHAVIOR=true`.
8
+ *
9
+ * The always-on derivation legs in `agent-platform-profile.test.ts` prove the
10
+ * §B predicate logic against synthetic payloads; THIS asserts the §C/§D
11
+ * honest-advertisement rule against the LIVE discovery doc: a host MAY advertise
12
+ * `openwop-agent-platform` only if its real wire satisfies the §B floor
13
+ * predicate — the platform claim is **backed by** the per-capability evidence
14
+ * (each constituent cap's gated scenario — agent-manifest-runtime,
15
+ * agent-live-*, tool-catalog/hooks, safe-fetch, provider-usage, prompts, memory,
16
+ * feedback, replay, + the governance scenarios — runs in this same suite run and
17
+ * must pass), never asserted on the profile string alone.
18
+ *
19
+ * When the operator declares the cert tier `full`
20
+ * (`OPENWOP_AGENT_PLATFORM_TIER=full`), the full predicate (all governance terms
21
+ * + tenant installScope) MUST hold non-vacuously.
22
+ *
23
+ * Spec references:
24
+ * - https://github.com/openwop/openwop/blob/main/spec/v1/agent-platform-profile.md (§C/§D)
25
+ * - https://github.com/openwop/openwop/blob/main/RFCS/0085-agent-platform-meta-profile.md
26
+ */
27
+
28
+ import { describe, it, expect } from 'vitest';
29
+ import { driver } from '../lib/driver.js';
30
+ import { behaviorGate } from '../lib/behavior-gate.js';
31
+ import { isAgentPlatformPartial, isAgentPlatformFull, agentPlatformStatus, agentPlatformSatisfiedTerms } from '../lib/profiles.js';
32
+
33
+ describe('agent-platform-aggregate-evidence (RFC 0085 §C)', () => {
34
+ it('a host claiming openwop-agent-platform satisfies the §B floor on live discovery; full when the operator certifies full', async () => {
35
+ const res = await driver.get('/.well-known/openwop', { authenticated: false });
36
+ const disco = (res.status === 200 ? res.json : null) as Record<string, unknown> | null;
37
+ const profiles = Array.isArray(disco?.profiles) ? (disco!.profiles as unknown[]) : [];
38
+ const claims = disco !== null && profiles.includes('openwop-agent-platform');
39
+ if (!behaviorGate('openwop-agent-platform', claims)) return;
40
+
41
+ // §C / §D honest-advertisement: the profile claim MUST be backed by the §B
42
+ // floor predicate holding on the live discovery payload — never asserted on
43
+ // the profile string alone.
44
+ expect(
45
+ isAgentPlatformPartial(disco!),
46
+ driver.describe('agent-platform-profile.md §C', 'claiming openwop-agent-platform MUST satisfy the §B floor predicate on live discovery (claim backed by per-capability evidence)'),
47
+ ).toBe(true);
48
+
49
+ const status = agentPlatformStatus(disco!);
50
+ expect(
51
+ status === 'partial' || status === 'full',
52
+ driver.describe('agent-platform-profile.md §D', 'a claimed openwop-agent-platform host MUST derive to partial or full, never none'),
53
+ ).toBe(true);
54
+
55
+ // Non-vacuous FULL bar: when the operator declares the cert tier `full`,
56
+ // every governance term + tenant installScope MUST hold + all 16 §D terms.
57
+ if (process.env.OPENWOP_AGENT_PLATFORM_TIER === 'full') {
58
+ expect(
59
+ isAgentPlatformFull(disco!),
60
+ driver.describe('agent-platform-profile.md §B/§D', 'a host certifying `full` MUST satisfy every governance term: authorization + tenant installScope + memory.attribution + debugBundle + triggerBridge + httpClient.egressPolicy'),
61
+ ).toBe(true);
62
+ expect(
63
+ agentPlatformSatisfiedTerms(disco!).length,
64
+ driver.describe('agent-platform-profile.md §D', 'a host certifying `full` satisfies all 16 §D terms'),
65
+ ).toBe(16);
66
+ }
67
+ });
68
+ });
@@ -13,10 +13,11 @@
13
13
  * missing any reports `partial`, never `full` (the honest-advertisement rule).
14
14
  * - `capabilities.nondeterminismPolicy.declared` is declared in the schema.
15
15
  *
16
- * The LIVE aggregate-evidence assertion (does every required constituent scenario
17
- * actually pass against a host claiming `full`?) is the `Active → Accepted` step
18
- * per RFC 0085 §C — naturally gated on a reference host reaching partial/full, and
19
- * deferred here. This scenario asserts the discovery-predicate derivation only.
16
+ * The LIVE aggregate-evidence assertion (the §C honest-advertisement rule on a
17
+ * host claiming `openwop-agent-platform`) is the `Active → Accepted` step per RFC
18
+ * 0085 §C — capability-gated, server-requiring, and lives in the sibling
19
+ * `agent-platform-aggregate-evidence.test.ts`. THIS scenario asserts the
20
+ * discovery-predicate derivation only (always-on, server-free).
20
21
  *
21
22
  * Spec references:
22
23
  * - https://github.com/openwop/openwop/blob/main/spec/v1/agent-platform-profile.md
@@ -0,0 +1,152 @@
1
+ /**
2
+ * Budget enforcement — the §C lifecycle + §D hard-stop (RFC 0084) — behavioral.
3
+ *
4
+ * Gated on `capabilities.budget.supported` (root-first per RFC 0073). Soft-skips
5
+ * when unadvertised (default) / hard-fails under `OPENWOP_REQUIRE_BEHAVIOR=true`.
6
+ * The always-on wire-shape coverage lives in `budget-policy-shape.test.ts`; this
7
+ * asserts host BEHAVIOR via the `POST /v1/host/sample/budget/run` seam + the test
8
+ * event-log seam:
9
+ *
10
+ * 1. HARD COST EXHAUST (§C/§D, requires `enforce:"hard"`) — a hard-cost run
11
+ * accrues to exhaustion, emitting in strict sequence:
12
+ * `budget.reserved` → `budget.consumed` → `budget.threshold.crossed{percent}`
13
+ * → `budget.exhausted` → `cap.breached{kind:"budget-cost"}` →
14
+ * `run.failed{error:"budget_exhausted"}`.
15
+ * 2. MODEL DENIED (§D model policy) — a run whose model violates the budget
16
+ * allow/deny list is refused with `budget_model_denied` BEFORE the provider
17
+ * call (no model call, fail-closed).
18
+ * 3. ADVISORY (§D, `enforce:"advisory"`) — the same accrual emits the
19
+ * `budget.*` events but does NOT stop the run (no `cap.breached`, no
20
+ * `run.failed{budget_exhausted}`).
21
+ * 4. CONTENT-FREE (SR-1 / `budget-no-pricing-leak`) — every `budget.*` payload
22
+ * carries only dimension/limit/consumed/remaining/percent scalars, never a
23
+ * provider pricing table or per-token rate.
24
+ *
25
+ * Spec references:
26
+ * - https://github.com/openwop/openwop/blob/main/spec/v1/budget-policy.md (§C/§D)
27
+ * - https://github.com/openwop/openwop/blob/main/RFCS/0084-budget-quota-and-cost-policy.md
28
+ * - https://github.com/openwop/openwop/blob/main/SECURITY/invariants.yaml (budget-no-pricing-leak)
29
+ */
30
+
31
+ import { describe, it, expect } from 'vitest';
32
+ import { driver } from '../lib/driver.js';
33
+ import { behaviorGate } from '../lib/behavior-gate.js';
34
+ import { readBudgetCap, driveBudgetRun, BUDGET_CAP_KINDS, BUDGET_CONTENT_FORBIDDEN } from '../lib/budgetPolicy.js';
35
+ import { queryTestEvents, isEventLogSeamAvailable, resetTestSeam } from '../lib/event-log-query.js';
36
+ import type { TestEvent } from '../lib/event-log-query.js';
37
+
38
+ function seq(events: TestEvent[], type: string): number {
39
+ const e = events.find((x) => x.type === type);
40
+ return e ? e.sequence : -1;
41
+ }
42
+
43
+ function expectContentFree(events: TestEvent[]): void {
44
+ for (const e of events.filter((x) => x.type.startsWith('budget.'))) {
45
+ for (const f of BUDGET_CONTENT_FORBIDDEN) {
46
+ expect(
47
+ !(f in e.payload),
48
+ driver.describe('RFC 0084 §F (SR-1) / budget-no-pricing-leak', `budget.* MUST be content-free (no ${f})`),
49
+ ).toBe(true);
50
+ }
51
+ }
52
+ }
53
+
54
+ describe('budget-enforcement (RFC 0084 §C/§D)', () => {
55
+ it('runs the reserved→consumed→threshold→exhausted→cap.breached→run.failed chain, refuses denied models, and honors advisory mode', async () => {
56
+ const cap = await readBudgetCap();
57
+ if (!behaviorGate('openwop-budget-enforcement', cap?.supported === true)) return;
58
+ if (!(await isEventLogSeamAvailable())) return; // event-log seam absent — soft-skip
59
+
60
+ // ---- Leg 1: hard cost exhaust (§C/§D) -------------------------------
61
+ const hard = await driveBudgetRun({ scenario: 'hard-cost-exhaust' });
62
+ if (hard === null) return; // budget seam absent — soft-skip the whole behavior
63
+ if (hard.runId) {
64
+ const q = await queryTestEvents(hard.runId);
65
+ if (q.ok) {
66
+ const ev = q.events.slice().sort((a, b) => a.sequence - b.sequence);
67
+ const reserved = seq(ev, 'budget.reserved');
68
+ const threshold = seq(ev, 'budget.threshold.crossed');
69
+ const exhausted = seq(ev, 'budget.exhausted');
70
+ const failed = seq(ev, 'run.failed');
71
+ const capBreached = ev.find((e) => e.type === 'cap.breached' && typeof e.payload.kind === 'string' && (e.payload.kind as string).startsWith('budget-'));
72
+
73
+ expect(
74
+ reserved >= 0 && exhausted >= 0,
75
+ driver.describe('budget-policy.md §C', 'a hard budget run MUST emit budget.reserved + budget.exhausted'),
76
+ ).toBe(true);
77
+ // §C ordering: reserved < threshold.crossed < exhausted < run.failed.
78
+ if (threshold >= 0) {
79
+ expect(
80
+ reserved < threshold && threshold < exhausted,
81
+ driver.describe('RFC 0084 §C', 'ordering MUST be reserved < threshold.crossed < exhausted'),
82
+ ).toBe(true);
83
+ const tc = ev.find((e) => e.type === 'budget.threshold.crossed');
84
+ expect(
85
+ typeof tc?.payload.percent === 'number',
86
+ driver.describe('run-event-payloads.schema.json#budgetThresholdCrossed', 'threshold.crossed MUST carry a numeric percent'),
87
+ ).toBe(true);
88
+ }
89
+ // §D hard-stop: exhausted → cap.breached{budget-*} → run.failed{budget_exhausted}.
90
+ expect(
91
+ capBreached !== undefined,
92
+ driver.describe('RFC 0084 §D', 'exhaustion MUST emit cap.breached with a budget-* kind'),
93
+ ).toBe(true);
94
+ if (capBreached) {
95
+ expect(
96
+ BUDGET_CAP_KINDS.includes(capBreached.payload.kind as string),
97
+ driver.describe('RFC 0084 §D', 'cap.breached.kind MUST be in the closed budget vocabulary'),
98
+ ).toBe(true);
99
+ expect(
100
+ exhausted <= capBreached.sequence && capBreached.sequence <= failed,
101
+ driver.describe('RFC 0084 §D', 'ordering MUST be exhausted ≤ cap.breached ≤ run.failed'),
102
+ ).toBe(true);
103
+ }
104
+ const failedEvt = ev.find((e) => e.type === 'run.failed');
105
+ expect(
106
+ failedEvt?.payload.error === 'budget_exhausted',
107
+ driver.describe('RFC 0084 §D', 'a hard-budget overrun MUST fail the run with error budget_exhausted'),
108
+ ).toBe(true);
109
+ expectContentFree(ev);
110
+ }
111
+ }
112
+
113
+ // ---- Leg 2: model denied (§D model policy, fail-closed) -------------
114
+ const denied = await driveBudgetRun({ scenario: 'model-denied' });
115
+ if (denied !== null) {
116
+ expect(
117
+ denied.error === 'budget_model_denied',
118
+ driver.describe('RFC 0084 §D', 'a model violating the budget allow/deny list MUST be refused with budget_model_denied'),
119
+ ).toBe(true);
120
+ expect(
121
+ denied.modelCalled !== true,
122
+ driver.describe('RFC 0084 §D', 'a denied model MUST be refused BEFORE the provider call (fail-closed)'),
123
+ ).toBe(true);
124
+ }
125
+
126
+ // ---- Leg 3: advisory mode emits events but never stops --------------
127
+ if (cap?.enforce === 'advisory' || cap?.enforce === undefined) {
128
+ const adv = await driveBudgetRun({ scenario: 'advisory' });
129
+ if (adv !== null && adv.runId) {
130
+ const q = await queryTestEvents(adv.runId);
131
+ if (q.ok) {
132
+ const ev = q.events;
133
+ const hasBudgetEvents = ev.some((e) => e.type.startsWith('budget.'));
134
+ const stopped = ev.some(
135
+ (e) =>
136
+ (e.type === 'cap.breached' && typeof e.payload.kind === 'string' && (e.payload.kind as string).startsWith('budget-')) ||
137
+ (e.type === 'run.failed' && e.payload.error === 'budget_exhausted'),
138
+ );
139
+ if (hasBudgetEvents) {
140
+ expect(
141
+ !stopped,
142
+ driver.describe('RFC 0084 §D', 'advisory enforcement MUST emit budget.* events without stopping the run'),
143
+ ).toBe(true);
144
+ }
145
+ expectContentFree(ev);
146
+ }
147
+ }
148
+ }
149
+
150
+ await resetTestSeam();
151
+ });
152
+ });