@openwop/openwop-conformance 1.15.0 → 1.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +34 -0
- package/README.md +2 -2
- package/coverage.md +4 -2
- package/package.json +1 -1
- package/schemas/run-event-payloads.schema.json +2 -2
- package/src/lib/budgetPolicy.ts +63 -0
- package/src/lib/event-log-query.ts +18 -0
- package/src/lib/otel-collector.ts +34 -4
- package/src/scenarios/agent-deployment-lifecycle.test.ts +82 -59
- package/src/scenarios/agent-eval-run.test.ts +95 -68
- package/src/scenarios/agent-platform-aggregate-evidence.test.ts +68 -0
- package/src/scenarios/agent-platform-profile.test.ts +5 -4
- package/src/scenarios/budget-enforcement.test.ts +152 -0
- package/src/scenarios/otel-collector-canary-inspection.test.ts +50 -0
- package/src/scenarios/replay-observable-sequence-determinism.test.ts +35 -10
- package/src/scenarios/trigger-bridge-delivery.test.ts +92 -56
|
@@ -42,7 +42,7 @@ import {
|
|
|
42
42
|
DEPLOYMENT_STATES,
|
|
43
43
|
DEPLOYMENT_CONTENT_FORBIDDEN,
|
|
44
44
|
} from '../lib/agentDeployment.js';
|
|
45
|
-
import { queryTestEvents, isEventLogSeamAvailable, resetTestSeam } from '../lib/event-log-query.js';
|
|
45
|
+
import { queryTestEvents, requireEvents, isEventLogSeamAvailable, resetTestSeam } from '../lib/event-log-query.js';
|
|
46
46
|
|
|
47
47
|
function loadSchema(name: string): Record<string, unknown> {
|
|
48
48
|
return JSON.parse(readFileSync(join(SCHEMAS_DIR, name), 'utf8')) as Record<string, unknown>;
|
|
@@ -71,76 +71,99 @@ describe('agent-deployment-lifecycle (RFC 0082 §B/§E)', () => {
|
|
|
71
71
|
const promote = await driveDeploymentTransition({ scenario: 'promote' });
|
|
72
72
|
if (promote === null) return; // deployment seam unwired — soft-skip the whole behavioral suite
|
|
73
73
|
|
|
74
|
-
|
|
74
|
+
// The host has ADVERTISED agents.deployment AND wired the seam — missing
|
|
75
|
+
// evidence is a FAILURE, not a soft-skip. A successful promote MUST return
|
|
76
|
+
// a runId + a schema-valid record + emit ≥1 content-free deployment.promoted.
|
|
77
|
+
expect(
|
|
78
|
+
typeof promote.runId === 'string' && (promote.runId as string).length > 0,
|
|
79
|
+
driver.describe('agent-deployment.md §E', 'a wired promote MUST return the runId'),
|
|
80
|
+
).toBe(true);
|
|
81
|
+
expect(
|
|
82
|
+
promote.record !== undefined && promote.record !== null,
|
|
83
|
+
driver.describe('agent-deployment.md §E', 'a successful promote MUST return the deployment record'),
|
|
84
|
+
).toBe(true);
|
|
85
|
+
expect(
|
|
86
|
+
validateRecord(promote.record),
|
|
87
|
+
driver.describe('agent-deployment.schema.json', `a promoted deployment record MUST validate (${ajv.errorsText(validateRecord.errors)})`),
|
|
88
|
+
).toBe(true);
|
|
89
|
+
|
|
90
|
+
const promotedEvents = requireEvents(
|
|
91
|
+
await queryTestEvents(promote.runId as string, { type: 'deployment.promoted' }),
|
|
92
|
+
'deployment.promoted',
|
|
93
|
+
);
|
|
94
|
+
expect(
|
|
95
|
+
promotedEvents.length >= 1,
|
|
96
|
+
driver.describe('agent-deployment.md §E', 'a successful promote MUST emit at least one deployment.promoted'),
|
|
97
|
+
).toBe(true);
|
|
98
|
+
for (const e of promotedEvents) {
|
|
99
|
+
expectContentFree(e.payload, 'deployment.promoted');
|
|
75
100
|
expect(
|
|
76
|
-
|
|
77
|
-
driver.describe(
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
),
|
|
101
|
+
typeof e.payload.toState === 'string' && DEPLOYMENT_STATES.includes(e.payload.toState as string),
|
|
102
|
+
driver.describe('run-event-payloads.schema.json#/$defs/deploymentPromoted', 'toState MUST be in the seven-state vocabulary'),
|
|
103
|
+
).toBe(true);
|
|
104
|
+
expect(
|
|
105
|
+
typeof e.payload.toVersion === 'string' && (e.payload.toVersion as string).length > 0,
|
|
106
|
+
driver.describe('agent-deployment.md §D', 'deployment.promoted MUST carry the promoted toVersion'),
|
|
81
107
|
).toBe(true);
|
|
82
|
-
}
|
|
83
|
-
if (promote.runId) {
|
|
84
|
-
const pq = await queryTestEvents(promote.runId, { type: 'deployment.promoted' });
|
|
85
|
-
if (pq.ok) {
|
|
86
|
-
for (const e of pq.events) {
|
|
87
|
-
expectContentFree(e.payload, 'deployment.promoted');
|
|
88
|
-
expect(
|
|
89
|
-
typeof e.payload.toState === 'string' && DEPLOYMENT_STATES.includes(e.payload.toState as string),
|
|
90
|
-
driver.describe('run-event-payloads.schema.json#/$defs/deploymentPromoted', 'toState MUST be in the seven-state vocabulary'),
|
|
91
|
-
).toBe(true);
|
|
92
|
-
expect(
|
|
93
|
-
typeof e.payload.toVersion === 'string' && (e.payload.toVersion as string).length > 0,
|
|
94
|
-
driver.describe('agent-deployment.md §D', 'deployment.promoted MUST carry the promoted toVersion'),
|
|
95
|
-
).toBe(true);
|
|
96
|
-
}
|
|
97
|
-
}
|
|
98
108
|
}
|
|
99
109
|
|
|
100
110
|
// ---- Leg 2: fail-closed authz (§E-1; deployment-promotion-fail-closed) -
|
|
101
111
|
const unauth = await driveDeploymentTransition({ scenario: 'unauthorized' });
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
112
|
+
expect(
|
|
113
|
+
unauth !== null && typeof unauth.runId === 'string' && (unauth.runId as string).length > 0,
|
|
114
|
+
driver.describe('agent-deployment.md §E-1', 'the unauthorized scenario MUST return a runId to evidence the fail-closed denial'),
|
|
115
|
+
).toBe(true);
|
|
116
|
+
expect(
|
|
117
|
+
unauth!.allowed !== true,
|
|
118
|
+
driver.describe('agent-deployment.md §E-1', 'a principal without deploy:promote MUST be denied (fail-closed)'),
|
|
119
|
+
).toBe(true);
|
|
120
|
+
const unauthPromoted = requireEvents(
|
|
121
|
+
await queryTestEvents(unauth!.runId as string, { type: 'deployment.promoted' }),
|
|
122
|
+
'deployment.promoted (unauthorized)',
|
|
123
|
+
);
|
|
124
|
+
expect(
|
|
125
|
+
unauthPromoted.length === 0,
|
|
126
|
+
driver.describe('SECURITY invariant deployment-promotion-fail-closed', 'a denied transition MUST emit NO deployment.promoted'),
|
|
127
|
+
).toBe(true);
|
|
115
128
|
|
|
116
129
|
// ---- Leg 3: eval-gate-unmet denial (§E-3) ----------------------------
|
|
117
130
|
const evalUnmet = await driveDeploymentTransition({ scenario: 'eval-gate-unmet' });
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
+
expect(
|
|
132
|
+
evalUnmet !== null && typeof evalUnmet.runId === 'string' && (evalUnmet.runId as string).length > 0,
|
|
133
|
+
driver.describe('agent-deployment.md §E-3', 'the eval-gate-unmet scenario MUST return a runId to evidence the denial'),
|
|
134
|
+
).toBe(true);
|
|
135
|
+
expect(
|
|
136
|
+
evalUnmet!.error === 'eval_gate_unmet' || evalUnmet!.allowed !== true,
|
|
137
|
+
driver.describe('agent-deployment.md §E-3', 'a promote whose eval evidence has passed:false MUST be denied (eval_gate_unmet)'),
|
|
138
|
+
).toBe(true);
|
|
139
|
+
const evalUnmetPromoted = requireEvents(
|
|
140
|
+
await queryTestEvents(evalUnmet!.runId as string, { type: 'deployment.promoted' }),
|
|
141
|
+
'deployment.promoted (eval-gate-unmet)',
|
|
142
|
+
);
|
|
143
|
+
expect(
|
|
144
|
+
evalUnmetPromoted.length === 0,
|
|
145
|
+
driver.describe('agent-deployment.md §E-3', 'an unmet eval gate MUST emit NO deployment.promoted'),
|
|
146
|
+
).toBe(true);
|
|
131
147
|
|
|
132
148
|
// ---- Leg 4: channel-resolution pin (§B) ------------------------------
|
|
133
149
|
const pin = await driveDeploymentTransition({ scenario: 'channel-pin', channel: 'stable' });
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
150
|
+
expect(
|
|
151
|
+
pin !== null && typeof pin.runId === 'string' && (pin.runId as string).length > 0,
|
|
152
|
+
driver.describe('agent-deployment.md §B', 'the channel-pin scenario MUST return a runId'),
|
|
153
|
+
).toBe(true);
|
|
154
|
+
const invEvents = requireEvents(
|
|
155
|
+
await queryTestEvents(pin!.runId as string, { type: 'agent.invocation.started' }),
|
|
156
|
+
'agent.invocation.started (channel-pin)',
|
|
157
|
+
);
|
|
158
|
+
expect(
|
|
159
|
+
invEvents.length >= 1,
|
|
160
|
+
driver.describe('agent-deployment.md §B', 'a @channel-bound run MUST emit agent.invocation.started'),
|
|
161
|
+
).toBe(true);
|
|
162
|
+
const startedInv = invEvents.sort((a, b) => a.sequence - b.sequence)[0]!;
|
|
163
|
+
expect(
|
|
164
|
+
typeof startedInv.payload.resolvedAgentVersion === 'string' && (startedInv.payload.resolvedAgentVersion as string).length > 0,
|
|
165
|
+
driver.describe('agent-deployment.md §B', 'a @channel-bound run MUST record resolvedAgentVersion on agent.invocation.started (the recorded fact a replay re-reads)'),
|
|
166
|
+
).toBe(true);
|
|
144
167
|
|
|
145
168
|
await resetTestSeam();
|
|
146
169
|
});
|
|
@@ -38,7 +38,7 @@ import {
|
|
|
38
38
|
getEvalSummary,
|
|
39
39
|
EVAL_CONTENT_FORBIDDEN,
|
|
40
40
|
} from '../lib/agentEval.js';
|
|
41
|
-
import { queryTestEvents, isEventLogSeamAvailable, resetTestSeam } from '../lib/event-log-query.js';
|
|
41
|
+
import { queryTestEvents, requireEvents, isEventLogSeamAvailable, resetTestSeam } from '../lib/event-log-query.js';
|
|
42
42
|
|
|
43
43
|
function loadSchema(name: string): Record<string, unknown> {
|
|
44
44
|
return JSON.parse(readFileSync(join(SCHEMAS_DIR, name), 'utf8')) as Record<string, unknown>;
|
|
@@ -61,83 +61,110 @@ describe('agent-eval-run (RFC 0081 §B/§C)', () => {
|
|
|
61
61
|
|
|
62
62
|
const run = await driveEvalRun({ modes: ['golden'] });
|
|
63
63
|
if (run === null) return; // eval-run seam unwired — soft-skip the whole behavioral suite
|
|
64
|
-
if (!run.runId) return;
|
|
65
64
|
|
|
66
|
-
//
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
65
|
+
// From here the host has ADVERTISED agents.evalSuite AND wired the eval-run
|
|
66
|
+
// seam — missing evidence is a FAILURE, not a soft-skip. A host claiming the
|
|
67
|
+
// capability MUST produce the runId, the full eval.* sequence, and the
|
|
68
|
+
// normative EvalSummary, or it is advertising a capability it doesn't deliver.
|
|
69
|
+
expect(
|
|
70
|
+
typeof run.runId === 'string' && run.runId.length > 0,
|
|
71
|
+
driver.describe('agent-evaluation.md §B', 'a wired eval-run seam MUST return the projected runId'),
|
|
72
|
+
).toBe(true);
|
|
73
|
+
const runId = run.runId as string;
|
|
70
74
|
|
|
71
|
-
|
|
72
|
-
|
|
75
|
+
// ---- Legs 1+2: eval.* ordering + content-free (§C) -------------------
|
|
76
|
+
const startedQ = await queryTestEvents(runId, { type: 'eval.started' });
|
|
77
|
+
const scoredQ = await queryTestEvents(runId, { type: 'eval.scored' });
|
|
78
|
+
const completedQ = await queryTestEvents(runId, { type: 'eval.completed' });
|
|
73
79
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
).toBe(true);
|
|
80
|
-
}
|
|
80
|
+
// The event-log seam MUST return the eval.* events for a wired eval run
|
|
81
|
+
// (requireEvents hard-fails if a leg's query is not ok — no vacuous pass).
|
|
82
|
+
const startedEvents = requireEvents(startedQ, 'eval.started');
|
|
83
|
+
const scoredEvents = requireEvents(scoredQ, 'eval.scored');
|
|
84
|
+
const completedEvents = requireEvents(completedQ, 'eval.completed');
|
|
81
85
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
}
|
|
86
|
+
// eval.started exactly once (FIRST); eval.completed exactly once (LAST);
|
|
87
|
+
// ≥1 eval.scored — a wired eval run MUST emit the full sequence.
|
|
88
|
+
expect(
|
|
89
|
+
startedEvents.length === 1,
|
|
90
|
+
driver.describe('agent-evaluation.md §C', 'an eval run MUST emit exactly one eval.started'),
|
|
91
|
+
).toBe(true);
|
|
92
|
+
expect(
|
|
93
|
+
scoredEvents.length >= 1,
|
|
94
|
+
driver.describe('agent-evaluation.md §C', 'an eval run MUST emit at least one eval.scored'),
|
|
95
|
+
).toBe(true);
|
|
96
|
+
expect(
|
|
97
|
+
completedEvents.length === 1,
|
|
98
|
+
driver.describe('agent-evaluation.md §C', 'an eval run MUST emit exactly one eval.completed'),
|
|
99
|
+
).toBe(true);
|
|
100
|
+
const started = startedEvents[0]!;
|
|
101
|
+
const completed = completedEvents[0]!;
|
|
99
102
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
).toBe(true);
|
|
111
|
-
}
|
|
112
|
-
expectContentFree(started.payload, 'eval.started');
|
|
103
|
+
// Ordering: eval.started precedes every eval.scored precedes eval.completed.
|
|
104
|
+
for (const s of scoredEvents) {
|
|
105
|
+
expect(
|
|
106
|
+
started.sequence < s.sequence,
|
|
107
|
+
driver.describe('agent-evaluation.md §C', 'eval.started MUST precede every eval.scored'),
|
|
108
|
+
).toBe(true);
|
|
109
|
+
expect(
|
|
110
|
+
s.sequence < completed.sequence,
|
|
111
|
+
driver.describe('agent-evaluation.md §C', 'every eval.scored MUST precede eval.completed'),
|
|
112
|
+
).toBe(true);
|
|
113
113
|
}
|
|
114
114
|
|
|
115
|
-
//
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
115
|
+
// One eval.scored per task (count == eval.completed.taskCount).
|
|
116
|
+
expect(
|
|
117
|
+
typeof completed.payload.taskCount === 'number',
|
|
118
|
+
driver.describe('run-event-payloads.schema.json#/$defs/evalCompleted', 'eval.completed MUST carry a numeric taskCount'),
|
|
119
|
+
).toBe(true);
|
|
120
|
+
expect(
|
|
121
|
+
scoredEvents.length === completed.payload.taskCount,
|
|
122
|
+
driver.describe('agent-evaluation.md §C', 'one eval.scored per task (count == eval.completed.taskCount)'),
|
|
123
|
+
).toBe(true);
|
|
124
|
+
|
|
125
|
+
// Content-free (§C / eval-summary-no-content-leak) + score ∈ 0..1, passed boolean.
|
|
126
|
+
expectContentFree(started.payload, 'eval.started');
|
|
127
|
+
expectContentFree(completed.payload, 'eval.completed');
|
|
128
|
+
for (const s of scoredEvents) {
|
|
129
|
+
expectContentFree(s.payload, 'eval.scored');
|
|
130
|
+
expect(
|
|
131
|
+
typeof s.payload.score === 'number' && (s.payload.score as number) >= 0 && (s.payload.score as number) <= 1,
|
|
132
|
+
driver.describe('run-event-payloads.schema.json#/$defs/evalScored', 'eval.scored.score MUST be in 0..1'),
|
|
133
|
+
).toBe(true);
|
|
121
134
|
expect(
|
|
122
|
-
|
|
123
|
-
driver.describe(
|
|
124
|
-
'eval-summary.schema.json',
|
|
125
|
-
`GET /v1/runs/{runId}/eval-summary MUST return a schema-valid EvalSummary (${ajv.errorsText(validate.errors)})`,
|
|
126
|
-
),
|
|
135
|
+
typeof s.payload.passed === 'boolean',
|
|
136
|
+
driver.describe('run-event-payloads.schema.json#/$defs/evalScored', 'eval.scored.passed MUST be a boolean'),
|
|
127
137
|
).toBe(true);
|
|
138
|
+
}
|
|
128
139
|
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
140
|
+
// ---- Leg 3: NORMATIVE EvalSummary read (§C) — MUST serve a 200 -------
|
|
141
|
+
const { status, summary } = await getEvalSummary(runId);
|
|
142
|
+
expect(
|
|
143
|
+
status === 200 && summary !== undefined,
|
|
144
|
+
driver.describe('agent-evaluation.md §C', `GET /v1/runs/{runId}/eval-summary MUST serve a 200 EvalSummary for a completed eval run (got ${status})`),
|
|
145
|
+
).toBe(true);
|
|
146
|
+
const sum = summary as Record<string, unknown>;
|
|
147
|
+
const ajv = new Ajv2020({ strict: false, allErrors: true });
|
|
148
|
+
addFormats(ajv);
|
|
149
|
+
const validate = ajv.compile(loadSchema('eval-summary.schema.json'));
|
|
150
|
+
expect(
|
|
151
|
+
validate(sum),
|
|
152
|
+
driver.describe('eval-summary.schema.json', `EvalSummary MUST be schema-valid (${ajv.errorsText(validate.errors)})`),
|
|
153
|
+
).toBe(true);
|
|
154
|
+
|
|
155
|
+
const tasks = (sum.tasks as Array<Record<string, unknown>> | undefined) ?? [];
|
|
156
|
+
const passedCount = sum.passedCount as number | undefined;
|
|
157
|
+
const taskCount = sum.taskCount as number | undefined;
|
|
158
|
+
expect(
|
|
159
|
+
typeof passedCount === 'number' && typeof taskCount === 'number',
|
|
160
|
+
driver.describe('eval-summary.schema.json', 'EvalSummary MUST carry numeric passedCount + taskCount'),
|
|
161
|
+
).toBe(true);
|
|
162
|
+
expect(
|
|
163
|
+
(passedCount as number) <= (taskCount as number),
|
|
164
|
+
driver.describe('agent-evaluation.md §C', 'EvalSummary.passedCount MUST NOT exceed taskCount'),
|
|
165
|
+
).toBe(true);
|
|
166
|
+
for (const t of tasks) {
|
|
167
|
+
expectContentFree(t, 'EvalSummary.tasks[]');
|
|
141
168
|
}
|
|
142
169
|
|
|
143
170
|
await resetTestSeam();
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* openwop-agent-platform — LIVE aggregate-evidence (RFC 0085 §C) — behavioral.
|
|
3
|
+
*
|
|
4
|
+
* The `Active → Accepted` bar for the meta-profile. Capability-gated on a host
|
|
5
|
+
* CLAIMING the operational annex — i.e. its live discovery `profiles[]` includes
|
|
6
|
+
* `openwop-agent-platform`. Soft-skips when unclaimed (default) / hard-fails
|
|
7
|
+
* under `OPENWOP_REQUIRE_BEHAVIOR=true`.
|
|
8
|
+
*
|
|
9
|
+
* The always-on derivation legs in `agent-platform-profile.test.ts` prove the
|
|
10
|
+
* §B predicate logic against synthetic payloads; THIS asserts the §C/§D
|
|
11
|
+
* honest-advertisement rule against the LIVE discovery doc: a host MAY advertise
|
|
12
|
+
* `openwop-agent-platform` only if its real wire satisfies the §B floor
|
|
13
|
+
* predicate — the platform claim is **backed by** the per-capability evidence
|
|
14
|
+
* (each constituent cap's gated scenario — agent-manifest-runtime,
|
|
15
|
+
* agent-live-*, tool-catalog/hooks, safe-fetch, provider-usage, prompts, memory,
|
|
16
|
+
* feedback, replay, + the governance scenarios — runs in this same suite run and
|
|
17
|
+
* must pass), never asserted on the profile string alone.
|
|
18
|
+
*
|
|
19
|
+
* When the operator declares the cert tier `full`
|
|
20
|
+
* (`OPENWOP_AGENT_PLATFORM_TIER=full`), the full predicate (all governance terms
|
|
21
|
+
* + tenant installScope) MUST hold non-vacuously.
|
|
22
|
+
*
|
|
23
|
+
* Spec references:
|
|
24
|
+
* - https://github.com/openwop/openwop/blob/main/spec/v1/agent-platform-profile.md (§C/§D)
|
|
25
|
+
* - https://github.com/openwop/openwop/blob/main/RFCS/0085-agent-platform-meta-profile.md
|
|
26
|
+
*/
|
|
27
|
+
|
|
28
|
+
import { describe, it, expect } from 'vitest';
|
|
29
|
+
import { driver } from '../lib/driver.js';
|
|
30
|
+
import { behaviorGate } from '../lib/behavior-gate.js';
|
|
31
|
+
import { isAgentPlatformPartial, isAgentPlatformFull, agentPlatformStatus, agentPlatformSatisfiedTerms } from '../lib/profiles.js';
|
|
32
|
+
|
|
33
|
+
describe('agent-platform-aggregate-evidence (RFC 0085 §C)', () => {
|
|
34
|
+
it('a host claiming openwop-agent-platform satisfies the §B floor on live discovery; full when the operator certifies full', async () => {
|
|
35
|
+
const res = await driver.get('/.well-known/openwop', { authenticated: false });
|
|
36
|
+
const disco = (res.status === 200 ? res.json : null) as Record<string, unknown> | null;
|
|
37
|
+
const profiles = Array.isArray(disco?.profiles) ? (disco!.profiles as unknown[]) : [];
|
|
38
|
+
const claims = disco !== null && profiles.includes('openwop-agent-platform');
|
|
39
|
+
if (!behaviorGate('openwop-agent-platform', claims)) return;
|
|
40
|
+
|
|
41
|
+
// §C / §D honest-advertisement: the profile claim MUST be backed by the §B
|
|
42
|
+
// floor predicate holding on the live discovery payload — never asserted on
|
|
43
|
+
// the profile string alone.
|
|
44
|
+
expect(
|
|
45
|
+
isAgentPlatformPartial(disco!),
|
|
46
|
+
driver.describe('agent-platform-profile.md §C', 'claiming openwop-agent-platform MUST satisfy the §B floor predicate on live discovery (claim backed by per-capability evidence)'),
|
|
47
|
+
).toBe(true);
|
|
48
|
+
|
|
49
|
+
const status = agentPlatformStatus(disco!);
|
|
50
|
+
expect(
|
|
51
|
+
status === 'partial' || status === 'full',
|
|
52
|
+
driver.describe('agent-platform-profile.md §D', 'a claimed openwop-agent-platform host MUST derive to partial or full, never none'),
|
|
53
|
+
).toBe(true);
|
|
54
|
+
|
|
55
|
+
// Non-vacuous FULL bar: when the operator declares the cert tier `full`,
|
|
56
|
+
// every governance term + tenant installScope MUST hold + all 16 §D terms.
|
|
57
|
+
if (process.env.OPENWOP_AGENT_PLATFORM_TIER === 'full') {
|
|
58
|
+
expect(
|
|
59
|
+
isAgentPlatformFull(disco!),
|
|
60
|
+
driver.describe('agent-platform-profile.md §B/§D', 'a host certifying `full` MUST satisfy every governance term: authorization + tenant installScope + memory.attribution + debugBundle + triggerBridge + httpClient.egressPolicy'),
|
|
61
|
+
).toBe(true);
|
|
62
|
+
expect(
|
|
63
|
+
agentPlatformSatisfiedTerms(disco!).length,
|
|
64
|
+
driver.describe('agent-platform-profile.md §D', 'a host certifying `full` satisfies all 16 §D terms'),
|
|
65
|
+
).toBe(16);
|
|
66
|
+
}
|
|
67
|
+
});
|
|
68
|
+
});
|
|
@@ -13,10 +13,11 @@
|
|
|
13
13
|
* missing any reports `partial`, never `full` (the honest-advertisement rule).
|
|
14
14
|
* - `capabilities.nondeterminismPolicy.declared` is declared in the schema.
|
|
15
15
|
*
|
|
16
|
-
* The LIVE aggregate-evidence assertion (
|
|
17
|
-
*
|
|
18
|
-
*
|
|
19
|
-
*
|
|
16
|
+
* The LIVE aggregate-evidence assertion (the §C honest-advertisement rule on a
|
|
17
|
+
* host claiming `openwop-agent-platform`) is the `Active → Accepted` step per RFC
|
|
18
|
+
* 0085 §C — capability-gated, server-requiring, and lives in the sibling
|
|
19
|
+
* `agent-platform-aggregate-evidence.test.ts`. THIS scenario asserts the
|
|
20
|
+
* discovery-predicate derivation only (always-on, server-free).
|
|
20
21
|
*
|
|
21
22
|
* Spec references:
|
|
22
23
|
* - https://github.com/openwop/openwop/blob/main/spec/v1/agent-platform-profile.md
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Budget enforcement — the §C lifecycle + §D hard-stop (RFC 0084) — behavioral.
|
|
3
|
+
*
|
|
4
|
+
* Gated on `capabilities.budget.supported` (root-first per RFC 0073). Soft-skips
|
|
5
|
+
* when unadvertised (default) / hard-fails under `OPENWOP_REQUIRE_BEHAVIOR=true`.
|
|
6
|
+
* The always-on wire-shape coverage lives in `budget-policy-shape.test.ts`; this
|
|
7
|
+
* asserts host BEHAVIOR via the `POST /v1/host/sample/budget/run` seam + the test
|
|
8
|
+
* event-log seam:
|
|
9
|
+
*
|
|
10
|
+
* 1. HARD COST EXHAUST (§C/§D, requires `enforce:"hard"`) — a hard-cost run
|
|
11
|
+
* accrues to exhaustion, emitting in strict sequence:
|
|
12
|
+
* `budget.reserved` → `budget.consumed` → `budget.threshold.crossed{percent}`
|
|
13
|
+
* → `budget.exhausted` → `cap.breached{kind:"budget-cost"}` →
|
|
14
|
+
* `run.failed{error:"budget_exhausted"}`.
|
|
15
|
+
* 2. MODEL DENIED (§D model policy) — a run whose model violates the budget
|
|
16
|
+
* allow/deny list is refused with `budget_model_denied` BEFORE the provider
|
|
17
|
+
* call (no model call, fail-closed).
|
|
18
|
+
* 3. ADVISORY (§D, `enforce:"advisory"`) — the same accrual emits the
|
|
19
|
+
* `budget.*` events but does NOT stop the run (no `cap.breached`, no
|
|
20
|
+
* `run.failed{budget_exhausted}`).
|
|
21
|
+
* 4. CONTENT-FREE (SR-1 / `budget-no-pricing-leak`) — every `budget.*` payload
|
|
22
|
+
* carries only dimension/limit/consumed/remaining/percent scalars, never a
|
|
23
|
+
* provider pricing table or per-token rate.
|
|
24
|
+
*
|
|
25
|
+
* Spec references:
|
|
26
|
+
* - https://github.com/openwop/openwop/blob/main/spec/v1/budget-policy.md (§C/§D)
|
|
27
|
+
* - https://github.com/openwop/openwop/blob/main/RFCS/0084-budget-quota-and-cost-policy.md
|
|
28
|
+
* - https://github.com/openwop/openwop/blob/main/SECURITY/invariants.yaml (budget-no-pricing-leak)
|
|
29
|
+
*/
|
|
30
|
+
|
|
31
|
+
import { describe, it, expect } from 'vitest';
|
|
32
|
+
import { driver } from '../lib/driver.js';
|
|
33
|
+
import { behaviorGate } from '../lib/behavior-gate.js';
|
|
34
|
+
import { readBudgetCap, driveBudgetRun, BUDGET_CAP_KINDS, BUDGET_CONTENT_FORBIDDEN } from '../lib/budgetPolicy.js';
|
|
35
|
+
import { queryTestEvents, isEventLogSeamAvailable, resetTestSeam } from '../lib/event-log-query.js';
|
|
36
|
+
import type { TestEvent } from '../lib/event-log-query.js';
|
|
37
|
+
|
|
38
|
+
function seq(events: TestEvent[], type: string): number {
|
|
39
|
+
const e = events.find((x) => x.type === type);
|
|
40
|
+
return e ? e.sequence : -1;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
function expectContentFree(events: TestEvent[]): void {
|
|
44
|
+
for (const e of events.filter((x) => x.type.startsWith('budget.'))) {
|
|
45
|
+
for (const f of BUDGET_CONTENT_FORBIDDEN) {
|
|
46
|
+
expect(
|
|
47
|
+
!(f in e.payload),
|
|
48
|
+
driver.describe('RFC 0084 §F (SR-1) / budget-no-pricing-leak', `budget.* MUST be content-free (no ${f})`),
|
|
49
|
+
).toBe(true);
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
describe('budget-enforcement (RFC 0084 §C/§D)', () => {
|
|
55
|
+
it('runs the reserved→consumed→threshold→exhausted→cap.breached→run.failed chain, refuses denied models, and honors advisory mode', async () => {
|
|
56
|
+
const cap = await readBudgetCap();
|
|
57
|
+
if (!behaviorGate('openwop-budget-enforcement', cap?.supported === true)) return;
|
|
58
|
+
if (!(await isEventLogSeamAvailable())) return; // event-log seam absent — soft-skip
|
|
59
|
+
|
|
60
|
+
// ---- Leg 1: hard cost exhaust (§C/§D) -------------------------------
|
|
61
|
+
const hard = await driveBudgetRun({ scenario: 'hard-cost-exhaust' });
|
|
62
|
+
if (hard === null) return; // budget seam absent — soft-skip the whole behavior
|
|
63
|
+
if (hard.runId) {
|
|
64
|
+
const q = await queryTestEvents(hard.runId);
|
|
65
|
+
if (q.ok) {
|
|
66
|
+
const ev = q.events.slice().sort((a, b) => a.sequence - b.sequence);
|
|
67
|
+
const reserved = seq(ev, 'budget.reserved');
|
|
68
|
+
const threshold = seq(ev, 'budget.threshold.crossed');
|
|
69
|
+
const exhausted = seq(ev, 'budget.exhausted');
|
|
70
|
+
const failed = seq(ev, 'run.failed');
|
|
71
|
+
const capBreached = ev.find((e) => e.type === 'cap.breached' && typeof e.payload.kind === 'string' && (e.payload.kind as string).startsWith('budget-'));
|
|
72
|
+
|
|
73
|
+
expect(
|
|
74
|
+
reserved >= 0 && exhausted >= 0,
|
|
75
|
+
driver.describe('budget-policy.md §C', 'a hard budget run MUST emit budget.reserved + budget.exhausted'),
|
|
76
|
+
).toBe(true);
|
|
77
|
+
// §C ordering: reserved < threshold.crossed < exhausted < run.failed.
|
|
78
|
+
if (threshold >= 0) {
|
|
79
|
+
expect(
|
|
80
|
+
reserved < threshold && threshold < exhausted,
|
|
81
|
+
driver.describe('RFC 0084 §C', 'ordering MUST be reserved < threshold.crossed < exhausted'),
|
|
82
|
+
).toBe(true);
|
|
83
|
+
const tc = ev.find((e) => e.type === 'budget.threshold.crossed');
|
|
84
|
+
expect(
|
|
85
|
+
typeof tc?.payload.percent === 'number',
|
|
86
|
+
driver.describe('run-event-payloads.schema.json#budgetThresholdCrossed', 'threshold.crossed MUST carry a numeric percent'),
|
|
87
|
+
).toBe(true);
|
|
88
|
+
}
|
|
89
|
+
// §D hard-stop: exhausted → cap.breached{budget-*} → run.failed{budget_exhausted}.
|
|
90
|
+
expect(
|
|
91
|
+
capBreached !== undefined,
|
|
92
|
+
driver.describe('RFC 0084 §D', 'exhaustion MUST emit cap.breached with a budget-* kind'),
|
|
93
|
+
).toBe(true);
|
|
94
|
+
if (capBreached) {
|
|
95
|
+
expect(
|
|
96
|
+
BUDGET_CAP_KINDS.includes(capBreached.payload.kind as string),
|
|
97
|
+
driver.describe('RFC 0084 §D', 'cap.breached.kind MUST be in the closed budget vocabulary'),
|
|
98
|
+
).toBe(true);
|
|
99
|
+
expect(
|
|
100
|
+
exhausted <= capBreached.sequence && capBreached.sequence <= failed,
|
|
101
|
+
driver.describe('RFC 0084 §D', 'ordering MUST be exhausted ≤ cap.breached ≤ run.failed'),
|
|
102
|
+
).toBe(true);
|
|
103
|
+
}
|
|
104
|
+
const failedEvt = ev.find((e) => e.type === 'run.failed');
|
|
105
|
+
expect(
|
|
106
|
+
failedEvt?.payload.error === 'budget_exhausted',
|
|
107
|
+
driver.describe('RFC 0084 §D', 'a hard-budget overrun MUST fail the run with error budget_exhausted'),
|
|
108
|
+
).toBe(true);
|
|
109
|
+
expectContentFree(ev);
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
// ---- Leg 2: model denied (§D model policy, fail-closed) -------------
|
|
114
|
+
const denied = await driveBudgetRun({ scenario: 'model-denied' });
|
|
115
|
+
if (denied !== null) {
|
|
116
|
+
expect(
|
|
117
|
+
denied.error === 'budget_model_denied',
|
|
118
|
+
driver.describe('RFC 0084 §D', 'a model violating the budget allow/deny list MUST be refused with budget_model_denied'),
|
|
119
|
+
).toBe(true);
|
|
120
|
+
expect(
|
|
121
|
+
denied.modelCalled !== true,
|
|
122
|
+
driver.describe('RFC 0084 §D', 'a denied model MUST be refused BEFORE the provider call (fail-closed)'),
|
|
123
|
+
).toBe(true);
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
// ---- Leg 3: advisory mode emits events but never stops --------------
|
|
127
|
+
if (cap?.enforce === 'advisory' || cap?.enforce === undefined) {
|
|
128
|
+
const adv = await driveBudgetRun({ scenario: 'advisory' });
|
|
129
|
+
if (adv !== null && adv.runId) {
|
|
130
|
+
const q = await queryTestEvents(adv.runId);
|
|
131
|
+
if (q.ok) {
|
|
132
|
+
const ev = q.events;
|
|
133
|
+
const hasBudgetEvents = ev.some((e) => e.type.startsWith('budget.'));
|
|
134
|
+
const stopped = ev.some(
|
|
135
|
+
(e) =>
|
|
136
|
+
(e.type === 'cap.breached' && typeof e.payload.kind === 'string' && (e.payload.kind as string).startsWith('budget-')) ||
|
|
137
|
+
(e.type === 'run.failed' && e.payload.error === 'budget_exhausted'),
|
|
138
|
+
);
|
|
139
|
+
if (hasBudgetEvents) {
|
|
140
|
+
expect(
|
|
141
|
+
!stopped,
|
|
142
|
+
driver.describe('RFC 0084 §D', 'advisory enforcement MUST emit budget.* events without stopping the run'),
|
|
143
|
+
).toBe(true);
|
|
144
|
+
}
|
|
145
|
+
expectContentFree(ev);
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
await resetTestSeam();
|
|
151
|
+
});
|
|
152
|
+
});
|