npm - @openwop/openwop-conformance - Versions diffs - 1.15.0 → 1.18.0 - Mend

@openwop/openwop-conformance 1.15.0 → 1.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/CHANGELOG.md +34 -0
package/README.md +2 -2
package/coverage.md +4 -2
package/package.json +1 -1
package/schemas/run-event-payloads.schema.json +2 -2
package/src/lib/budgetPolicy.ts +63 -0
package/src/lib/event-log-query.ts +18 -0
package/src/lib/otel-collector.ts +34 -4
package/src/scenarios/agent-deployment-lifecycle.test.ts +82 -59
package/src/scenarios/agent-eval-run.test.ts +95 -68
package/src/scenarios/agent-platform-aggregate-evidence.test.ts +68 -0
package/src/scenarios/agent-platform-profile.test.ts +5 -4
package/src/scenarios/budget-enforcement.test.ts +152 -0
package/src/scenarios/otel-collector-canary-inspection.test.ts +50 -0
package/src/scenarios/replay-observable-sequence-determinism.test.ts +35 -10
package/src/scenarios/trigger-bridge-delivery.test.ts +92 -56

package/src/scenarios/agent-deployment-lifecycle.test.ts CHANGED Viewed

@@ -42,7 +42,7 @@ import {
   DEPLOYMENT_STATES,
   DEPLOYMENT_CONTENT_FORBIDDEN,
 } from '../lib/agentDeployment.js';
-import { queryTestEvents, isEventLogSeamAvailable, resetTestSeam } from '../lib/event-log-query.js';
+import { queryTestEvents, requireEvents, isEventLogSeamAvailable, resetTestSeam } from '../lib/event-log-query.js';
 function loadSchema(name: string): Record<string, unknown> {
   return JSON.parse(readFileSync(join(SCHEMAS_DIR, name), 'utf8')) as Record<string, unknown>;
@@ -71,76 +71,99 @@ describe('agent-deployment-lifecycle (RFC 0082 §B/§E)', () => {
     const promote = await driveDeploymentTransition({ scenario: 'promote' });
     if (promote === null) return; // deployment seam unwired — soft-skip the whole behavioral suite
-    if (promote.record) {
+    // The host has ADVERTISED agents.deployment AND wired the seam — missing
+    // evidence is a FAILURE, not a soft-skip. A successful promote MUST return
+    // a runId + a schema-valid record + emit ≥1 content-free deployment.promoted.
+    expect(
+      typeof promote.runId === 'string' && (promote.runId as string).length > 0,
+      driver.describe('agent-deployment.md §E', 'a wired promote MUST return the runId'),
+    ).toBe(true);
+    expect(
+      promote.record !== undefined && promote.record !== null,
+      driver.describe('agent-deployment.md §E', 'a successful promote MUST return the deployment record'),
+    ).toBe(true);
+    expect(
+      validateRecord(promote.record),
+      driver.describe('agent-deployment.schema.json', `a promoted deployment record MUST validate (${ajv.errorsText(validateRecord.errors)})`),
+    ).toBe(true);
+    const promotedEvents = requireEvents(
+      await queryTestEvents(promote.runId as string, { type: 'deployment.promoted' }),
+      'deployment.promoted',
+    );
+    expect(
+      promotedEvents.length >= 1,
+      driver.describe('agent-deployment.md §E', 'a successful promote MUST emit at least one deployment.promoted'),
+    ).toBe(true);
+    for (const e of promotedEvents) {
+      expectContentFree(e.payload, 'deployment.promoted');
       expect(
-        validateRecord(promote.record),
-        driver.describe(
-          'agent-deployment.schema.json',
-          `a promoted deployment record MUST validate (${ajv.errorsText(validateRecord.errors)})`,
-        ),
+        typeof e.payload.toState === 'string' && DEPLOYMENT_STATES.includes(e.payload.toState as string),
+        driver.describe('run-event-payloads.schema.json#/$defs/deploymentPromoted', 'toState MUST be in the seven-state vocabulary'),
+      ).toBe(true);
+      expect(
+        typeof e.payload.toVersion === 'string' && (e.payload.toVersion as string).length > 0,
+        driver.describe('agent-deployment.md §D', 'deployment.promoted MUST carry the promoted toVersion'),
       ).toBe(true);
-    }
-    if (promote.runId) {
-      const pq = await queryTestEvents(promote.runId, { type: 'deployment.promoted' });
-      if (pq.ok) {
-        for (const e of pq.events) {
-          expectContentFree(e.payload, 'deployment.promoted');
-          expect(
-            typeof e.payload.toState === 'string' && DEPLOYMENT_STATES.includes(e.payload.toState as string),
-            driver.describe('run-event-payloads.schema.json#/$defs/deploymentPromoted', 'toState MUST be in the seven-state vocabulary'),
-          ).toBe(true);
-          expect(
-            typeof e.payload.toVersion === 'string' && (e.payload.toVersion as string).length > 0,
-            driver.describe('agent-deployment.md §D', 'deployment.promoted MUST carry the promoted toVersion'),
-          ).toBe(true);
-        }
-      }
     }
     // ---- Leg 2: fail-closed authz (§E-1; deployment-promotion-fail-closed) -
     const unauth = await driveDeploymentTransition({ scenario: 'unauthorized' });
-    if (unauth && unauth.runId) {
-      expect(
-        unauth.allowed !== true,
-        driver.describe('agent-deployment.md §E-1', 'a principal without deploy:promote MUST be denied (fail-closed)'),
-      ).toBe(true);
-      const uq = await queryTestEvents(unauth.runId, { type: 'deployment.promoted' });
-      if (uq.ok) {
-        expect(
-          uq.events.length === 0,
-          driver.describe('SECURITY invariant deployment-promotion-fail-closed', 'a denied transition MUST emit NO deployment.promoted'),
-        ).toBe(true);
-      }
-    }
+    expect(
+      unauth !== null && typeof unauth.runId === 'string' && (unauth.runId as string).length > 0,
+      driver.describe('agent-deployment.md §E-1', 'the unauthorized scenario MUST return a runId to evidence the fail-closed denial'),
+    ).toBe(true);
+    expect(
+      unauth!.allowed !== true,
+      driver.describe('agent-deployment.md §E-1', 'a principal without deploy:promote MUST be denied (fail-closed)'),
+    ).toBe(true);
+    const unauthPromoted = requireEvents(
+      await queryTestEvents(unauth!.runId as string, { type: 'deployment.promoted' }),
+      'deployment.promoted (unauthorized)',
+    );
+    expect(
+      unauthPromoted.length === 0,
+      driver.describe('SECURITY invariant deployment-promotion-fail-closed', 'a denied transition MUST emit NO deployment.promoted'),
+    ).toBe(true);
     // ---- Leg 3: eval-gate-unmet denial (§E-3) ----------------------------
     const evalUnmet = await driveDeploymentTransition({ scenario: 'eval-gate-unmet' });
-    if (evalUnmet && evalUnmet.runId) {
-      expect(
-        evalUnmet.error === 'eval_gate_unmet' || evalUnmet.allowed !== true,
-        driver.describe('agent-deployment.md §E-3', 'a promote whose eval evidence has passed:false MUST be denied (eval_gate_unmet)'),
-      ).toBe(true);
-      const eq = await queryTestEvents(evalUnmet.runId, { type: 'deployment.promoted' });
-      if (eq.ok) {
-        expect(
-          eq.events.length === 0,
-          driver.describe('agent-deployment.md §E-3', 'an unmet eval gate MUST emit NO deployment.promoted'),
-        ).toBe(true);
-      }
-    }
+    expect(
+      evalUnmet !== null && typeof evalUnmet.runId === 'string' && (evalUnmet.runId as string).length > 0,
+      driver.describe('agent-deployment.md §E-3', 'the eval-gate-unmet scenario MUST return a runId to evidence the denial'),
+    ).toBe(true);
+    expect(
+      evalUnmet!.error === 'eval_gate_unmet' || evalUnmet!.allowed !== true,
+      driver.describe('agent-deployment.md §E-3', 'a promote whose eval evidence has passed:false MUST be denied (eval_gate_unmet)'),
+    ).toBe(true);
+    const evalUnmetPromoted = requireEvents(
+      await queryTestEvents(evalUnmet!.runId as string, { type: 'deployment.promoted' }),
+      'deployment.promoted (eval-gate-unmet)',
+    );
+    expect(
+      evalUnmetPromoted.length === 0,
+      driver.describe('agent-deployment.md §E-3', 'an unmet eval gate MUST emit NO deployment.promoted'),
+    ).toBe(true);
     // ---- Leg 4: channel-resolution pin (§B) ------------------------------
     const pin = await driveDeploymentTransition({ scenario: 'channel-pin', channel: 'stable' });
-    if (pin && pin.runId) {
-      const iq = await queryTestEvents(pin.runId, { type: 'agent.invocation.started' });
-      if (iq.ok && iq.events.length > 0) {
-        const started = iq.events.sort((a, b) => a.sequence - b.sequence)[0]!;
-        expect(
-          typeof started.payload.resolvedAgentVersion === 'string' && (started.payload.resolvedAgentVersion as string).length > 0,
-          driver.describe('agent-deployment.md §B', 'a @channel-bound run MUST record resolvedAgentVersion on agent.invocation.started (the recorded fact a replay re-reads)'),
-        ).toBe(true);
-      }
-    }
+    expect(
+      pin !== null && typeof pin.runId === 'string' && (pin.runId as string).length > 0,
+      driver.describe('agent-deployment.md §B', 'the channel-pin scenario MUST return a runId'),
+    ).toBe(true);
+    const invEvents = requireEvents(
+      await queryTestEvents(pin!.runId as string, { type: 'agent.invocation.started' }),
+      'agent.invocation.started (channel-pin)',
+    );
+    expect(
+      invEvents.length >= 1,
+      driver.describe('agent-deployment.md §B', 'a @channel-bound run MUST emit agent.invocation.started'),
+    ).toBe(true);
+    const startedInv = invEvents.sort((a, b) => a.sequence - b.sequence)[0]!;
+    expect(
+      typeof startedInv.payload.resolvedAgentVersion === 'string' && (startedInv.payload.resolvedAgentVersion as string).length > 0,
+      driver.describe('agent-deployment.md §B', 'a @channel-bound run MUST record resolvedAgentVersion on agent.invocation.started (the recorded fact a replay re-reads)'),
+    ).toBe(true);
     await resetTestSeam();
   });

package/src/scenarios/agent-eval-run.test.ts CHANGED Viewed

@@ -38,7 +38,7 @@ import {
   getEvalSummary,
   EVAL_CONTENT_FORBIDDEN,
 } from '../lib/agentEval.js';
-import { queryTestEvents, isEventLogSeamAvailable, resetTestSeam } from '../lib/event-log-query.js';
+import { queryTestEvents, requireEvents, isEventLogSeamAvailable, resetTestSeam } from '../lib/event-log-query.js';
 function loadSchema(name: string): Record<string, unknown> {
   return JSON.parse(readFileSync(join(SCHEMAS_DIR, name), 'utf8')) as Record<string, unknown>;
@@ -61,83 +61,110 @@ describe('agent-eval-run (RFC 0081 §B/§C)', () => {
     const run = await driveEvalRun({ modes: ['golden'] });
     if (run === null) return; // eval-run seam unwired — soft-skip the whole behavioral suite
-    if (!run.runId) return;
-    // ---- Legs 1+2: eval.* ordering + content-free (§C) -------------------
-    const startedQ = await queryTestEvents(run.runId, { type: 'eval.started' });
-    const scoredQ = await queryTestEvents(run.runId, { type: 'eval.scored' });
-    const completedQ = await queryTestEvents(run.runId, { type: 'eval.completed' });
+    // From here the host has ADVERTISED agents.evalSuite AND wired the eval-run
+    // seam — missing evidence is a FAILURE, not a soft-skip. A host claiming the
+    // capability MUST produce the runId, the full eval.* sequence, and the
+    // normative EvalSummary, or it is advertising a capability it doesn't deliver.
+    expect(
+      typeof run.runId === 'string' && run.runId.length > 0,
+      driver.describe('agent-evaluation.md §B', 'a wired eval-run seam MUST return the projected runId'),
+    ).toBe(true);
+    const runId = run.runId as string;
-    if (startedQ.ok && scoredQ.ok && startedQ.events.length > 0) {
-      const started = startedQ.events.sort((a, b) => a.sequence - b.sequence)[0]!;
+    // ---- Legs 1+2: eval.* ordering + content-free (§C) -------------------
+    const startedQ = await queryTestEvents(runId, { type: 'eval.started' });
+    const scoredQ = await queryTestEvents(runId, { type: 'eval.scored' });
+    const completedQ = await queryTestEvents(runId, { type: 'eval.completed' });
-      // eval.started precedes every eval.scored (§C ordering).
-      for (const s of scoredQ.events) {
-        expect(
-          started.sequence < s.sequence,
-          driver.describe('agent-evaluation.md §C', 'eval.started MUST precede every eval.scored'),
-        ).toBe(true);
-      }
+    // The event-log seam MUST return the eval.* events for a wired eval run
+    // (requireEvents hard-fails if a leg's query is not ok — no vacuous pass).
+    const startedEvents = requireEvents(startedQ, 'eval.started');
+    const scoredEvents = requireEvents(scoredQ, 'eval.scored');
+    const completedEvents = requireEvents(completedQ, 'eval.completed');
-      if (completedQ.ok && completedQ.events.length > 0) {
-        const completed = completedQ.events.sort((a, b) => a.sequence - b.sequence)[completedQ.events.length - 1]!;
-        for (const s of scoredQ.events) {
-          expect(
-            s.sequence < completed.sequence,
-            driver.describe('agent-evaluation.md §C', 'every eval.scored MUST precede eval.completed'),
-          ).toBe(true);
-        }
-        // eval.scored is emitted once per task (count == eval.completed.taskCount).
-        if (typeof completed.payload.taskCount === 'number') {
-          expect(
-            scoredQ.events.length === completed.payload.taskCount,
-            driver.describe('agent-evaluation.md §C', 'one eval.scored per task (count == eval.completed.taskCount)'),
-          ).toBe(true);
-        }
-        expectContentFree(completed.payload, 'eval.completed');
-      }
+    // eval.started exactly once (FIRST); eval.completed exactly once (LAST);
+    // ≥1 eval.scored — a wired eval run MUST emit the full sequence.
+    expect(
+      startedEvents.length === 1,
+      driver.describe('agent-evaluation.md §C', 'an eval run MUST emit exactly one eval.started'),
+    ).toBe(true);
+    expect(
+      scoredEvents.length >= 1,
+      driver.describe('agent-evaluation.md §C', 'an eval run MUST emit at least one eval.scored'),
+    ).toBe(true);
+    expect(
+      completedEvents.length === 1,
+      driver.describe('agent-evaluation.md §C', 'an eval run MUST emit exactly one eval.completed'),
+    ).toBe(true);
+    const started = startedEvents[0]!;
+    const completed = completedEvents[0]!;
-      // each eval.scored content-free + score ∈ 0..1, passed boolean.
-      for (const s of scoredQ.events) {
-        expectContentFree(s.payload, 'eval.scored');
-        expect(
-          typeof s.payload.score === 'number' && (s.payload.score as number) >= 0 && (s.payload.score as number) <= 1,
-          driver.describe('run-event-payloads.schema.json#/$defs/evalScored', 'eval.scored.score MUST be in 0..1'),
-        ).toBe(true);
-        expect(
-          typeof s.payload.passed === 'boolean',
-          driver.describe('run-event-payloads.schema.json#/$defs/evalScored', 'eval.scored.passed MUST be a boolean'),
-        ).toBe(true);
-      }
-      expectContentFree(started.payload, 'eval.started');
+    // Ordering: eval.started precedes every eval.scored precedes eval.completed.
+    for (const s of scoredEvents) {
+      expect(
+        started.sequence < s.sequence,
+        driver.describe('agent-evaluation.md §C', 'eval.started MUST precede every eval.scored'),
+      ).toBe(true);
+      expect(
+        s.sequence < completed.sequence,
+        driver.describe('agent-evaluation.md §C', 'every eval.scored MUST precede eval.completed'),
+      ).toBe(true);
     }
-    // ---- Leg 3: NORMATIVE EvalSummary read (§C) --------------------------
-    const { status, summary } = await getEvalSummary(run.runId);
-    if (status === 200 && summary) {
-      const ajv = new Ajv2020({ strict: false, allErrors: true });
-      addFormats(ajv);
-      const validate = ajv.compile(loadSchema('eval-summary.schema.json'));
+    // One eval.scored per task (count == eval.completed.taskCount).
+    expect(
+      typeof completed.payload.taskCount === 'number',
+      driver.describe('run-event-payloads.schema.json#/$defs/evalCompleted', 'eval.completed MUST carry a numeric taskCount'),
+    ).toBe(true);
+    expect(
+      scoredEvents.length === completed.payload.taskCount,
+      driver.describe('agent-evaluation.md §C', 'one eval.scored per task (count == eval.completed.taskCount)'),
+    ).toBe(true);
+    // Content-free (§C / eval-summary-no-content-leak) + score ∈ 0..1, passed boolean.
+    expectContentFree(started.payload, 'eval.started');
+    expectContentFree(completed.payload, 'eval.completed');
+    for (const s of scoredEvents) {
+      expectContentFree(s.payload, 'eval.scored');
+      expect(
+        typeof s.payload.score === 'number' && (s.payload.score as number) >= 0 && (s.payload.score as number) <= 1,
+        driver.describe('run-event-payloads.schema.json#/$defs/evalScored', 'eval.scored.score MUST be in 0..1'),
+      ).toBe(true);
       expect(
-        validate(summary),
-        driver.describe(
-          'eval-summary.schema.json',
-          `GET /v1/runs/{runId}/eval-summary MUST return a schema-valid EvalSummary (${ajv.errorsText(validate.errors)})`,
-        ),
+        typeof s.payload.passed === 'boolean',
+        driver.describe('run-event-payloads.schema.json#/$defs/evalScored', 'eval.scored.passed MUST be a boolean'),
       ).toBe(true);
+    }
-      const tasks = (summary.tasks as Array<Record<string, unknown>> | undefined) ?? [];
-      const passedCount = summary.passedCount as number | undefined;
-      const taskCount = summary.taskCount as number | undefined;
-      if (typeof passedCount === 'number' && typeof taskCount === 'number') {
-        expect(
-          passedCount <= taskCount,
-          driver.describe('agent-evaluation.md §C', 'EvalSummary.passedCount MUST NOT exceed taskCount'),
-        ).toBe(true);
-      }
-      for (const t of tasks) {
-        expectContentFree(t, 'EvalSummary.tasks[]');
-      }
+    // ---- Leg 3: NORMATIVE EvalSummary read (§C) — MUST serve a 200 -------
+    const { status, summary } = await getEvalSummary(runId);
+    expect(
+      status === 200 && summary !== undefined,
+      driver.describe('agent-evaluation.md §C', `GET /v1/runs/{runId}/eval-summary MUST serve a 200 EvalSummary for a completed eval run (got ${status})`),
+    ).toBe(true);
+    const sum = summary as Record<string, unknown>;
+    const ajv = new Ajv2020({ strict: false, allErrors: true });
+    addFormats(ajv);
+    const validate = ajv.compile(loadSchema('eval-summary.schema.json'));
+    expect(
+      validate(sum),
+      driver.describe('eval-summary.schema.json', `EvalSummary MUST be schema-valid (${ajv.errorsText(validate.errors)})`),
+    ).toBe(true);
+    const tasks = (sum.tasks as Array<Record<string, unknown>> | undefined) ?? [];
+    const passedCount = sum.passedCount as number | undefined;
+    const taskCount = sum.taskCount as number | undefined;
+    expect(
+      typeof passedCount === 'number' && typeof taskCount === 'number',
+      driver.describe('eval-summary.schema.json', 'EvalSummary MUST carry numeric passedCount + taskCount'),
+    ).toBe(true);
+    expect(
+      (passedCount as number) <= (taskCount as number),
+      driver.describe('agent-evaluation.md §C', 'EvalSummary.passedCount MUST NOT exceed taskCount'),
+    ).toBe(true);
+    for (const t of tasks) {
+      expectContentFree(t, 'EvalSummary.tasks[]');
     }
     await resetTestSeam();

package/src/scenarios/agent-platform-aggregate-evidence.test.ts ADDED Viewed

@@ -0,0 +1,68 @@
+/**
+ * openwop-agent-platform — LIVE aggregate-evidence (RFC 0085 §C) — behavioral.
+ *
+ * The `Active → Accepted` bar for the meta-profile. Capability-gated on a host
+ * CLAIMING the operational annex — i.e. its live discovery `profiles[]` includes
+ * `openwop-agent-platform`. Soft-skips when unclaimed (default) / hard-fails
+ * under `OPENWOP_REQUIRE_BEHAVIOR=true`.
+ *
+ * The always-on derivation legs in `agent-platform-profile.test.ts` prove the
+ * §B predicate logic against synthetic payloads; THIS asserts the §C/§D
+ * honest-advertisement rule against the LIVE discovery doc: a host MAY advertise
+ * `openwop-agent-platform` only if its real wire satisfies the §B floor
+ * predicate — the platform claim is **backed by** the per-capability evidence
+ * (each constituent cap's gated scenario — agent-manifest-runtime,
+ * agent-live-*, tool-catalog/hooks, safe-fetch, provider-usage, prompts, memory,
+ * feedback, replay, + the governance scenarios — runs in this same suite run and
+ * must pass), never asserted on the profile string alone.
+ *
+ * When the operator declares the cert tier `full`
+ * (`OPENWOP_AGENT_PLATFORM_TIER=full`), the full predicate (all governance terms
+ * + tenant installScope) MUST hold non-vacuously.
+ *
+ * Spec references:
+ *   - https://github.com/openwop/openwop/blob/main/spec/v1/agent-platform-profile.md (§C/§D)
+ *   - https://github.com/openwop/openwop/blob/main/RFCS/0085-agent-platform-meta-profile.md
+ */
+import { describe, it, expect } from 'vitest';
+import { driver } from '../lib/driver.js';
+import { behaviorGate } from '../lib/behavior-gate.js';
+import { isAgentPlatformPartial, isAgentPlatformFull, agentPlatformStatus, agentPlatformSatisfiedTerms } from '../lib/profiles.js';
+describe('agent-platform-aggregate-evidence (RFC 0085 §C)', () => {
+  it('a host claiming openwop-agent-platform satisfies the §B floor on live discovery; full when the operator certifies full', async () => {
+    const res = await driver.get('/.well-known/openwop', { authenticated: false });
+    const disco = (res.status === 200 ? res.json : null) as Record<string, unknown> | null;
+    const profiles = Array.isArray(disco?.profiles) ? (disco!.profiles as unknown[]) : [];
+    const claims = disco !== null && profiles.includes('openwop-agent-platform');
+    if (!behaviorGate('openwop-agent-platform', claims)) return;
+    // §C / §D honest-advertisement: the profile claim MUST be backed by the §B
+    // floor predicate holding on the live discovery payload — never asserted on
+    // the profile string alone.
+    expect(
+      isAgentPlatformPartial(disco!),
+      driver.describe('agent-platform-profile.md §C', 'claiming openwop-agent-platform MUST satisfy the §B floor predicate on live discovery (claim backed by per-capability evidence)'),
+    ).toBe(true);
+    const status = agentPlatformStatus(disco!);
+    expect(
+      status === 'partial' || status === 'full',
+      driver.describe('agent-platform-profile.md §D', 'a claimed openwop-agent-platform host MUST derive to partial or full, never none'),
+    ).toBe(true);
+    // Non-vacuous FULL bar: when the operator declares the cert tier `full`,
+    // every governance term + tenant installScope MUST hold + all 16 §D terms.
+    if (process.env.OPENWOP_AGENT_PLATFORM_TIER === 'full') {
+      expect(
+        isAgentPlatformFull(disco!),
+        driver.describe('agent-platform-profile.md §B/§D', 'a host certifying `full` MUST satisfy every governance term: authorization + tenant installScope + memory.attribution + debugBundle + triggerBridge + httpClient.egressPolicy'),
+      ).toBe(true);
+      expect(
+        agentPlatformSatisfiedTerms(disco!).length,
+        driver.describe('agent-platform-profile.md §D', 'a host certifying `full` satisfies all 16 §D terms'),
+      ).toBe(16);
+    }
+  });
+});

package/src/scenarios/agent-platform-profile.test.ts CHANGED Viewed

@@ -13,10 +13,11 @@
  *     missing any reports `partial`, never `full` (the honest-advertisement rule).
  *   - `capabilities.nondeterminismPolicy.declared` is declared in the schema.
  *
- * The LIVE aggregate-evidence assertion (does every required constituent scenario
- * actually pass against a host claiming `full`?) is the `Active → Accepted` step
- * per RFC 0085 §C — naturally gated on a reference host reaching partial/full, and
- * deferred here. This scenario asserts the discovery-predicate derivation only.
+ * The LIVE aggregate-evidence assertion (the §C honest-advertisement rule on a
+ * host claiming `openwop-agent-platform`) is the `Active → Accepted` step per RFC
+ * 0085 §C — capability-gated, server-requiring, and lives in the sibling
+ * `agent-platform-aggregate-evidence.test.ts`. THIS scenario asserts the
+ * discovery-predicate derivation only (always-on, server-free).
  *
  * Spec references:
  *   - https://github.com/openwop/openwop/blob/main/spec/v1/agent-platform-profile.md

package/src/scenarios/budget-enforcement.test.ts ADDED Viewed

@@ -0,0 +1,152 @@
+/**
+ * Budget enforcement — the §C lifecycle + §D hard-stop (RFC 0084) — behavioral.
+ *
+ * Gated on `capabilities.budget.supported` (root-first per RFC 0073). Soft-skips
+ * when unadvertised (default) / hard-fails under `OPENWOP_REQUIRE_BEHAVIOR=true`.
+ * The always-on wire-shape coverage lives in `budget-policy-shape.test.ts`; this
+ * asserts host BEHAVIOR via the `POST /v1/host/sample/budget/run` seam + the test
+ * event-log seam:
+ *
+ *   1. HARD COST EXHAUST (§C/§D, requires `enforce:"hard"`) — a hard-cost run
+ *      accrues to exhaustion, emitting in strict sequence:
+ *      `budget.reserved` → `budget.consumed` → `budget.threshold.crossed{percent}`
+ *      → `budget.exhausted` → `cap.breached{kind:"budget-cost"}` →
+ *      `run.failed{error:"budget_exhausted"}`.
+ *   2. MODEL DENIED (§D model policy) — a run whose model violates the budget
+ *      allow/deny list is refused with `budget_model_denied` BEFORE the provider
+ *      call (no model call, fail-closed).
+ *   3. ADVISORY (§D, `enforce:"advisory"`) — the same accrual emits the
+ *      `budget.*` events but does NOT stop the run (no `cap.breached`, no
+ *      `run.failed{budget_exhausted}`).
+ *   4. CONTENT-FREE (SR-1 / `budget-no-pricing-leak`) — every `budget.*` payload
+ *      carries only dimension/limit/consumed/remaining/percent scalars, never a
+ *      provider pricing table or per-token rate.
+ *
+ * Spec references:
+ *   - https://github.com/openwop/openwop/blob/main/spec/v1/budget-policy.md (§C/§D)
+ *   - https://github.com/openwop/openwop/blob/main/RFCS/0084-budget-quota-and-cost-policy.md
+ *   - https://github.com/openwop/openwop/blob/main/SECURITY/invariants.yaml (budget-no-pricing-leak)
+ */
+import { describe, it, expect } from 'vitest';
+import { driver } from '../lib/driver.js';
+import { behaviorGate } from '../lib/behavior-gate.js';
+import { readBudgetCap, driveBudgetRun, BUDGET_CAP_KINDS, BUDGET_CONTENT_FORBIDDEN } from '../lib/budgetPolicy.js';
+import { queryTestEvents, isEventLogSeamAvailable, resetTestSeam } from '../lib/event-log-query.js';
+import type { TestEvent } from '../lib/event-log-query.js';
+function seq(events: TestEvent[], type: string): number {
+  const e = events.find((x) => x.type === type);
+  return e ? e.sequence : -1;
+}
+function expectContentFree(events: TestEvent[]): void {
+  for (const e of events.filter((x) => x.type.startsWith('budget.'))) {
+    for (const f of BUDGET_CONTENT_FORBIDDEN) {
+      expect(
+        !(f in e.payload),
+        driver.describe('RFC 0084 §F (SR-1) / budget-no-pricing-leak', `budget.* MUST be content-free (no ${f})`),
+      ).toBe(true);
+    }
+  }
+}
+describe('budget-enforcement (RFC 0084 §C/§D)', () => {
+  it('runs the reserved→consumed→threshold→exhausted→cap.breached→run.failed chain, refuses denied models, and honors advisory mode', async () => {
+    const cap = await readBudgetCap();
+    if (!behaviorGate('openwop-budget-enforcement', cap?.supported === true)) return;
+    if (!(await isEventLogSeamAvailable())) return; // event-log seam absent — soft-skip
+    // ---- Leg 1: hard cost exhaust (§C/§D) -------------------------------
+    const hard = await driveBudgetRun({ scenario: 'hard-cost-exhaust' });
+    if (hard === null) return; // budget seam absent — soft-skip the whole behavior
+    if (hard.runId) {
+      const q = await queryTestEvents(hard.runId);
+      if (q.ok) {
+        const ev = q.events.slice().sort((a, b) => a.sequence - b.sequence);
+        const reserved = seq(ev, 'budget.reserved');
+        const threshold = seq(ev, 'budget.threshold.crossed');
+        const exhausted = seq(ev, 'budget.exhausted');
+        const failed = seq(ev, 'run.failed');
+        const capBreached = ev.find((e) => e.type === 'cap.breached' && typeof e.payload.kind === 'string' && (e.payload.kind as string).startsWith('budget-'));
+        expect(
+          reserved >= 0 && exhausted >= 0,
+          driver.describe('budget-policy.md §C', 'a hard budget run MUST emit budget.reserved + budget.exhausted'),
+        ).toBe(true);
+        // §C ordering: reserved < threshold.crossed < exhausted < run.failed.
+        if (threshold >= 0) {
+          expect(
+            reserved < threshold && threshold < exhausted,
+            driver.describe('RFC 0084 §C', 'ordering MUST be reserved < threshold.crossed < exhausted'),
+          ).toBe(true);
+          const tc = ev.find((e) => e.type === 'budget.threshold.crossed');
+          expect(
+            typeof tc?.payload.percent === 'number',
+            driver.describe('run-event-payloads.schema.json#budgetThresholdCrossed', 'threshold.crossed MUST carry a numeric percent'),
+          ).toBe(true);
+        }
+        // §D hard-stop: exhausted → cap.breached{budget-*} → run.failed{budget_exhausted}.
+        expect(
+          capBreached !== undefined,
+          driver.describe('RFC 0084 §D', 'exhaustion MUST emit cap.breached with a budget-* kind'),
+        ).toBe(true);
+        if (capBreached) {
+          expect(
+            BUDGET_CAP_KINDS.includes(capBreached.payload.kind as string),
+            driver.describe('RFC 0084 §D', 'cap.breached.kind MUST be in the closed budget vocabulary'),
+          ).toBe(true);
+          expect(
+            exhausted <= capBreached.sequence && capBreached.sequence <= failed,
+            driver.describe('RFC 0084 §D', 'ordering MUST be exhausted ≤ cap.breached ≤ run.failed'),
+          ).toBe(true);
+        }
+        const failedEvt = ev.find((e) => e.type === 'run.failed');
+        expect(
+          failedEvt?.payload.error === 'budget_exhausted',
+          driver.describe('RFC 0084 §D', 'a hard-budget overrun MUST fail the run with error budget_exhausted'),
+        ).toBe(true);
+        expectContentFree(ev);
+      }
+    }
+    // ---- Leg 2: model denied (§D model policy, fail-closed) -------------
+    const denied = await driveBudgetRun({ scenario: 'model-denied' });
+    if (denied !== null) {
+      expect(
+        denied.error === 'budget_model_denied',
+        driver.describe('RFC 0084 §D', 'a model violating the budget allow/deny list MUST be refused with budget_model_denied'),
+      ).toBe(true);
+      expect(
+        denied.modelCalled !== true,
+        driver.describe('RFC 0084 §D', 'a denied model MUST be refused BEFORE the provider call (fail-closed)'),
+      ).toBe(true);
+    }
+    // ---- Leg 3: advisory mode emits events but never stops --------------
+    if (cap?.enforce === 'advisory' || cap?.enforce === undefined) {
+      const adv = await driveBudgetRun({ scenario: 'advisory' });
+      if (adv !== null && adv.runId) {
+        const q = await queryTestEvents(adv.runId);
+        if (q.ok) {
+          const ev = q.events;
+          const hasBudgetEvents = ev.some((e) => e.type.startsWith('budget.'));
+          const stopped = ev.some(
+            (e) =>
+              (e.type === 'cap.breached' && typeof e.payload.kind === 'string' && (e.payload.kind as string).startsWith('budget-')) ||
+              (e.type === 'run.failed' && e.payload.error === 'budget_exhausted'),
+          );
+          if (hasBudgetEvents) {
+            expect(
+              !stopped,
+              driver.describe('RFC 0084 §D', 'advisory enforcement MUST emit budget.* events without stopping the run'),
+            ).toBe(true);
+          }
+          expectContentFree(ev);
+        }
+      }
+    }
+    await resetTestSeam();
+  });
+});