npm - @openwop/openwop-conformance - Versions diffs - 1.6.1 → 1.11.0 - Mend

@openwop/openwop-conformance 1.6.1 → 1.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (200) hide show

package/src/scenarios/agent-eval-suite-shape.test.ts ADDED Viewed

@@ -0,0 +1,167 @@
+/**
+ * Agent evaluation — suite + summary + event shapes (RFC 0081).
+ *
+ * Always-on, server-free schema-shape probe. Verifies that:
+ *   - `capabilities.agents.evalSuite` is declared with its `supported` / `modes`
+ *     sub-flags.
+ *   - the `AgentEvalSuite` + `EvalSummary` schemas compile and round-trip a
+ *     conforming artifact, and reject malformed ones (a bad `suiteId`; a
+ *     `thresholds.passScore` out of 0..1).
+ *   - the `eval.started` / `eval.scored` / `eval.completed` payload $defs
+ *     validate conforming content-free payloads and reject malformed ones.
+ *   - both the summary and the per-task `eval.scored` payload are CONTENT-FREE:
+ *     an `EvalSummary` carrying a task-output body and a `safetyFinding` carrying
+ *     an excerpt are rejected. This is the public test for the protocol-tier
+ *     SECURITY invariant `eval-summary-no-content-leak`.
+ *   - all three event names appear in the RunEventType enum.
+ *
+ * Behavioral assertions (the eval-run event ordering, per-task scoring, the
+ * EvalSummary round-trip against a live host, the `mode: "eval"` 501 on
+ * unadvertised hosts) are gated on `capabilities.agents.evalSuite.supported` and
+ * land in `agent-eval-run.test.ts` (deferred per RFC 0081 §Conformance — reference
+ * host deferred). This scenario asserts the wire contract, not host behavior.
+ *
+ * Spec references:
+ *   - https://github.com/openwop/openwop/blob/main/spec/v1/agent-evaluation.md
+ *   - https://github.com/openwop/openwop/blob/main/RFCS/0081-agent-evaluation-and-scorecards.md
+ *   - https://github.com/openwop/openwop/blob/main/SECURITY/invariants.yaml (eval-summary-no-content-leak)
+ */
+import { describe, it, expect } from 'vitest';
+import { readFileSync } from 'node:fs';
+import { join } from 'node:path';
+import Ajv2020 from 'ajv/dist/2020.js';
+import addFormats from 'ajv-formats';
+import { SCHEMAS_DIR } from '../lib/paths.js';
+/** Server-free assertion-message helper (mirrors driver.describe's "spec — requirement" shape without requiring OPENWOP_BASE_URL). */
+const why = (specRef: string, requirement: string): string => `${specRef} — ${requirement}`;
+function loadSchema(name: string): Record<string, unknown> {
+  return JSON.parse(readFileSync(join(SCHEMAS_DIR, name), 'utf8')) as Record<string, unknown>;
+}
+describe('agent-eval-suite-shape: capability advertisement (RFC 0081, server-free)', () => {
+  it('the capabilities schema declares agents.evalSuite with its sub-flags', () => {
+    const caps = loadSchema('capabilities.schema.json');
+    const agents = (caps.properties as Record<string, { properties?: Record<string, { properties?: Record<string, unknown> }> }>).agents;
+    const evalSuite = agents?.properties?.evalSuite;
+    expect(
+      evalSuite,
+      why('capabilities.md §agents', 'agents.evalSuite MUST be declared'),
+    ).toBeDefined();
+    for (const flag of ['supported', 'modes']) {
+      expect(
+        evalSuite?.properties?.[flag],
+        why('agent-evaluation.md §Capability advertisement', `agents.evalSuite.${flag} MUST be declared`),
+      ).toBeDefined();
+    }
+  });
+});
+describe('agent-eval-suite-shape: AgentEvalSuite + EvalSummary schemas (RFC 0081, server-free)', () => {
+  const ajv = new Ajv2020({ strict: false, allErrors: true });
+  addFormats(ajv);
+  const suite = ajv.compile(loadSchema('agent-eval-suite.schema.json'));
+  const summary = ajv.compile(loadSchema('eval-summary.schema.json'));
+  it('AgentEvalSuite validates a conforming suite and rejects a malformed suiteId / out-of-range threshold', () => {
+    const good = {
+      suiteId: 'core.openwop.evals.support-resolver',
+      version: '1.0.0',
+      modes: ['golden', 'regression'],
+      thresholds: { passScore: 0.8 },
+      tasks: [
+        { taskId: 'refund-window', input: { q: 'refund policy?' }, expected: { kind: 'golden', match: { strategy: 'contains', value: '30 days' } } },
+      ],
+    };
+    expect(suite(good), why('RFC 0081 §A', 'a conforming AgentEvalSuite MUST validate')).toBe(true);
+    // Negative: suiteId must carry the `.evals.` infix.
+    expect(suite({ ...good, suiteId: 'core.openwop.support-resolver' }), why('RFC 0081 §A', 'a suiteId without the `.evals.` infix MUST be rejected')).toBe(false);
+    // Negative: passScore out of 0..1.
+    expect(suite({ ...good, thresholds: { passScore: 1.5 } }), why('RFC 0081 §A', 'thresholds.passScore > 1 MUST be rejected')).toBe(false);
+  });
+  it('EvalSummary validates a conforming scorecard and rejects an out-of-range score', () => {
+    const good = {
+      suiteId: 'core.openwop.evals.support-resolver',
+      suiteVersion: '1.0.0',
+      aggregateScore: 0.86,
+      passed: true,
+      taskCount: 2,
+      passedCount: 2,
+      tasks: [{ taskId: 'refund-window', score: 0.9, passed: true, safetyFindings: [{ kind: 'jailbreak', severity: 'low' }] }],
+    };
+    expect(summary(good), why('RFC 0081 §C', 'a conforming EvalSummary MUST validate')).toBe(true);
+    expect(summary({ ...good, aggregateScore: 1.4 }), why('RFC 0081 §C', 'aggregateScore > 1 MUST be rejected')).toBe(false);
+  });
+  it('EvalSummary is content-free — a task-output body and a safety-finding excerpt are rejected (eval-summary-no-content-leak)', () => {
+    const base = { suiteId: 'core.openwop.evals.x', suiteVersion: '1.0.0', aggregateScore: 0.5, passed: false, taskCount: 1, passedCount: 0 };
+    // Negative: a per-task entry carrying the output body.
+    expect(
+      summary({ ...base, tasks: [{ taskId: 't1', score: 0.5, passed: false, taskOutput: 'the model said …' }] }),
+      why('SECURITY invariant eval-summary-no-content-leak', 'an EvalSummary task entry MUST NOT carry an output body'),
+    ).toBe(false);
+    // Negative: a safety finding carrying excerpted content rather than a {kind, severity} descriptor.
+    expect(
+      summary({ ...base, tasks: [{ taskId: 't1', score: 0.5, passed: false, safetyFindings: [{ kind: 'pii-leak', severity: 'high', excerpt: 'SSN 123-45-6789' }] }] }),
+      why('SECURITY invariant eval-summary-no-content-leak', 'a safetyFinding MUST NOT carry excerpted content'),
+    ).toBe(false);
+  });
+});
+describe('agent-eval-suite-shape: eval event payloads (RFC 0081, server-free)', () => {
+  const payloads = loadSchema('run-event-payloads.schema.json');
+  const ajv = new Ajv2020({ strict: false, allErrors: true });
+  addFormats(ajv);
+  ajv.addSchema(payloads, 'payloads');
+  const started = ajv.getSchema('payloads#/$defs/evalStarted');
+  const scored = ajv.getSchema('payloads#/$defs/evalScored');
+  const completed = ajv.getSchema('payloads#/$defs/evalCompleted');
+  it('eval.started validates a content-free start record and requires the suite provenance', () => {
+    expect(started, 'the evalStarted $def MUST exist').toBeTruthy();
+    expect(
+      started!({ suiteId: 'core.openwop.evals.support-resolver', suiteVersion: '1.0.0', taskCount: 12, modes: ['golden'] }),
+      why('RFC 0081 §C', 'a conforming eval.started payload MUST validate'),
+    ).toBe(true);
+    expect(
+      started!({ suiteId: 'core.openwop.evals.x' }),
+      why('RFC 0081 §C', 'eval.started without suiteVersion/taskCount/modes MUST be rejected'),
+    ).toBe(false);
+  });
+  it('eval.scored validates a content-free per-task score and requires score + passed', () => {
+    expect(scored, 'the evalScored $def MUST exist').toBeTruthy();
+    expect(
+      scored!({ taskId: 'refund-window', score: 0.9, passed: true, costUsd: 0.012 }),
+      why('RFC 0081 §C', 'a conforming eval.scored payload MUST validate'),
+    ).toBe(true);
+    expect(
+      scored!({ taskId: 'refund-window' }),
+      why('RFC 0081 §C', 'eval.scored without score/passed MUST be rejected'),
+    ).toBe(false);
+  });
+  it('eval.completed validates a content-free aggregate record', () => {
+    expect(completed, 'the evalCompleted $def MUST exist').toBeTruthy();
+    expect(
+      completed!({ aggregateScore: 0.86, passed: true, taskCount: 12, passedCount: 11, regressionVsBaseline: 0.04 }),
+      why('RFC 0081 §C', 'a conforming eval.completed payload MUST validate'),
+    ).toBe(true);
+    expect(
+      completed!({ aggregateScore: 2 }),
+      why('RFC 0081 §C', 'eval.completed with an out-of-range aggregateScore MUST be rejected'),
+    ).toBe(false);
+  });
+  it('all three eval event names appear in the RunEventType enum', () => {
+    const runEvent = loadSchema('run-event.schema.json');
+    const enumVals = (runEvent.$defs as Record<string, { enum?: string[] }>).RunEventType?.enum ?? [];
+    expect(enumVals).toContain('eval.started');
+    expect(enumVals).toContain('eval.scored');
+    expect(enumVals).toContain('eval.completed');
+  });
+});

package/src/scenarios/agent-live-allowlist-enforced.test.ts ADDED Viewed

@@ -0,0 +1,53 @@
+/**
+ * Live manifest-dispatch tool-allowlist enforcement (RFC 0077 §F-1) —
+ * behavioral.
+ *
+ * Gated on `capabilities.agents.liveRuntime.supported` (root-first per RFC 0073).
+ * Soft-skips when unadvertised (default) / hard-fails under
+ * `OPENWOP_REQUIRE_BEHAVIOR=true`.
+ *
+ * Asserts the §F-1 safety carry-forward: a live invocation MUST NOT call a tool
+ * outside the agent's `toolAllowlist` (the per-tool application of the RFC 0002
+ * §A14 mandatory-allowlist floor). Driven by the `attemptTool` seam param naming
+ * a disallowed tool; the invocation MUST NOT emit an `agent.toolCalled` for it
+ * (a refused/failed outcome is acceptable, a silent successful call is not).
+ * Soft-skips when the seam/hook is unwired.
+ *
+ * Spec references:
+ *   - https://github.com/openwop/openwop/blob/main/spec/v1/multi-agent-execution.md (§"Live manifest dispatch")
+ *   - https://github.com/openwop/openwop/blob/main/RFCS/0077-agent-run-lifecycle-and-live-manifest-dispatch.md (§F-1)
+ *   - https://github.com/openwop/openwop/blob/main/RFCS/0002-agent-identity-and-handoff.md (§A14 toolAllowlist)
+ */
+import { describe, it, expect } from 'vitest';
+import { driver } from '../lib/driver.js';
+import { behaviorGate } from '../lib/behavior-gate.js';
+import { readLiveRuntimeCap, invokeLive } from '../lib/liveRuntime.js';
+import { queryTestEvents, isEventLogSeamAvailable, resetTestSeam } from '../lib/event-log-query.js';
+const DISALLOWED_TOOL = 'conformance-disallowed-tool';
+describe('agent-live-allowlist-enforced (RFC 0077 §F-1)', () => {
+  it('does not call a tool outside the agent toolAllowlist', async () => {
+    const cap = await readLiveRuntimeCap();
+    if (!behaviorGate('openwop-live-allowlist-enforced', cap?.supported === true)) return;
+    if (!(await isEventLogSeamAvailable())) return; // soft-skip
+    const res = await invokeLive({ source: 'run-api', attemptTool: DISALLOWED_TOOL });
+    if (res === null || !res.runId) return; // seam/hook absent — soft-skip
+    const q = await queryTestEvents(res.runId, { type: 'agent.toolCalled' });
+    if (!q.ok) return;
+    const calledDisallowed = q.events.some((e) => {
+      const tool = e.payload.tool ?? e.payload.toolId ?? e.payload.name;
+      return tool === DISALLOWED_TOOL;
+    });
+    expect(
+      calledDisallowed === false,
+      driver.describe('RFC 0077 §F-1 / RFC 0002 §A14', 'a live invocation MUST NOT call a tool outside the agent toolAllowlist'),
+    ).toBe(true);
+    await resetTestSeam();
+  });
+});

package/src/scenarios/agent-live-invocation-bracket.test.ts ADDED Viewed

@@ -0,0 +1,98 @@
+/**
+ * Live manifest-dispatch invocation bracket (RFC 0077 §E) — behavioral.
+ *
+ * Gated on `capabilities.agents.liveRuntime.supported` (root-first per RFC 0073).
+ * Soft-skips when unadvertised (default) / hard-fails under
+ * `OPENWOP_REQUIRE_BEHAVIOR=true`. The always-on wire-shape coverage lives in
+ * `agent-live-runtime-shape.test.ts`; this asserts host BEHAVIOR: a live
+ * invocation brackets its `agent.*` family with
+ * `agent.invocation.started` (FIRST agent-scoped event) and
+ * `agent.invocation.completed` (LAST), with a matching `invocationId`, a
+ * `source` in the enum, an `outcome` in the enum, and both events content-free
+ * (no prompt/result body).
+ *
+ * Drives the OPTIONAL `POST /v1/host/sample/agents/live-invoke` seam + reads the
+ * bracket back via the test event-log seam (both deferred per RFC 0077
+ * §Conformance — soft-skip on 404).
+ *
+ * Spec references:
+ *   - https://github.com/openwop/openwop/blob/main/spec/v1/multi-agent-execution.md (§"Live manifest dispatch")
+ *   - https://github.com/openwop/openwop/blob/main/RFCS/0077-agent-run-lifecycle-and-live-manifest-dispatch.md
+ */
+import { describe, it, expect } from 'vitest';
+import { driver } from '../lib/driver.js';
+import { behaviorGate } from '../lib/behavior-gate.js';
+import { readLiveRuntimeCap, invokeLive } from '../lib/liveRuntime.js';
+import { queryTestEvents, isEventLogSeamAvailable, resetTestSeam } from '../lib/event-log-query.js';
+const SOURCES = ['workflow-node', 'run-api', 'chat-mention'];
+const OUTCOMES = ['completed', 'handed-off', 'escalated', 'refused', 'failed'];
+const AGENT_SCOPED = (t: string): boolean => t === 'agent.invocation.started' || t === 'agent.invocation.completed' || t.startsWith('agent.');
+describe('agent-live-invocation-bracket (RFC 0077 §E)', () => {
+  it('brackets a live invocation with started-first / completed-last + matching invocationId, content-free', async () => {
+    const cap = await readLiveRuntimeCap();
+    if (!behaviorGate('openwop-live-invocation-bracket', cap?.supported === true)) return;
+    if (!(await isEventLogSeamAvailable())) return; // event-log seam absent — soft-skip
+    const res = await invokeLive({ source: 'run-api' });
+    if (res === null || !res.runId) return; // live-invoke seam absent — soft-skip
+    const q = await queryTestEvents(res.runId);
+    if (!q.ok) return;
+    const events = q.events.slice().sort((a, b) => a.sequence - b.sequence);
+    const started = events.filter((e) => e.type === 'agent.invocation.started');
+    const completed = events.filter((e) => e.type === 'agent.invocation.completed');
+    expect(
+      started.length >= 1 && completed.length >= 1,
+      driver.describe('multi-agent-execution.md §"Live manifest dispatch"', 'a live invocation MUST emit agent.invocation.started + agent.invocation.completed'),
+    ).toBe(true);
+    if (started.length === 0 || completed.length === 0) return;
+    const start = started[0]!;
+    const end = completed[completed.length - 1]!;
+    // §E ordering: started is the FIRST agent-scoped event, completed the LAST.
+    const agentScoped = events.filter((e) => AGENT_SCOPED(e.type));
+    expect(
+      agentScoped[0]?.type === 'agent.invocation.started',
+      driver.describe('RFC 0077 §E', 'agent.invocation.started MUST be the first agent-scoped event of the invocation'),
+    ).toBe(true);
+    expect(
+      agentScoped[agentScoped.length - 1]?.type === 'agent.invocation.completed',
+      driver.describe('RFC 0077 §E', 'agent.invocation.completed MUST be the last agent-scoped event of the invocation'),
+    ).toBe(true);
+    // Matching invocationId across the bracket.
+    const startId = start.payload.invocationId;
+    const endId = end.payload.invocationId;
+    expect(
+      typeof startId === 'string' && startId === endId,
+      driver.describe('run-event-payloads.schema.json#agentInvocation*', 'the bracket MUST share one invocationId'),
+    ).toBe(true);
+    // Enum discipline.
+    expect(
+      typeof start.payload.source === 'string' && SOURCES.includes(start.payload.source as string),
+      driver.describe('run-event-payloads.schema.json#agentInvocationStarted', 'source MUST be workflow-node|run-api|chat-mention'),
+    ).toBe(true);
+    expect(
+      typeof end.payload.outcome === 'string' && OUTCOMES.includes(end.payload.outcome as string),
+      driver.describe('run-event-payloads.schema.json#agentInvocationCompleted', 'outcome MUST be in the closed enum'),
+    ).toBe(true);
+    // Content-free: identifiers + metadata only, never prompt/result body.
+    for (const evt of [start, end]) {
+      for (const forbidden of ['prompt', 'result', 'body', 'input', 'output', 'apiKey', 'secret', 'credentials', 'token']) {
+        expect(
+          !(forbidden in evt.payload),
+          driver.describe('RFC 0077', `agent.invocation.* MUST be content-free (no ${forbidden})`),
+        ).toBe(true);
+      }
+    }
+    await resetTestSeam();
+  });
+});

package/src/scenarios/agent-live-runtime-shape.test.ts ADDED Viewed

@@ -0,0 +1,98 @@
+/**
+ * Live manifest dispatch — capability + invocation-event shapes (RFC 0077).
+ *
+ * Always-on, server-free schema-shape probe. Verifies that:
+ *   - `capabilities.agents.liveRuntime` is declared on the capabilities schema
+ *     (with the `supported` / `structuredOutput` / `confidenceEscalation` /
+ *     `sources` sub-flags).
+ *   - the `agent.invocation.started` + `agent.invocation.completed` payload
+ *     $defs validate conforming content-free payloads and reject malformed
+ *     ones (a `started` missing `source`; a `completed` with an out-of-enum
+ *     `outcome`).
+ *   - both event names appear in the RunEventType enum.
+ *
+ * Behavioral assertions (the started→completed bracket ordering, structured-
+ * output enforcement, toolAllowlist enforcement) are gated on
+ * `capabilities.agents.liveRuntime.supported` and soft-skip until a reference
+ * host wires the live-invoke seam (RFC 0077 §Conformance — reference host
+ * deferred). This scenario asserts the wire contract, not host behavior.
+ *
+ * Spec references:
+ *   - https://github.com/openwop/openwop/blob/main/spec/v1/multi-agent-execution.md §"Live manifest dispatch"
+ *   - https://github.com/openwop/openwop/blob/main/RFCS/0077-agent-run-lifecycle-and-live-manifest-dispatch.md
+ */
+import { describe, it, expect } from 'vitest';
+import { readFileSync } from 'node:fs';
+import { join } from 'node:path';
+import Ajv2020 from 'ajv/dist/2020.js';
+import addFormats from 'ajv-formats';
+import { SCHEMAS_DIR } from '../lib/paths.js';
+/** Server-free assertion-message helper (mirrors driver.describe's "spec — requirement" shape without requiring OPENWOP_BASE_URL). */
+const why = (specRef: string, requirement: string): string => `${specRef} — ${requirement}`;
+function loadSchema(name: string): Record<string, unknown> {
+  return JSON.parse(readFileSync(join(SCHEMAS_DIR, name), 'utf8')) as Record<string, unknown>;
+}
+describe('agent-live-runtime-shape: capability advertisement (RFC 0077, server-free)', () => {
+  it('the capabilities schema declares agents.liveRuntime with its sub-flags', () => {
+    const caps = loadSchema('capabilities.schema.json');
+    const agents = (caps.properties as Record<string, { properties?: Record<string, { properties?: Record<string, unknown> }> }>).agents;
+    const live = agents?.properties?.liveRuntime;
+    expect(
+      live,
+      why('capabilities.md §agents', 'agents.liveRuntime MUST be declared'),
+    ).toBeDefined();
+    for (const flag of ['supported', 'structuredOutput', 'confidenceEscalation', 'sources']) {
+      expect(
+        live?.properties?.[flag],
+        why('multi-agent-execution.md §Live manifest dispatch', `agents.liveRuntime.${flag} MUST be declared`),
+      ).toBeDefined();
+    }
+  });
+});
+describe('agent-live-runtime-shape: invocation event payloads (RFC 0077, server-free)', () => {
+  const payloads = loadSchema('run-event-payloads.schema.json');
+  const ajv = new Ajv2020({ strict: false, allErrors: true });
+  addFormats(ajv);
+  ajv.addSchema(payloads, 'payloads');
+  const started = ajv.getSchema('payloads#/$defs/agentInvocationStarted');
+  const completed = ajv.getSchema('payloads#/$defs/agentInvocationCompleted');
+  it('agent.invocation.started validates a content-free start record and requires source', () => {
+    expect(started, 'the agentInvocationStarted $def MUST exist').toBeTruthy();
+    expect(
+      started!({ invocationId: 'inv-1', agentId: 'vendor.acme.review.code-reviewer', source: 'run-api', modelClass: 'coding', toolSurfaceCount: 3, memoryBound: false }),
+      why('RFC 0077 §C', 'a conforming agent.invocation.started payload MUST validate'),
+    ).toBe(true);
+    // Negative: missing source — every invocation must record its entry point.
+    expect(
+      started!({ invocationId: 'inv-1', agentId: 'vendor.acme.review.code-reviewer' }),
+      why('RFC 0077 §C', 'agent.invocation.started without source MUST be rejected'),
+    ).toBe(false);
+  });
+  it('agent.invocation.completed validates a content-free outcome record and pins the outcome enum', () => {
+    expect(completed, 'the agentInvocationCompleted $def MUST exist').toBeTruthy();
+    expect(
+      completed!({ invocationId: 'inv-1', agentId: 'vendor.acme.review.code-reviewer', outcome: 'completed', schemaValidated: true, confidence: 0.91 }),
+      why('RFC 0077 §C', 'a conforming agent.invocation.completed payload MUST validate'),
+    ).toBe(true);
+    // Negative: out-of-enum outcome — the canonical value is `completed`, not `done`.
+    expect(
+      completed!({ invocationId: 'inv-1', agentId: 'a', outcome: 'done' }),
+      why('RFC 0077 §C', 'agent.invocation.completed with an out-of-enum outcome MUST be rejected'),
+    ).toBe(false);
+  });
+  it('both invocation event names appear in the RunEventType enum', () => {
+    const runEvent = loadSchema('run-event.schema.json');
+    const enumVals = (runEvent.$defs as Record<string, { enum?: string[] }>).RunEventType?.enum ?? [];
+    expect(enumVals).toContain('agent.invocation.started');
+    expect(enumVals).toContain('agent.invocation.completed');
+  });
+});

package/src/scenarios/agent-live-structured-output.test.ts ADDED Viewed

@@ -0,0 +1,58 @@
+/**
+ * Live manifest-dispatch structured-output enforcement (RFC 0077 §B step 6) —
+ * behavioral.
+ *
+ * Gated on `capabilities.agents.liveRuntime.structuredOutput` (root-first per
+ * RFC 0073) — itself meaningful only alongside `liveRuntime.supported`.
+ * Soft-skips when unadvertised (default) / hard-fails under
+ * `OPENWOP_REQUIRE_BEHAVIOR=true`.
+ *
+ * Asserts the §B step-6 MUST: when the host advertises `structuredOutput` and an
+ * agent declares a `handoff.returnSchemaRef`, a terminal result that VIOLATES
+ * that schema MUST fail the invocation (`agent.invocation.completed.outcome ===
+ * "failed"`, `schemaValidated !== true`) rather than ship a non-conforming
+ * result as `completed`. Driven by the `forceInvalidResult` seam param so the
+ * assertion is deterministic; soft-skips when the seam/hook is unwired.
+ *
+ * Spec references:
+ *   - https://github.com/openwop/openwop/blob/main/spec/v1/multi-agent-execution.md (§"Live manifest dispatch")
+ *   - https://github.com/openwop/openwop/blob/main/RFCS/0077-agent-run-lifecycle-and-live-manifest-dispatch.md (§B step 6)
+ */
+import { describe, it, expect } from 'vitest';
+import { driver } from '../lib/driver.js';
+import { behaviorGate } from '../lib/behavior-gate.js';
+import { readLiveRuntimeCap, invokeLive } from '../lib/liveRuntime.js';
+import { queryTestEvents, isEventLogSeamAvailable, resetTestSeam } from '../lib/event-log-query.js';
+describe('agent-live-structured-output (RFC 0077 §B step 6)', () => {
+  it('fails the invocation on a result that violates handoff.returnSchemaRef', async () => {
+    const cap = await readLiveRuntimeCap();
+    // structuredOutput is a sub-flag of a supported liveRuntime; gate on both.
+    const advertised = cap?.supported === true && cap?.structuredOutput === true;
+    if (!behaviorGate('openwop-live-structured-output', advertised)) return;
+    if (!(await isEventLogSeamAvailable())) return; // soft-skip
+    const res = await invokeLive({
+      source: 'run-api',
+      returnSchemaRef: 'conformance-strict-handoff',
+      forceInvalidResult: true,
+    });
+    if (res === null || !res.runId) return; // seam/hook absent — soft-skip
+    const q = await queryTestEvents(res.runId, { type: 'agent.invocation.completed' });
+    if (!q.ok || !q.events[0]) return;
+    const payload = q.events[q.events.length - 1]!.payload;
+    expect(
+      payload.outcome === 'failed',
+      driver.describe('RFC 0077 §B step 6', 'a result violating handoff.returnSchemaRef MUST fail the invocation (outcome "failed"), not ship as completed'),
+    ).toBe(true);
+    expect(
+      payload.schemaValidated !== true,
+      driver.describe('RFC 0077 §B step 6', 'schemaValidated MUST NOT be true for a schema-violating result'),
+    ).toBe(true);
+    await resetTestSeam();
+  });
+});

package/src/scenarios/agent-loop-iteration-monotonic.test.ts ADDED Viewed

@@ -0,0 +1,33 @@
+/**
+ * agent-loop-iteration-monotonic — RFC 0061 §B. Across a multi-turn loop,
+ * `runOrchestrator.decided.iteration` increments 1, 2, 3 … exactly once per turn
+ * (1-based, monotonic) — the observable counter `maxLoopIterations` bounds.
+ *
+ * Gated on `executionModel.version >= 5` + the host agent-loop seam; soft-skips
+ * when either is absent.
+ *
+ * @see RFCS/0061-agent-loop-lifecycle.md §B
+ */
+import { describe, it, expect } from 'vitest';
+import { driver } from '../lib/driver.js';
+import { readExecutionModelCap, isVersion5, invokeAgentLoop } from '../lib/agentLoop.js';
+describe('agent-loop-iteration-monotonic (RFC 0061 §B)', () => {
+  it('iteration increments by exactly 1 per orchestrator turn, 1-based', async () => {
+    if (!isVersion5(await readExecutionModelCap())) return;
+    const res = await invokeAgentLoop({ turns: 3 });
+    if (res === null) return; // seam absent — soft-skip
+    const decisions = res.decisions ?? [];
+    expect(
+      decisions.length >= 1,
+      driver.describe('RFC 0061 §B', 'a multi-turn loop MUST emit one runOrchestrator.decided per turn'),
+    ).toBe(true);
+    const iterations = decisions.map((d) => d.iteration);
+    const expected = decisions.map((_, k) => k + 1);
+    expect(
+      JSON.stringify(iterations),
+      driver.describe('RFC 0061 §B', 'iteration MUST be 1-based + monotonic, incrementing by exactly 1 per turn'),
+    ).toBe(JSON.stringify(expected));
+  });
+});

package/src/scenarios/agent-loop-stateful-resume.test.ts ADDED Viewed

@@ -0,0 +1,28 @@
+/**
+ * agent-loop-stateful-resume — RFC 0061 §D. A loop suspended on a clarify/escalate
+ * HITL interrupt resumes at the SAME iteration — the counter does not reset or
+ * skip — with the snapshot lineage intact.
+ *
+ * Gated on `executionModel.statefulResume: true` + the host agent-loop seam;
+ * soft-skips when either is absent.
+ *
+ * @see RFCS/0061-agent-loop-lifecycle.md §D
+ */
+import { describe, it, expect } from 'vitest';
+import { driver } from '../lib/driver.js';
+import { readExecutionModelCap, invokeAgentLoop } from '../lib/agentLoop.js';
+describe('agent-loop-stateful-resume (RFC 0061 §D)', () => {
+  it('a mid-loop suspend resumes at the same iteration, counter intact', async () => {
+    const em = await readExecutionModelCap();
+    if (em?.statefulResume !== true) return;
+    // Suspend at turn 2, then resume: the resumed iteration MUST be 2, not 1 or 3.
+    const res = await invokeAgentLoop({ turns: 4, suspendAtTurn: 2, resume: true });
+    if (res === null) return; // seam absent — soft-skip
+    expect(
+      res.resumedIteration,
+      driver.describe('RFC 0061 §D', 'a stateful resume MUST continue at the suspend iteration — the counter does not reset or skip'),
+    ).toBe(2);
+  });
+});

package/src/scenarios/agent-loop-version5-shape.test.ts ADDED Viewed

@@ -0,0 +1,41 @@
+/**
+ * agent-loop-version5-shape — RFC 0061 §A/§B. The `executionModel.statefulResume`
+ * + `transcriptWindow` advertisement fields are well-formed when present, and a
+ * host advertising `version >= 5` carries a sane version ceiling.
+ *
+ * Status: ACTIVE (advertisement-shape; always runs). Behavioral coverage lives
+ * in the sibling agent-loop-*.test.ts scenarios, gated on `version >= 5` + the
+ * host agent-loop seam.
+ *
+ * @see RFCS/0061-agent-loop-lifecycle.md §A
+ * @see spec/v1/multi-agent-execution.md §"Stateful agent-loop lifecycle"
+ */
+import { describe, it, expect } from 'vitest';
+import { driver } from '../lib/driver.js';
+import { readExecutionModelCap } from '../lib/agentLoop.js';
+describe('agent-loop-version5-shape: advertisement (RFC 0061 §A)', () => {
+  it('executionModel.statefulResume/transcriptWindow are well-formed when present', async () => {
+    const em = await readExecutionModelCap();
+    if (em === null) return; // no execution model — valid
+    if (em.statefulResume !== undefined) {
+      expect(
+        typeof em.statefulResume,
+        driver.describe('capabilities.schema.json §multiAgent.executionModel', 'statefulResume MUST be a boolean when present'),
+      ).toBe('boolean');
+    }
+    if (em.transcriptWindow !== undefined) {
+      expect(
+        typeof em.transcriptWindow === 'number' && (em.transcriptWindow as number) >= 1,
+        driver.describe('capabilities.schema.json §multiAgent.executionModel', 'transcriptWindow MUST be a positive integer when present'),
+      ).toBe(true);
+    }
+    if (typeof em.version === 'number') {
+      expect(
+        (em.version as number) >= 1 && (em.version as number) <= 5,
+        driver.describe('capabilities.schema.json §multiAgent.executionModel', 'version MUST be within the 1–5 ladder'),
+      ).toBe(true);
+    }
+  });
+});

package/src/scenarios/agent-loop-workspace-snapshot.test.ts ADDED Viewed

@@ -0,0 +1,33 @@
+/**
+ * agent-loop-workspace-snapshot — RFC 0061 §C. A workspace PUT during turn i is
+ * invisible to turn i's snapshot and visible to turn i+1 — per-iteration
+ * snapshot immutability (writes land next turn, never retroactively).
+ *
+ * Gated on `executionModel.version >= 5` AND `host.workspace.supported` + the
+ * host agent-loop seam; soft-skips when any is absent.
+ *
+ * @see RFCS/0061-agent-loop-lifecycle.md §C
+ * @see RFCS/0059-agent-workspace.md §D — the workspace read snapshot
+ */
+import { describe, it, expect } from 'vitest';
+import { driver } from '../lib/driver.js';
+import { readExecutionModelCap, isVersion5, hasWorkspace, invokeAgentLoop } from '../lib/agentLoop.js';
+describe('agent-loop-workspace-snapshot (RFC 0061 §C)', () => {
+  it('a turn-i workspace write is invisible to turn i, visible to turn i+1', async () => {
+    if (!isVersion5(await readExecutionModelCap())) return;
+    if (!(await hasWorkspace())) return; // workspace optional — soft-skip
+    const res = await invokeAgentLoop({ turns: 2, workspaceWriteAtTurn: 1 });
+    if (res === null) return; // seam absent — soft-skip
+    const vis = res.workspaceVisible ?? {};
+    expect(
+      vis.atWriteTurn,
+      driver.describe('RFC 0061 §C', 'a workspace write during turn i MUST be invisible to turn i\'s snapshot'),
+    ).toBe(false);
+    expect(
+      vis.atNextTurn,
+      driver.describe('RFC 0061 §C', 'a workspace write during turn i MUST be visible to turn i+1'),
+    ).toBe(true);
+  });
+});