@openwop/openwop-conformance 1.10.0 → 1.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/CHANGELOG.md +34 -0
  2. package/README.md +2 -2
  3. package/api/asyncapi.yaml +70 -0
  4. package/api/openapi.yaml +268 -1
  5. package/coverage.md +30 -2
  6. package/fixtures/oauth-providers/synthetic.json +38 -0
  7. package/fixtures.md +10 -0
  8. package/package.json +1 -1
  9. package/schemas/README.md +12 -0
  10. package/schemas/agent-deployment-transition.schema.json +49 -0
  11. package/schemas/agent-deployment.schema.json +54 -0
  12. package/schemas/agent-eval-suite.schema.json +140 -0
  13. package/schemas/agent-inventory-response.schema.json +25 -0
  14. package/schemas/agent-manifest.schema.json +5 -0
  15. package/schemas/agent-org-chart.schema.json +82 -0
  16. package/schemas/agent-ref.schema.json +12 -2
  17. package/schemas/agent-roster-entry.schema.json +81 -0
  18. package/schemas/agent-roster-response.schema.json +21 -0
  19. package/schemas/budget-policy.schema.json +18 -0
  20. package/schemas/capabilities.schema.json +277 -0
  21. package/schemas/credential-provenance.schema.json +18 -0
  22. package/schemas/eval-summary.schema.json +92 -0
  23. package/schemas/node-pack-manifest.schema.json +17 -0
  24. package/schemas/org-chart-responsibility-view.schema.json +26 -0
  25. package/schemas/run-event-payloads.schema.json +286 -3
  26. package/schemas/run-event.schema.json +19 -0
  27. package/schemas/tool-descriptor.schema.json +63 -0
  28. package/schemas/trigger-subscription.schema.json +26 -0
  29. package/src/lib/agentRoster.ts +76 -0
  30. package/src/lib/liveRuntime.ts +59 -0
  31. package/src/lib/profiles.ts +157 -0
  32. package/src/lib/runtimeRequires.ts +38 -0
  33. package/src/lib/safeFetch.ts +87 -0
  34. package/src/scenarios/agent-deployment-shape.test.ts +139 -0
  35. package/src/scenarios/agent-eval-suite-shape.test.ts +167 -0
  36. package/src/scenarios/agent-live-allowlist-enforced.test.ts +53 -0
  37. package/src/scenarios/agent-live-invocation-bracket.test.ts +98 -0
  38. package/src/scenarios/agent-live-runtime-shape.test.ts +98 -0
  39. package/src/scenarios/agent-live-structured-output.test.ts +58 -0
  40. package/src/scenarios/agent-org-chart-shape.test.ts +127 -0
  41. package/src/scenarios/agent-platform-profile.test.ts +158 -0
  42. package/src/scenarios/agent-roster-attribution.test.ts +179 -0
  43. package/src/scenarios/agent-roster-shape.test.ts +146 -0
  44. package/src/scenarios/budget-policy-shape.test.ts +136 -0
  45. package/src/scenarios/egress-provenance-shape.test.ts +137 -0
  46. package/src/scenarios/memory-capability-model-shape.test.ts +186 -0
  47. package/src/scenarios/oauth-authorization-code-roundtrip.test.ts +145 -0
  48. package/src/scenarios/runtime-requires-install-gate.test.ts +92 -0
  49. package/src/scenarios/runtime-requires-shape.test.ts +134 -0
  50. package/src/scenarios/safefetch-behavior.test.ts +99 -0
  51. package/src/scenarios/safefetch-live-audit.test.ts +175 -0
  52. package/src/scenarios/spec-corpus-validity.test.ts +19 -3
  53. package/src/scenarios/tool-descriptor-shape.test.ts +133 -0
  54. package/src/scenarios/trigger-bridge-shape.test.ts +135 -0
  55. package/src/scenarios/x-openwop-form-pack-manifest.test.ts +155 -0
package/coverage.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # OpenWOP Conformance Coverage Map
2
2
 
3
- > **Status: Living document. Updated 2026-05-25.** This map connects the current scenario files to the protocol surfaces they protect and records the remaining gaps from the protocol deep dive. Scenario names are source-of-truth file names under `conformance/src/scenarios/`.
3
+ > **Status: Living document. Updated 2026-05-30.** This map connects the current scenario files to the protocol surfaces they protect and records the remaining gaps from the protocol deep dive. Scenario names are source-of-truth file names under `conformance/src/scenarios/`.
4
4
 
5
5
  > **Shape grade vs behavior grade.** Some optional-profile scenarios validate **capability shape** (the host's discovery advertisement is well-formed) without yet exercising **behavior** (the host actually implements the profile end-to-end). The "Current grade" column reflects shape; see §"Capability-gated scenarios: shape vs behavior" below for the dual-grade view and the `OPENWOP_REQUIRE_BEHAVIOR=true` strict-mode runner flag.
6
6
 
@@ -50,12 +50,23 @@
50
50
  | Sandbox MVP behavioral close-out (RFC 0035 §B) | **`sandbox-mvp-behavior.test.ts` (2026-05-22)** | A (10 capability-gated behavioral assertions covering 7 of 8 §B failure-mode invariants — 5 escape kinds + timeout + memory-exceeded + cross-pack-mutation isolation + capability-gate-violation + 2 well-behaved baselines; all 10 PASS against the workflow-engine's node:vm-based sandbox MVP) | Companion to the existing 8 advertisement-shape sandbox scenarios (`sandbox-no-host-fs-escape.test.ts` et al.). Exercises the canonical 4-code error catalog at `spec/v1/host-capabilities.md` §"Error codes" (`sandbox_escape_attempt` + `sandbox_capability_denied` + `sandbox_memory_exceeded` + `sandbox_timeout`) with spec-mandated `details.{escapeKind, requestedCapability, requestedBytes}` populated. Wire-shape per `spec/v1/host-sample-test-seams.md §8`. Production adopters use wasmtime/nsjail behind the same HTTP test-seam contract. |
51
51
  | RFC 0041 §B replay-divergence-at-refusal behavioral (`version: 4`) | `replay-divergence-at-refusal.test.ts` (advertisement-shape + behavioral; 3 assertions PASS against workflow-engine when the `multiAgent.executionModel.version: 4` advertisement is enabled) | A (was `it.todo` until 2026-05-23 when the executor wiring landed — see commit `1fce55a` + `bba3b4a`. Behavioral assertions cover both divergence directions: original=valid + replay=refusal AND original=refusal + replay=valid) | Closes Track #4 of the 2026-05-22 multi-agent behavioral-harness close-out. Reference workflow-engine emits `replay.divergedAtRefusal` event + fails run with `error.code: 'replay_diverged_at_refusal'` when source vs replay envelope kinds differ at the same nodeId. Gated on `OPENWOP_MULTI_AGENT_EXECUTION_MODEL_PHASE_4=true` AND `run.forkMode === 'replay'`. Path-to-Accepted for RFC 0041: non-steward host advertises `multiAgent.executionModel.version: 4` end-to-end. |
52
52
  | Agent-manifest runtime floor (RFC 0070 — `capabilities.agents.manifestRuntime`) | `agent-manifest-runtime.test.ts` | B (capability-gated; lists ≥1 installed manifest agent + dispatches one with attributed `agent.reasoned`+`agent.decided` events, plus a §F sub-threshold-escalation assertion) | RFC 0070 filed Draft 2026-05-26. Gated on `capabilities.agents.manifestRuntime.supported` + the host dispatch seam (`POST /v1/host/sample/agents/{agentId}/dispatch`); soft-skips when either is absent. The reference **workflow-engine** host advertises `manifestRuntime: { supported: true, handoffValidation: true }`, loads pack `agents[]` (RFC 0003 `installAgents`) into an AgentRegistry at boot, and dispatches end-to-end (toolAllowlist-filtered per RFC 0002 §A14, handoff-validated per RFC 0003 §D, confidence-escalating per §F) — see `apps/workflow-engine/backend/typescript/test/agent-dispatch-route.test.ts` (6 HTTP assertions, incl. the normative inventory). **RFC 0072 (`Draft`):** the scenario's inventory leg now drives the NORMATIVE `GET /v1/agents` (§A) so it runs black-box against any conformant host; the dispatch leg stays on the sample seam (soft-skips off-steward) pending the executor-integration tier. RFC 0072 §C `peerDependenciesMeta` disposition + `degraded[]` are unit-tested in `agent-loader.test.ts`. Path to `Active → Accepted` (RFC 0070): a non-steward host advertises `manifestRuntime` + serves `GET /v1/agents`. |
53
+ | Live manifest dispatch (RFC 0077 — `capabilities.agents.liveRuntime`) | `agent-live-runtime-shape.test.ts` | A (always-on, server-free shape probe) | RFC 0077 promoted Draft → Active 2026-05-29 (5 UQs resolved via MyndHyve T4 co-design). Always-on shape probe asserts `capabilities.agents.liveRuntime` (+ `supported`/`structuredOutput`/`confidenceEscalation`/`sources` sub-flags) is declared, the `agentInvocationStarted`/`agentInvocationCompleted` payloads validate conforming content-free records + reject malformed ones (`started` missing `source`; `completed` out-of-enum `outcome`), and both event names appear in the RunEventType enum. `liveRuntime` ⊃ `manifestRuntime`. **Behavioral scenarios deferred** per RFC 0077 §Conformance (reference host): the started→completed bracket ordering, `structuredOutput` enforcement, and `toolAllowlist` enforcement gate on `capabilities.agents.liveRuntime.supported` + a live-invoke seam and soft-skip until a host wires it. Path to `Accepted`: a non-steward host advertises `liveRuntime` + emits the invocation pair (net-new MyndHyve T4 work, queued behind §B). |
54
+ | Agent evaluation (RFC 0081 — `capabilities.agents.evalSuite`) | `agent-eval-suite-shape.test.ts` | A (always-on, server-free shape probe; doubles as the public test for `eval-summary-no-content-leak`) | RFC 0081 promoted Draft → Active 2026-05-30. Always-on shape probe asserts `capabilities.agents.evalSuite` (+ `supported`/`modes` sub-flags) is declared; the `AgentEvalSuite` + `EvalSummary` schemas compile + round-trip a conforming artifact and reject malformed ones (bad `suiteId` infix; `passScore` out of 0..1; out-of-range `aggregateScore`); the `eval.started`/`eval.scored`/`eval.completed` payloads validate content-free records + reject malformed ones; and all three event names appear in the RunEventType enum. The **content-free negatives** (an `EvalSummary` task entry carrying a `taskOutput` body; a `safetyFinding` carrying an `excerpt`) are the public test for protocol-tier SECURITY invariant `eval-summary-no-content-leak`. **Behavioral scenarios deferred** per RFC 0081 §Conformance (reference host): `agent-eval-run.test.ts` (the `eval.started`→per-task `eval.scored`→`eval.completed` ordering, the `EvalSummary` round-trip, the `mode: "eval"` 501 on unadvertised hosts) gates on `capabilities.agents.evalSuite.supported` + the eval-run seam and soft-skips until a host wires the eval projection. Path to `Accepted`: a host advertises `evalSuite` + runs a golden/regression suite end-to-end (the `GET /v1/runs/{runId}/eval-summary` endpoint + SDK helper land with it). |
55
+ | Memory capability model (RFC 0080 — `spec/v1/agent-memory.md` §"Memory capability model", `spec/v1/profiles.md` §`openwop-memory`) | `memory-capability-model-shape.test.ts` | A (always-on, server-free shape probe) | RFC 0080 promoted Draft → Active 2026-05-30 (4 UQs resolved via MyndHyve review). Always-on shape probe asserts the additive `capabilities.memory.{writable,search,retention}` dimensions are declared (existing `supported`/`compaction`/`distillation`/`attribution` untouched), `memory.search`/`memory.retention` validate conforming instances + reject malformed ones (`retention.ttl` non-boolean; out-of-enum `search.modes`; unknown property under `additionalProperties:false`), `agent-inventory-response` declares `memoryDegraded` + the closed-enum `degradedMemoryDimensions` (the eight §A dimension names), and `deriveProfiles` surfaces `openwop-memory` for a read/write + long-term host while withholding it from a `writable:false` host. **Behavioral scenario deferred** per RFC 0080 §Conformance: `memory-degraded-projection.test.ts` (a live `GET /v1/agents` stamping `memoryDegraded` when an agent's `memoryShape` exceeds the host's reconciled model) gates on `agents.manifestRuntime` + `memory` and soft-skips until a reference host computes it. Path to `Accepted`: a host computes the §C degraded projection + the scenario passes against it. |
56
+ | Portable tool catalog (RFC 0078 — `spec/v1/tool-catalog.md`) | `tool-descriptor-shape.test.ts` | A (always-on, server-free shape probe) | RFC 0078 promoted Draft → Active 2026-05-30 (4 UQs resolved via MyndHyve review). Always-on shape probe asserts `tool-descriptor.schema.json` round-trips a conforming `ToolDescriptor` + rejects the malformed (`safetyTier`-required, `additionalProperties:false`), enforces the §C-1/§F-4 cross-field MUST (`safetyTier:"exec"` ⇒ `source:"host-extension"`, RFC 0069 — an `exec`+`node-pack` descriptor is rejected), asserts the `capabilities.toolCatalog` `supported`/`sources`/`sessionLifecycle` shape, and validates the two content-free `tool.session.{opened,closed}` payloads (incl. the closed `outcome` enum) + their RunEventType-enum membership. **Behavioral scenarios deferred** per RFC 0078 §Conformance: `tool-catalog-projection.test.ts` (the authorization-scoped `GET /v1/tools` + `404` non-disclosure) + `tool-session-lifecycle.test.ts` (the `tool.session.*` bracket ordering) gate on `capabilities.toolCatalog.supported` + `sessionLifecycle` and soft-skip until a reference host serves the catalog. Path to `Accepted`: a host projects ≥1 tool source at `GET /v1/tools` + the projection scenario passes. |
57
+ | Credential provenance + egress policy (RFC 0079 — `spec/v1/host-capabilities.md` §"Credential provenance + egress policy") | `egress-provenance-shape.test.ts` | A (always-on, server-free shape probe; doubles as the public test for `egress-decision-no-secret-leak`) | RFC 0079 promoted Draft → Active 2026-05-30 (4 UQs resolved via MyndHyve review). Always-on shape probe asserts `credential-provenance.schema.json` round-trips a conforming `CredentialProvenance` + rejects `audiences:[]` / missing `credentialId` / unknown property, the descriptor + `egress.decided` declare NO secret-value property (the content-free **`egress-decision-no-secret-leak`** protocol-tier invariant), the `egress.decided` payload validates a content-free record + enforces the `decision` enum + required `decision`/`destination`, and `capabilities.httpClient.egressPolicy` is declared. **The behavioral audience-binding MUST-NOT (`egress-credential-audience-bound`) is reference-impl tier** at Draft→Active — a credential bound to audience A on an egress to B must be `denied`/`downgraded` (never `allowed`-with-credential), fail-closed on unevaluable provenance — and lands in the gated `egress-audience-binding.test.ts` + `egress-decision-content-free.test.ts` (soft-skip until a host wires `egressPolicy` over `safeFetch`). Path to `Accepted`: a reference host enforces §C + the binding scenario passes → `egress-credential-audience-bound` graduates protocol-tier (RFC 0035 precedent). |
58
+ | Durable trigger + channel bridge (RFC 0083 — `spec/v1/trigger-bridge.md`, `spec/v1/profiles.md` §`openwop-trigger-bridge`) | `trigger-bridge-shape.test.ts` | A (always-on, server-free shape probe) | RFC 0083 promoted Draft → Active 2026-05-30 (5 UQs resolved via MyndHyve review). Always-on shape probe asserts `trigger-subscription.schema.json` round-trips a conforming `TriggerSubscription` + rejects missing-`state`/out-of-enum-`source`/unknown-property, the four-state vocab (`active`/`paused`/`failed`/`dead-lettered`) is stable, the two content-free `trigger.{subscription.state.changed,delivery.attempted}` payloads validate + enforce the `state`/`outcome` enums + RunEventType-enum membership, `capabilities.triggerBridge` + `webhooks.durable` are declared, and `deriveProfiles` surfaces `openwop-trigger-bridge` for bridge+sink+durable-source while withholding it with no dead-letter sink. **Behavioral scenario deferred** per RFC 0083 §Conformance: `trigger-bridge-delivery.test.ts` (dedup → retry → dead-letter → trigger→run causation) is profile-gated on `openwop-trigger-bridge` and soft-skips until a reference host wires durable delivery. Path to `Accepted`: a host wires the state machine + delivery loop + the scenario passes. |
59
+ | Budget, quota + cost policy (RFC 0084 — `spec/v1/budget-policy.md`) | `budget-policy-shape.test.ts` | A (always-on, server-free shape probe; doubles as the public test for `budget-no-pricing-leak`) | RFC 0084 promoted Draft → Active 2026-05-30 (5 UQs resolved via MyndHyve review). Always-on shape probe asserts `budget-policy.schema.json` round-trips a conforming `BudgetPolicy` + enforces the §A/§E orthogonality guard (a wall-time field is rejected — that's RFC 0058's `runTimeoutMs`) + threshold/onExhaustion negatives, the four content-free `budget.{reserved,consumed,threshold.crossed,exhausted}` payloads validate + enforce the `dimension`/`scope` enums, the four `cap.breached{budget-tokens,budget-cost,budget-tool-calls,budget-retries}` kinds + the four `budget.*` RunEventType-enum entries are present, the payloads declare no pricing/credential property (the **`budget-no-pricing-leak`** protocol-tier invariant), and `capabilities.budget` + `limits.maxBudget{Tokens,CostUsd}` are declared. **Behavioral scenario deferred** per RFC 0084 §Conformance: `budget-enforcement.test.ts` (accrue → threshold → exhaust → `cap.breached{budget-cost}` → `run.failed{budget_exhausted}`; `budget_model_denied`; advisory no-stop) gates on `budget.supported` + `enforce` and soft-skips until a reference host wires accounting. Path to `Accepted`: a host wires the budget accumulator + the exhaustion stop + the scenario passes. |
60
+ | `openwop-agent-platform` meta-profile (RFC 0085 — `spec/v1/agent-platform-profile.md`, operational annex) | `agent-platform-profile.test.ts` | A (always-on, server-free derivation probe) | RFC 0085 promoted Draft → Active 2026-05-30 (5 UQs resolved via MyndHyve review). Operational annex (NOT a closed `profiles.md` catalog predicate). Always-on derivation probe asserts `isAgentPlatformPartial`/`isAgentPlatformFull`/`agentPlatformStatus` derive `none`/`partial`/`full` correctly: all-floor ⇒ partial, a missing floor flag ⇒ none, the replay-OR-`nondeterminismPolicy.declared` term, floor+governance ⇒ full, a missing governance term (tenant `installScope`) ⇒ partial-not-full (the honest-advertisement rule), and that the eval/deploy/budget platform-plus tier is advisory (a full host without them is still full); plus `capabilities.nondeterminismPolicy.declared` is declared. **Live aggregate-evidence assertion deferred** per RFC 0085 §C: when a host claims `full`, the meta-scenario must assert every required constituent scenario is in its passing set — naturally gated on a reference host reaching partial/full (Postgres is the candidate). Path to `Accepted`: ≥1 host reports `partial`/`full` backed by the aggregate + renders the badge. |
61
+ | Agent deployment lifecycle (RFC 0082 — `capabilities.agents.deployment`) | `agent-deployment-shape.test.ts` | A (always-on, server-free shape probe; doubles as the public test for `deployment-event-no-content-leak`) | RFC 0082 promoted Draft → Active 2026-05-30. Always-on shape probe asserts `capabilities.agents.deployment` (+ `supported`/`channels`/`canary`/`rollback`/`states` sub-flags); the `AgentDeployment` record compiles + round-trips and rejects malformed ones (out-of-enum `state`; `canaryPercent` out of 0..100); the **`AgentRef` `channel` XOR `version`** rule (each alone + neither validate; both rejected by the `not` clause, §A); the four `deployment.*` payloads validate content-free records + reject malformed ones; `agent.invocation.started` carries the additive recorded-fact `resolvedAgentVersion`/`resolvedChannel` (§B channel pin); and all four event names appear in the RunEventType enum. The **content-free negatives** (a `deployment.promoted` carrying a `manifestBody`; a `deployment.state.changed` carrying a `prompt`) are the public test for protocol-tier SECURITY invariant `deployment-event-no-content-leak`. The behavioral `deployment-promotion-fail-closed` invariant is `reference-impl` tier until the behavioral scenario lands (then graduates to protocol; RFC 0035 precedent). **Behavioral scenarios deferred** per RFC 0082 §Conformance (reference host): `agent-deployment-lifecycle.test.ts` (authz → approvalGate → eval-verify → `deployment.promoted`; the fail-closed denial; the §B replay re-read of `resolvedAgentVersion`) gates on `capabilities.agents.deployment.supported` + the deployment-store seam and soft-skips until a host wires it. Path to `Accepted`: a host implements the deployment store + canary router + the `POST /v1/agents/{agentId}/deployments` promotion contract (the endpoint + SDK helper land with it). |
62
+ | Standing agent roster (RFC 0086 — `capabilities.agents.roster`) | `agent-roster-shape.test.ts` | A (always-on, server-free shape probe; doubles as the public test for `roster-attribution-no-content`) | RFC 0086 promoted Draft → Active 2026-05-30. Always-on shape probe asserts `capabilities.agents.roster` (+ `supported`/`installScope`/`portfolioTriggerSources` sub-flags); the `AgentRosterEntry` record compiles + round-trips and rejects malformed ones (a non-`host:` `rosterId`; an `agentRef` carrying BOTH `version` and `channel` — the RFC 0082 §A XOR rule; a missing `rosterId`); the `roster.run.initiated` payload validates a content-free attribution record + requires its ids/persona/triggerSource; the `AgentInventoryEntry` carries the additive optional `roster` portfolio projection (§B); and `roster.run.initiated` appears in the RunEventType enum. The **content-free negatives** (a `roster.run.initiated` carrying a `body`; one carrying a `prompt`) are the public test for protocol-tier SECURITY invariant `roster-attribution-no-content`. **Behavioral scenarios deferred** per RFC 0086 §Conformance (reference host): a scheduled portfolio fire emitting `roster.run.initiated` before `agent.invocation.started`; the RFC 0083 work-item causation chain; the replay re-read; the cross-tenant `GET /v1/agents/roster/{id}` 404 — gate on `capabilities.agents.roster.supported` + the roster-store seam and soft-skip until a host wires it (the host-extension at `/v1/host/sample/roster` + board attribution, apps/workflow-engine #368, is the reference demonstration). Path to `Accepted`: a non-steward host advertises `agents.roster` + emits `roster.run.initiated`. |
63
+ | Agent org-chart (RFC 0087 — `capabilities.agents.orgChart`) | `agent-org-chart-shape.test.ts` | A (always-on, server-free shape probe; doubles as the public test for `org-position-no-authority-escalation`) | RFC 0087 promoted Draft → Active 2026-05-30. Always-on shape probe asserts `capabilities.agents.orgChart` (+ `supported`/`installScope`/`departmentNesting`/`responsibilityView` sub-flags); the `AgentOrgChart` record compiles + round-trips and rejects malformed ones (a non-`host:` member `rosterId`; a chart missing `members`). The **§B structural non-authority guarantee**: the schema **rejects** an authority-bearing field on a member (`scopes`/`canDispatch`/`permissions`/`authority` — every object is `additionalProperties:false`), and a conforming member's key set is exactly `{rosterId, departmentId, roleId, reportsTo}`. These are the public test for protocol-tier SECURITY invariant `org-position-no-authority-escalation` (an org edge confers no authority — position describes, never authorizes). NO new RunEventType (the org-chart is structure + a read, not an event surface). **Behavioral scenarios deferred** per RFC 0087 §Conformance (reference host): the live-dispatch refusal of a manager's tool over-reach; an RFC 0049 decision invariant to org position; the cross-tenant `GET /v1/agents/org-chart` 404; the §D responsibility roll-up over live roster portfolios — gate on `capabilities.agents.orgChart.supported` + the org-store seam and soft-skip until a host wires it (the host-extension at `/v1/host/sample/org-chart`, apps/workflow-engine #371, is the reference demonstration). Path to `Accepted`: a non-steward host advertises `agents.orgChart` + passes the behavioral non-authority scenario. |
53
64
 
54
65
  ---
55
66
 
56
67
  ## Capability-gated scenarios: shape vs behavior
57
68
 
58
- Twenty-two scenario groups validate optional profiles where the host's discovery advertisement is well-formed (shape grade) but no reference host yet implements the profile end-to-end (behavior grade is `host-pending`). Default suite runs skip these with a warning; set `OPENWOP_REQUIRE_BEHAVIOR=true` to convert skips into hard failures.
69
+ Twenty-eight scenario groups validate optional profiles where the host's discovery advertisement is well-formed (shape grade) but no reference host yet implements the profile end-to-end (behavior grade is `host-pending`). Default suite runs skip these with a warning; set `OPENWOP_REQUIRE_BEHAVIOR=true` to convert skips into hard failures.
59
70
 
60
71
  | Scenario | Profile / capability | Shape grade | Behavior grade | Behavior-unlock dependency |
61
72
  |---|---|---|---|---|
@@ -81,6 +92,7 @@ Twenty-two scenario groups validate optional profiles where the host's discovery
81
92
  | `credential-payload-redaction.test.ts` | `capabilities.credentials` (RFC 0046) + `SECURITY/invariants.yaml` `credential-payload-redaction` | A (advertisement shape always; redaction MUST-NOT via optional `POST /v1/host/sample/credentials/echo` seam — canary plaintext absent from all observable surfaces) | `host-pending` | Capability-gated on `credentials.supported`; behavioral probe soft-skips on 404 when the seam is unwired, mirroring `fs-path-traversal`. |
82
93
  | `oauth-capability-shape.test.ts` | `capabilities.oauth` (RFC 0047, `host-capabilities.md` §host.oauth) | A (advertisement shape always — `supported` boolean; `grants` ⊆ {authorization_code,client_credentials,refresh_token}; every `providers[].id` non-empty) | `host-pending` | Always runs; asserts the block is absent or well-formed. No host advertises `capabilities.oauth` yet (RFC 0047 `Draft`). |
83
94
  | `oauth-connector-redaction.test.ts` | `capabilities.oauth` (RFC 0047) + `SECURITY/invariants.yaml` `credential-payload-redaction` | A (advertisement shape always; token-material redaction via optional `POST /v1/host/sample/oauth/connector-echo` seam — canary token absent from all observable surfaces; `connector.authorized` carries the ref not the token) | `host-pending` | Capability-gated on `oauth.supported`; behavioral probe soft-skips on 404. Reuses the RFC 0046 redaction invariant (OAuth tokens are stored as host.credentials entries). |
95
+ | `oauth-authorization-code-roundtrip.test.ts` | `capabilities.oauth` (RFC 0047 §C) + `SECURITY/invariants.yaml` `credential-payload-redaction` | A (capability-gated on `oauth.supported` + `grants` ∋ `authorization_code`; behavioral roundtrip via optional `POST /v1/host/sample/oauth/authorize-code-roundtrip` seam against the one canonical synthetic provider in `fixtures/oauth-providers/synthetic.json` — returns a credential REFERENCE; the authorization code / state / PKCE verifier / acquired access+refresh tokens are absent from every observable surface; `connector.authorized` carries the ref not the token) | `host-pending` | Capability+grant-gated; behavioral probe soft-skips on 404. Closes the RFC 0047 Tier-2 gap — exercises the actual authorization-code dance (shape + redaction scenarios existed; the grant itself was unexercised). |
84
96
  | `connector-manifest-validity.test.ts` | `node-pack-manifest.schema.json` §Connector (RFC 0045, `node-packs.md` §Connectors) | Server-free (schema validity of the `connector` block incl. both ConnectorAuth variants + positive/negative round-trip; §B action/trigger typeId-resolution semantics — `connector_action_unresolved` on an unknown typeId) | host-pass (server-free) | Always runs; no host needed. Behavioral idempotency-hint + rate-limit-honored scenarios deferred until a host advertises a connector. |
85
97
  | `identity-owner-shape.test.ts` | `run-snapshot.schema.json` properties.owner (RFC 0048 §C, `auth.md` §Identity claims) | Server-free (owner triple schema validity: positive `{tenant}` + full triple; negative missing-tenant + unknown-prop) | host-pass (server-free) | Always runs; no host needed. |
86
98
  | `cross-workspace-isolation.test.ts` | RFC 0048 §C/§D (`auth.md` §Identity claims, `rest-endpoints.md` `run_forbidden`) | A (owner-echo shape if a sample run is readable; §D isolation MUST-NOT via optional `POST /v1/host/sample/identity/cross-workspace-read` seam — cross-workspace read fails closed with `run_forbidden`/`not_found`) | `host-pending` | Behavioral probe soft-skips on 404; no host advertises run ownership yet (RFC 0048 `Draft`). |
@@ -88,6 +100,14 @@ Twenty-two scenario groups validate optional profiles where the host's discovery
88
100
  | `authorization-fail-closed.test.ts` | `capabilities.authorization` (RFC 0049 §C) + `SECURITY/invariants.yaml` `authorization-fail-closed` | A (advertisement `failClosed===true` always; fail-closed MUST-NOT via optional `POST /v1/host/sample/authorization/decide` seam — an unseeded-role principal resolves `allowed:false`) | `host-pending` | Capability-gated on `authorization.supported`; behavioral probe soft-skips on 404. Scope-match + denial-audited scenarios deferred to a host. |
89
101
  | `auth-saml-profile.test.ts` | `openwop-auth-saml` (RFC 0050, `auth-profiles.md` §`openwop-auth-saml`) | A+B (profile-advertisement shape always; **1-positive + 6-negative reference suite runs server-free** via the bundled synthetic IdP `conformance/src/lib/saml-idp.ts` — `alg:none`/unsigned/bad-sig/expired/not-yet-valid/wrapping; host-ACS validation opt-in via `OPENWOP_TEST_SAML_IDP_URL` + the `auth/saml/validate` seam) | host-pass (server-free reference) | Synthetic IdP bundled (`node:crypto`, no deps). Host-ACS pass is the remaining graduation gate. |
90
102
  | `auth-scim-profile.test.ts` | `openwop-auth-scim` (RFC 0050, `auth-profiles.md` §`openwop-auth-scim`) | B (profile-advertisement shape always; SCIM user/group provisioning → principal/role roundtrip opt-in via `OPENWOP_TEST_SCIM_URL` + the `auth/scim/provision` seam) | `host-pending` | Behavior opt-in (operator-supplied SCIM endpoint); deactivate ⇒ subsequent-deny assertion deferred to a host. |
103
+ | `runtime-requires-shape.test.ts` | `node-pack-manifest.schema.json` `$defs/Runtime.requires` (RFC 0076 §A, `node-packs.md` §"Runtime platform requirements") | Server-free (closed-vocabulary validation: 8 tokens validate; raw builtin `node:dns/promises` rejected → `invalid_manifest`; empty-array≡omission; `uniqueItems`) | host-pass (server-free) | Always runs; no host needed. The install-time GATE behavior is in `runtime-requires-install-gate.test.ts`. |
104
+ | `runtime-requires-install-gate.test.ts` | RFC 0076 §A install gate (no capability flag; seam `POST /v1/host/sample/packs/install-gate`) | A (install-grant; install-refuse → `pack_runtime_requirement_unmet { unmet, manifest, advice? }`; non-sandbox SHOULD-projection — all via the optional seam) | `host-pending` | Seam-gated; soft-skips on 404. First adopter: MyndHyve's install-time gate against `core.openwop.http` declaring `["net.dns","net.outbound"]`. |
105
+ | `safefetch-behavior.test.ts` | `capabilities.httpClient.safeFetch` (RFC 0076 §B, `host-capabilities.md` §host.http) + `SECURITY/invariants.yaml` `http-client-ssrf-guard` | A (SSRF block + DNS-rebinding defeat + `Connection: upgrade` refusal + tool-hooks audit-when-`prePostEvents`, via the optional `POST /v1/host/sample/http/safe-fetch` seam) | in-memory ✅ | **5/5 PASS against the in-memory reference host** (2026-05-29, `OPENWOP_REQUIRE_BEHAVIOR=true`); seam-gated, soft-skips on 404 elsewhere. Reuses the existing `http-client-ssrf-guard` invariant (no new invariant). Advertisement contract in `http-client-ssrf.test.ts`. §B → Accepted still awaits `core.openwop.http@2.0.0` consumer + non-steward adoption. |
106
+ | `safefetch-live-audit.test.ts` | `capabilities.httpClient.safeFetch` + `capabilities.toolHooks.prePostEvents` (RFC 0076 §B, `host-capabilities.md` §host.http; RFC 0064 §B) | A (the audit-when-both MUST against the **durable run event log** — production `ctx.http.safeFetch` path — via the `POST /v1/host/sample/http/safe-fetch-run` open seam + the test event-log seam; asserts a `callId`-paired `agent.toolCalled` `transport:"http"` / `agent.toolReturned` was persisted) | `host-pending` | `behaviorGate('openwop-safefetch-live-audit', …)` — both-flags-advertised ⇒ FAIL under `OPENWOP_REQUIRE_BEHAVIOR=true` if the durable pair is absent (closes the seam-vs-production gap: a production `createSafeFetch()` with no audit hooks passes `safefetch-behavior.test.ts` but fails this). Asserts the pair on a guaranteed-**blocked** metadata URL (egress-independent floor — "every invocation" incl. refused, so no vacuous pass on an egress-blocked host) + best-effort public fetch for success-path coverage. Run seam soft-skips on 404 (host-pending). No new invariant. **This is the RFC 0076 §B → Accepted bar.** |
107
+ | `agent-roster-attribution.test.ts` | `capabilities.agents.roster.supported` (RFC 0086 §B/§C, `agent-roster.md`) | A (the normative `GET /v1/agents/roster` read shape + `total==roster.length`; the §C `roster.run.initiated`-before-`agent.invocation.started` ordering + the content-free payload backing `roster-attribution-no-content`; the durable work-item `triggerSubscriptionId` (RFC 0083); the RFC 0074 cross-tenant 404 via `OPENWOP_CROSS_TENANT_ROSTER_ID` — driven through `POST /v1/host/sample/roster/fire` + the test event-log seam) | `host-pending` | `behaviorGate('openwop-roster-attribution', …)`. Normative-read leg runs black-box on any roster host; the attribution/ordering legs are seam-gated and soft-skip on 404. **This is the RFC 0086 → Accepted bar.** First adopter: MyndHyve `agents.roster`. |
108
+ | `agent-live-invocation-bracket.test.ts` | `capabilities.agents.liveRuntime.supported` (RFC 0077 §E, `multi-agent-execution.md` §"Live manifest dispatch") | A (the §E bracket — `agent.invocation.started`-first / `agent.invocation.completed`-last, matching `invocationId`, `source`/`outcome` closed enums, both content-free — via `POST /v1/host/sample/agents/live-invoke` + the test event-log seam) | `host-pending` | `behaviorGate('openwop-live-invocation-bracket', …)`. Seam-gated; soft-skips on 404. Part of the RFC 0077 → Accepted bar. First adopter: MyndHyve `agents.liveRuntime`. |
109
+ | `agent-live-structured-output.test.ts` | `capabilities.agents.liveRuntime.structuredOutput` (RFC 0077 §B step 6) | A (a terminal result violating `handoff.returnSchemaRef` fails the invocation `outcome:"failed"` + `schemaValidated != true`, not a shipped completion — via the `forceInvalidResult` seam param) | `host-pending` | `behaviorGate('openwop-live-structured-output', …)`; gated on `liveRuntime.supported` + `structuredOutput`. Seam-gated; soft-skips on 404. |
110
+ | `agent-live-allowlist-enforced.test.ts` | `capabilities.agents.liveRuntime.supported` (RFC 0077 §F-1 / RFC 0002 §A14 `toolAllowlist`) | A (a tool outside the agent `toolAllowlist` is not callable — no `agent.toolCalled` for the disallowed tool — via the `attemptTool` seam param) | `host-pending` | `behaviorGate('openwop-live-allowlist-enforced', …)`. Seam-gated; soft-skips on 404. Part of the RFC 0077 → Accepted bar. |
91
111
  | `approval-gate-events.test.ts` | `approval.granted` / `.rejected` / `.overridden` (RFC 0051 §B, `interrupt-profiles.md` §approvalGate) | Server-free (event-payload schema validity: required fields incl. mandatory `overridden.reason`; additionalProperties:false negatives) | host-pass (server-free) | Always runs; no host needed. |
92
112
  | `approval-gate-flow.test.ts` | `core.openwop.governance.approvalGate` (RFC 0051 §A) + `capabilities.authorization` (RFC 0049) | A (capability-gated on `authorization.supported`; unauthorized-principal-denied + override-audited via the `governance/approval-gate` seam) | `host-pending` | Behavioral probe soft-skips on 404. Grant/reject-loopback/quorum scenarios deferred until a governance host wires the seam. |
93
113
  | `scheduling-capability-shape.test.ts` | `capabilities.scheduling` (RFC 0052 §A, `host-capabilities.md` §host.scheduling) | A (advertisement shape always — `supported` boolean; `cron`/`delayed`/`calendar` booleans; `maxFutureHorizon` ISO-8601 duration) | `host-pending` | Always runs; asserts the block is absent or well-formed. |
@@ -103,6 +123,7 @@ Twenty-two scenario groups validate optional profiles where the host's discovery
103
123
  | `artifact-type-pack-install.test.ts` | `host.artifactTypes` (RFC 0071 Phase 1, `host-capabilities.md` §host.artifactTypes) | A (install + produce → `artifact.created { registered: true }` after schema validation; schema-violating payload rejected, via the `POST /v1/host/sample/artifacttypes/{install,produce}` seam) | MyndHyve ✅ · in-memory ✅ | Capability-gated on `host.artifactTypes.supported`. PASS against MyndHyve `workflow-runtime-00396-cuj` (production) AND the in-memory reference host (store-only seam, RFC 0075 P2-1); soft-skips on hosts that don't advertise. |
104
124
  | `artifact-type-store-without-render.test.ts` | `host.artifactTypes` (RFC 0071 §host.artifactTypes — store/render negotiation) | A (a `store:true,render:false` host stores the artifact + completes the run; never fails for lack of a renderer) | in-memory ✅ | **Exercised end-to-end (RFC 0075 P2-1):** the in-memory reference host advertises `host.artifactTypes { store:true, render:false }` and implements the produce seam — a registered, schema-valid artifact is stored and the run completes with `rendered:false`. MyndHyve (`render:true`) honestly soft-skips this path; the in-memory store-only host is the one that actually verifies the negotiation. |
105
125
  | `chat-card-pack-manifest-validation.test.ts` | `chat-card-pack-manifest.schema.json` (RFC 0071 Phase 2, `chat-card-packs.md` §"Manifest format") | Server-free (positive + 5 negatives: mixed-kind / empty `cards` / bad `cardTypeId` / missing `prompt` / non-portable `inputs[].type`; + positive `vendor.*` extension + the full portable `inputs[].type` subset incl. `multiselect`/`file`, G9) | host-pass (server-free) | Always runs; no host needed. Behavioral execution lives in the sibling scenario below. |
126
+ | `x-openwop-form-pack-manifest.test.ts` | `node-packs.md` §"`x-openwop-form` UX hints" (RFC 0066) | Server-free (positive: an annotated `configSchema` stays a valid 2020-12 schema + advisory hints don't change what it accepts; each §A annotation matches the shape; forward-compat: unknown `kind` validates; 3 negatives: missing `kind` / non-string `kind` / non-string `dependsOn`) | host-pass (server-free) | Always runs; no host needed. `x-openwop-form` is a consumer-side rendering hint — hosts don't advertise it; renderer behavior is a reference-frontend concern, out of scope here. |
106
127
  | `chat-card-pack-execution.test.ts` | `host.chat.cardPacks` (RFC 0071 Phase 2, `chat-card-packs.md` §"Card execution" / §"Trust boundary") + `SECURITY/invariants.yaml` `chat-card-input-trust-boundary` | A (output validated against the linked `outputArtifactType` → `artifact.created { registered: true }`; **card-input-derived prompt content propagates `contentTrust:"untrusted"`** — the R2 proof, via `POST /v1/host/sample/cardpacks/execute` seam) | MyndHyve ✅ | **Phase 2 `Accepted` 2026-05-27.** PASS against MyndHyve `workflow-runtime-00402-bey` (real `core.chat.cardExecute` → `ctx.aiEnvelope.generate`; `host.chat.cardPacks` + `host.aiEnvelope` advertised unconditionally on production, steward-curl-verified). Capability-gated on `host.chat.cardPacks.supported`; soft-skips on hosts that don't advertise. |
107
128
  | `kv-cross-tenant-isolation.test.ts`, `kv-atomic-increment.test.ts`, `kv-cas.test.ts` (three scenarios) | `capabilities.kvStorage` (RFC 0015, `host-kv-storage-capability.md`) + `SECURITY/invariants.yaml` `kv-cross-tenant-isolation` | A (advertisement shape always; behavioral cross-tenant `set`/`get`, 50× concurrent atomic increment convergence, CAS matching/stale-expect) | host-pass via opt-in test seam | Reference host exposes `POST /v1/host/sample/test/surface` env-gated on `OPENWOP_TEST_SEAM_ENABLED=true`; hosts that don't expose the seam soft-skip the behavioral assertions and verify advertisement shape only. |
108
129
  | `table-cross-tenant-isolation.test.ts` | `capabilities.tableStorage` (RFC 0016, `host-table-storage-capability.md`) | A (advertisement shape + behavioral cross-tenant insert/query proof) | host-pass via opt-in test seam | Same seam dependency as kv row. |
@@ -147,6 +168,13 @@ Every OpenAPI operation should have:
147
168
  | `getRunAncestry` | `cross-host-ancestry-endpoint.test.ts`, `cross-host-causation-shape.test.ts` (RFC 0040 §C); capability-gated on `capabilities.multiAgent.executionModel.crossHostCausation.ancestryEndpointSupported` | Unadvertised-host 404 path + top-level `parent: null` shape covered | Add positive multi-hop traversal once a reference host implements end-to-end cross-host composition. |
148
169
  | `listAgents` | `agent-manifest-runtime.test.ts` (RFC 0072 §A); capability-gated on `capabilities.agents.manifestRuntime.supported` | Normative `GET /v1/agents` inventory leg — lists ≥1 installed manifest agent; soft-skips (404) when unadvertised | Black-box across hosts; dispatch leg via sample seam pending the executor-integration tier. |
149
170
  | `getAgent` | `agent-manifest-runtime.test.ts` (RFC 0072 §A) + `agent-dispatch-route.test.ts` (reference host) | One manifest agent's inventory entry + 404 for unknown | Covered against the workflow-engine reference host. |
171
+ | `getEvalSummary` | `agent-eval-suite-shape.test.ts` (RFC 0081 — schema/shape of the returned `EvalSummary`); capability-gated on `capabilities.agents.evalSuite.supported` | Wire shape + the content-free invariant covered always-on; the live `GET /v1/runs/{runId}/eval-summary` round-trip is the behavioral `agent-eval-run.test.ts` deferred to `Active → Accepted` (reference host wires the eval projection). | Add the live 200/404/409 path once a host runs a `mode:"eval"` suite end-to-end. |
172
+ | `listAgentDeployments` | `agent-deployment-shape.test.ts` (RFC 0082 — the `AgentDeployment` record shape the array returns); capability-gated on `capabilities.agents.deployment.supported` | Record shape covered always-on; the live list is the behavioral `agent-deployment-lifecycle.test.ts` deferred to `Active → Accepted` (reference host wires the deployment store). | Add the live 200/404 path once a host implements the deployment store. |
173
+ | `transitionAgentDeployment` | `agent-deployment-shape.test.ts` (RFC 0082 — the `agent-deployment-transition` request + `deployment.*` event shapes) + `deployment-event-no-content-leak` public test; capability-gated on `capabilities.agents.deployment.supported` | Request/record/event shapes + content-free negatives covered always-on; the live authz→gate→eval-verify→`deployment.promoted` path is the behavioral `agent-deployment-lifecycle.test.ts` deferred to `Active → Accepted`. | Add the live fail-closed `403` / `eval_gate_unmet` / `no_active_deployment` assertions once a host wires the promotion contract. |
174
+ | `listAgentRoster` | `agent-roster-shape.test.ts` (RFC 0086 §B — the `agent-roster-response` shape); capability-gated on `capabilities.agents.roster.supported` | Response shape covered always-on; the live `GET /v1/agents/roster` 200/404 + tenant-scoping is deferred to `Active → Accepted` (reference host serves the normative `/v1/agents/roster`, vs the sample-extension `/v1/host/sample/roster`). | Add the live path once a host serves the normative roster endpoint. |
175
+ | `getAgentRosterEntry` | `agent-roster-shape.test.ts` (RFC 0086 §B — the `agent-roster-entry` shape); capability-gated on `capabilities.agents.roster.supported` | Entry shape covered always-on; the live 200/404 + cross-tenant-404 is deferred to `Active → Accepted`. | Add the live path once a host serves the normative endpoint. |
176
+ | `getAgentOrgChart` | `agent-org-chart-shape.test.ts` (RFC 0087 §C — the `agent-org-chart` shape + the `org-position-no-authority-escalation` structural test); capability-gated on `capabilities.agents.orgChart.supported` | Chart shape + the no-authority structural guarantee covered always-on; the live `GET /v1/agents/org-chart` 200/404 + tenant-scoping is deferred to `Active → Accepted`. | Add the live path once a host serves the normative endpoint. |
177
+ | `getAgentOrgChartDepartment` | `agent-org-chart-shape.test.ts` (RFC 0087 §D — the `org-chart-responsibility-view` response shape); capability-gated on `capabilities.agents.orgChart.supported` | Roll-up response shape covered always-on; the live subtree + responsibility roll-up (incl. `?recursive=false`) is deferred to `Active → Accepted`. | Add the live path once a host computes the roll-up over real roster portfolios. |
150
178
  | `streamRunEvents` | `stream-modes.test.ts`, `stream-modes-buffer.test.ts`, `stream-modes-mixed.test.ts`, `streamReconnect.test.ts` | Unsupported mode and invalid buffer assertions | Add long-running proxy timeout soak outside fast CI. |
151
179
  | `pollRunEvents` | `multi-node-ordering.test.ts`, `version-negotiation.test.ts`, redaction tests | Past-end and validation assertions | Good. Add malformed `lastSequence` if missing. |
152
180
  | `cancelRun` | `cancellation.test.ts` | Unknown/terminal idempotency cases partial | Add explicit already-terminal cancel behavior. |
@@ -0,0 +1,38 @@
1
+ {
2
+ "id": "openwop-synthetic-oauth-provider",
3
+ "description": "RFC 0047 §C — ONE canonical synthetic OAuth provider for proving the host.oauth authorization-code roundtrip end-to-end without a live IdP. A host's host.oauth implementation drives the redirect/callback exchange against `provider.authUrl` + `provider.tokenUrl` (which a conformance test double serves), feeds `exchange.authorizationCode` + `exchange.state` + `exchange.redirectUri`, and MUST receive `exchange.tokenResponse`. The canary token/refresh values let the paired scenario assert RFC 0047 §C.2 redaction (token material never crosses a run-visible surface) and §C (the authorization code, redirect URI, and state never appear on any run-visible surface). One provider is sufficient: all real providers differ ONLY in authUrl/tokenUrl, with no provider-specific grant/exchange quirks on the wire — so a single parameterizable provider exercises the whole roundtrip. A provider-specific quirk fixture is added only if one ever materializes.",
4
+ "provider": {
5
+ "id": "synthetic",
6
+ "authUrl": "https://oauth.synthetic.openwop.test/authorize",
7
+ "tokenUrl": "https://oauth.synthetic.openwop.test/token",
8
+ "scopes": ["openwop.read", "openwop.write"]
9
+ },
10
+ "grant": "authorization_code",
11
+ "exchange": {
12
+ "authorizationCode": "openwop-synthetic-auth-code-1f4b9e",
13
+ "redirectUri": "https://host.example/openwop/oauth/callback",
14
+ "state": "openwop-synthetic-state-7c2a8d",
15
+ "codeVerifier": "openwop-synthetic-pkce-verifier-3e9f1b2c5a7d4e8f0a1b2c3d4e5f6a7b",
16
+ "tokenResponse": {
17
+ "access_token": "OPENWOP_OAUTH_TOKEN_CANARY_9d4c1f7a",
18
+ "token_type": "Bearer",
19
+ "expires_in": 3600,
20
+ "refresh_token": "OPENWOP_OAUTH_REFRESH_CANARY_2b8e6a3f",
21
+ "scope": "openwop.read openwop.write"
22
+ }
23
+ },
24
+ "expectations": {
25
+ "credentialReferenceReturned": true,
26
+ "connectorAuthorizedEvent": {
27
+ "carries": ["provider", "credentialRef", "scopes"],
28
+ "mustNotCarry": ["access_token", "refresh_token", "code", "state", "redirectUri", "codeVerifier"]
29
+ },
30
+ "redactedValuesNeverOnAnyRunSurface": [
31
+ "OPENWOP_OAUTH_TOKEN_CANARY_9d4c1f7a",
32
+ "OPENWOP_OAUTH_REFRESH_CANARY_2b8e6a3f",
33
+ "openwop-synthetic-auth-code-1f4b9e",
34
+ "openwop-synthetic-state-7c2a8d",
35
+ "openwop-synthetic-pkce-verifier-3e9f1b2c5a7d4e8f0a1b2c3d4e5f6a7b"
36
+ ]
37
+ }
38
+ }
package/fixtures.md CHANGED
@@ -469,6 +469,16 @@ Pack-manifest fixtures are exercised by the server-free `fixtures-valid.test.ts`
469
469
 
470
470
  ---
471
471
 
472
+ ## OAuth provider fixtures
473
+
474
+ The `fixtures/oauth-providers/` sub-directory holds synthetic OAuth provider definitions used to prove the RFC 0047 `host.oauth` authorization-code roundtrip end-to-end **without a live IdP**. They are NOT `WorkflowDefinition`s and are NOT seeded as workflows — they parameterize the behavioral roundtrip scenario, which drives the host's `POST /v1/host/sample/oauth/authorize-code-roundtrip` seam against the provider's `authUrl`/`tokenUrl` (served by a conformance test double).
475
+
476
+ | Fixture | `provider.id` | Purpose |
477
+ |---|---|---|
478
+ | `synthetic` | `synthetic` | The ONE canonical synthetic provider. Real providers differ only in `authUrl`/`tokenUrl` with no provider-specific grant/exchange quirks on the wire, so a single parameterizable provider exercises the whole authorization-code dance. Carries a canned `exchange` (authorization code, state, PKCE verifier, redirect URI, and a canary `tokenResponse`) so the paired `oauth-authorization-code-roundtrip.test.ts` can assert RFC 0047 §C + §C.2 redaction — none of those values may appear on a run-visible surface. A provider-specific quirk fixture is added only if one ever materializes.
479
+
480
+ ---
481
+
472
482
  ## Prompt-template fixtures
473
483
 
474
484
  The `fixtures/prompt-templates/` sub-directory holds canonical PromptTemplate documents (per RFC 0027 §A) used as schema-level proof points (validated server-free against `../schemas/prompt-template.schema.json`). They are NOT seeded into a workflow store. They exist so the `prompt-template-shape` scenario has stable positive fixtures, the secret-redaction + trust-marker conformance scenarios have known fixture templateIds to compose against (when a host advertises `capabilities.prompts.supported: true` + `observability: "full"`), and follow-up RFCs (RFC 0028 prompt packs, RFC 0029 resolution chain) can reference a stable shared fixture set.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@openwop/openwop-conformance",
3
- "version": "1.10.0",
3
+ "version": "1.11.0",
4
4
  "description": "Production-ready black-box conformance suite for OpenWOP v1.0 compliant servers.",
5
5
  "repository": {
6
6
  "type": "git",
package/schemas/README.md CHANGED
@@ -5,8 +5,16 @@
5
5
  | Schema | Source spec | Coverage |
6
6
  |---|---|---|
7
7
  | `agent-inventory-response.schema.json` | `node-packs.md` + RFC 0072 | RFC 0072 §A — read projection of installed manifest agents (`GET /v1/agents` body + `$defs.AgentInventoryEntry`); no system prompt / handoff schemas / credentials (SR-1) |
8
+ | `agent-deployment.schema.json` | `agent-deployment.md` (RFC 0082) | Per-(agentId, version) deployment record — the seven-state lifecycle (draft/test/staged/active/paused/deprecated/rolled-back) + canaryPercent + rollbackPointer + channels[]; host-runtime state distinct from the immutable manifest and the registry's published tags |
9
+ | `agent-deployment-transition.schema.json` | `agent-deployment.md` §E (RFC 0082) | The `POST /v1/agents/{agentId}/deployments` request body — a state-transition request (promote/pause/deprecate/rollback/adjust-canary + toState/channel/canaryPercent/evalRunId), authorized fail-closed + gate-enforced before emitting the matching `deployment.*` event |
10
+ | `agent-eval-suite.schema.json` | `agent-evaluation.md` (RFC 0081) | Portable agent evaluation suite — tasks + golden/rubric `expected` + deterministic fixtures + allowed model classes + pass/fail thresholds, pack-distributed via `evalSuiteRef` |
8
11
  | `agent-manifest.schema.json` | `node-packs.md` + agent-pack RFCs | Agent manifest entries distributed alongside node-pack manifests |
12
+ | `eval-summary.schema.json` | `agent-evaluation.md` (RFC 0081) | The content-free eval-run scorecard — aggregate + per-task scores/cost/latency/safety-findings + regression delta; served by `GET /v1/runs/{runId}/eval-summary` (SECURITY invariant `eval-summary-no-content-leak`) |
9
13
  | `agent-ref.schema.json` | `agent-memory.md` + agent-identity RFC | Multi-Agent Shift Phase 1 — slim runtime AgentRef projection carried on `RunSnapshot.agent` / `runOrchestrator`, `WorkflowNode.agent?`, and `agent.*` event payloads |
14
+ | `agent-roster-entry.schema.json` | `agent-roster.md` (RFC 0086) | Standing agent INSTANCE — a named, tenant-scoped `host:<id>` agent (the "digital-twin employee") that references a manifest/deployment (`agentRef`) and owns a `workflows[]` portfolio; the discovery shape behind `GET /v1/agents/roster` + the `roster` inventory projection |
15
+ | `agent-org-chart.schema.json` | `agent-org-chart.md` (RFC 0087) | Tenant-scoped, DESCRIPTIVE grouping of roster members into departments + roles with acyclic `reportsTo` edges; carries NO authority field (every object `additionalProperties:false`) per the `org-position-no-authority-escalation` invariant; the discovery shape behind `GET /v1/agents/org-chart` |
16
+ | `agent-roster-response.schema.json` | `agent-roster.md` (RFC 0086 §B) | Response body for `GET /v1/agents/roster` — `{ roster: AgentRosterEntry[], total }`, tenant-scoped |
17
+ | `org-chart-responsibility-view.schema.json` | `agent-org-chart.md` (RFC 0087 §D) | Response body for `GET /v1/agents/org-chart/{departmentId}` — the department subtree + the responsibility roll-up (union of member portfolios) |
10
18
  | `ai-envelope.schema.json` | `ai-envelope.md` | FINAL v1.1 — inbound LLM-emission envelope. Top-level shape (`type` / `schemaVersion` / `envelopeId` / `correlationId` / `payload` / `meta` / `partial`). Per-kind payload schemas under `envelopes/`. Distinct from `RunEventDoc` (outbound) and `error-envelope.schema.json` (host HTTP errors). |
11
19
  | `artifact-type-pack-manifest.schema.json` | `artifact-type-packs.md` + RFC 0071 | DRAFT — manifest for `kind: "artifact-type"` registry packs. Peer to `node-pack-manifest.schema.json` (RFC 0003), `workflow-chain-pack-manifest.schema.json` (RFC 0013), and `prompt-pack-manifest.schema.json` (RFC 0028); disjoint via the `kind` discriminator. Distributes typed artifact definitions (schema + advisory rendering hint + lifecycle + export-format hints) via the same signed-tarball + Ed25519 + SRI pipeline. |
12
20
  | `envelopes/clarification.request.schema.json` | `ai-envelope.md` §"Universal kinds" | FINAL v1.1 — payload for the universal `clarification.request` kind; engine lifts to `kind: "clarification"` `InterruptPayload`. |
@@ -48,7 +56,11 @@
48
56
  | `run-options.schema.json` | `run-options.md` | Per-run input overlay (configurable + tags + metadata) on `POST /v1/runs` |
49
57
  | `run-orchestrator-decided-event.schema.json` | orchestrator RFC + `observability.md` | Event payload for orchestrator decisions |
50
58
  | `run-snapshot.schema.json` | `rest-endpoints.md` §RunSnapshot | Projected run state from `GET /v1/runs/{runId}` |
59
+ | `credential-provenance.schema.json` | `host-capabilities.md` §"Credential provenance + egress policy" (RFC 0079) | Metadata about a host-issued credential at the tool/egress boundary — `credentialId`/`issuer`/`audiences`(+scopes/expiry/redaction/audit-correlation). Secret-free (SR-1); the §C audience-binding MUST is evaluated against `audiences`. |
51
60
  | `security-advisory.schema.json` | `registry-operations.md` + INCIDENT-RESPONSE runbook | Registry-owned CVE advisory record at `registry/security/advisories.json`. One entry per disclosed vulnerability — id, severity, affected pack-name + SemVer range, optional fixedIn/advisoryUrl/credits. Enforced by `check-advisories.mjs` in `.github/workflows/registry-publish.yml`. |
61
+ | `trigger-subscription.schema.json` | `trigger-bridge.md` (RFC 0083) | Durable inbound-trigger subscription record — `subscriptionId`/`source`/`state` (active/paused/failed/dead-lettered) + `dedupEnabled`/`retryPolicy` + the webhooks.md register keys. Backs the `openwop-trigger-bridge` profile; content-free of inbound payloads (SR-1). |
62
+ | `budget-policy.schema.json` | `budget-policy.md` (RFC 0084) | The reserved `budget` run-options shape — `maxTokens`/`maxCostUsd`/`maxToolCalls`/`maxRetries`/`modelAllow[]`/`modelDeny[]`/`thresholdPercent`/`onExhaustion`. Enforceable per-run spend governance; wall-time/iterations delegated to RFC 0058 (`additionalProperties:false`). Content-free events; no pricing on the wire (`budget-no-pricing-leak`). |
63
+ | `tool-descriptor.schema.json` | `tool-catalog.md` (RFC 0078) | Portable read-only description of one tool unifying the five tool surfaces (node-pack/workflow/mcp/connector/host-extension) — stable `toolId`, source, I/O schemas, auth/egress/approval requirements, replay policy, and `safetyTier` (`exec` ⇒ `host-extension`, RFC 0069). Returned by `GET /v1/tools`; secret-free (SR-1). |
52
64
  | `suspend-request.schema.json` | `interrupt.md` | `InterruptPayload` with 8 `kind` discriminators (approval, clarification, external-event, custom, conversation.start, conversation.exchange, conversation.close, low-confidence) |
53
65
  | `workflow-chain-pack-manifest.schema.json` | `workflow-chain-packs.md` + RFC 0013 | Manifest for workflow-chain packs (`kind: "workflow-chain"`) — pre-configured DAG fragments expanded inline at workflow-author time. Peer to `node-pack-manifest.schema.json`; disjoint via the `kind` discriminator. |
54
66
  | `workflow-definition.schema.json` | `channels-and-reducers.md` + `node-packs.md` | DAG of nodes + edges + triggers + variables + channels |
@@ -0,0 +1,49 @@
1
+ {
2
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
3
+ "$id": "https://openwop.dev/spec/v1/agent-deployment-transition.schema.json",
4
+ "title": "AgentDeploymentTransition",
5
+ "description": "RFC 0082 §E. The request body of POST /v1/agents/{agentId}/deployments — a deployment state-transition request. The host authorizes it fail-closed against the RFC 0049 `deploy:*` scope, runs any configured RFC 0051 approvalGate, enforces the RFC 0081 eval evidence when the gate requires it, and on success emits the matching `deployment.*` event and returns the updated `agent-deployment.schema.json` record. Carries no manifest body or credential material (SR-1).",
6
+ "type": "object",
7
+ "additionalProperties": false,
8
+ "required": ["version", "transition"],
9
+ "properties": {
10
+ "version": {
11
+ "type": "string",
12
+ "maxLength": 64,
13
+ "description": "The concrete agent-definition version the transition targets (matches `AgentManifest.version`)."
14
+ },
15
+ "transition": {
16
+ "type": "string",
17
+ "enum": ["promote", "pause", "deprecate", "rollback", "adjust-canary"],
18
+ "description": "The lifecycle operation. `promote`: advance toward production (draft→test→staged→active) into `toState`. `pause`: active→paused. `deprecate`: active→deprecated. `rollback`: active→rolled-back, restoring a prior version. `adjust-canary`: change the active version's `canaryPercent`. Legal transitions + the seven-state machine are normative in `agent-deployment.md` §C."
19
+ },
20
+ "toState": {
21
+ "type": "string",
22
+ "enum": ["draft", "test", "staged", "active", "paused", "deprecated", "rolled-back"],
23
+ "description": "MAY. The target lifecycle state for a `promote` transition. A host MUST reject a transition into a state it does not advertise in `capabilities.agents.deployment.states`."
24
+ },
25
+ "channel": {
26
+ "type": "string",
27
+ "minLength": 1,
28
+ "maxLength": 64,
29
+ "description": "MAY. The named deployment channel this transition targets (e.g. `stable`). The version becomes resolvable by `AgentRef.channel` = this value (RFC 0082 §A/§B)."
30
+ },
31
+ "canaryPercent": {
32
+ "type": "integer",
33
+ "minimum": 0,
34
+ "maximum": 100,
35
+ "description": "MAY. For `promote` to `active` or `adjust-canary`, the share of channel traffic this version takes. A host advertising `capabilities.agents.deployment.canary: false` MUST reject any value < 100."
36
+ },
37
+ "evalRunId": {
38
+ "type": "string",
39
+ "minLength": 1,
40
+ "description": "MAY. The RFC 0081 eval run whose `EvalSummary.passed` the gate verifies before emitting `deployment.promoted` (the §E evidence). Required when the target `approvalGate` is configured with `requiredEval`; an unmet eval gate is rejected with `eval_gate_unmet`."
41
+ },
42
+ "reason": {
43
+ "type": "string",
44
+ "minLength": 1,
45
+ "maxLength": 512,
46
+ "description": "MAY. A short redaction-safe reason label (NOT free-text content), surfaced on `deployment.rolled-back.reason`. Same posture as `run.dead_lettered.reason`."
47
+ }
48
+ }
49
+ }
@@ -0,0 +1,54 @@
1
+ {
2
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
3
+ "$id": "https://openwop.dev/spec/v1/agent-deployment.schema.json",
4
+ "title": "AgentDeployment",
5
+ "description": "RFC 0082 §C. A per-(agentId, version) deployment record: the host-runtime lifecycle state of one version of a manifest agent (RFC 0003/0070), distinct from the immutable AgentManifest (which is the pack-distribution descriptor) and from the registry's published semver tags (which version EXISTS, not which version SERVES). Holds the deployment `state`, the named `channels` that resolve to this version, the canary traffic share, the rollback pointer, and the provenance (`evalRunId`/`approvalGateId`) of the last transition. Content-free of any manifest body or credential material (SR-1). The record itself is host-internal; this schema is the canonical wire shape a host exposes via GET /v1/agents/{agentId}/deployments (the endpoint lands at Active → Accepted per RFC 0082 §Conformance).",
6
+ "type": "object",
7
+ "additionalProperties": false,
8
+ "required": ["agentId", "version", "state"],
9
+ "properties": {
10
+ "agentId": {
11
+ "type": "string",
12
+ "minLength": 3,
13
+ "maxLength": 256,
14
+ "description": "The deployed agent's id (matches `AgentManifest.agentId` / `AgentRef.agentId`)."
15
+ },
16
+ "version": {
17
+ "type": "string",
18
+ "maxLength": 64,
19
+ "description": "The concrete agent-definition version this record governs (matches `AgentManifest.version`). A deployment record is per-(agentId, version); a `@channel` reference resolves to exactly one such version per the RFC 0082 §B pin."
20
+ },
21
+ "state": {
22
+ "type": "string",
23
+ "enum": ["draft", "test", "staged", "active", "paused", "deprecated", "rolled-back"],
24
+ "description": "RFC 0082 §C lifecycle state. `draft`: authored, not yet evaluated. `test`: undergoing eval (RFC 0081). `staged`: eval-passed, awaiting production promotion. `active`: serving (optionally at `canaryPercent < 100`). `paused`: temporarily withdrawn, recoverable. `deprecated`: sunset — no new traffic, existing pins honored. `rolled-back`: superseded; `rollbackPointer` names the version that replaced it. Legal transitions: draft→test→staged→active (promotion); active↔paused (operational); active→deprecated (terminal); active→rolled-back (recovery, with `rollbackPointer`)."
25
+ },
26
+ "canaryPercent": {
27
+ "type": "integer",
28
+ "minimum": 0,
29
+ "maximum": 100,
30
+ "description": "MAY. For an `active` version, the share (0–100) of channel traffic the §B pin draw assigns to THIS version; the remainder goes to the prior `active` version on the same channel. Absent ⇒ 100 (full traffic). A host advertising `agents.deployment.canary: false` MUST reject any value < 100. The per-run draw outcome is the recorded-fact `resolvedAgentVersion` (RFC 0082 §B) — never re-rolled on replay."
31
+ },
32
+ "rollbackPointer": {
33
+ "type": "string",
34
+ "maxLength": 64,
35
+ "description": "MAY. For a `rolled-back` version, the version that was restored to `active` in its place (the recovery target). Absent for non-rolled-back records."
36
+ },
37
+ "channels": {
38
+ "type": "array",
39
+ "uniqueItems": true,
40
+ "items": { "type": "string", "minLength": 1 },
41
+ "description": "MAY. The named deployment channels (e.g. `stable`, `canary`) that resolve to this version. A version MAY be on more than one channel (RFC 0082 UQ#4 — a promoted-to-`stable` version is also resolvable by the reserved `latest` channel = highest active semver). Channel names are host-advertised in `capabilities.agents.deployment.channels`."
42
+ },
43
+ "evalRunId": {
44
+ "type": "string",
45
+ "minLength": 1,
46
+ "description": "MAY. The RFC 0081 eval run whose `EvalSummary.passed` gated the last promotion into this state (the §E evidence). Content-free reference, not the eval body."
47
+ },
48
+ "approvalGateId": {
49
+ "type": "string",
50
+ "minLength": 1,
51
+ "description": "MAY. The RFC 0051 `approvalGate` that authorized the last transition (the §E human gate). Content-free reference."
52
+ }
53
+ }
54
+ }
@@ -0,0 +1,140 @@
1
+ {
2
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
3
+ "$id": "https://openwop.dev/spec/v1/agent-eval-suite.schema.json",
4
+ "title": "AgentEvalSuite",
5
+ "description": "RFC 0081 §A. A portable, host-agnostic evaluation suite for a manifest agent (RFC 0003/0070): the tasks, the expected outputs or rubrics, the deterministic tool/memory fixtures, the allowed model classes, and the pass/fail thresholds that answer \"is this agent good enough to deploy?\". Distributed inside a pack tarball and referenced by URI exactly like `systemPromptRef` / `handoff.*SchemaRef` (RFC 0003 §C/§D) — NOT embedded in `AgentManifest`. A host advertising `capabilities.agents.evalSuite.supported: true` executes a suite as an eval run (a `mode: \"eval\"` projection over `POST /v1/runs`, RFC 0081 §B) and terminates with an `eval-summary.schema.json` scorecard. The suite carries NO secret material and NO host-internal identifiers (it is authored offline and shipped in a signed pack).",
6
+ "type": "object",
7
+ "additionalProperties": false,
8
+ "required": ["suiteId", "version", "modes", "tasks"],
9
+ "properties": {
10
+ "suiteId": {
11
+ "type": "string",
12
+ "pattern": "^[a-z0-9.-]+\\.evals\\.[a-z0-9-]+$",
13
+ "description": "Globally unique suite identifier in the `<scope>.<org>.evals.<name>` convention (e.g. `core.openwop.evals.support-resolver`), mirroring the pack `<scope>.<author>.<pack>` namespace (RFC 0003). The `.evals.` infix distinguishes a suite from an agent/pack id."
14
+ },
15
+ "version": {
16
+ "type": "string",
17
+ "pattern": "^[0-9]+\\.[0-9]+\\.[0-9]+$",
18
+ "description": "SemVer of the suite. A suite version is pinned on an eval run (carried on `eval.started.suiteVersion` and `eval-summary.suiteVersion`) so a regression comparison (§D `regression` mode) is between like versions."
19
+ },
20
+ "targetAgentId": {
21
+ "type": "string",
22
+ "minLength": 1,
23
+ "description": "MAY. The `AgentManifest.id` this suite is authored for. Absent ⇒ the suite is agent-agnostic and MAY be pointed at any `agentId` at run time (the run request carries the `agentId`). When present, a host SHOULD reject an eval run whose target `agentId` differs, unless the caller explicitly overrides."
24
+ },
25
+ "modes": {
26
+ "type": "array",
27
+ "minItems": 1,
28
+ "uniqueItems": true,
29
+ "items": { "type": "string", "enum": ["golden", "rubric", "adversarial", "regression", "live-shadow"] },
30
+ "description": "The eval modes this suite exercises (RFC 0081 §D closed vocabulary). `golden`: exact / contains / json-match against each task's `expected`. `rubric`: a host-chosen judge scores against weighted criteria (nondeterministic — a recorded-fact score). `adversarial`: tasks probe for unsafe / jailbreak behavior; `safetyFindings` is the primary output. `regression`: re-run against a new agent/model/prompt version and diff scores vs a `baselineRunId` (composes RFC 0054 `:diff`). `live-shadow`: run against LIVE tools/memory instead of `fixtures` — the only mode that bypasses fixture injection; explicitly nondeterministic. A run MUST request only modes the suite declares here AND the host advertises (`capabilities.agents.evalSuite.modes`); an unadvertised mode is rejected at run-create with `400 validation_error`."
31
+ },
32
+ "allowedModels": {
33
+ "type": "array",
34
+ "uniqueItems": true,
35
+ "items": { "type": "string", "enum": ["reasoning", "writing", "coding", "research", "classification", "general"] },
36
+ "description": "MAY. The `AgentManifest.modelClass` values (RFC 0002) the suite is valid for. Absent ⇒ valid for any class. A host SHOULD record the `evaluatedModelClass` on the summary so a score is interpreted against the model it was produced with."
37
+ },
38
+ "thresholds": {
39
+ "type": "object",
40
+ "additionalProperties": false,
41
+ "description": "MAY. The pass/fail bar for the suite. A task or the aggregate `passed` flag is computed against these. Absent ⇒ the host's default bar (the summary still carries raw scores).",
42
+ "properties": {
43
+ "passScore": {
44
+ "type": "number",
45
+ "minimum": 0,
46
+ "maximum": 1,
47
+ "description": "The minimum aggregate score (0.0–1.0) for `EvalSummary.passed: true`."
48
+ },
49
+ "maxCostUsd": {
50
+ "type": "number",
51
+ "minimum": 0,
52
+ "description": "MAY. The maximum total cost (summed from RFC 0026 `provider.usage`) for a passing run. A run that exceeds it MUST NOT report `passed: true` even if `passScore` is met."
53
+ },
54
+ "maxP95LatencyMs": {
55
+ "type": "integer",
56
+ "minimum": 0,
57
+ "description": "MAY. The maximum p95 per-task latency for a passing run."
58
+ }
59
+ }
60
+ },
61
+ "tasks": {
62
+ "type": "array",
63
+ "minItems": 1,
64
+ "description": "The eval tasks. Each is executed as one child agent invocation (the RFC 0077 `agent.invocation.*` bracket + the existing `agent.*` / `provider.usage` events), scored, and reported via a per-task `eval.scored` event + an `EvalSummary` entry.",
65
+ "items": {
66
+ "type": "object",
67
+ "additionalProperties": false,
68
+ "required": ["taskId", "input", "expected"],
69
+ "properties": {
70
+ "taskId": {
71
+ "type": "string",
72
+ "pattern": "^[a-z0-9][a-z0-9-]*$",
73
+ "description": "Suite-unique task identifier (kebab-case). Carried verbatim on `eval.scored.taskId` and the per-task summary entry."
74
+ },
75
+ "input": {
76
+ "description": "The run input for the task, validated against the agent's input schema by the host. An opaque object/value — content is task-defined."
77
+ },
78
+ "expected": {
79
+ "type": "object",
80
+ "additionalProperties": false,
81
+ "required": ["kind"],
82
+ "description": "How the task is scored. `golden`: deterministic match against `match`. `rubric`: a judge scores against weighted `rubric` criteria.",
83
+ "properties": {
84
+ "kind": { "type": "string", "enum": ["golden", "rubric"], "description": "Scoring mode for this task. A suite declaring a non-`golden`/`rubric` `modes` entry (e.g. `adversarial`) still scores each task via one of these two `kind`s." },
85
+ "match": {
86
+ "type": "object",
87
+ "additionalProperties": false,
88
+ "description": "Present when `kind: \"golden\"`. The deterministic expectation.",
89
+ "properties": {
90
+ "strategy": { "type": "string", "enum": ["exact", "contains", "json-match"], "description": "`exact`: stringified output equals `value`. `contains`: output contains `value`. `json-match`: output JSON-deep-equals `value` (key order / whitespace insensitive)." },
91
+ "value": { "description": "The expected value for the strategy. Opaque." }
92
+ },
93
+ "required": ["strategy", "value"]
94
+ },
95
+ "rubric": {
96
+ "type": "array",
97
+ "minItems": 1,
98
+ "description": "Present when `kind: \"rubric\"`. Weighted criteria a judge scores the output against; the task score is the weighted sum of met criteria, normalized to 0.0–1.0. Judge selection + scoring is host-internal (nondeterministic — the score is a recorded fact).",
99
+ "items": {
100
+ "type": "object",
101
+ "additionalProperties": false,
102
+ "required": ["criterion", "weight"],
103
+ "properties": {
104
+ "criterion": { "type": "string", "minLength": 1, "description": "A human-readable scoring criterion (e.g. \"cites the 30-day refund window\")." },
105
+ "weight": { "type": "number", "minimum": 0, "maximum": 1, "description": "Relative weight of this criterion (criteria weights SHOULD sum to 1.0 across the task)." }
106
+ }
107
+ }
108
+ }
109
+ }
110
+ },
111
+ "fixtures": {
112
+ "type": "object",
113
+ "additionalProperties": false,
114
+ "description": "MAY. Deterministic substitutes for live tool/memory I/O so a `golden`/`regression` eval is reproducible. When present, the eval host MUST inject `toolResponses` in place of live tool calls and seed `memorySeed` before the invocation. The `live-shadow` mode is the explicit exception — it ignores `fixtures` and runs against live tools/memory.",
115
+ "properties": {
116
+ "toolResponses": {
117
+ "type": "array",
118
+ "description": "Canned tool results keyed by tool invocation, injected in place of live tool calls.",
119
+ "items": {
120
+ "type": "object",
121
+ "additionalProperties": false,
122
+ "required": ["tool"],
123
+ "properties": {
124
+ "tool": { "type": "string", "minLength": 1, "description": "The `<scope>:<tool-id>` (RFC 0077 `toolAllowlist` / RFC 0078) the response stands in for." },
125
+ "response": { "description": "The canned result the host returns for that tool. Opaque." }
126
+ }
127
+ }
128
+ },
129
+ "memorySeed": {
130
+ "type": "array",
131
+ "description": "Memory entries seeded into the agent's read snapshot before the invocation (RFC 0004 `MemoryAdapter` shape). Tenant-scoped + SR-1-redacted on the host side exactly like any memory write.",
132
+ "items": { "type": "object" }
133
+ }
134
+ }
135
+ }
136
+ }
137
+ }
138
+ }
139
+ }
140
+ }
@@ -83,6 +83,31 @@
83
83
  "type": "array",
84
84
  "items": { "type": "string" },
85
85
  "description": "Capability keys this agent declared as `peerDependenciesMeta.optional` that this host does NOT satisfy, and which are therefore inert for this installation (RFC 0072 §C). Absent or empty means the agent runs at full declared capability here."
86
+ },
87
+ "memoryDegraded": {
88
+ "type": "boolean",
89
+ "description": "RFC 0080 §C. `true` when this agent's `memoryShape` declares a memory dimension the host's reconciled memory model (RFC 0080 §A) does NOT satisfy — the agent MAY still dispatch at the RFC 0070 floor, but the degradation MUST be observable here (a silent satisfied-looking entry for an unsatisfiable `memoryShape` is non-conformant). Absent ⇒ memory fully satisfied, or an older host that does not compute the projection (consumers treat absence as not-degraded/unknown and MAY probe)."
90
+ },
91
+ "degradedMemoryDimensions": {
92
+ "type": "array",
93
+ "items": { "type": "string", "enum": ["read", "write", "search", "long-term-durability", "compaction", "attribution", "replay-snapshot", "retention"] },
94
+ "uniqueItems": true,
95
+ "description": "RFC 0080 §C. The RFC 0080 §A dimension names (NOT the `memoryShape` keys) this host cannot satisfy for this agent. Present (non-empty) iff `memoryDegraded: true`; OPTIONAL/absent when `memoryDegraded` is false or absent. The dimension name `long-term-durability` is deliberately distinct from the `agents.memoryBackends` *value* `long-term` (a backend id) to avoid wire ambiguity. The §A→`memoryShape` mapping: `longTerm`⇒`long-term-durability`, `scratchpad`/`conversation`⇒`write`+`read` as applicable."
96
+ },
97
+ "roster": {
98
+ "type": "array",
99
+ "uniqueItems": true,
100
+ "description": "RFC 0086 §B. OPTIONAL/additive. The standing roster INSTANCES of this manifest agent visible to the caller (tenant-scoped per RFC 0074), each with its persona + owned workflow portfolio — so a single GET /v1/agents call surfaces responsibilities without a second round-trip. Present only when the host advertises `capabilities.agents.roster.supported: true`; absent ⇒ no roster surface (today's default).",
101
+ "items": {
102
+ "type": "object",
103
+ "additionalProperties": false,
104
+ "required": ["rosterId", "persona", "workflows"],
105
+ "properties": {
106
+ "rosterId": { "type": "string", "minLength": 1, "description": "The roster entry's `host:<id>` instance id (agent-roster-entry.schema.json)." },
107
+ "persona": { "type": "string", "minLength": 1, "description": "The instance's human display name." },
108
+ "workflows": { "type": "array", "items": { "type": "string", "minLength": 1 }, "uniqueItems": true, "description": "The workflow ids this instance owns by role (its portfolio)." }
109
+ }
110
+ }
86
111
  }
87
112
  }
88
113
  }
@@ -41,6 +41,11 @@
41
41
  "description": "URI-reference to a prompt file inside the pack tarball (e.g., `prompts/supervisor.md`). Mutually exclusive with `systemPrompt`. Useful when the prompt body is large enough that inlining would bloat the manifest.",
42
42
  "minLength": 1
43
43
  },
44
+ "evalSuiteRef": {
45
+ "type": "string",
46
+ "description": "RFC 0081 §A. MAY. URI-reference to an `agent-eval-suite.schema.json` file inside the pack tarball (e.g., `evals/support-resolver.json`), resolved at install exactly like `systemPromptRef` / `handoff.*SchemaRef`. Declares the agent's portable evaluation suite (golden/rubric/adversarial/regression/live-shadow tasks + thresholds) so a host advertising `capabilities.agents.evalSuite.supported: true` can run it as a `mode: \"eval\"` run. Absent ⇒ the agent ships no suite (a suite MAY still be authored independently and pointed at this `agentId` at run time). Does NOT embed the suite in the manifest — the suite evolves on its own cadence and is often authored by a different role.",
47
+ "minLength": 1
48
+ },
44
49
  "toolAllowlist": {
45
50
  "type": "array",
46
51
  "items": { "type": "string", "minLength": 1 },