npm - @openwop/openwop-conformance - Versions diffs - 1.5.0 → 1.6.1 - Mend

@openwop/openwop-conformance 1.5.0 → 1.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

package/CHANGELOG.md +27 -0
package/README.md +2 -2
package/api/asyncapi.yaml +25 -4
package/api/openapi.yaml +371 -0
package/coverage.md +31 -4
package/fixtures/conformance-phase4-nondet-tool.json +53 -0
package/fixtures/conformance-phase4-replay-divergence.json +40 -0
package/fixtures.md +5 -3
package/package.json +1 -1
package/schemas/README.md +4 -0
package/schemas/annotation-create.schema.json +37 -0
package/schemas/annotation.schema.json +56 -0
package/schemas/capabilities.schema.json +191 -3
package/schemas/credential-reference.schema.json +21 -0
package/schemas/node-pack-manifest.schema.json +112 -1
package/schemas/run-diff-response.schema.json +64 -0
package/schemas/run-event-payloads.schema.json +104 -2
package/schemas/run-event.schema.json +8 -1
package/schemas/run-snapshot.schema.json +11 -0
package/src/lib/behavior-gate.ts +51 -0
package/src/lib/driver.ts +13 -1
package/src/lib/feedback.ts +31 -0
package/src/lib/saml-idp.ts +179 -0
package/src/scenarios/approval-gate-events.test.ts +61 -0
package/src/scenarios/approval-gate-flow.test.ts +68 -0
package/src/scenarios/auth-saml-profile.test.ts +119 -0
package/src/scenarios/auth-scim-profile.test.ts +65 -0
package/src/scenarios/authorization-fail-closed.test.ts +80 -0
package/src/scenarios/authorization-roles-shape.test.ts +83 -0
package/src/scenarios/connector-manifest-validity.test.ts +142 -0
package/src/scenarios/credential-payload-redaction.test.ts +93 -0
package/src/scenarios/credentials-capability-shape.test.ts +90 -0
package/src/scenarios/cross-engine-append-behavior.test.ts +204 -0
package/src/scenarios/cross-host-traceparent-propagation.test.ts +13 -6
package/src/scenarios/cross-workspace-isolation.test.ts +72 -0
package/src/scenarios/deadletter-capability-shape.test.ts +59 -0
package/src/scenarios/deadletter-retry-exhaustion.test.ts +62 -0
package/src/scenarios/experimental-tier-shape.test.ts +192 -0
package/src/scenarios/feedback-capability-shape.test.ts +35 -0
package/src/scenarios/feedback-correction-redaction.test.ts +35 -0
package/src/scenarios/feedback-cross-tenant-isolation.test.ts +37 -0
package/src/scenarios/feedback-fork-not-copied.test.ts +40 -0
package/src/scenarios/feedback-on-terminal-run.test.ts +32 -0
package/src/scenarios/feedback-record-and-list.test.ts +32 -0
package/src/scenarios/feedback-unsupported-501.test.ts +32 -0
package/src/scenarios/identity-owner-shape.test.ts +64 -0
package/src/scenarios/multi-agent-confidence-escalation.test.ts +13 -12
package/src/scenarios/multi-agent-memory-lifecycle.test.ts +87 -12
package/src/scenarios/multi-region-idempotency-behavior.test.ts +203 -0
package/src/scenarios/oauth-capability-shape.test.ts +97 -0
package/src/scenarios/oauth-connector-redaction.test.ts +91 -0
package/src/scenarios/pack-registry-isolation.test.ts +108 -0
package/src/scenarios/pack-registry-publish.test.ts +1 -1
package/src/scenarios/prompt-mutation-workspace-membership-enforced.test.ts +126 -0
package/src/scenarios/prompt-read-workspace-membership-enforced.test.ts +183 -0
package/src/scenarios/redaction.test.ts +4 -1
package/src/scenarios/replay-divergence-at-refusal.test.ts +187 -7
package/src/scenarios/replay-observable-sequence-determinism.test.ts +20 -6
package/src/scenarios/run-diff.test.ts +143 -0
package/src/scenarios/sandbox-capability-gate-respected.test.ts +7 -1
package/src/scenarios/sandbox-memory-cap.test.ts +7 -5
package/src/scenarios/sandbox-mvp-behavior.test.ts +280 -0
package/src/scenarios/sandbox-no-cross-pack-mutation.test.ts +7 -1
package/src/scenarios/sandbox-no-host-env-leak.test.ts +5 -1
package/src/scenarios/sandbox-no-host-fs-escape.test.ts +9 -1
package/src/scenarios/sandbox-no-host-process-escape.test.ts +5 -1
package/src/scenarios/sandbox-no-network-escape.test.ts +5 -1
package/src/scenarios/sandbox-timeout-cap.test.ts +7 -5
package/src/scenarios/scheduling-capability-shape.test.ts +81 -0
package/src/scenarios/scheduling-cron-fires-once.test.ts +66 -0
package/src/scenarios/secret-leakage-otel-attribute.test.ts +241 -0
package/src/scenarios/spec-corpus-validity.test.ts +6 -3

package/coverage.md CHANGED Viewed

@@ -1,6 +1,6 @@
 # OpenWOP Conformance Coverage Map
-> **Status: Living document. Updated 2026-05-22.** This map connects the current scenario files to the protocol surfaces they protect and records the remaining gaps from the protocol deep dive. Scenario names are source-of-truth file names under `conformance/src/scenarios/`.
+> **Status: Living document. Updated 2026-05-25.** This map connects the current scenario files to the protocol surfaces they protect and records the remaining gaps from the protocol deep dive. Scenario names are source-of-truth file names under `conformance/src/scenarios/`.
 > **Shape grade vs behavior grade.** Some optional-profile scenarios validate **capability shape** (the host's discovery advertisement is well-formed) without yet exercising **behavior** (the host actually implements the profile end-to-end). The "Current grade" column reflects shape; see §"Capability-gated scenarios: shape vs behavior" below for the dual-grade view and the `OPENWOP_REQUIRE_BEHAVIOR=true` strict-mode runner flag.
@@ -40,10 +40,14 @@
 | Envelope variant discrimination + model capabilities (RFC 0031 — `spec/v1/ai-envelope.md` §"Variant payload discrimination (normative)", `spec/v1/host-capabilities.md` §"Model-capability declarations", `spec/v1/node-packs.md` §"Model-capability declarations on NodeModules") | `envelope-variant-discriminator-static.test.ts`, `model-capability-substituted.test.ts`, `model-capability-insufficient.test.ts`, `node-module-required-capabilities-shape.test.ts` | B+ (discriminator-static + advertisement-shape always-on; 14 live behavioral assertions across substitution + insufficient + authoring-convention, capability-gated) | RFC 0031 promoted Draft → Active 2026-05-20. `envelope-variant-discriminator-static` (always-on) walks every `schemas/envelopes/*.schema.json` asserting no `oneOf` at any nesting depth (Gemini silently drops `oneOf`, producing looser-than-declared schemas — a silent correctness bug) AND every `anyOf` branch declares a single-string-`enum` discriminator in `required` per RFC 0031 §A. `model-capability-substituted` (capability-gated on `capabilities.modelCapabilities.supported` + `substitutionSupported`) carries advertisement-shape check on the `advertised: string[]` pattern (each identifier matches the spec-reserved set OR `^x-host-<host>-<key>$` per RFC 0031 §C) + 4 live behavioral assertions covering substitution emission + SECURITY invariant `model-capability-substituted-no-credential-disclosure`'s all-or-nothing `"[REDACTED]"` redaction option. `model-capability-insufficient` (capability-gated on `modelCapabilities.supported`) carries 6 live behavioral assertions covering refusal emission paths + the no-recursive-fallback constraint (RFC 0031 §"Unresolved questions" #3 — `fallbackAttempted: true` when the declared fallback itself fails; NO chaining). `node-module-required-capabilities-shape` (SHOULD-tier authoring convention check) carries 4 live assertions for the `core.ai.*` typeId-pattern recommendation. Path to `Accepted`: reference host implements `executor/modelCapabilityGate.ts` end-to-end + advertises `capabilities.modelCapabilities: { supported: true, advertised: [...], substitutionSupported: true }` (the live behavioral assertions soft-skip cleanly on hosts that haven't wired the executor yet). |
 | Envelope-reliability run-event vocabulary (RFC 0032 — `spec/v1/ai-envelope.md` §"Envelope-reliability events" + line-448 scope clarification, `spec/v1/observability.md` §"Envelope-reliability events (RFC 0032)") | `envelope-retry-attempted.test.ts`, `envelope-retry-exhausted.test.ts`, `envelope-refusal-shape.test.ts`, `envelope-truncated.test.ts`, `envelope-nl-to-format-engaged.test.ts`, `envelope-recovery-applied.test.ts` | B (1 shared advertisement-shape probe with MUST-events enforcement; 34 live behavioral assertions across the six events, all capability- + fixture-gated) | RFC 0032 promoted Draft → Active 2026-05-20. Carries the central `ai-envelope.md` line-448 scope clarification (per-kind routing events forbidden; cross-kind operational events permitted via RFC). `envelope-retry-attempted` carries the shared advertisement-shape probe: when `capabilities.envelopes.reliability.supported: true`, the host MUST list both `envelope.retry.exhausted` AND `envelope.refusal` in `events[]` (the two MUST-tier events per RFC 0032 §C); `maxRetryAttempts` MUST be in `[1, 16]`. The six scenarios collectively carry 34 live behavioral assertions (drained 2026-05-19 via the conformance `mock` provider + `POST /v1/host/sample/test/mock-ai/program` seam): retry on schema-violation + retry on truncation + retry-exhausted terminal failure + provider refusal (no-retry MUST per RFC 0032 §B.3 + RFC 0033 §D) + truncation cut-off + NL-to-Format escalation (Tam et al. mitigation per arXiv 2408.02442) + lenient-parsing recovery + SECURITY invariants `envelope-refusal-no-prompt-leak` (BYOK + prompt-content redaction on `refusalText`) and `envelope-recovery-no-content-leak` (no pre-recovery substrings in the event payload). Path to `Accepted`: reference host implements `executor/envelopeReliability.ts` end-to-end + advertises `capabilities.envelopes.reliability: { supported: true, events: [...], maxRetryAttempts: <n> }` (the behavioral assertions already pass against the reference host's end-to-end emission path under `OPENWOP_ENVELOPE_RELIABILITY_END_TO_END=true`; the no-flag default still soft-skips). |
 | Envelope-completion retry routing (RFC 0033 — `spec/v1/ai-envelope.md` §"Envelope-completion criteria", `spec/v1/observability.md` §"Envelope-completion retry routing (RFC 0033)") | `envelope-completion-distinguishes-truncation.test.ts`, `envelope-truncation-cap-exhaustion.test.ts` | B− (1 advertisement-shape probe on `completion.{distinguishesTruncation, truncationBudgetMultiplier}`; 9 live behavioral assertions across the two retry paths + the DoS-bound assertion) | RFC 0033 promoted Draft → Active 2026-05-20. Closes `spec/v1/ai-envelope.md` §"Open spec gaps" E5 (refusal-mode + retry-policy interaction). Reuses RFC 0032's event vocabulary; introduces NO new event types. `envelope-completion-distinguishes-truncation` (capability-gated on `completion.distinguishesTruncation: true`) carries 5 live behavioral assertions covering both retry paths — truncation MUST increase output budget (RECOMMENDED 2× per `truncationBudgetMultiplier`) WITHOUT a corrective fragment; schema-violation MUST add a corrective fragment WITHOUT a budget change. `envelope-truncation-cap-exhaustion` carries 4 live behavioral assertions covering the DoS-bound assertion (truncation retries count against `Capabilities.limits.schemaRounds`; exhaustion → `envelope.retry.exhausted { finalReason: "truncation" }` + `cap.breached { kind: "schema" }` + node fails with NEW error code `envelope_truncation_unrecoverable` per RFC 0033 §F). All 9 assertions are fixture- + capability-gated against the conformance `mock` provider via `POST /v1/host/sample/test/mock-ai/program`. Path to `Accepted`: reference host implements the truncation-vs-schema-violation retry-routing branch end-to-end (`executor/envelopeReliability.ts` + `stop_reason` inspection in `aiProviders/aiProvidersHost.ts`) + advertises `capabilities.envelopes.reliability.completion.distinguishesTruncation: true`. |
-| Multi-agent execution model + handoff state machine (RFC 0037 Phase 1 — `spec/v1/multi-agent-execution.md`) | `multi-agent-handoff-state-machine.test.ts` | B (1 advertisement-shape probe + 1 behavioral 4-event causation-chain assertion against the parent+child fixture pair) | RFC 0037 Phase 1 filed Draft → promoted Active 2026-05-21 after spec + schema + scenario landed atomically. Advertisement-shape probe asserts `capabilities.multiAgent.executionModel.{supported, version ∈ [1,4]}` when present. Behavioral assertion drives the `conformance-multi-agent-handoff` parent + `conformance-multi-agent-handoff-child` fixture pair: runs the supervisor → next-worker → child completed loop and asserts the 4 `core.workflowChain.event` records appear in the exact phase sequence `dispatch.began → dispatch.succeeded → child.completed → output.harvested` with each event's `causationId === prior.eventId` and `dispatch.began.causationId === runOrchestrator.decided.eventId`, plus `output.harvested.harvestedKeys === ['parentResult']` (proves the spec §"Transition events" table on real wire). Reference workflow-engine advertises + emits end-to-end when `OPENWOP_MULTI_AGENT_EXECUTION_MODEL=true`; the no-flag default soft-skips honestly. Path to `Accepted`: non-steward host advertises + the behavioral assertion passes against it. |
-| Multi-agent Phase 2 confidence-floor escalation (RFC 0039 — `spec/v1/multi-agent-execution.md` §"Confidence escalation") | `multi-agent-confidence-escalation.test.ts` | B (1 advertisement-shape probe on `confidenceEscalationFloor` + 1 behavioral assertion against the low-confidence fixture) | RFC 0039 Phase 2 filed Draft → promoted Active 2026-05-22 after the confidence-floor half landed end-to-end. Advertisement-shape probe asserts `capabilities.multiAgent.executionModel.confidenceEscalationFloor` (when present) is a number in `[0.5, 1.0]`; values below the spec floor are non-conformant. Behavioral assertion drives the `conformance-multi-agent-confidence-escalation` fixture (supervisor `mockDispatchPlan` carries one decision with `confidence: 0.3`) and asserts: parent reaches `waiting-clarification` (NOT `completed` because no dispatch fired); exactly ONE `core.workflowChain.confidence-escalated` event with `payload.confidence === 0.3`, `payload.floor ∈ [0.5, 1.0]`, `payload.escalationKind ∈ {clarify, escalate}`; causationId chains back to the `runOrchestrator.decided` event; ZERO `core.workflowChain.event` records (the load-bearing distinction from Phase 1 — confidence floor MUST fire BEFORE any dispatch.began). Reference workflow-engine advertises `version: 2` + `confidenceEscalationFloor: 0.5` when both `OPENWOP_MULTI_AGENT_EXECUTION_MODEL=true` AND `OPENWOP_MULTI_AGENT_EXECUTION_MODEL_PHASE_2=true` are set; floor tunable via `OPENWOP_MULTI_AGENT_CONFIDENCE_FLOOR`. Path to `Accepted`: non-steward host advertises `version: 2` + the behavioral assertion passes against it. Memory-lifecycle half of RFC 0039 (MAE-2/3) remains explicit follow-up: `crossChildMemoryConcurrency` capability field is schema-landed but the host's MemoryAdapter doesn't yet implement either contract. |
+| Multi-agent execution model + handoff state machine (RFC 0037 — `spec/v1/multi-agent-execution.md`, `version: 1`) | `multi-agent-handoff-state-machine.test.ts` | B (1 advertisement-shape probe + 1 behavioral 4-event causation-chain assertion against the parent+child fixture pair) | RFC 0037 filed Draft → promoted Active 2026-05-21 after spec + schema + scenario landed atomically. Advertisement-shape probe asserts `capabilities.multiAgent.executionModel.{supported, version ∈ [1,4]}` when present. Behavioral assertion drives the `conformance-multi-agent-handoff` parent + `conformance-multi-agent-handoff-child` fixture pair: runs the supervisor → next-worker → child completed loop and asserts the 4 `core.workflowChain.event` records appear in the exact phase sequence `dispatch.began → dispatch.succeeded → child.completed → output.harvested` with each event's `causationId === prior.eventId` and `dispatch.began.causationId === runOrchestrator.decided.eventId`, plus `output.harvested.harvestedKeys === ['parentResult']` (proves the spec §"Transition events" table on real wire). Reference workflow-engine advertises + emits end-to-end when `OPENWOP_MULTI_AGENT_EXECUTION_MODEL=true`; the no-flag default soft-skips honestly. Path to `Accepted`: non-steward host advertises + the behavioral assertion passes against it. |
+| Multi-agent confidence-floor escalation (RFC 0039 — `spec/v1/multi-agent-execution.md` §"Confidence escalation", `version: 2`) | `multi-agent-confidence-escalation.test.ts` | B (1 advertisement-shape probe on `confidenceEscalationFloor` + 1 behavioral assertion against the low-confidence fixture) | RFC 0039 filed Draft → promoted Active 2026-05-22 after the confidence-floor half landed end-to-end. Advertisement-shape probe asserts `capabilities.multiAgent.executionModel.confidenceEscalationFloor` (when present) is a number in `[0.5, 1.0]`; values below the spec floor are non-conformant. Behavioral assertion drives the `conformance-multi-agent-confidence-escalation` fixture (supervisor `mockDispatchPlan` carries one decision with `confidence: 0.3`) and asserts: parent reaches `waiting-clarification` (NOT `completed` because no dispatch fired); exactly ONE `core.workflowChain.confidence-escalated` event with `payload.confidence === 0.3`, `payload.floor ∈ [0.5, 1.0]`, `payload.escalationKind ∈ {clarify, escalate}`; causationId chains back to the `runOrchestrator.decided` event; ZERO `core.workflowChain.event` records (the load-bearing distinction from `version: 1` — confidence floor MUST fire BEFORE any dispatch.began). Reference workflow-engine advertises `version: 2` + `confidenceEscalationFloor: 0.5` when both `OPENWOP_MULTI_AGENT_EXECUTION_MODEL=true` AND `OPENWOP_MULTI_AGENT_EXECUTION_MODEL_PHASE_2=true` are set; floor tunable via `OPENWOP_MULTI_AGENT_CONFIDENCE_FLOOR`. Path to `Accepted`: non-steward host advertises `version: 2` + the behavioral assertion passes against it. Memory-lifecycle half of RFC 0039 (MAE-2/3) remains explicit follow-up: `crossChildMemoryConcurrency` capability field is schema-landed but the host's MemoryAdapter doesn't yet implement either contract. |
 | Sandbox execution contract (RFC 0035 — `spec/v1/host-capabilities.md` §"Sandbox execution contract") | `sandbox-no-host-fs-escape.test.ts`, `sandbox-no-host-env-leak.test.ts`, `sandbox-no-network-escape.test.ts`, `sandbox-no-host-process-escape.test.ts`, `sandbox-memory-cap.test.ts`, `sandbox-timeout-cap.test.ts`, `sandbox-capability-gate-respected.test.ts`, `sandbox-no-cross-pack-mutation.test.ts` | C+ (advertisement-shape probes always-on; 8 capability-gated behavioral stubs scaffolded; soft-skip on hosts that don't advertise `capabilities.sandbox.supported`) | RFC 0035 promoted Draft → Active 2026-05-21. 8 scenarios, one per `node-pack-sandbox-*` invariant in `SECURITY/invariants.yaml`. Behavioral assertions remain stubbed with `expect(true).toBe(true)` + docstring expected-wire-shape pending the synthetic `vendor.openwop.misbehaving-sandbox` pack + a first sandbox-executing reference host. Path to `Accepted`: first sandbox-executing host advertises + implements the 8 failure-mode invariants + the 8 scenarios pass; at that point the 8 `node-pack-sandbox-*` SECURITY rows graduate from `reference-impl` → `protocol` tier per RFC 0035 §"Acceptance criteria." |
-| Multi-region idempotency + cross-engine append-ordering (RFC 0036 — `spec/v1/idempotency.md` §"`multiRegion` sub-block", `spec/v1/replay.md` §"Cross-region replay") | `multi-region-idempotency.test.ts`, `cross-engine-append-ordering.test.ts` | C+ (2 categorical-shape probes always-on + 1 granular `multiRegion` shape probe + 1 `crossEngineOrdering` shape probe; behavioral assertions deferred to simulator landing per RFC 0036 §C) | RFC 0036 promoted Draft → Active 2026-05-21. The existing `multi-region-idempotency.test.ts` covers the categorical `capabilities.idempotency.crossRegion ∈ {single-region, best-effort, strict}` claim plus the matching operator-tier metric names; a third describe block added 2026-05-21 covers the granular `capabilities.idempotency.multiRegion.{supported, replicationLagBoundMs, partitionRecoveryStrategy}` advertisement shape (`replicationLagBoundMs ∈ [0, 60000]`; `partitionRecoveryStrategy ∈ {last-writer-wins, first-writer-wins}` OR `^x-host-<host>-<key>$`). NEW `cross-engine-append-ordering.test.ts` covers `capabilities.eventLog.crossEngineOrdering.{supported, orderingModel ∈ {lamport, vector-clock, global-sequencer}}` shape. Behavioral two-engine-append-then-cross-read assertion deferred until the Postgres reference host's multi-region simulator lands per RFC 0036 §C. Path to `Accepted`: simulator + behavioral conformance pass against the reference host; non-steward host advertises the same. |
+| Multi-region idempotency + cross-engine append-ordering (RFC 0036 — `spec/v1/idempotency.md` §"`multiRegion` sub-block", `spec/v1/replay.md` §"Cross-region replay") | `multi-region-idempotency.test.ts`, `cross-engine-append-ordering.test.ts`, **`multi-region-idempotency-behavior.test.ts` (2026-05-22)**, **`cross-engine-append-behavior.test.ts` (2026-05-22)** | A (2 categorical-shape probes always-on + 1 granular `multiRegion` shape probe + 1 `crossEngineOrdering` shape probe + 6 multi-region behavioral assertions + 4 cross-engine Lamport-ordering behavioral assertions; all 10 behavioral assertions PASS against the reference workflow-engine when `OPENWOP_TEST_MULTI_REGION_SIMULATOR=true` + `OPENWOP_TEST_CROSS_ENGINE_HARNESS=true` are set) | RFC 0036 §B + §C behavioral close-out landed 2026-05-22 via the new workflow-engine test seams (`POST /v1/host/sample/test/multi-region/simulate-partition` + `POST/GET /v1/host/sample/test/cross-engine/{append,read,reset}`) — see `spec/v1/host-sample-test-seams.md` §6 + §7. The new `multi-region-idempotency-behavior.test.ts` exercises the canonical lex-min convergence rule + order-invariance + 400-on-mismatch; the new `cross-engine-append-behavior.test.ts` exercises Lamport-clock monotonicity + per-engine order preservation + read-determinism. Path to `Accepted`: non-steward host advertises matching capabilities + the behavioral assertions pass against it. |
+| Secret-leakage telemetry / debug-bundle export (RFC 0034 §B — `spec/v1/host-capabilities.md` §"OTel collector test seam") | **`secret-leakage-otel-attribute.test.ts` (2026-05-22)** | A− (3 capability-gated probes — OTel span scrape + debug-bundle scrape + advertisement-shape; soft-skips honestly until host advertises `capabilities.observability.testSeams.{otelScrape, debugBundleExport}` AND `capabilities.secrets.supported` AND `OPENWOP_CANARY_SECRET_VALUE` env is set) | Broadens the existing protocol-tier `secret-leakage-otel-attribute` + `secret-leakage-debug-bundle-otel` SECURITY invariants from envelope-acceptor-narrow (already covered by `envelope-reasoning-secret-redaction.test.ts`) to executor-side-broad. Drives the existing `openwop-smoke-byok-roundtrip` fixture; scrapes both seams after run completion; hard-fails if the BYOK canary plaintext appears in any OTel span attribute or debug-bundle field. |
+| Experimental capability tier (RFC 0042 — `schemas/capabilities.schema.json` §`multiAgent.executionModel.tier`) | **`experimental-tier-shape.test.ts` (2026-05-22)** | A (6 server-free + helper-routing assertions across §A schema discipline + §D experimentalGate routing; always-on for hosts that advertise tier='experimental' on any capability sub-block; helper-level behavioral probes for the `experimentalGate()` routing under both default + OPENWOP_REQUIRE_EXPERIMENTAL modes) | RFC 0042 (Draft) lands the audit's "Active RFC → carve-out" pattern. Schema diff lands on `multiAgent.executionModel` with optional `tier ∈ {stable, experimental}` + `experimentalUntil` (ISO-8601 sunset) + `if/then` conditional enforcing §B sunset MUST mechanically. New `experimentalGate()` helper in `conformance/src/lib/behavior-gate.ts` routes scenarios under default mode + `OPENWOP_REQUIRE_EXPERIMENTAL=true` strict-mode. |
+| Sandbox MVP behavioral close-out (RFC 0035 §B) | **`sandbox-mvp-behavior.test.ts` (2026-05-22)** | A (10 capability-gated behavioral assertions covering 7 of 8 §B failure-mode invariants — 5 escape kinds + timeout + memory-exceeded + cross-pack-mutation isolation + capability-gate-violation + 2 well-behaved baselines; all 10 PASS against the workflow-engine's node:vm-based sandbox MVP) | Companion to the existing 8 advertisement-shape sandbox scenarios (`sandbox-no-host-fs-escape.test.ts` et al.). Exercises the canonical 4-code error catalog at `spec/v1/host-capabilities.md` §"Error codes" (`sandbox_escape_attempt` + `sandbox_capability_denied` + `sandbox_memory_exceeded` + `sandbox_timeout`) with spec-mandated `details.{escapeKind, requestedCapability, requestedBytes}` populated. Wire-shape per `spec/v1/host-sample-test-seams.md §8`. Production adopters use wasmtime/nsjail behind the same HTTP test-seam contract. |
+| RFC 0041 §B replay-divergence-at-refusal behavioral (`version: 4`) | `replay-divergence-at-refusal.test.ts` (advertisement-shape + behavioral; 3 assertions PASS against workflow-engine when the `multiAgent.executionModel.version: 4` advertisement is enabled) | A (was `it.todo` until 2026-05-23 when the executor wiring landed — see commit `1fce55a` + `bba3b4a`. Behavioral assertions cover both divergence directions: original=valid + replay=refusal AND original=refusal + replay=valid) | Closes Track #4 of the 2026-05-22 multi-agent behavioral-harness close-out. Reference workflow-engine emits `replay.divergedAtRefusal` event + fails run with `error.code: 'replay_diverged_at_refusal'` when source vs replay envelope kinds differ at the same nodeId. Gated on `OPENWOP_MULTI_AGENT_EXECUTION_MODEL_PHASE_4=true` AND `run.forkMode === 'replay'`. Path-to-Accepted for RFC 0041: non-steward host advertises `multiAgent.executionModel.version: 4` end-to-end. |
 ---
@@ -71,6 +75,23 @@ Twenty-two scenario groups validate optional profiles where the host's discovery
 | `replay-retention-expiry.test.ts` | `openwop-replay-fork` (`replay.md` §"Retention and garbage collection") | B (capability shape always; 410/422 envelope on expired-range fork gated on `OPENWOP_TEST_EXPIRED_REPLAY_RUN_ID`; details.{sourceRunId, fromSeq, retentionBoundary} soft-checks per spec SHOULD) | `host-pending` | Reference host advertises `replay.supported: true` + operator produces a known-expired run id (no standardized force-expire endpoint per RFC 0009 Q#1). |
 | `discovery.test.ts` — auth-scoped subtests (3 of them) | `openwop-discovery-auth-scoped` (`capabilities-change-detection.md` §"Scoped capability views", RFC 0011) | B (capability shape + mode/endpointPath typing always; required-field-preservation in authenticated view always; authorization-oracle probe gated on `OPENWOP_TEST_UNAUTHORIZED_API_KEY`) | `host-pending` | Reference host advertises `capabilities.discovery.authScoped.supported: true` + serves an authenticated capability view that satisfies the base schema + a tenant-scoped key pair for the oracle probe. |
 | `fs-path-traversal.test.ts` | `capabilities.fs` (RFC 0014, `host-fs-capability.md`) | A (advertisement shape + two path-escape probes asserting `path_outside_sandbox`) | host-pass (workflow-engine reference) | Reference host advertises `capabilities.fs.supported: true` with sandboxRoot under `<dataDir>/host-fs`. |
+| `credentials-capability-shape.test.ts` | `capabilities.credentials` (RFC 0046, `host-capabilities.md` §host.credentials) | A (advertisement shape always — `supported` boolean; `scopes` ⊆ {user,workspace,tenant}; `rotation` ∈ {none,two-key-overlap}) | `host-pending` | Always runs; asserts the block is absent or well-formed. No host advertises `capabilities.credentials` yet (RFC 0046 `Draft`). |
+| `credential-payload-redaction.test.ts` | `capabilities.credentials` (RFC 0046) + `SECURITY/invariants.yaml` `credential-payload-redaction` | A (advertisement shape always; redaction MUST-NOT via optional `POST /v1/host/sample/credentials/echo` seam — canary plaintext absent from all observable surfaces) | `host-pending` | Capability-gated on `credentials.supported`; behavioral probe soft-skips on 404 when the seam is unwired, mirroring `fs-path-traversal`. |
+| `oauth-capability-shape.test.ts` | `capabilities.oauth` (RFC 0047, `host-capabilities.md` §host.oauth) | A (advertisement shape always — `supported` boolean; `grants` ⊆ {authorization_code,client_credentials,refresh_token}; every `providers[].id` non-empty) | `host-pending` | Always runs; asserts the block is absent or well-formed. No host advertises `capabilities.oauth` yet (RFC 0047 `Draft`). |
+| `oauth-connector-redaction.test.ts` | `capabilities.oauth` (RFC 0047) + `SECURITY/invariants.yaml` `credential-payload-redaction` | A (advertisement shape always; token-material redaction via optional `POST /v1/host/sample/oauth/connector-echo` seam — canary token absent from all observable surfaces; `connector.authorized` carries the ref not the token) | `host-pending` | Capability-gated on `oauth.supported`; behavioral probe soft-skips on 404. Reuses the RFC 0046 redaction invariant (OAuth tokens are stored as host.credentials entries). |
+| `connector-manifest-validity.test.ts` | `node-pack-manifest.schema.json` §Connector (RFC 0045, `node-packs.md` §Connectors) | Server-free (schema validity of the `connector` block incl. both ConnectorAuth variants + positive/negative round-trip; §B action/trigger typeId-resolution semantics — `connector_action_unresolved` on an unknown typeId) | host-pass (server-free) | Always runs; no host needed. Behavioral idempotency-hint + rate-limit-honored scenarios deferred until a host advertises a connector. |
+| `identity-owner-shape.test.ts` | `run-snapshot.schema.json` properties.owner (RFC 0048 §C, `auth.md` §Identity claims) | Server-free (owner triple schema validity: positive `{tenant}` + full triple; negative missing-tenant + unknown-prop) | host-pass (server-free) | Always runs; no host needed. |
+| `cross-workspace-isolation.test.ts` | RFC 0048 §C/§D (`auth.md` §Identity claims, `rest-endpoints.md` `run_forbidden`) | A (owner-echo shape if a sample run is readable; §D isolation MUST-NOT via optional `POST /v1/host/sample/identity/cross-workspace-read` seam — cross-workspace read fails closed with `run_forbidden`/`not_found`) | `host-pending` | Behavioral probe soft-skips on 404; no host advertises run ownership yet (RFC 0048 `Draft`). |
+| `authorization-roles-shape.test.ts` | `capabilities.authorization` (RFC 0049 §A, `auth.md` §"Role-based authorization") | A (advertisement shape always — `supported` boolean; `failClosed` const true; every `roles[].role` non-empty + `scopes` array) | `host-pending` | Always runs; asserts the block is absent or well-formed. |
+| `authorization-fail-closed.test.ts` | `capabilities.authorization` (RFC 0049 §C) + `SECURITY/invariants.yaml` `authorization-fail-closed` | A (advertisement `failClosed===true` always; fail-closed MUST-NOT via optional `POST /v1/host/sample/authorization/decide` seam — an unseeded-role principal resolves `allowed:false`) | `host-pending` | Capability-gated on `authorization.supported`; behavioral probe soft-skips on 404. Scope-match + denial-audited scenarios deferred to a host. |
+| `auth-saml-profile.test.ts` | `openwop-auth-saml` (RFC 0050, `auth-profiles.md` §`openwop-auth-saml`) | A+B (profile-advertisement shape always; **1-positive + 6-negative reference suite runs server-free** via the bundled synthetic IdP `conformance/src/lib/saml-idp.ts` — `alg:none`/unsigned/bad-sig/expired/not-yet-valid/wrapping; host-ACS validation opt-in via `OPENWOP_TEST_SAML_IDP_URL` + the `auth/saml/validate` seam) | host-pass (server-free reference) | Synthetic IdP bundled (`node:crypto`, no deps). Host-ACS pass is the remaining graduation gate. |
+| `auth-scim-profile.test.ts` | `openwop-auth-scim` (RFC 0050, `auth-profiles.md` §`openwop-auth-scim`) | B (profile-advertisement shape always; SCIM user/group provisioning → principal/role roundtrip opt-in via `OPENWOP_TEST_SCIM_URL` + the `auth/scim/provision` seam) | `host-pending` | Behavior opt-in (operator-supplied SCIM endpoint); deactivate ⇒ subsequent-deny assertion deferred to a host. |
+| `approval-gate-events.test.ts` | `approval.granted` / `.rejected` / `.overridden` (RFC 0051 §B, `interrupt-profiles.md` §approvalGate) | Server-free (event-payload schema validity: required fields incl. mandatory `overridden.reason`; additionalProperties:false negatives) | host-pass (server-free) | Always runs; no host needed. |
+| `approval-gate-flow.test.ts` | `core.openwop.governance.approvalGate` (RFC 0051 §A) + `capabilities.authorization` (RFC 0049) | A (capability-gated on `authorization.supported`; unauthorized-principal-denied + override-audited via the `governance/approval-gate` seam) | `host-pending` | Behavioral probe soft-skips on 404. Grant/reject-loopback/quorum scenarios deferred until a governance host wires the seam. |
+| `scheduling-capability-shape.test.ts` | `capabilities.scheduling` (RFC 0052 §A, `host-capabilities.md` §host.scheduling) | A (advertisement shape always — `supported` boolean; `cron`/`delayed`/`calendar` booleans; `maxFutureHorizon` ISO-8601 duration) | `host-pending` | Always runs; asserts the block is absent or well-formed. |
+| `scheduling-cron-fires-once.test.ts` | `capabilities.scheduling` (RFC 0052 §B) | A (once-per-tick + missed-tick MUST-NOT via optional `POST /v1/host/sample/scheduling/tick` seam — single tick fires exactly one run; missed window never floods) | `host-pending` | Capability-gated on `scheduling.supported` + `cron`; soft-skips on 404. Delayed-horizon + calendar scenarios deferred. |
+| `deadletter-capability-shape.test.ts` | `capabilities.deadLetter` (RFC 0053 §A, `host-capabilities.md` §host.deadLetter) | A (advertisement shape always — `supported` boolean; `retentionDays` integer ≥ 1) | `host-pending` | Always runs; asserts the block is absent or well-formed. |
+| `deadletter-retry-exhaustion.test.ts` | `capabilities.deadLetter` (RFC 0053 §C) + `run.dead_lettered` event | A (retry-exhaustion → `run.dead_lettered` with `attempts` + dead-lettered run fork-eligible, via optional `POST /v1/host/sample/deadletter/exhaust` seam) | `host-pending` | Capability-gated on `deadLetter.supported`; soft-skips on 404. Retention-purge scenario deferred (needs clock seam). |
 | `kv-cross-tenant-isolation.test.ts`, `kv-atomic-increment.test.ts`, `kv-cas.test.ts` (three scenarios) | `capabilities.kvStorage` (RFC 0015, `host-kv-storage-capability.md`) + `SECURITY/invariants.yaml` `kv-cross-tenant-isolation` | A (advertisement shape always; behavioral cross-tenant `set`/`get`, 50× concurrent atomic increment convergence, CAS matching/stale-expect) | host-pass via opt-in test seam | Reference host exposes `POST /v1/host/sample/test/surface` env-gated on `OPENWOP_TEST_SEAM_ENABLED=true`; hosts that don't expose the seam soft-skip the behavioral assertions and verify advertisement shape only. |
 | `table-cross-tenant-isolation.test.ts` | `capabilities.tableStorage` (RFC 0016, `host-table-storage-capability.md`) | A (advertisement shape + behavioral cross-tenant insert/query proof) | host-pass via opt-in test seam | Same seam dependency as kv row. |
 | `queue-cross-tenant-isolation.test.ts` | `capabilities.queueBus` (RFC 0017, `host-queue-bus-capability.md`) + `SECURITY/invariants.yaml` `queue-cross-tenant-isolation` | A (advertisement shape + behavioral cross-tenant publish/consume proof) | host-pass via opt-in test seam | Same seam dependency as kv row. |
@@ -80,6 +101,8 @@ Twenty-two scenario groups validate optional profiles where the host's discovery
 | `prompt-end-to-end-events.test.ts`, `prompt-resolution-chain-node-wins.test.ts`, `prompt-resolution-chain-fallback-cascade.test.ts` (three scenarios) | `prompts-supported` profile — gates on `capabilities.prompts.supported: true` (RFC 0027 + RFC 0029, `spec/v1/prompts.md`) | A (advertisement shape always + end-to-end resolve + emit during real workflow dispatch; resolution chain Layers 1, 3, 4 exercised) | host-pass (workflow-engine reference) | Reference host advertises `capabilities.prompts.supported: true` since RFC 0027 ref-impl landed; dispatch wiring in `bootstrap/nodes.ts` walks the resolution chain and emits `agent.promptResolved` + `prompt.composed` per spec/v1/prompts.md §"Composition + observability". |
 | `prompt-pack-install.test.ts`, `prompt-list-and-fetch.test.ts`, `prompt-render-deterministic.test.ts` (three scenarios) | `prompts-endpoints` profile — gates on `capabilities.prompts.endpointsSupported: true` (RFC 0028 §A, `spec/v1/prompts.md` §"Discovery & distribution") | A (advertisement shape always + list/get/render contract + pack-source provenance stamps + ETag honoring when supported) | host-pass (workflow-engine reference) | Reference host serves the six `/v1/prompts*` routes via `routes/prompts.ts` against the in-memory `PromptStore`. Pack-install existence claim opt-in via `OPENWOP_TEST_PROMPT_PACK_INSTALLED=true` (the in-tree `vendor.openwop.prompt-sample` pack auto-installs via `promptPackLoader.ts`). |
 | `prompt-mutable-lifecycle.test.ts` | `prompts-mutable` profile — gates on `capabilities.prompts.mutableLibrary: true` (RFC 0028 §C) | A (advertisement shape + CRUD lifecycle + pack/host source 403-on-mutation) | host-pass (workflow-engine reference) | Reference host advertises `mutableLibrary: true`; user-source templates accepted, pack + host-built-in templates return 403 on POST/PUT/DELETE. |
+| `prompt-mutation-workspace-membership-enforced.test.ts` | `prompts-mutable` profile — gates on `capabilities.prompts.mutableLibrary: true` (RFC 0028 Tier-2 follow-up, post-promotion) + `SECURITY/invariants.yaml` `prompt-mutation-workspace-membership-enforced` | A (advertisement gate + cross-workspace write refusal — drives `POST /v1/prompts` with a random non-member `workspaceId`, asserts any 4xx/5xx; on 403 specifically, additionally pins canonical `error === "workspace_membership_required"` envelope per `rest-endpoints.md` §"Common error codes"; other refusal codes unconstrained) | capability-gated (no reference-host membership backend yet; soft-skips cleanly until a host wires the workspace-member resolver) | Filed 2026-05-25 in response to a MyndHyve self-disclosed Admin-SDK-bypasses-DB-rules vulnerability on revision `00207-vzq`. T1 canonicalization same-day (2026-05-25) added the 403-envelope check. Operator override via `OPENWOP_TEST_NONMEMBER_WORKSPACE_ID`. |
+| `prompt-read-workspace-membership-enforced.test.ts` | `prompts-supported` profile — gates on `capabilities.prompts.supported: true` (broader than `mutableLibrary` per MyndHyve relay Option B: read-only hosts that expose `?workspaceId=` reads are NOT exempt from the symmetric authz invariant) + `SECURITY/invariants.yaml` `prompt-read-workspace-membership-enforced` | A (advertisement gate + cross-workspace read refusal — drives `GET /v1/prompts?workspaceId=<random-non-member>`, interprets response: 4xx PASS with canonical envelope check on 403; 200 with empty `templates[]` PASS as correct null result; 200 with non-empty `templates[]` FAIL as cross-tenant leak; 200 without `templates[]` field SKIP via response-shape detection — host doesn't expose workspace-scoped reads) | capability-gated (no reference-host workspace-scoped read backend yet; soft-skips cleanly on the response-shape detection) | T2 sister scenario filed 2026-05-25 alongside T1; same threat model as the write scenario but probes the read path. Read paths are NOT exempt from cross-tenant authz — a `GET ?workspaceId=<not-mine>` that returns another workspace's templates is a data leak with the same blast radius as a cross-tenant write. Uses response-shape detection (rather than a new capability field) to self-skip hosts without workspace-scoped reads. Operator override via `OPENWOP_TEST_NONMEMBER_WORKSPACE_ID`. |
 | `prompt-resolution-chain-agent-intrinsic.test.ts` | `prompts-agent-bindings` profile — gates on `capabilities.prompts.agentBindings: true` (RFC 0029 §A Layer 2) | A (advertisement shape + Layer 2 agent intrinsic / overrides / library-default precedence over Layers 3-4) | host-pass (workflow-engine reference) | Reference host advertises `agentBindings: true` so Layer 2 sub-layers (agent-intrinsic / agent-overrides / agent-library-default) walk per RFC 0029 §B. |
 | `prompt-composed-secret-redaction.test.ts`, `prompt-composed-trust-marker.test.ts` (two scenarios) | `prompts-observability-full` profile — gates on `prompts.supported + observability: "full"` (RFC 0027 §E + RFC 0020 §D) + `SECURITY/invariants.yaml` `prompt-composed-secret-redaction` + `prompt-composed-trust-marker` | A (advertisement shape + `[REDACTED:<credentialRef>]` markers for secret-source bindings + `<UNTRUSTED>...</UNTRUSTED>` wrapping + `contentTrust: "untrusted"` propagation) | host-pass (workflow-engine reference) | Reference host advertises `observability: "full"` (sourced from `host/promptHostConfig.ts`). Composition pipeline in `host/promptCompose.ts` enforces SR-1 carry-forward + untrusted-content marker per `SECURITY/threat-model-secret-leakage.md` §SR-1. |
@@ -118,6 +141,9 @@ Every OpenAPI operation should have:
 | `pauseRun` | `pause-resume.test.ts` covers direct route behavior for running → paused, idempotent re-pause, terminal conflict, and pause-during-suspend race | Conflict and race paths covered with `details.runStatus`; endpoint is no longer coverage-missing | Add explicit immediate-vs-drain-current-node policy assertion when a host advertises both drain policies. |
 | `resumeRun` | `pause-resume.test.ts` covers direct route behavior for paused → running and non-paused conflict | Conflict path covered with `details.runStatus`; endpoint is no longer coverage-missing | Good. |
 | `forkRun` | `replay-fork.test.ts`, `replayDeterminism.test.ts` | Negative `fromSeq`, past-end, unknown source, invalid overlay | Add arbitrary-event fork and retention-expired source. |
+| `createAnnotation` | `feedback-record-and-list.test.ts`, `feedback-on-terminal-run.test.ts`, `feedback-correction-redaction.test.ts` (RFC 0056); gated on `capabilities.feedback.supported`, soft-skip on `501` | `feedback-unsupported-501.test.ts` (501 when unadvertised), `feedback-cross-tenant-isolation.test.ts`, `feedback-fork-not-copied.test.ts` | Capability-gated; full cross-tenant proof needs a multi-tenant auth seam (soft-skips, like `kv-cross-tenant-isolation`). |
+| `listAnnotations` | `feedback-record-and-list.test.ts`, `feedback-cross-tenant-isolation.test.ts` (RFC 0056) | `feedback-correction-redaction.test.ts` (redacted listing), `feedback-fork-not-copied.test.ts` (fork list empty) | Gated on `capabilities.feedback.supported`. |
+| `diffRun` | `run-diff.test.ts` (RFC 0054); soft-skips on 404 when the endpoint is unimplemented | Self-diff `divergedAtSeq: null`/empty (determinism floor), two-fixture divergence with `eventDiffs[0].seq === divergedAtSeq`, response-shape + `stateDiff` redaction-safety, `400` (missing `against`) + `404` (nonexistent `against`) | Add a bespoke deterministically-divergent fork fixture for `divergedAtSeq === N`-at-a-chosen-seq; full cross-principal `403` needs a multi-principal harness. |
 | `resolveInterruptByRun` | `interrupt-approval.test.ts`, `interrupt-clarification.test.ts`, `approval-payload.test.ts`, `interruptRace.test.ts` | Invalid action, unknown node, race cases | Add auth-required and quorum profile scenarios. |
 | `inspectInterruptByToken` | `interrupt-token-matrix.test.ts` (CF-3, 2026-05-15) covers malformed + unknown token paths | Negative paths covered | Add explicit expired-token case when a host advertises a TTL seam. |
 | `resolveInterruptByToken` | `interrupt-token-matrix.test.ts` covers replay (already-resolved) + unknown token; `interrupt-external-event-correlation.test.ts` covers positive path | Replay path + unknown-token path covered with explicit assertions | Add wrong-action case once the host advertises a typed allowed-actions vocabulary in the interrupt manifest. |
@@ -130,6 +156,7 @@ Every OpenAPI operation should have:
 | `updatePromptTemplate` | `prompt-mutable-lifecycle.test.ts` covers positive update + non-monotonic-version conflict + pack-sourced-readonly 403 against the reference workflow-engine | Positive update + 403 readonly-source + 409 conflict covered | Add `501` not-mutable-library negative for hosts that advertise `mutableLibrary: false`. |
 | `deletePromptTemplate` | `prompt-mutable-lifecycle.test.ts` covers positive delete + pack-sourced-readonly 403 against the reference workflow-engine | Positive delete + 403 readonly-source covered | Add `501` not-mutable-library negative + `404` unknown-template scenarios. |
 | `renderPromptTemplate` | `prompt-render-deterministic.test.ts` exercises `POST /v1/prompts:render` end-to-end against the reference workflow-engine; deterministic-hash invariant verified across `:render` + `prompt.composed` event paths. `prompt-composed-secret-redaction.test.ts` + `prompt-composed-trust-marker.test.ts` exercise the shared compose pipeline via the `/v1/host/sample/prompt/compose` seam | Deterministic render + composition redaction + trust-marker invariants covered | Add `400 prompt_variable_unresolved` matrix for missing variables across all four PromptKinds. |
+| `putTestPackTarball`, `getTestPackTarball`, `deleteTestPackVersion`, `getTestPackSignature` | `pack-registry-publish.test.ts` covers the 19-code publish error catalog through the RFC 0025 `/v1/packs-test/*` mirror namespace, gated on `capabilities.packs.testMode.supported: true` (RFC 0025 §A). 26 scenarios soft-skip when the advertisement is absent; when present, the suite exercises URL/scope, body-shape, tarball-extraction, manifest-contents, integrity, auth/conflict, unpublish-window, and signature-endpoint pairing. | Soft-skip on advertisement absence; behavioral on advertisement presence | Add real-tarball-builder fixtures so the manifest_mismatch / pack_integrity_failure / unsupported_runtime branches assert against a meaningful gzip+tar payload (currently soft-skipped with explanatory comments). |
 ---

package/fixtures/conformance-phase4-nondet-tool.json ADDED Viewed

@@ -0,0 +1,53 @@
+{
+  "id": "conformance-phase4-nondet-tool",
+  "name": "Conformance: RFC 0041 §C observable-output-sequence determinism (Phase 4)",
+  "version": "1.0",
+  "description": "Two-node workflow exercising a nondeterministic tool followed by a structured-output node, used by `replay-observable-sequence-determinism.test.ts` to verify RFC 0041 §C — across original + replay runs of the same workflow against the same engine, the observable RunEventDoc sequence prefix MUST be identical up to and including the nondeterministic-tool node's `node.completed` event. The host's replay path MUST replay the original event log entries (rather than re-executing the tool) for nodes whose `core.tool.*` config carries `nondeterministic: true`. Phase 4 hosts advertising `multiAgent.executionModel.replayDeterminism.supported: true` MUST honor this contract; non-Phase-4 hosts MAY re-execute the tool freely (and consequently observe sequence drift the conformance scenario will not assert against).",
+  "nodes": [
+    {
+      "id": "nondet-tool",
+      "typeId": "core.noop",
+      "name": "Nondeterministic tool (proxied via core.noop for sample-grade)",
+      "position": { "x": 0, "y": 0 },
+      "config": {
+        "nondeterministic": true,
+        "phase4Probe": true
+      },
+      "inputs": {}
+    },
+    {
+      "id": "structured-call",
+      "typeId": "core.ai.structuredOutput",
+      "name": "Structured output via mock provider (consumes nondet-tool result)",
+      "position": { "x": 200, "y": 0 },
+      "config": {
+        "provider": "mock",
+        "model": "mock-mini",
+        "outputSchema": {
+          "type": "object",
+          "required": ["valid"],
+          "properties": { "valid": { "type": "boolean" } }
+        }
+      },
+      "inputs": {
+        "messages": {
+          "type": "static",
+          "value": [
+            { "role": "user", "content": "Please emit a valid envelope." }
+          ]
+        }
+      }
+    }
+  ],
+  "edges": [
+    { "id": "e1", "sourceNodeId": "nondet-tool", "targetNodeId": "structured-call" }
+  ],
+  "triggers": [
+    { "id": "manual", "type": "manual", "enabled": true }
+  ],
+  "variables": [],
+  "metadata": {
+    "tags": ["conformance", "rfc-0041", "phase-4", "observable-sequence-determinism", "multi-agent-execution"]
+  },
+  "settings": { "timeout": 30000 }
+}

package/fixtures/conformance-phase4-replay-divergence.json ADDED Viewed

@@ -0,0 +1,40 @@
+{
+  "id": "conformance-phase4-replay-divergence",
+  "name": "Conformance: RFC 0041 §B replay-divergence-at-refusal (Phase 4)",
+  "version": "1.0",
+  "description": "Single `core.ai.structuredOutput` node against the conformance `mock` provider. Conformance scenario `replay-divergence-at-refusal.test.ts` pre-seeds the mock with a two-entry program via `POST /v1/host/sample/test/mock-ai/program` keyed on the structured-call nodeId: entry [0] returns a valid envelope (consumed by the original run); entry [1] returns `stopReason: 'safety'` + `refusalText` (consumed by the `:fork mode: replay`). Phase 4 hosts advertising `multiAgent.executionModel.replayDeterminism.refusalDivergenceEmission: true` MUST detect the divergence at replay time, emit a `replay.divergedAtRefusal` event with `originalEnvelopeKind: 'valid'` + `replayEnvelopeKind: 'refusal'`, and fail the replay with HTTP `422` + `error.code: 'replay_diverged_at_refusal'` per `spec/v1/rest-endpoints.md §\"Common error codes\"`. Silent substitution of the refusal for the original envelope is non-conformant.",
+  "nodes": [
+    {
+      "id": "structured-call",
+      "typeId": "core.ai.structuredOutput",
+      "name": "Structured output via mock provider (Phase 4 replay-divergence probe)",
+      "position": { "x": 0, "y": 0 },
+      "config": {
+        "provider": "mock",
+        "model": "mock-mini",
+        "outputSchema": {
+          "type": "object",
+          "required": ["valid"],
+          "properties": { "valid": { "type": "boolean" } }
+        }
+      },
+      "inputs": {
+        "messages": {
+          "type": "static",
+          "value": [
+            { "role": "user", "content": "Please emit a valid envelope." }
+          ]
+        }
+      }
+    }
+  ],
+  "edges": [],
+  "triggers": [
+    { "id": "manual", "type": "manual", "enabled": true }
+  ],
+  "variables": [],
+  "metadata": {
+    "tags": ["conformance", "rfc-0041", "phase-4", "replay-divergence", "multi-agent-execution"]
+  },
+  "settings": { "timeout": 30000 }
+}

package/fixtures.md CHANGED Viewed

@@ -84,9 +84,9 @@ All fixtures MUST advertise:
 | Dispatch Per-Worker Mapping Override | `conformance-dispatch-per-worker-override` | RFC 0022 §A / HVMAP-1c-override — parent with BOTH a default `inputMapping` (`{ input: 'defaultX' }`) AND `perWorkerInputMappings.child-b: { input: 'sharedVar' }`. Verifies `effectiveInputMapping` precedence per §A: child-a receives the default, child-b receives the override. Reuses `conformance-dispatch-cross-worker-handoff-child-a` + `-child-b`. | `completed` | ≤ 30s |
 | Dispatch deterministic-fail child | `conformance-dispatch-deterministic-fail-child` | RFC 0022 §B / HVMAP-1b-failed — child workflow that ALWAYS terminates `failed` via `core.fail`. Used by `conformance-dispatch-output-mapping` to verify the parent's `outputMapping` is SKIPPED when the child fails terminally. | `failed` | ≤ 5s |
 | Dispatch cancellable child | `conformance-dispatch-cancellable-child` | RFC 0022 §B / HVMAP-1b-cancelled — child workflow with a long `core.delay` so the test cancels it externally via `POST /v1/runs/{childRunId}/cancel`. Verifies the parent's `outputMapping` is SKIPPED when the child terminates `cancelled`. | `cancelled` | ≤ 60s |
-| Multi-Agent Handoff (parent) | `conformance-multi-agent-handoff` | RFC 0037 Phase 1 — exercises the planner→worker handoff state machine. Supervisor decides one `next-worker`, dispatch spawns the child, harvests outputMapping. Conformance reads the event log for the 4 `core.workflowChain.event` transition records in causation-chained order (`dispatch.began → dispatch.succeeded → child.completed → output.harvested`). Capability-gated on `capabilities.multiAgent.executionModel.supported`. | `completed` | ≤ 30s |
-| Multi-Agent Handoff (child) | `conformance-multi-agent-handoff-child` | RFC 0037 Phase 1 — child for `conformance-multi-agent-handoff`. Declares `childOutcome.defaultValue='handoff-complete'`; the parent's outputMapping harvests it onto `parentResult`, triggering the `output.harvested` transition event. | `completed` | ≤ 5s |
-| Multi-Agent Confidence Escalation | `conformance-multi-agent-confidence-escalation` | RFC 0039 §A — exercises the Phase 2 confidence-floor escalation contract. Supervisor's `mockDispatchPlan` carries ONE decision with `confidence: 0.3` (below the 0.5 spec floor). The host MUST emit `core.workflowChain.confidence-escalated` AND suspend with a clarification interrupt BEFORE any dispatch.began fires; conformance asserts zero `core.workflowChain.event` records (no dispatch). Capability-gated on `capabilities.multiAgent.executionModel.version >= 2`. | `waiting-clarification` | ≤ 30s |
+| Multi-Agent Handoff (parent) | `conformance-multi-agent-handoff` | RFC 0037 (`version: 1`) — exercises the planner→worker handoff state machine. Supervisor decides one `next-worker`, dispatch spawns the child, harvests outputMapping. Conformance reads the event log for the 4 `core.workflowChain.event` transition records in causation-chained order (`dispatch.began → dispatch.succeeded → child.completed → output.harvested`). Capability-gated on `capabilities.multiAgent.executionModel.supported`. | `completed` | ≤ 30s |
+| Multi-Agent Handoff (child) | `conformance-multi-agent-handoff-child` | RFC 0037 (`version: 1`) — child for `conformance-multi-agent-handoff`. Declares `childOutcome.defaultValue='handoff-complete'`; the parent's outputMapping harvests it onto `parentResult`, triggering the `output.harvested` transition event. | `completed` | ≤ 5s |
+| Multi-Agent Confidence Escalation | `conformance-multi-agent-confidence-escalation` | RFC 0039 §A (`version: 2`) — exercises the confidence-floor escalation contract. Supervisor's `mockDispatchPlan` carries ONE decision with `confidence: 0.3` (below the 0.5 spec floor). The host MUST emit `core.workflowChain.confidence-escalated` AND suspend with a clarification interrupt BEFORE any dispatch.began fires; conformance asserts zero `core.workflowChain.event` records (no dispatch). Capability-gated on `capabilities.multiAgent.executionModel.version >= 2`. | `waiting-clarification` | ≤ 30s |
 | Agent Memory Round-Trip | `conformance-agent-memory-roundtrip` | Phase 3 — `MemoryAdapter.list/get` write → read | `completed` | ≤ 15s |
 | Agent Memory Cross-Tenant | `conformance-agent-memory-cross-tenant` | Phase 3 / CTI-1 — cross-tenant probe MUST return `[]` / `null` | `completed` | ≤ 10s |
 | Agent Memory Redaction | `conformance-agent-memory-redaction` | Phase 3 / SR-1 — BYOK plaintext surfaces as `[REDACTED:<id>]` on read | `completed` | ≤ 15s |
@@ -113,6 +113,8 @@ All fixtures MUST advertise:
 | Envelope Refusal | `conformance-envelope-refusal` | RFC 0032 §B.3 + RFC 0033 §D + §F end-to-end refusal — mock provider returns `stopReason: 'safety'` with `refusalText`. Host MUST emit exactly one `envelope.refusal` event, NOT retry (RFC 0033 §D), fail with `error.code: 'envelope_refused_by_provider'`, AND keep refusalText off `RunSnapshot.error.message` (SECURITY invariant `envelope-refusal-no-prompt-leak`). | `failed` (`error.code='envelope_refused_by_provider'`) | ≤ 10s |
 | Envelope Recovery Applied | `conformance-envelope-recovery-applied` | RFC 0032 §B.6 lenient-parse — mock returns a markdown-fenced JSON envelope (```json\\n...\\n```). Host's `dispatchStructured()` lenient-parse fallback (`tryLenientParse()`) strips the fence, emits exactly one `envelope.recovery.applied` with `path: 'markdown-fence'`, and accepts the parsed value WITHOUT counting against the retry budget per RFC 0033 §D. | `completed` | ≤ 10s |
 | Envelope NL-to-Format Engaged | `conformance-envelope-nl-to-format-engaged` | RFC 0032 §B.5 NL-to-Format fallback — mock returns natural-language prose on the first 3 attempts (exhausting the retry budget); the host detects the NL shape after exhaustion, emits exactly one `envelope.nlToFormat.engaged { originalEnvelopeType, fallbackCalls: 1 }`, then fires ONE additional dispatch with a corrective coercion fragment. The 4th program entry returns valid JSON; the schema validates; the run terminates `completed`. | `completed` | ≤ 10s |
+| Phase 4 Replay Divergence | `conformance-phase4-replay-divergence` | RFC 0041 §B — single `core.ai.structuredOutput` node against mock provider. Conformance scenario pre-seeds a 2-entry program via the existing mock-AI program seam: entry [0] returns a valid envelope (original run consumes); entry [1] returns `stopReason: 'safety'` + `refusalText` (`:fork mode: replay` consumes). Phase 4 hosts advertising `multiAgent.executionModel.replayDeterminism.refusalDivergenceEmission: true` MUST emit `replay.divergedAtRefusal` + fail replay with `error.code: 'replay_diverged_at_refusal'`. Silent substitution is non-conformant. Pairs with `replay-divergence-at-refusal.test.ts`. | original: `completed`; replay: `failed` (`error.code='replay_diverged_at_refusal'`) | ≤ 10s |
+| Phase 4 Nondeterministic Tool | `conformance-phase4-nondet-tool` | RFC 0041 §C — two-node workflow (`core.noop` proxied as a nondeterministic tool → `core.ai.structuredOutput`). Used by `replay-observable-sequence-determinism.test.ts` to verify that across original + replay runs, the observable `RunEventDoc` sequence prefix is identical up to and including the nondeterministic-tool node's `node.completed` event. The host's replay path MUST replay the original event log entries (rather than re-executing the tool) for nodes whose `core.tool.*` config carries `nondeterministic: true`. Phase 4 hosts advertising `multiAgent.executionModel.replayDeterminism.supported: true` honor this contract. | original + replay: `completed`; observable prefixes equal up to the nondet boundary | ≤ 10s |
 The `messages`-mode stream fixture (AI token streaming) is covered by the deterministic mock-provider surface in `spec/v1/run-options.md`. Hosts that do not advertise `Capabilities.testing.mockProviders` skip-equivalent on those scenarios.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@openwop/openwop-conformance",
-  "version": "1.5.0",
+  "version": "1.6.1",
   "description": "Production-ready black-box conformance suite for OpenWOP v1.0 compliant servers.",
   "repository": {
     "type": "git",

package/schemas/README.md CHANGED Viewed

@@ -11,12 +11,15 @@
 | `envelopes/schema.request.schema.json` | `ai-envelope.md` §"Universal kinds" | FINAL v1.1 — LLM asks the engine for a kind's JSON Schema. Counts against `Capabilities.limits.schemaRounds`. |
 | `envelopes/schema.response.schema.json` | `ai-envelope.md` §"Universal kinds" | FINAL v1.1 — side-channel ack for `schema.request`. Never surfaces to users. |
 | `envelopes/error.schema.json` | `ai-envelope.md` §"Universal kinds" | FINAL v1.1 — LLM's deliberate error report. Distinct from `error-envelope.schema.json` (host HTTP errors). |
+| `annotation.schema.json` | `RFCS/0056` + `observability.md` | RFC 0056 (`Draft`) — a non-blocking human/agent quality signal (rating / correction / label / flag) attached to a run, event, or node. A side-resource (not a replayable run-event-log entry); response of `POST/GET /v1/runs/{runId}/annotations` + payload of the `run.annotated` SSE notification. |
+| `annotation-create.schema.json` | `RFCS/0056` | RFC 0056 (`Draft`) — request body for `POST /v1/runs/{runId}/annotations` (host assigns `annotationId`/`createdAt`/`actor`; binds `target.runId` to the path). |
 | `audit-verify-result.schema.json` | `auth-profiles.md` §`openwop-audit-log-integrity` | Response payload from `GET /v1/audit/verify` — chain-validity verdict + checkpoints + anomalies |
 | `capabilities.schema.json` | `capabilities.md` | `/.well-known/openwop` response — protocolVersion + supportedEnvelopes + schemaVersions + limits + optional v1 discovery surface |
 | `channel-written-payload.schema.json` | `channels-and-reducers.md` §Channel write event | Payload of the `channel.written` RunEvent — write input + reducer name |
 | `conversation-event.schema.json` | `channels-and-reducers.md` + conversation RFC | Multi-turn conversation event shape for orchestrator-driven HITL flows |
 | `conversation-turn.schema.json` | `channels-and-reducers.md` + conversation RFC | Conversation turn shape for user/agent/system messages |
 | `core-conformance-mock-agent-config.schema.json` | `node-packs.md` + RFC 0023 | Config shape for the conformance-only `core.conformance.mock-agent` typeId — drives `agent.*` event emission on cue (`mockReasoning` / `mockToolCalls` / `mockHandoff` / `mockDecision` / `mockConfidence`). Hosts MUST refuse this typeId for production tenants unless `capabilities.conformance.mockAgent` is advertised. |
+| `credential-reference.schema.json` | `host-capabilities.md` §host.credentials + RFC 0046 | Opaque `{ ref, scope }` handle to a host-stored credential — the only credential artifact on the wire; never carries key material |
 | `debug-bundle.schema.json` | `debug-bundle.md` | Portable run diagnostic export from `GET /v1/runs/{runId}/debug-bundle` |
 | `dispatch-config.schema.json` | `node-packs.md` + dispatch RFC | Configuration shape for `core.dispatch` / sub-workflow routing |
 | `error-envelope.schema.json` | `rest-endpoints.md` + `auth.md` | Canonical `{error, message, details?}` shape returned on every non-2xx |
@@ -31,6 +34,7 @@
 | `registry-version-manifest.schema.json` | `registry-operations.md` | Registry-augmented version manifest served at `GET /v1/packs/{name}/-/{version}.json`. Extends the bare pack-manifest contract with registry-side metadata (integrity hash, signing-block polymorphism, lifecycle flags). Enforced by the `Validate version manifests against registry-version-manifest schema` step in `.github/workflows/registry-publish.yml`. |
 | `orchestrator-decision.schema.json` | `node-packs.md` + orchestrator RFC | Decision output shape for orchestrator routing nodes |
 | `run-ancestry-response.schema.json` | `multi-agent-execution.md` + RFC 0040 | Response body for `GET /v1/runs/{runId}/ancestry` — names the run's immediate parent in the cross-host composition chain (or `parent: null` for top-level runs). Capability-gated on `capabilities.multiAgent.executionModel.crossHostCausation.ancestryEndpointSupported`. |
+| `run-diff-response.schema.json` | `rest-endpoints.md` + RFC 0054 | Response body for `GET /v1/runs/{runId}:diff?against={otherRunId}` — deterministic, replay-aware structured diff of two runs (`divergedAtSeq` + `eventDiffs[]` + `stateDiff`). |
 | `run-event-payloads.schema.json` | `run-event.schema.json` §RunEventType | Per-RunEventType payload contracts, indexed by `$defs.<typeId>` for opt-in strict validation |
 | `run-event.schema.json` | `version-negotiation.md` + `RunEventDoc` | Event log envelope + event type enum |
 | `run-options.schema.json` | `run-options.md` | Per-run input overlay (configurable + tags + metadata) on `POST /v1/runs` |

package/schemas/annotation-create.schema.json ADDED Viewed

@@ -0,0 +1,37 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "https://openwop.dev/spec/v1/annotation-create.schema.json",
+  "title": "AnnotationCreate",
+  "description": "RFC 0056. Request body for `POST /v1/runs/{runId}/annotations`. The host assigns `annotationId` + `createdAt`, derives `actor.principalRef` from the authenticated principal, and binds `target.runId` to the path `runId` — so the client supplies only the target anchor (event/node), the signal, and an optional note. The response is a full `annotation.schema.json`.",
+  "type": "object",
+  "required": ["signal"],
+  "properties": {
+    "target": {
+      "type": "object",
+      "description": "Optional finer-grained anchor within the run. `runId` is taken from the path and MUST NOT be supplied here.",
+      "properties": {
+        "eventId": { "type": "string", "description": "Anchor the annotation to one RunEvent." },
+        "nodeId": { "type": "string", "description": "Anchor the annotation to one node." }
+      },
+      "additionalProperties": false
+    },
+    "signal": {
+      "type": "object",
+      "required": ["kind"],
+      "properties": {
+        "kind": { "type": "string", "enum": ["rating", "correction", "label", "flag"] },
+        "rating": { "type": "integer", "minimum": 1, "maximum": 5, "description": "Required iff `kind` is `rating`." },
+        "label": { "type": "string", "description": "Required iff `kind` is `label`." },
+        "correction": { "type": "string", "description": "Corrected text/value iff `kind` is `correction`. Untrusted user content." }
+      },
+      "additionalProperties": false,
+      "allOf": [
+        { "if": { "properties": { "kind": { "const": "rating" } } }, "then": { "required": ["rating"] } },
+        { "if": { "properties": { "kind": { "const": "label" } } }, "then": { "required": ["label"] } },
+        { "if": { "properties": { "kind": { "const": "correction" } } }, "then": { "required": ["correction"] } }
+      ]
+    },
+    "note": { "type": "string", "description": "Optional free-text reviewer note. Untrusted user content." }
+  },
+  "additionalProperties": false
+}

package/schemas/annotation.schema.json ADDED Viewed

@@ -0,0 +1,56 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "https://openwop.dev/spec/v1/annotation.schema.json",
+  "title": "Annotation",
+  "description": "RFC 0056. A non-blocking human/agent quality signal attached to a run, event, or node — a side-resource, NOT a replayable run-event-log entry. Recorded via `POST /v1/runs/{runId}/annotations`, listed via `GET`, and surfaced live via the `run.annotated` SSE notification. `signal.correction` and `note` are untrusted user content (SECURITY invariant `annotation-content-redaction`).",
+  "type": "object",
+  "required": ["annotationId", "target", "signal", "actor", "createdAt"],
+  "properties": {
+    "annotationId": {
+      "type": "string",
+      "minLength": 1,
+      "description": "Host-assigned unique identifier for this annotation."
+    },
+    "target": {
+      "type": "object",
+      "required": ["runId"],
+      "properties": {
+        "runId": { "type": "string", "minLength": 1 },
+        "eventId": { "type": "string", "description": "Optional — anchors the annotation to one RunEvent." },
+        "nodeId": { "type": "string", "description": "Optional — anchors the annotation to one node." }
+      },
+      "additionalProperties": false
+    },
+    "signal": {
+      "type": "object",
+      "required": ["kind"],
+      "properties": {
+        "kind": { "type": "string", "enum": ["rating", "correction", "label", "flag"] },
+        "rating": { "type": "integer", "minimum": 1, "maximum": 5, "description": "Required iff `kind` is `rating`." },
+        "label": { "type": "string", "description": "Required iff `kind` is `label`." },
+        "correction": { "type": "string", "description": "Corrected text/value iff `kind` is `correction`. Untrusted user content." }
+      },
+      "additionalProperties": false,
+      "allOf": [
+        { "if": { "properties": { "kind": { "const": "rating" } } }, "then": { "required": ["rating"] } },
+        { "if": { "properties": { "kind": { "const": "label" } } }, "then": { "required": ["label"] } },
+        { "if": { "properties": { "kind": { "const": "correction" } } }, "then": { "required": ["correction"] } }
+      ]
+    },
+    "actor": {
+      "type": "object",
+      "required": ["principalRef"],
+      "properties": {
+        "principalRef": {
+          "type": "string",
+          "minLength": 1,
+          "description": "Opaque principal identifier — a principal per RFC 0048 (Draft, referenced non-normatively) or an AgentRef per RFC 0002 when a supervisor agent annotates. String-typed so RFC 0056 does not depend on RFC 0048 reaching Accepted."
+        }
+      },
+      "additionalProperties": false
+    },
+    "note": { "type": "string", "description": "Optional free-text reviewer note. Untrusted user content." },
+    "createdAt": { "type": "string", "format": "date-time", "description": "ISO 8601 timestamp the host recorded the annotation." }
+  },
+  "additionalProperties": false
+}