@openwop/openwop-conformance 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. package/CHANGELOG.md +25 -0
  2. package/README.md +2 -2
  3. package/coverage.md +29 -17
  4. package/fixtures/conformance-agent-low-confidence.json +7 -4
  5. package/fixtures/conformance-agent-pack-handoff-schema-validation.json +30 -0
  6. package/fixtures/conformance-agent-reasoning.json +23 -4
  7. package/fixtures/conformance-dispatch-cross-worker-handoff-child-a.json +27 -0
  8. package/fixtures/conformance-dispatch-cross-worker-handoff-child-b.json +25 -0
  9. package/fixtures/conformance-dispatch-cross-worker-handoff.json +60 -0
  10. package/fixtures/conformance-dispatch-input-mapping-child.json +25 -0
  11. package/fixtures/conformance-dispatch-input-mapping.json +49 -0
  12. package/fixtures/conformance-dispatch-output-mapping-child.json +27 -0
  13. package/fixtures/conformance-dispatch-output-mapping.json +49 -0
  14. package/fixtures/conformance-subworkflow-input-mapping-child.json +27 -0
  15. package/fixtures/conformance-subworkflow-input-mapping.json +33 -0
  16. package/fixtures.md +12 -2
  17. package/package.json +1 -1
  18. package/schemas/README.md +7 -0
  19. package/schemas/agent-ref.schema.json +1 -1
  20. package/schemas/ai-envelope.schema.json +106 -0
  21. package/schemas/capabilities.schema.json +300 -3
  22. package/schemas/core-conformance-mock-agent-config.schema.json +147 -0
  23. package/schemas/dispatch-config.schema.json +26 -0
  24. package/schemas/envelopes/clarification.request.schema.json +43 -0
  25. package/schemas/envelopes/error.schema.json +26 -0
  26. package/schemas/envelopes/schema.request.schema.json +22 -0
  27. package/schemas/envelopes/schema.response.schema.json +22 -0
  28. package/schemas/node-pack-manifest.schema.json +5 -0
  29. package/schemas/pack-lockfile.schema.json +16 -0
  30. package/schemas/run-event-payloads.schema.json +18 -2
  31. package/schemas/run-event.schema.json +2 -1
  32. package/schemas/workflow-chain-pack-manifest.schema.json +226 -0
  33. package/src/lib/behavior-gate.ts +44 -5
  34. package/src/lib/env.ts +27 -0
  35. package/src/lib/webhook-receiver.ts +137 -0
  36. package/src/lib/workflow-chain-expansion.ts +213 -0
  37. package/src/scenarios/agentPackCatalog.test.ts +216 -0
  38. package/src/scenarios/agentPackHandoffSchemaValidation.test.ts +146 -0
  39. package/src/scenarios/agentReasoningEvents.test.ts +58 -7
  40. package/src/scenarios/agents-run-tool-allowlist.test.ts +182 -0
  41. package/src/scenarios/ai-envelope-shape.test.ts +362 -0
  42. package/src/scenarios/aiEnvelope.capBreached.test.ts +173 -0
  43. package/src/scenarios/aiEnvelope.contractRefusal.test.ts +150 -0
  44. package/src/scenarios/aiEnvelope.correlationReplay.test.ts +69 -0
  45. package/src/scenarios/aiEnvelope.redaction.test.ts +73 -0
  46. package/src/scenarios/aiEnvelope.schemaDrift.test.ts +87 -0
  47. package/src/scenarios/aiEnvelope.trustBoundaryPropagation.test.ts +143 -0
  48. package/src/scenarios/aiEnvelope.universalKinds.test.ts +176 -0
  49. package/src/scenarios/append-ordering.test.ts +44 -0
  50. package/src/scenarios/artifact-auth.test.ts +58 -0
  51. package/src/scenarios/blob-cross-tenant-isolation.test.ts +66 -0
  52. package/src/scenarios/blob-presign-expiry.test.ts +66 -0
  53. package/src/scenarios/blob-roundtrip.test.ts +48 -0
  54. package/src/scenarios/cache-cross-tenant-isolation.test.ts +61 -0
  55. package/src/scenarios/cache-ttl-expiry.test.ts +47 -0
  56. package/src/scenarios/dispatch-cross-worker-handoff.test.ts +98 -0
  57. package/src/scenarios/dispatch-input-mapping.test.ts +94 -0
  58. package/src/scenarios/dispatch-output-mapping.test.ts +65 -0
  59. package/src/scenarios/fs-path-traversal.test.ts +124 -0
  60. package/src/scenarios/idempotency-key-determinism.test.ts +230 -0
  61. package/src/scenarios/interrupt-token-matrix.test.ts +126 -0
  62. package/src/scenarios/kv-atomic-increment.test.ts +74 -0
  63. package/src/scenarios/kv-cas.test.ts +75 -0
  64. package/src/scenarios/kv-cross-tenant-isolation.test.ts +85 -0
  65. package/src/scenarios/kv-ttl-expiry.test.ts +47 -0
  66. package/src/scenarios/mcp-server-elicitation-bridge.test.ts +92 -0
  67. package/src/scenarios/mcp-server-prompt-roundtrip.test.ts +80 -0
  68. package/src/scenarios/mcp-server-resource-roundtrip.test.ts +82 -0
  69. package/src/scenarios/mcp-server-sampling-bridge.test.ts +84 -0
  70. package/src/scenarios/mcp-server-tool-roundtrip.test.ts +107 -0
  71. package/src/scenarios/mcp-server-untrusted-args.test.ts +105 -0
  72. package/src/scenarios/mcp-tool-roundtrip.test.ts +13 -6
  73. package/src/scenarios/memory-compaction-event-emitted.test.ts +121 -0
  74. package/src/scenarios/memory-compaction-provenance-tag.test.ts +116 -0
  75. package/src/scenarios/memory-compaction-sr1-carry-forward.test.ts +127 -0
  76. package/src/scenarios/multi-region-idempotency.test.ts +39 -4
  77. package/src/scenarios/otel-trace-propagation-subworkflow.test.ts +139 -0
  78. package/src/scenarios/pause-resume.test.ts +43 -0
  79. package/src/scenarios/queue-ack-nack-dlq.test.ts +67 -0
  80. package/src/scenarios/queue-cross-tenant-isolation.test.ts +66 -0
  81. package/src/scenarios/queue-publish-consume-roundtrip.test.ts +48 -0
  82. package/src/scenarios/registry-public.test.ts +91 -0
  83. package/src/scenarios/search-bm25-roundtrip.test.ts +47 -0
  84. package/src/scenarios/spec-corpus-validity.test.ts +28 -7
  85. package/src/scenarios/sql-injection-rejection.test.ts +84 -0
  86. package/src/scenarios/sql-transaction-atomicity.test.ts +66 -0
  87. package/src/scenarios/stream-subscribe-from-beginning.test.ts +66 -0
  88. package/src/scenarios/subworkflow-input-mapping.test.ts +100 -0
  89. package/src/scenarios/table-cross-tenant-isolation.test.ts +65 -0
  90. package/src/scenarios/table-cursor-pagination.test.ts +47 -0
  91. package/src/scenarios/table-schema-enforcement.test.ts +47 -0
  92. package/src/scenarios/vector-knn-roundtrip.test.ts +48 -0
  93. package/src/scenarios/webhook-receiver-adversarial.test.ts +210 -0
  94. package/src/scenarios/workflow-chain-expansion.test.ts +366 -0
  95. package/src/scenarios/workflow-chain-pack-manifest-validation.test.ts +232 -0
  96. package/src/scenarios/workflow-chain-pack-signature-verification.test.ts +138 -0
  97. package/src/scenarios/workflow-chain-unresolvable-typeid.test.ts +170 -0
@@ -0,0 +1,216 @@
1
+ /**
2
+ * Multi-Agent Shift — `core.openwop.agents.{deep-research, react, supervisor}`
3
+ * pack-catalog evidence.
4
+ *
5
+ * The three reference agent packs published 2026-05-17 are registry-signed
6
+ * (keyId `openwop-team-1`) but had no in-tree conformance scenarios
7
+ * proving their `agents[]` manifests are reachable via the host pack
8
+ * surface AND that each manifest's contents match the contract documented
9
+ * in `RFCS/0003-agent-packs.md` + `schemas/agent-manifest.schema.json`.
10
+ *
11
+ * This file closes that gap. Three test groups, one per pack. Each group:
12
+ * 1. Skips when the host doesn't advertise `capabilities.agents.supported`
13
+ * OR doesn't expose a pack-listing endpoint (`/v1/packs` returning
14
+ * 404/501 → soft-skip).
15
+ * 2. Locates the pack by name in the host's pack list.
16
+ * 3. Validates the pack's `agents[]` entry against the AgentManifest
17
+ * contract: required fields, agentId namespace pattern, modelClass
18
+ * enum, toolAllowlist format, handoff schema refs.
19
+ *
20
+ * Behavioral assertions (the agent actually researches / reacts / supervises)
21
+ * require an LLM + real agentRuntime host and live outside the public
22
+ * conformance suite. The advertisement-shape + manifest-validity coverage
23
+ * here is the wire-level guarantee a third-party host MUST satisfy to
24
+ * claim "I ship the reference agent packs."
25
+ *
26
+ * @see RFCS/0003-agent-packs.md
27
+ * @see schemas/agent-manifest.schema.json
28
+ * @see packs/core.openwop.agents.{deep-research,react,supervisor}/pack.json
29
+ */
30
+
31
+ import { describe, it, expect } from 'vitest';
32
+ import { driver } from '../lib/driver.js';
33
+ import { isAgentSupported } from '../lib/multi-agent-capabilities.js';
34
+
35
+ interface PackList {
36
+ packs?: Array<{
37
+ name?: string;
38
+ version?: string;
39
+ agents?: Array<{
40
+ agentId?: string;
41
+ persona?: string;
42
+ modelClass?: string;
43
+ systemPrompt?: string;
44
+ systemPromptRef?: string;
45
+ toolAllowlist?: string[];
46
+ memoryShape?: Record<string, unknown>;
47
+ handoff?: { taskSchemaRef?: string; returnSchemaRef?: string };
48
+ }>;
49
+ }>;
50
+ }
51
+
52
+ // AgentManifest agentId pattern from schemas/agent-manifest.schema.json.
53
+ const AGENT_ID_PATTERN = /^(core|vendor|community|private|local)\.[a-z][a-z0-9_-]*(\.[a-z][a-zA-Z0-9_-]*)+$/;
54
+ const VALID_MODEL_CLASSES = new Set([
55
+ 'reasoning', 'tool-using', 'chat', 'code', 'vision', 'multimodal',
56
+ 'embedding', 'classification', 'retrieval', 'research', 'delegate',
57
+ ]);
58
+ const VALID_TOOL_SCOPES = ['openwop:', 'mcp:', 'vendor.', 'community.', 'private.', 'local.', 'host:'];
59
+
60
+ async function findPack(name: string): Promise<PackList['packs'] extends Array<infer T> | undefined ? T | null : never> {
61
+ const res = await driver.get('/v1/packs');
62
+ if (res.status === 404 || res.status === 501) return null as never;
63
+ if (res.status !== 200) return null as never;
64
+ const body = res.json as PackList;
65
+ if (!Array.isArray(body.packs)) return null as never;
66
+ const found = body.packs.find((p) => p.name === name);
67
+ // Cast through unknown to satisfy the conditional return type.
68
+ return (found ?? null) as never;
69
+ }
70
+
71
+ function assertAgentManifestShape(
72
+ agent: NonNullable<NonNullable<PackList['packs']>[number]['agents']>[number],
73
+ expectations: { agentIdEndsWith?: string; modelClass?: string; minTools?: number },
74
+ ): void {
75
+ // Required: agentId, persona, modelClass.
76
+ expect(typeof agent.agentId, 'AgentManifest.agentId MUST be a string').toBe('string');
77
+ expect(typeof agent.persona, 'AgentManifest.persona MUST be a string').toBe('string');
78
+ expect(typeof agent.modelClass, 'AgentManifest.modelClass MUST be a string').toBe('string');
79
+
80
+ // agentId pattern (RFCS/0003 §A namespace tiers).
81
+ expect(
82
+ AGENT_ID_PATTERN.test(agent.agentId ?? ''),
83
+ driver.describe(
84
+ 'schemas/agent-manifest.schema.json §agentId',
85
+ `agentId "${agent.agentId}" MUST match the namespace-tier pattern`,
86
+ ),
87
+ ).toBe(true);
88
+
89
+ // modelClass enum check (loose — the schema declares an enum but
90
+ // hosts MAY extend with research/delegate per the reference packs).
91
+ if (agent.modelClass !== undefined) {
92
+ expect(
93
+ VALID_MODEL_CLASSES.has(agent.modelClass),
94
+ `AgentManifest.modelClass "${agent.modelClass}" SHOULD be a recognized class`,
95
+ ).toBe(true);
96
+ }
97
+
98
+ // systemPrompt XOR systemPromptRef.
99
+ const hasInline = typeof agent.systemPrompt === 'string' && agent.systemPrompt.length > 0;
100
+ const hasRef = typeof agent.systemPromptRef === 'string' && agent.systemPromptRef.length > 0;
101
+ expect(
102
+ hasInline !== hasRef,
103
+ 'AgentManifest MUST have exactly one of systemPrompt | systemPromptRef',
104
+ ).toBe(true);
105
+
106
+ // toolAllowlist: optional, but when present each entry MUST start with a recognized scope.
107
+ if (Array.isArray(agent.toolAllowlist)) {
108
+ for (const tool of agent.toolAllowlist) {
109
+ expect(
110
+ VALID_TOOL_SCOPES.some((scope) => tool.startsWith(scope)),
111
+ `toolAllowlist entry "${tool}" MUST start with a recognized scope`,
112
+ ).toBe(true);
113
+ }
114
+ if (expectations.minTools !== undefined) {
115
+ expect(
116
+ agent.toolAllowlist.length,
117
+ `agent's toolAllowlist MUST have at least ${expectations.minTools} entries`,
118
+ ).toBeGreaterThanOrEqual(expectations.minTools);
119
+ }
120
+ }
121
+
122
+ // Per-pack expectations.
123
+ if (expectations.agentIdEndsWith !== undefined) {
124
+ expect(agent.agentId ?? '').toContain(expectations.agentIdEndsWith);
125
+ }
126
+ if (expectations.modelClass !== undefined) {
127
+ expect(agent.modelClass).toBe(expectations.modelClass);
128
+ }
129
+ }
130
+
131
+ const SKIP = !isAgentSupported();
132
+
133
+ describe.skipIf(SKIP)('core.openwop.agents.deep-research — pack catalog evidence', () => {
134
+ it('host pack-list includes deep-research with a well-formed AgentManifest', async () => {
135
+ const pack = await findPack('core.openwop.agents.deep-research');
136
+ if (pack === null) return; // host doesn't expose /v1/packs or doesn't have this pack
137
+ expect(pack.version, 'pack version MUST be present').toBeDefined();
138
+ expect(Array.isArray(pack.agents) && pack.agents.length === 1, 'deep-research ships exactly one agent').toBe(true);
139
+ assertAgentManifestShape(pack.agents![0]!, {
140
+ agentIdEndsWith: 'deep-research',
141
+ modelClass: 'research',
142
+ minTools: 1,
143
+ });
144
+ // Domain-specific: deep-research uses long-term memory + RAG retrievers.
145
+ const tools = pack.agents![0]!.toolAllowlist ?? [];
146
+ expect(
147
+ tools.some((t) => t.includes('rag') || t.includes('retriever')),
148
+ 'deep-research SHOULD allow at least one rag/retriever tool',
149
+ ).toBe(true);
150
+ expect(
151
+ pack.agents![0]!.memoryShape?.longTerm,
152
+ 'deep-research MUST request longTerm memory (it persists facts across runs)',
153
+ ).toBe(true);
154
+ });
155
+ });
156
+
157
+ describe.skipIf(SKIP)('core.openwop.agents.react — pack catalog evidence', () => {
158
+ it('host pack-list includes react with a well-formed AgentManifest', async () => {
159
+ const pack = await findPack('core.openwop.agents.react');
160
+ if (pack === null) return;
161
+ expect(pack.version).toBeDefined();
162
+ expect(Array.isArray(pack.agents) && pack.agents.length >= 1, 'react ships at least one agent').toBe(true);
163
+ assertAgentManifestShape(pack.agents![0]!, {
164
+ agentIdEndsWith: 'react',
165
+ });
166
+ // ReAct pattern requires handoff schemas (task + return).
167
+ const handoff = pack.agents![0]!.handoff;
168
+ expect(handoff, 'react AgentManifest MUST include a handoff block').toBeDefined();
169
+ expect(typeof handoff?.taskSchemaRef, 'handoff.taskSchemaRef MUST be a string').toBe('string');
170
+ expect(typeof handoff?.returnSchemaRef, 'handoff.returnSchemaRef MUST be a string').toBe('string');
171
+ });
172
+ });
173
+
174
+ describe.skipIf(SKIP)('core.openwop.agents.supervisor — pack catalog evidence', () => {
175
+ it('host pack-list includes supervisor with a well-formed AgentManifest', async () => {
176
+ const pack = await findPack('core.openwop.agents.supervisor');
177
+ if (pack === null) return;
178
+ expect(pack.version).toBeDefined();
179
+ expect(Array.isArray(pack.agents) && pack.agents.length >= 1, 'supervisor ships at least one agent').toBe(true);
180
+ assertAgentManifestShape(pack.agents![0]!, {
181
+ agentIdEndsWith: 'supervisor',
182
+ });
183
+ // Supervisor pattern delegates to crew members; its modelClass should
184
+ // be `delegate` or `reasoning` (it makes orchestration decisions).
185
+ const mc = pack.agents![0]!.modelClass;
186
+ expect(
187
+ mc === 'delegate' || mc === 'reasoning',
188
+ `supervisor SHOULD have modelClass=delegate|reasoning, got "${mc}"`,
189
+ ).toBe(true);
190
+ // Supervisor needs handoff schemas to dispatch work.
191
+ expect(pack.agents![0]!.handoff, 'supervisor MUST include handoff schemas').toBeDefined();
192
+ });
193
+ });
194
+
195
+ describe.skipIf(SKIP)('agent-pack catalog summary', () => {
196
+ it('all three 2026-05-17 reference agent packs are catalog-reachable', async () => {
197
+ const names = [
198
+ 'core.openwop.agents.deep-research',
199
+ 'core.openwop.agents.react',
200
+ 'core.openwop.agents.supervisor',
201
+ ];
202
+ const found: string[] = [];
203
+ for (const n of names) {
204
+ const p = await findPack(n);
205
+ if (p !== null) found.push(n);
206
+ }
207
+ // Either none are present (host doesn't ship these — skip) OR all are
208
+ // present (host ships the full reference batch). Half-shipping is a
209
+ // configuration error worth flagging.
210
+ if (found.length === 0) return;
211
+ expect(
212
+ found.length,
213
+ 'host SHOULD ship the reference agent packs as a coherent batch (none, or all three)',
214
+ ).toBe(names.length);
215
+ });
216
+ });
@@ -0,0 +1,146 @@
1
+ /**
2
+ * Multi-Agent Shift Phase 2 — handoff-schema validation at dispatch (HV-1).
3
+ * Normative reference: RFCS/0003-agent-packs.md §D (handoff schema resolution)
4
+ *
5
+ * Verifies that when an agent's manifest carries `handoff.taskSchemaRef`, the
6
+ * host MUST validate inbound dispatch payloads against the referenced JSON
7
+ * Schema (resolved at install time per RFC 0003 §D) BEFORE dispatching the
8
+ * agent. Invalid payloads MUST be rejected with a structured error envelope
9
+ * — the agent MUST NOT see the malformed payload.
10
+ *
11
+ * Symmetric assertion on `handoff.returnSchemaRef`: when an agent returns a
12
+ * payload that fails return-schema validation, the host MUST reject before
13
+ * persistence and surface a structured error rather than silently storing
14
+ * an off-contract result.
15
+ *
16
+ * Capability-gated: skips when host doesn't advertise
17
+ * `capabilities.agents.supported: true` AND `capabilities.agents.dispatch: true`.
18
+ * Fixture-gated: requires `conformance-agent-pack-handoff-schema-validation`.
19
+ *
20
+ * @see RFCS/0003-agent-packs.md §D
21
+ * @see schemas/agent-manifest.schema.json #/properties/handoff
22
+ * @see packs/core.openwop.agent-examples/agents[structured-fixture]
23
+ */
24
+
25
+ import { describe, it, expect } from 'vitest';
26
+ import { driver } from '../lib/driver.js';
27
+ import { isFixtureAdvertised } from '../lib/fixtures.js';
28
+ import { isAgentSupported } from '../lib/multi-agent-capabilities.js';
29
+
30
+ const FIXTURE = 'conformance-agent-pack-handoff-schema-validation';
31
+ const SKIP = !isAgentSupported() || !isFixtureAdvertised(FIXTURE);
32
+
33
+ describe.skipIf(SKIP)('agentPackHandoffSchemaValidation: handoff schema enforcement at dispatch', () => {
34
+ it('valid task payload that matches taskSchemaRef is dispatched and completes', async () => {
35
+ // The fixture workflow dispatches `core.openwop.agent-examples.structured-fixture`
36
+ // with a VALID task payload matching schemas/structured-fixture.task.schema.json
37
+ // (`{ text: string, extractionFields: string[], language?: string }`).
38
+ const create = await driver.post('/v1/runs', {
39
+ workflowId: FIXTURE,
40
+ inputs: {
41
+ scenario: 'valid-task',
42
+ text: 'Acme Corp invoiced $1,200 on 2026-04-15 for Q2 consulting.',
43
+ extractionFields: ['vendor', 'amount', 'date'],
44
+ },
45
+ });
46
+ expect(create.status).toBe(201);
47
+ const runId = (create.json as { runId: string }).runId;
48
+
49
+ let snap: { status: string } | undefined;
50
+ for (let i = 0; i < 40; i++) {
51
+ const res = await driver.get(`/v1/runs/${encodeURIComponent(runId)}`);
52
+ const body = res.json as { status: string };
53
+ if (['completed', 'failed', 'waiting-approval'].includes(body.status)) {
54
+ snap = body;
55
+ break;
56
+ }
57
+ await new Promise((r) => setTimeout(r, 100));
58
+ }
59
+ expect(snap?.status, 'HV-1a: valid task payload should NOT be rejected by handoff-schema validation').toBe('completed');
60
+ });
61
+
62
+ it('invalid task payload (missing required field) is rejected before dispatch with structured error', async () => {
63
+ const create = await driver.post('/v1/runs', {
64
+ workflowId: FIXTURE,
65
+ inputs: {
66
+ scenario: 'invalid-task',
67
+ // intentionally missing required `extractionFields`
68
+ text: 'Some input text',
69
+ },
70
+ });
71
+ expect(create.status).toBe(201);
72
+ const runId = (create.json as { runId: string }).runId;
73
+
74
+ let snap: { status: string } | undefined;
75
+ for (let i = 0; i < 40; i++) {
76
+ const res = await driver.get(`/v1/runs/${encodeURIComponent(runId)}`);
77
+ const body = res.json as { status: string };
78
+ if (['completed', 'failed'].includes(body.status)) {
79
+ snap = body;
80
+ break;
81
+ }
82
+ await new Promise((r) => setTimeout(r, 100));
83
+ }
84
+ expect(snap?.status, 'HV-1b: invalid task payload MUST cause the run to fail rather than silently dispatch off-contract').toBe('failed');
85
+
86
+ const events = await driver.get(`/v1/runs/${encodeURIComponent(runId)}/events`);
87
+ const list = (events.json as { events?: Array<{ type: string; payload?: Record<string, unknown> }> })
88
+ .events ?? [];
89
+
90
+ const validationFailure = list.find(
91
+ (e) =>
92
+ e.type === 'node.failed' &&
93
+ typeof e.payload?.error === 'object' &&
94
+ ((e.payload?.error as Record<string, unknown>)?.code === 'handoff_task_schema_violation' ||
95
+ (e.payload?.error as Record<string, unknown>)?.code === 'agent_dispatch_validation_failed'),
96
+ );
97
+ expect(
98
+ validationFailure,
99
+ 'HV-1b: failure event payload MUST carry a recognizable handoff-validation error code',
100
+ ).toBeDefined();
101
+ });
102
+
103
+ it('agent return payload that fails returnSchemaRef is rejected before persistence', async () => {
104
+ // The fixture's `mock-return-violation` scenario causes the agent runtime
105
+ // to emit a return payload that violates schemas/structured-fixture.return.schema.json
106
+ // (e.g., omits the required `extracted` field while not declaring `error`).
107
+ const create = await driver.post('/v1/runs', {
108
+ workflowId: FIXTURE,
109
+ inputs: { scenario: 'mock-return-violation' },
110
+ });
111
+ expect(create.status).toBe(201);
112
+ const runId = (create.json as { runId: string }).runId;
113
+
114
+ let snap: { status: string } | undefined;
115
+ for (let i = 0; i < 40; i++) {
116
+ const res = await driver.get(`/v1/runs/${encodeURIComponent(runId)}`);
117
+ const body = res.json as { status: string };
118
+ if (['completed', 'failed'].includes(body.status)) {
119
+ snap = body;
120
+ break;
121
+ }
122
+ await new Promise((r) => setTimeout(r, 100));
123
+ }
124
+ // Hosts MAY surface return-schema violations as either a failed run OR a
125
+ // run that completes with a flagged error envelope, but the persisted
126
+ // result MUST NOT carry an off-schema body. Tolerate both outcomes here;
127
+ // the strict assertion is that downstream readers can detect the violation.
128
+ expect(['completed', 'failed']).toContain(snap?.status);
129
+
130
+ const events = await driver.get(`/v1/runs/${encodeURIComponent(runId)}/events`);
131
+ const list = (events.json as { events?: Array<{ type: string; payload?: Record<string, unknown> }> })
132
+ .events ?? [];
133
+
134
+ const returnViolation = list.find(
135
+ (e) =>
136
+ (e.type === 'node.failed' || e.type === 'agent.error') &&
137
+ typeof e.payload?.error === 'object' &&
138
+ ((e.payload?.error as Record<string, unknown>)?.code === 'handoff_return_schema_violation' ||
139
+ (e.payload?.error as Record<string, unknown>)?.code === 'agent_return_validation_failed'),
140
+ );
141
+ expect(
142
+ returnViolation,
143
+ 'HV-1c: off-schema return payload MUST surface a structured violation event before persistence',
144
+ ).toBeDefined();
145
+ });
146
+ });
@@ -47,26 +47,77 @@ describe.skipIf(SKIP)('agentReasoningEvents: agent.* event family emission', ()
47
47
 
48
48
  const events = await driver.get(`/v1/runs/${encodeURIComponent(runId)}/events`);
49
49
  expect(events.status).toBe(200);
50
- const list = (events.json as { events?: Array<{ type: string; payload?: Record<string, unknown> }> })
51
- .events ?? [];
50
+ const list = (events.json as {
51
+ events?: Array<{
52
+ type: string;
53
+ eventId?: string;
54
+ causationId?: string;
55
+ payload?: Record<string, unknown>;
56
+ }>;
57
+ }).events ?? [];
52
58
 
53
59
  const agentEvents = list.filter((e) => REASONING_EVENT_TYPES.has(e.type));
54
60
  expect(agentEvents.length).toBeGreaterThan(0);
55
61
 
56
- // Every agent.* event payload MUST carry `agentId` (per RFC 0002 §C).
62
+ // Every agent.* event payload MUST identify the agent. Per
63
+ // `run-event-payloads.schema.json` §`agent*` shapes, four of the
64
+ // five events (`reasoned`, `toolCalled`, `toolReturned`, `decided`)
65
+ // carry `agentId`; `agent.handoff` carries `fromAgentId` + `toAgentId`
66
+ // instead. Allow either shape.
57
67
  for (const ev of agentEvents) {
58
- expect(typeof ev.payload?.agentId).toBe('string');
59
- expect((ev.payload!.agentId as string).length).toBeGreaterThanOrEqual(3);
68
+ const p = (ev.payload ?? {}) as Record<string, unknown>;
69
+ if (ev.type === 'agent.handoff') {
70
+ expect(typeof p.fromAgentId).toBe('string');
71
+ expect(typeof p.toAgentId).toBe('string');
72
+ expect((p.fromAgentId as string).length).toBeGreaterThanOrEqual(3);
73
+ expect((p.toAgentId as string).length).toBeGreaterThanOrEqual(3);
74
+ } else {
75
+ expect(typeof p.agentId).toBe('string');
76
+ expect((p.agentId as string).length).toBeGreaterThanOrEqual(3);
77
+ }
60
78
  }
61
79
 
62
- // agent.toolCalled / agent.toolReturned MUST share a `callId` correlation.
80
+ // agent.toolCalled / agent.toolReturned pairing two normative
81
+ // requirements per RFC 0002 §B (`agentToolReturned`):
82
+ //
83
+ // 1. callId correlation. The pair shares a host-minted `callId`
84
+ // on their payloads; readers correlate request → response by
85
+ // this id even when the events arrive interleaved with other
86
+ // agent.* activity.
87
+ //
88
+ // 2. causationId === eventId of the paired agent.toolCalled.
89
+ // RFC 0002 §B states "`causationId` MUST equal the `eventId`
90
+ // of the corresponding `agent.toolCalled`." This is stricter
91
+ // than callId-pairing alone — it threads the event-log identity
92
+ // through the correlation chain so replay-determinism guarantees
93
+ // (`spec/v1/replay.md` §"Determinism with non-deterministic
94
+ // agents") survive event-id reuse and out-of-order delivery.
95
+ // Hosts whose `appendEvent` surface doesn't return the eventId
96
+ // synchronously need to extend it so the node can thread the
97
+ // paired eventId through.
63
98
  const calls = agentEvents.filter((e) => e.type === 'agent.toolCalled');
64
99
  const returns = agentEvents.filter((e) => e.type === 'agent.toolReturned');
65
100
  for (const ret of returns) {
66
101
  const callId = ret.payload?.callId as string | undefined;
67
102
  if (callId === undefined) continue;
68
103
  const matched = calls.find((c) => c.payload?.callId === callId);
69
- expect(matched, `agent.toolReturned.callId=${callId} MUST pair with a prior agent.toolCalled`).toBeDefined();
104
+ expect(
105
+ matched,
106
+ `agent.toolReturned.callId=${callId} MUST pair with a prior agent.toolCalled`,
107
+ ).toBeDefined();
108
+
109
+ // Strict causationId chain — only assert when the host actually
110
+ // surfaces eventId on the matched toolCalled event. Hosts that
111
+ // omit eventId from their `/events` projection skip this check
112
+ // (and SHOULD add it — RFC 0002 §B's chain integrity depends on
113
+ // it).
114
+ const matchedEventId = matched?.eventId;
115
+ if (typeof matchedEventId === 'string' && matchedEventId.length > 0) {
116
+ expect(
117
+ ret.causationId,
118
+ `agent.toolReturned (callId=${callId}) MUST carry causationId === paired agent.toolCalled.eventId per RFC 0002 §B`,
119
+ ).toBe(matchedEventId);
120
+ }
70
121
  }
71
122
  });
72
123
  });
@@ -0,0 +1,182 @@
1
+ /**
2
+ * core.openwop.agents.run — tool-allowlist enforcement contract
3
+ *
4
+ * Closes `OPENWOP-AUDIT-2026-003`: the 1.0.0 pack invoked workflow-supplied
5
+ * `tool.handler` as raw JS in its fallback loop, breaking the spec's
6
+ * `prompt-injection-tool-allowlist` invariant (`threat-model-prompt-injection.md`
7
+ * §"Authority bypass"). 1.0.1 refuses function-typed handlers outright; this
8
+ * scenario locks the refusal in as a CI gate so a future pack reimplementation
9
+ * cannot silently regress.
10
+ *
11
+ * Server-free. Loads the pack via dynamic import and asserts:
12
+ *
13
+ * 1. `tools[]` entries with `typeof handler === 'function'` are rejected
14
+ * with `INVALID_TOOL_DECLARATION` BEFORE any LLM call. The defect path.
15
+ * 2. `tools[]` entries missing a `name` are rejected (declaration discipline).
16
+ * 3. `tools[]` entries missing a `kind` discriminator are rejected (the host
17
+ * cannot resolve an unkinded tool through its connector registry).
18
+ * 4. Tool-driven runs (`tools.length > 0`) WITHOUT `ctx.agentRuntime` refuse
19
+ * with `HOST_CAPABILITY_MISSING` — the inline fallback that invoked raw
20
+ * handlers was removed in 1.0.1; there is no longer a host-less path for
21
+ * tool dispatch.
22
+ * 5. Tool-less runs (`tools.length === 0`) succeed via `ctx.callAIWithTools`
23
+ * with no tool dispatch (safe path preserved across the fix).
24
+ * 6. The preferred `ctx.agentRuntime.run` path threads through unchanged.
25
+ *
26
+ * Skip-conditions: soft-skips when `packs/core.openwop.agents/index.mjs` is not
27
+ * present (published-conformance-package context where pack source isn't shipped).
28
+ *
29
+ * @see SECURITY/internal-pre-audit-findings.json#OPENWOP-AUDIT-2026-003
30
+ * @see SECURITY/threat-model-prompt-injection.md §"Authority bypass" + §"prompt-injection-tool-allowlist"
31
+ * @see SECURITY/invariants.yaml#agents-run-no-raw-handler
32
+ * @see packs/core.openwop.agents/index.mjs (1.0.1)
33
+ */
34
+
35
+ import { describe, it, expect, beforeAll } from 'vitest';
36
+ import { existsSync } from 'node:fs';
37
+ import { dirname, resolve } from 'node:path';
38
+ import { fileURLToPath } from 'node:url';
39
+
40
+ const __dirname = dirname(fileURLToPath(import.meta.url));
41
+ const PACK_PATH = resolve(__dirname, '../../../packs/core.openwop.agents/index.mjs');
42
+
43
+ interface AgentRunCtx {
44
+ config?: Record<string, unknown>;
45
+ inputs?: {
46
+ userPrompt?: string;
47
+ tools?: unknown[];
48
+ memory?: unknown;
49
+ outputParser?: unknown;
50
+ };
51
+ agentRuntime?: { run?: (...args: unknown[]) => Promise<unknown> };
52
+ callAIWithTools?: (...args: unknown[]) => Promise<{ text?: string; usage?: unknown; toolCalls?: unknown[] }>;
53
+ emit?: (...args: unknown[]) => Promise<void>;
54
+ }
55
+
56
+ type AgentRunFn = (ctx: AgentRunCtx) => Promise<{ status: 'success'; outputs: Record<string, unknown> }>;
57
+
58
+ async function expectRejection(fn: () => Promise<unknown>, expectedCode: string, description: string): Promise<void> {
59
+ let caught: unknown;
60
+ try {
61
+ await fn();
62
+ } catch (err) {
63
+ caught = err;
64
+ }
65
+ expect(caught, description).toBeInstanceOf(Error);
66
+ expect((caught as Error & { code?: string }).code, `${description} → code`).toBe(expectedCode);
67
+ }
68
+
69
+ describe('category: core.openwop.agents.run — tool-allowlist enforcement (OPENWOP-AUDIT-2026-003)', () => {
70
+ let agentRun: AgentRunFn;
71
+ let packAvailable: boolean;
72
+
73
+ beforeAll(async () => {
74
+ packAvailable = existsSync(PACK_PATH);
75
+ if (!packAvailable) return;
76
+ const mod = (await import(PACK_PATH)) as { agentRun?: AgentRunFn };
77
+ if (typeof mod.agentRun !== 'function') {
78
+ throw new Error(`expected packs/core.openwop.agents/index.mjs to export agentRun; got ${typeof mod.agentRun}`);
79
+ }
80
+ agentRun = mod.agentRun;
81
+ });
82
+
83
+ it('skips cleanly when pack source is not bundled', () => {
84
+ if (!packAvailable) {
85
+ console.warn('[agents-run-tool-allowlist] pack source not present; skipping');
86
+ expect(packAvailable).toBe(false);
87
+ return;
88
+ }
89
+ expect(packAvailable).toBe(true);
90
+ });
91
+
92
+ it('rejects function-typed tool.handler (the defect path)', async () => {
93
+ if (!packAvailable) return;
94
+ // The 1.0.0 defect: a workflow author could supply executable JS via
95
+ // tools[].handler and the pack would await it directly with ctx. Closed
96
+ // in 1.0.1 — the validator throws INVALID_TOOL_DECLARATION at the run
97
+ // boundary, BEFORE any LLM call.
98
+ await expectRejection(
99
+ () => agentRun({
100
+ config: {},
101
+ inputs: {
102
+ userPrompt: 'x',
103
+ tools: [{ name: 'evil', kind: 'function', handler: () => 'rce' }],
104
+ },
105
+ }),
106
+ 'INVALID_TOOL_DECLARATION',
107
+ 'function-typed handler MUST be refused',
108
+ );
109
+ });
110
+
111
+ it('rejects tool declaration missing a name', async () => {
112
+ if (!packAvailable) return;
113
+ await expectRejection(
114
+ () => agentRun({ config: {}, inputs: { userPrompt: 'x', tools: [{ kind: 'workflow' }] } }),
115
+ 'INVALID_TOOL_DECLARATION',
116
+ 'unnamed tool MUST be refused',
117
+ );
118
+ });
119
+
120
+ it('rejects tool declaration missing a kind discriminator', async () => {
121
+ if (!packAvailable) return;
122
+ await expectRejection(
123
+ () => agentRun({ config: {}, inputs: { userPrompt: 'x', tools: [{ name: 't1' }] } }),
124
+ 'INVALID_TOOL_DECLARATION',
125
+ 'unkinded tool MUST be refused — host cannot resolve through its registry',
126
+ );
127
+ });
128
+
129
+ it('rejects tool-driven runs when host does not provide agentRuntime', async () => {
130
+ if (!packAvailable) return;
131
+ // Tool dispatch MUST go through a host-resolved runtime — the 1.0.0
132
+ // inline-handler fallback is gone.
133
+ await expectRejection(
134
+ () => agentRun({
135
+ config: {},
136
+ inputs: { userPrompt: 'x', tools: [{ name: 't1', kind: 'workflow' }] },
137
+ }),
138
+ 'HOST_CAPABILITY_MISSING',
139
+ 'tools[] with no agentRuntime MUST refuse',
140
+ );
141
+ });
142
+
143
+ it('tool-less run succeeds via callAIWithTools (safe fallback preserved)', async () => {
144
+ if (!packAvailable) return;
145
+ let toolsSeen: unknown = 'never-called';
146
+ const ctx: AgentRunCtx = {
147
+ config: {},
148
+ inputs: { userPrompt: 'hi', tools: [] },
149
+ callAIWithTools: async (args: unknown) => {
150
+ toolsSeen = (args as { tools?: unknown[] }).tools;
151
+ return { text: 'hello back', usage: { input_tokens: 1, output_tokens: 1 } };
152
+ },
153
+ };
154
+ const result = await agentRun(ctx);
155
+ expect(result.outputs.result).toBe('hello back');
156
+ expect(result.outputs.finishReason).toBe('complete');
157
+ expect(toolsSeen, 'tool-less fallback MUST pass an empty tools array — no LLM-driven dispatch').toEqual([]);
158
+ });
159
+
160
+ it('agentRuntime.run path threads through unchanged when host provides it', async () => {
161
+ if (!packAvailable) return;
162
+ let receivedTools: unknown;
163
+ const ctx: AgentRunCtx = {
164
+ config: {},
165
+ inputs: {
166
+ userPrompt: 'x',
167
+ tools: [{ name: 't1', kind: 'workflow', ref: 'vendor.acme.demo' }],
168
+ },
169
+ agentRuntime: {
170
+ run: async (req: unknown) => {
171
+ receivedTools = (req as { tools?: unknown[] }).tools;
172
+ return { result: 'from-runtime', toolCalls: [{ name: 't1' }] };
173
+ },
174
+ },
175
+ };
176
+ const result = await agentRun(ctx);
177
+ expect(result.outputs.result).toBe('from-runtime');
178
+ expect(receivedTools, 'host MUST receive the validated tools array').toEqual([
179
+ { name: 't1', kind: 'workflow', ref: 'vendor.acme.demo' },
180
+ ]);
181
+ });
182
+ });