@purista/harness 1.2.6 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/README.md +6 -0
  2. package/dist/agents/index.d.ts +7 -1
  3. package/dist/agents/index.js +56 -38
  4. package/dist/errors/catalog.d.ts +18 -2
  5. package/dist/errors/catalog.js +10 -0
  6. package/dist/eval/index.d.ts +3 -3
  7. package/dist/eval/index.js +15 -1
  8. package/dist/harness/defineHarness.d.ts +91 -1
  9. package/dist/harness/defineHarness.js +110 -1
  10. package/dist/index.d.ts +37 -17
  11. package/dist/index.js +30 -16
  12. package/dist/local/index.d.ts +36 -0
  13. package/dist/local/index.js +24 -0
  14. package/dist/local/local-sandbox.d.ts +25 -0
  15. package/dist/local/local-sandbox.js +368 -0
  16. package/dist/local/local-workspace.d.ts +56 -0
  17. package/dist/local/local-workspace.js +496 -0
  18. package/dist/local/ref-hash.d.ts +6 -0
  19. package/dist/local/ref-hash.js +9 -0
  20. package/dist/local/sqlite-storage.d.ts +106 -0
  21. package/dist/local/sqlite-storage.js +680 -0
  22. package/dist/models/adapter-utils.d.ts +52 -0
  23. package/dist/models/adapter-utils.js +81 -0
  24. package/dist/models/registry.js +28 -37
  25. package/dist/models/stream-pump.d.ts +16 -0
  26. package/dist/models/stream-pump.js +77 -0
  27. package/dist/ports/base-model-provider.d.ts +7 -1
  28. package/dist/ports/base-model-provider.js +384 -87
  29. package/dist/ports/capabilities.d.ts +16 -2
  30. package/dist/ports/context-checkpoints.d.ts +63 -0
  31. package/dist/ports/context-checkpoints.js +33 -0
  32. package/dist/ports/index.d.ts +1 -0
  33. package/dist/ports/index.js +1 -0
  34. package/dist/ports/model-provider.d.ts +94 -0
  35. package/dist/runtime/durable.d.ts +11 -0
  36. package/dist/runtime/durable.js +15 -2
  37. package/dist/runtime/sessionDurable.js +47 -21
  38. package/dist/sessions/index.d.ts +17 -6
  39. package/dist/sessions/index.js +337 -81
  40. package/dist/skills/index.d.ts +0 -2
  41. package/dist/skills/index.js +0 -8
  42. package/dist/state/in-memory.js +6 -6
  43. package/dist/telemetry/shim.js +2 -6
  44. package/dist/telemetry/span-attrs.d.ts +9 -0
  45. package/dist/telemetry/span-attrs.js +27 -0
  46. package/dist/testing/durableWorkspaceStoreContract.js +69 -0
  47. package/dist/testing/fakeLogger.d.ts +29 -0
  48. package/dist/testing/fakeLogger.js +47 -0
  49. package/dist/testing/fakeSandbox.d.ts +27 -0
  50. package/dist/testing/fakeSandbox.js +153 -0
  51. package/dist/testing/fakeStateStore.d.ts +36 -0
  52. package/dist/testing/fakeStateStore.js +66 -0
  53. package/dist/testing/index.d.ts +10 -4
  54. package/dist/testing/index.js +14 -4
  55. package/dist/testing/loggerContract.d.ts +9 -0
  56. package/dist/testing/loggerContract.js +62 -0
  57. package/dist/testing/modelProviderContract.d.ts +12 -0
  58. package/dist/testing/modelProviderContract.js +222 -0
  59. package/dist/testing/recordEvents.d.ts +3 -0
  60. package/dist/testing/recordEvents.js +8 -0
  61. package/dist/testing/stateStoreContract.js +27 -0
  62. package/dist/tools/index.js +26 -1
  63. package/dist/tools/mcp/http.d.ts +2 -0
  64. package/dist/tools/mcp/http.js +34 -21
  65. package/dist/tools/mcp/runner.d.ts +4 -0
  66. package/dist/tools/mcp/runner.js +75 -21
  67. package/dist/tools/mcp/stdio.d.ts +7 -1
  68. package/dist/tools/mcp/stdio.js +102 -23
  69. package/dist/version.d.ts +1 -1
  70. package/dist/version.js +1 -1
  71. package/dist/workspace/in-memory.d.ts +1 -0
  72. package/dist/workspace/in-memory.js +47 -12
  73. package/package.json +2 -1
package/README.md CHANGED
@@ -15,6 +15,12 @@ Telemetry defaults to dual GenAI and OpenInference attributes with no content
15
15
  capture. `InvokeOptions.traceparent` and `tracestate` accept inbound W3C Trace
16
16
  Context so application traces can parent harness run spans.
17
17
 
18
+ Workflows can orchestrate typed child agents with `ctx.agents.<id>(input)`.
19
+ Child-agent calls are disabled until a workflow declares `delegation` or the
20
+ harness opts in with `defaults.delegation.enabled: true`. Opted-in workflows get
21
+ bounded fan-out, agent allowlists, per-agent model alias overrides, and
22
+ lineage-rich run events.
23
+
18
24
  See [Evaluating Prompts](https://github.com/puristajs/harness/blob/main/docs/guides/evaluating-prompts.md)
19
25
  for the execution model, scorer limits, and privacy behavior.
20
26
 
@@ -1,7 +1,7 @@
1
1
  import type { Logger } from '../logger/index.js';
2
2
  import type { JsonValue } from '../models/json.js';
3
3
  import type { Message } from '../models/state.js';
4
- import type { AgentDefinition, ResolvedSkill, RunEvent, ToolsConfig } from '../harness/defineHarness.js';
4
+ import type { AgentDefinition, ContextCheckpoints, ResolvedSkill, RunEvent, ToolsConfig } from '../harness/defineHarness.js';
5
5
  import type { MemoryFacade } from '../ports/memory.js';
6
6
  import type { SandboxSession } from '../sandbox/index.js';
7
7
  import { type TelemetryShim } from '../telemetry/index.js';
@@ -12,15 +12,19 @@ export declare function runDefaultAgent(args: {
12
12
  runId: string;
13
13
  sessionId: string;
14
14
  workflowId?: string;
15
+ delegationCallId?: string;
16
+ delegationDepth?: number;
15
17
  input: unknown;
16
18
  history: Message[];
17
19
  agent: AgentDefinition<any>;
20
+ modelAlias?: string;
18
21
  models: Record<string, any>;
19
22
  skills: Record<string, ResolvedSkill>;
20
23
  customTools: ToolsConfig;
21
24
  mcpRegistry?: McpRunnerRegistry;
22
25
  session: SandboxSession;
23
26
  memory: MemoryFacade;
27
+ checkpoints: ContextCheckpoints;
24
28
  mountedSkills: Set<string>;
25
29
  historyWindow?: number;
26
30
  maxSteps: number;
@@ -35,3 +39,5 @@ export declare function runDefaultAgent(args: {
35
39
  output: JsonValue;
36
40
  emitted: Message[];
37
41
  }>;
42
+ /** Runs `fn` over `items` with bounded concurrency, preserving input order. */
43
+ export declare function runLimited<T, R>(items: readonly T[], limit: number, fn: (item: T) => Promise<R>): Promise<R[]>;
@@ -7,6 +7,7 @@ import { BUILTIN_ALIAS_TO_CANONICAL, getBuiltinToolSpecs, invokeBuiltinTool } fr
7
7
  import { getMcpToolSpecs, invokeMcpTool, isMcpToolDefinition } from '../tools/mcp/runner.js';
8
8
  import { ulid } from '../ulid/index.js';
9
9
  import { abortError, withAbortSignal } from '../runtime/abort.js';
10
+ import { metadataSpanAttrs } from '../telemetry/span-attrs.js';
10
11
  function stringifyInput(input) { return typeof input === 'string' ? input : JSON.stringify(input); }
11
12
  function isReadonlyBuiltin(name) { return ['read', 'list', 'glob', 'grep'].includes(name); }
12
13
  async function checkPermission(agentId, runId, sessionId, def, toolName, input) {
@@ -93,6 +94,8 @@ export async function runDefaultAgent(args) {
93
94
  'harness.session.id': args.sessionId,
94
95
  'harness.run.id': args.runId,
95
96
  ...(args.workflowId ? { 'harness.workflow.id': args.workflowId } : {}),
97
+ ...(args.delegationCallId ? { 'harness.agent.delegation_call_id': args.delegationCallId } : {}),
98
+ ...(args.delegationDepth !== undefined ? { 'harness.agent.delegation_depth': args.delegationDepth } : {}),
96
99
  'harness.agent.id': args.agentId,
97
100
  'gen_ai.operation.name': 'invoke_agent',
98
101
  'openinference.span.kind': 'AGENT',
@@ -100,33 +103,24 @@ export async function runDefaultAgent(args) {
100
103
  'metadata.agent_id': args.agentId,
101
104
  [ATTR_GEN_AI_AGENT_NAME]: args.agentId,
102
105
  [ATTR_GEN_AI_AGENT_ID]: args.agentId,
103
- 'harness.agent.model': args.agent.model,
106
+ 'harness.agent.model': args.modelAlias ?? args.agent.model,
107
+ ...(args.modelAlias && args.modelAlias !== args.agent.model ? { 'harness.agent.default_model': args.agent.model } : {}),
104
108
  'harness.agent.has_handler': args.agent.handler !== undefined,
105
109
  ...metadataSpanAttrs(args.metadata)
106
110
  };
107
111
  const metrics = createMetrics(args.telemetry, agentAttrs);
108
- const execute = () => runDefaultAgentInner({ ...args, metrics });
109
- return args.telemetry.span(`invoke_agent ${args.agentId}`, agentAttrs, execute);
110
- }
111
- function metadataSpanAttrs(metadata) {
112
- const attrs = {};
113
- for (const [key, value] of Object.entries(metadata ?? {})) {
114
- if (!/^[a-zA-Z][a-zA-Z0-9_.-]{0,63}$/.test(key))
115
- continue;
116
- if (typeof value === 'string') {
117
- if (value.length <= 256)
118
- attrs[`harness.metadata.${key}`] = value;
119
- continue;
120
- }
121
- if (typeof value === 'number' && Number.isFinite(value)) {
122
- attrs[`harness.metadata.${key}`] = value;
123
- continue;
112
+ // Spec 08 §9: the harness tracks activated skill names per run when the
113
+ // `read` tool loads `/skills/<name>/SKILL.md`. Only the count is emitted —
114
+ // skill names stay out of telemetry.
115
+ const activatedSkills = new Set();
116
+ return args.telemetry.span(`invoke_agent ${args.agentId}`, agentAttrs, async (span) => {
117
+ try {
118
+ return await runDefaultAgentInner({ ...args, metrics, activatedSkills });
124
119
  }
125
- if (typeof value === 'boolean') {
126
- attrs[`harness.metadata.${key}`] = value;
120
+ finally {
121
+ span.setAttribute('harness.agent.skills_activated', activatedSkills.size);
127
122
  }
128
- }
129
- return attrs;
123
+ });
130
124
  }
131
125
  async function runDefaultAgentInner(args) {
132
126
  if (args.signal.aborted)
@@ -134,12 +128,12 @@ async function runDefaultAgentInner(args) {
134
128
  const inputSchema = args.agent.input ?? z.string();
135
129
  const outputSchema = args.agent.output ?? z.string();
136
130
  const parsedInput = parseAgentSchema(inputSchema, args.input, 'agent_input');
137
- const model = args.models[args.agent.model];
131
+ const selectedModelAlias = args.modelAlias ?? args.agent.model;
132
+ const model = args.models[selectedModelAlias];
138
133
  if (!model)
139
- throw new ValidationError('Unknown model alias', { where: 'agent_input', issues: { model: args.agent.model } });
134
+ throw new ValidationError('Unknown model alias', { where: 'agent_input', issues: { model: selectedModelAlias } });
140
135
  const skillIds = args.agent.skills ?? [];
141
136
  await mountSkillsOnce(args.session, args.mountedSkills, args.skills, skillIds);
142
- const activatedSkills = new Set();
143
137
  if (args.agent.handler) {
144
138
  const handler = args.agent.handler;
145
139
  const output = await withAbortSignal(args.signal, 'run', 'Run was cancelled.', () => handler({
@@ -150,6 +144,7 @@ async function runDefaultAgentInner(args) {
150
144
  sessionId: args.sessionId,
151
145
  history: { list: async () => args.history },
152
146
  memory: args.memory,
147
+ checkpoints: args.checkpoints,
153
148
  metadata: args.metadata ?? {},
154
149
  metrics: args.metrics
155
150
  }));
@@ -157,7 +152,7 @@ async function runDefaultAgentInner(args) {
157
152
  return { output: validated, emitted: [{ id: `msg_${ulid()}_a`, sessionId: args.sessionId, runId: args.runId, role: 'assistant', content: JSON.stringify(validated), timestamp: new Date().toISOString() }] };
158
153
  }
159
154
  const baseInstructions = typeof args.agent.instructions === 'function'
160
- ? args.agent.instructions({ input: parsedInput, runId: args.runId, sessionId: args.sessionId, history: { list: async () => args.history }, memory: args.memory, metadata: args.metadata ?? {}, metrics: args.metrics })
155
+ ? args.agent.instructions({ input: parsedInput, runId: args.runId, sessionId: args.sessionId, history: { list: async () => args.history }, memory: args.memory, checkpoints: args.checkpoints, metadata: args.metadata ?? {}, metrics: args.metrics })
161
156
  : args.agent.instructions;
162
157
  const instructions = `${baseInstructions}${buildSkillIndex(args.skills, skillIds)}`;
163
158
  const enabledBuiltins = args.agent.builtinTools === false ? [] : args.agent.builtinTools?.slice() ?? ['bash', 'read', 'write', 'edit', 'glob', 'grep', 'list'];
@@ -193,7 +188,13 @@ async function runDefaultAgentInner(args) {
193
188
  const emitted = [];
194
189
  const maxSteps = Math.min(args.agent.maxSteps ?? args.maxSteps, 64);
195
190
  let steps = 0;
196
- await args.emitEvent?.({ type: 'agent.started', runId: args.runId, agentId: args.agentId, at: new Date().toISOString() });
191
+ const agentEventMeta = {
192
+ ...(args.workflowId ? { workflowId: args.workflowId } : {}),
193
+ ...(args.delegationCallId ? { delegationCallId: args.delegationCallId } : {}),
194
+ ...(args.delegationDepth !== undefined ? { delegationDepth: args.delegationDepth } : {}),
195
+ modelAlias: selectedModelAlias
196
+ };
197
+ await args.emitEvent?.({ type: 'agent.started', runId: args.runId, agentId: args.agentId, at: new Date().toISOString(), ...agentEventMeta });
197
198
  try {
198
199
  while (true) {
199
200
  if (args.signal.aborted)
@@ -217,12 +218,20 @@ async function runDefaultAgentInner(args) {
217
218
  // Emit one usage-bearing model event per model round-trip (including
218
219
  // tool-call steps) so run-summary modelCalls and tokenTotals are accurate
219
220
  // for multi-step runs.
220
- await args.emitEvent?.({ type: 'model.object', runId: args.runId, agentId: args.agentId, object: (response.object ?? null), usage: response.usage });
221
+ await args.emitEvent?.({
222
+ type: 'model.object',
223
+ runId: args.runId,
224
+ agentId: args.agentId,
225
+ ...(args.workflowId ? { workflowId: args.workflowId } : {}),
226
+ modelAlias: selectedModelAlias,
227
+ object: (response.object ?? null),
228
+ usage: response.usage
229
+ });
221
230
  const toolCalls = (response.toolCalls ?? []);
222
231
  if (toolCalls.length === 0) {
223
232
  const validated = parseAgentSchema(outputSchema, response.object, 'agent_output');
224
233
  emitted.push({ id: `msg_${ulid()}_a`, sessionId: args.sessionId, runId: args.runId, role: 'assistant', content: JSON.stringify(validated), timestamp: new Date().toISOString() });
225
- await args.emitEvent?.({ type: 'agent.finished', runId: args.runId, agentId: args.agentId, at: new Date().toISOString(), output: validated });
234
+ await args.emitEvent?.({ type: 'agent.finished', runId: args.runId, agentId: args.agentId, at: new Date().toISOString(), output: validated, ...agentEventMeta });
226
235
  return { output: validated, emitted };
227
236
  }
228
237
  const assistantMsg = {
@@ -238,8 +247,7 @@ async function runDefaultAgentInner(args) {
238
247
  });
239
248
  const outcomes = await runLimited(toolCalls, args.maxParallelToolCalls, (call) => executeToolCall({
240
249
  ...args,
241
- enabledCustomTools,
242
- activatedSkills
250
+ enabledCustomTools
243
251
  }, call));
244
252
  for (const outcome of outcomes) {
245
253
  emitted.push(outcome.emitted);
@@ -250,11 +258,12 @@ async function runDefaultAgentInner(args) {
250
258
  }
251
259
  catch (error) {
252
260
  // Pair every agent.started with an agent.finished, even on error/cancel/budget.
253
- await args.emitEvent?.({ type: 'agent.finished', runId: args.runId, agentId: args.agentId, at: new Date().toISOString(), error: serializeError(error) });
261
+ await args.emitEvent?.({ type: 'agent.finished', runId: args.runId, agentId: args.agentId, at: new Date().toISOString(), error: serializeError(error), ...agentEventMeta });
254
262
  throw error;
255
263
  }
256
264
  }
257
- async function runLimited(items, limit, fn) {
265
+ /** Runs `fn` over `items` with bounded concurrency, preserving input order. */
266
+ export async function runLimited(items, limit, fn) {
258
267
  const concurrency = Math.max(1, Math.min(limit, items.length));
259
268
  const results = new Array(items.length);
260
269
  let next = 0;
@@ -262,10 +271,10 @@ async function runLimited(items, limit, fn) {
262
271
  while (true) {
263
272
  const index = next;
264
273
  next += 1;
265
- const item = items[index];
266
- if (item === undefined)
274
+ // Index-based termination: an `undefined` element must not truncate the batch.
275
+ if (index >= items.length)
267
276
  return;
268
- results[index] = await fn(item);
277
+ results[index] = await fn(items[index]);
269
278
  }
270
279
  }
271
280
  await Promise.all(Array.from({ length: concurrency }, () => worker()));
@@ -332,9 +341,18 @@ async function executeToolCall(args, call) {
332
341
  catch (error) {
333
342
  const failure = normalizeToolFailure(canonical, error, toolKind);
334
343
  if (failure instanceof OperationCancelledError) {
335
- if (args.signal.aborted)
336
- throw new OperationCancelledError('Run was cancelled.', { scope: 'run' }, args.signal.reason ?? failure);
337
- throw failure;
344
+ const cancellation = args.signal.aborted
345
+ ? new OperationCancelledError('Run was cancelled.', { scope: 'run' }, args.signal.reason ?? failure)
346
+ : failure;
347
+ // Pair tool.started with a best-effort tool.finished even on cancellation,
348
+ // matching the deliberate started/finished pairing policy above.
349
+ try {
350
+ await args.emitEvent?.({ type: 'tool.finished', runId: args.runId, agentId: args.agentId, toolId: canonical, callId: call.id, error: serializeError(cancellation) });
351
+ }
352
+ catch {
353
+ // Best-effort: never mask the cancellation with an emit failure.
354
+ }
355
+ throw cancellation;
338
356
  }
339
357
  result = { error: serializeError(failure) };
340
358
  }
@@ -62,7 +62,12 @@ export declare class ModelError extends HarnessError {
62
62
  model: string;
63
63
  method: string;
64
64
  status?: number;
65
- reason?: 'http_error' | 'network' | 'unstructured_response' | 'malformed_response' | 'context_length_exceeded' | 'embedding_count_mismatch' | 'rerank_result_mismatch';
65
+ reason?: 'http_error' | 'network' | 'rate_limited' | 'provider_unavailable' | 'unstructured_response' | 'malformed_response' | 'context_length_exceeded' | 'embedding_count_mismatch' | 'rerank_result_mismatch';
66
+ retryKind?: 'none' | 'active' | 'deferred';
67
+ retryAfterMs?: number;
68
+ retryAttempt?: number;
69
+ retryMaxAttempts?: number;
70
+ rateLimit?: unknown;
66
71
  providerCode?: string;
67
72
  providerType?: string;
68
73
  providerParam?: string;
@@ -98,6 +103,7 @@ export declare class ToolNotFoundError extends HarnessError {
98
103
  export declare class SkillNotFoundError extends HarnessError {
99
104
  constructor(message: string, meta: {
100
105
  skill_id: string;
106
+ agent_id?: string;
101
107
  }, cause?: unknown);
102
108
  }
103
109
  /** Skill manifest/frontmatter/config validation failure. */
@@ -124,6 +130,16 @@ export declare class AgentLoopBudgetError extends HarnessError {
124
130
  limit: number;
125
131
  }, cause?: unknown);
126
132
  }
133
+ /** Workflow child-agent delegation was denied or exceeded a configured budget. */
134
+ export declare class DelegationPolicyError extends HarnessError {
135
+ constructor(message: string, meta: {
136
+ workflow_id: string;
137
+ agent_id: string;
138
+ reason: 'delegation_disabled' | 'agent_not_allowed' | 'max_child_agent_calls_exceeded' | 'max_parallel_child_agent_calls_exceeded' | 'max_delegation_depth_exceeded' | 'model_alias_not_allowed';
139
+ limit?: number;
140
+ model_alias?: string;
141
+ }, cause?: unknown);
142
+ }
127
143
  /** Session attempted to invoke unknown workflow id. */
128
144
  export declare class WorkflowNotFoundError extends HarnessError {
129
145
  constructor(message: string, meta: {
@@ -146,7 +162,7 @@ export declare class SessionBusyError extends HarnessError {
146
162
  /** State backend operation failed. */
147
163
  export declare class StateError extends HarnessError {
148
164
  constructor(message: string, meta: {
149
- op: 'getSession' | 'upsertSession' | 'closeSession' | 'appendMessages' | 'listMessages' | 'clearMessages' | 'createRun' | 'finishRun' | 'getRun' | 'listRuns' | 'appendEvents' | 'listEvents' | 'memory.get' | 'memory.set' | 'memory.delete' | 'memory.list' | 'memory.search';
165
+ op: 'getSession' | 'upsertSession' | 'closeSession' | 'appendMessages' | 'listMessages' | 'clearMessages' | 'replaceMessages' | 'createRun' | 'finishRun' | 'getRun' | 'listRuns' | 'appendEvents' | 'listEvents' | 'contextCheckpointWrite' | 'contextCheckpointRead' | 'contextCheckpointList' | 'contextCheckpointDelete' | 'memory.get' | 'memory.set' | 'memory.delete' | 'memory.list' | 'memory.search';
150
166
  reason?: 'duplicate_message_id' | string;
151
167
  adapter?: 'memory' | string;
152
168
  memory_provider?: string;
@@ -33,7 +33,11 @@ export class SandboxNoExecutorError extends HarnessError {
33
33
  export class ModelError extends HarnessError {
34
34
  constructor(message, meta, cause) {
35
35
  const retriable = meta.reason === 'network'
36
+ || meta.reason === 'rate_limited'
37
+ || meta.reason === 'provider_unavailable'
36
38
  || meta.status === 429
39
+ || meta.status === 408
40
+ || meta.status === 409
37
41
  || (typeof meta.status === 'number' && meta.status >= 500);
38
42
  super({ code: 'MODEL_ERROR', category: 'model', retriable, message, meta, cause });
39
43
  }
@@ -87,6 +91,12 @@ export class AgentLoopBudgetError extends HarnessError {
87
91
  super({ code: 'AGENT_LOOP_BUDGET_EXCEEDED', category: 'validation', retriable: false, message, meta, cause });
88
92
  }
89
93
  }
94
+ /** Workflow child-agent delegation was denied or exceeded a configured budget. */
95
+ export class DelegationPolicyError extends HarnessError {
96
+ constructor(message, meta, cause) {
97
+ super({ code: 'DELEGATION_POLICY_ERROR', category: 'validation', retriable: false, message, meta, cause });
98
+ }
99
+ }
90
100
  /** Session attempted to invoke unknown workflow id. */
91
101
  export class WorkflowNotFoundError extends HarnessError {
92
102
  constructor(message, meta, cause) {
@@ -28,7 +28,7 @@ export interface ScorerResult {
28
28
  passed: boolean;
29
29
  evidence?: JsonValue;
30
30
  }
31
- export interface PromptCandidate<I = unknown> {
31
+ export interface PromptCandidate {
32
32
  id: string;
33
33
  prompt: string;
34
34
  metadata?: Record<string, JsonValue>;
@@ -47,10 +47,10 @@ export interface CandidateScore {
47
47
  scorerCount: number;
48
48
  }
49
49
  export interface EvaluatePromptCandidatesInput<I = unknown> {
50
- candidates: PromptCandidate<I>[];
50
+ candidates: PromptCandidate[];
51
51
  items: EvaluationItem<I>[];
52
52
  scorer: (target: ScorerTarget, signal: AbortSignal) => Promise<ScorerResult>;
53
- runCandidate: (candidate: PromptCandidate<I>, item: EvaluationItem<I>, signal: AbortSignal) => Promise<unknown>;
53
+ runCandidate: (candidate: PromptCandidate, item: EvaluationItem<I>, signal: AbortSignal) => Promise<unknown>;
54
54
  signal: AbortSignal;
55
55
  }
56
56
  export declare function evaluateDeterministicScorer(definition: DeterministicScorerDefinition, target: ScorerTarget): ScorerResult;
@@ -167,8 +167,22 @@ function matchesType(value, type) {
167
167
  function isRecord(value) {
168
168
  return value !== null && typeof value === 'object' && !Array.isArray(value);
169
169
  }
170
+ /** Structural deep equality, insensitive to object key order. */
170
171
  function deepEqual(a, b) {
171
- return JSON.stringify(a) === JSON.stringify(b);
172
+ if (Object.is(a, b))
173
+ return true;
174
+ if (Array.isArray(a) || Array.isArray(b)) {
175
+ if (!Array.isArray(a) || !Array.isArray(b) || a.length !== b.length)
176
+ return false;
177
+ return a.every((entry, index) => deepEqual(entry, b[index]));
178
+ }
179
+ if (isRecord(a) && isRecord(b)) {
180
+ const keysA = Object.keys(a);
181
+ if (keysA.length !== Object.keys(b).length)
182
+ return false;
183
+ return keysA.every((key) => Object.hasOwn(b, key) && deepEqual(a[key], b[key]));
184
+ }
185
+ return false;
172
186
  }
173
187
  function toJsonValue(value) {
174
188
  if (value === null || typeof value === 'string' || typeof value === 'number' || typeof value === 'boolean')
@@ -6,6 +6,8 @@ import type { Metrics, TelemetryShim } from '../telemetry/index.js';
6
6
  import type { HarnessAdapterContext } from '../ports/harness-context.js';
7
7
  import type { MemoryAdapter, MemoryFacade, SessionMemory } from '../ports/memory.js';
8
8
  import type { DurableWorkspaceStore } from '../ports/workspace.js';
9
+ import type { ContextCheckpointStore } from '../ports/context-checkpoints.js';
10
+ import type { ContextCheckpoint, ContextCheckpointQuery } from '../ports/context-checkpoints.js';
9
11
  import type { JsonValue } from '../models/json.js';
10
12
  import type { Message } from '../models/state.js';
11
13
  import type { RunStatus } from '../models/state.js';
@@ -43,6 +45,32 @@ export interface HarnessDefaults {
43
45
  * `undefined` keeps all history, `0` keeps only system messages.
44
46
  */
45
47
  historyWindow?: number;
48
+ /** Default workflow child-agent delegation budgets. */
49
+ delegation?: DelegationDefaults;
50
+ }
51
+ /** Workflow child-agent delegation defaults. Delegation is disabled unless explicitly enabled. */
52
+ export interface DelegationDefaults {
53
+ /**
54
+ * Enable workflow child-agent calls for workflows that do not declare their
55
+ * own `delegation` policy. Default: `false`.
56
+ */
57
+ enabled?: boolean;
58
+ /**
59
+ * Maximum child-agent calls one workflow run may start. Default: `32`.
60
+ * Set per workflow with `workflow.delegation.maxChildAgentCalls`.
61
+ */
62
+ maxChildAgentCalls?: number;
63
+ /**
64
+ * Maximum child-agent calls active at the same time inside one workflow run.
65
+ * Default: `8`.
66
+ */
67
+ maxParallelChildAgentCalls?: number;
68
+ /**
69
+ * Maximum local delegation depth. Default: `1`.
70
+ * Current harness workflows invoke leaf agents, so `1` allows normal
71
+ * workflow-to-agent calls and `0` disables child-agent delegation.
72
+ */
73
+ maxDepth?: number;
46
74
  }
47
75
  /** Top-level harness options passed to {@link defineHarness}. */
48
76
  export interface HarnessOptions {
@@ -333,21 +361,43 @@ export interface AgentContextMinimal<S extends BuilderState, I> {
333
361
  runId: string;
334
362
  history: ConversationHistory;
335
363
  memory: MemoryFacade;
364
+ checkpoints: ContextCheckpoints;
336
365
  metadata: Readonly<Record<string, JsonValue>>;
337
366
  metrics: Metrics;
338
367
  }
368
+ /** Run-bound facade for explicit long-horizon context checkpoints. */
369
+ export interface ContextCheckpoints {
370
+ write(input: {
371
+ sequence: number;
372
+ kind: ContextCheckpoint['kind'];
373
+ payload: JsonValue;
374
+ metadata?: Record<string, JsonValue>;
375
+ }): Promise<void>;
376
+ list(query?: Omit<ContextCheckpointQuery, 'runId' | 'sessionId' | 'workflowId' | 'agentId' | 'signal'>): Promise<readonly ContextCheckpoint[]>;
377
+ read(ref: {
378
+ sequence: number;
379
+ kind: ContextCheckpoint['kind'];
380
+ }): Promise<ContextCheckpoint | undefined>;
381
+ delete(ref: {
382
+ sequence: number;
383
+ kind: ContextCheckpoint['kind'];
384
+ }): Promise<void>;
385
+ }
339
386
  /** Full context passed to workflow handlers. */
340
387
  export interface WorkflowContext<S extends BuilderState, I, O> {
341
388
  input: I;
342
389
  agents: {
343
- [K in keyof NonNullable<S['agents']>]: (input: AgentInput<S, K>, opts?: InvokeOptions) => Promise<AgentOutput<S, K>>;
390
+ [K in keyof NonNullable<S['agents']>]: (input: AgentInput<S, K>, opts?: WorkflowAgentInvokeOptions<S, K>) => Promise<AgentOutput<S, K>>;
344
391
  };
345
392
  models: ModelHandles<S>;
393
+ /** Harness logger scoped for workflow handler code (spec 10 `WorkflowContext`). */
394
+ log: Logger;
346
395
  signal: AbortSignal;
347
396
  runId: string;
348
397
  sessionId: string;
349
398
  metadata: Readonly<Record<string, JsonValue>>;
350
399
  memory: MemoryFacade;
400
+ checkpoints: ContextCheckpoints;
351
401
  metrics: Metrics;
352
402
  /**
353
403
  * Runs `fn` as a durable step. Under a durable invocation the output is
@@ -357,6 +407,15 @@ export interface WorkflowContext<S extends BuilderState, I, O> {
357
407
  step<T extends JsonValue>(stepId: string, fn: () => Promise<T>): Promise<T>;
358
408
  output?: O;
359
409
  }
410
+ /** Invoke options accepted by workflow-local child-agent calls. */
411
+ export type WorkflowAgentInvokeOptions<S extends BuilderState, K extends keyof NonNullable<S['agents']>> = InvokeOptions & {
412
+ /**
413
+ * Optional model alias override for this child-agent call.
414
+ * The alias must exist on the harness model registry and be allowed by the
415
+ * workflow delegation policy.
416
+ */
417
+ model?: keyof NonNullable<S['models']> & string;
418
+ };
360
419
  /** Full context passed to custom agent handlers. */
361
420
  export interface AgentContext<S extends BuilderState, I, O> extends AgentContextMinimal<S, I> {
362
421
  models: ModelHandles<S>;
@@ -381,6 +440,7 @@ export interface AgentDefinition<S extends BuilderState, I extends z.ZodTypeAny
381
440
  export interface WorkflowDefinition<S extends BuilderState, I extends z.ZodTypeAny = z.ZodTypeAny, O extends z.ZodTypeAny = z.ZodTypeAny> {
382
441
  input?: I;
383
442
  output?: O;
443
+ delegation?: WorkflowDelegationPolicy<S>;
384
444
  handler: (ctx: WorkflowContext<S, z.infer<I>, z.infer<O>>) => Promise<z.infer<O>>;
385
445
  }
386
446
  type AgentSchemaFields = {
@@ -423,8 +483,26 @@ type WorkflowSchemaFields = {
423
483
  type WorkflowDefinitionResolved<S extends BuilderState, I extends z.ZodTypeAny, O extends z.ZodTypeAny> = {
424
484
  input?: I;
425
485
  output?: O;
486
+ delegation?: WorkflowDelegationPolicy<S>;
426
487
  handler: (ctx: WorkflowContext<S, z.infer<I>, z.infer<O>>) => Promise<z.infer<O>>;
427
488
  };
489
+ /** Policy for workflow-local child-agent delegation through `ctx.agents`. */
490
+ export interface WorkflowDelegationPolicy<S extends BuilderState = BuilderState> {
491
+ /** Enable or disable child-agent calls for this workflow. A policy object without this field enables delegation. */
492
+ enabled?: boolean;
493
+ /** Child agent ids this workflow may call. Omit to allow all registered agents. */
494
+ agents?: readonly (keyof NonNullable<S['agents']> & string)[];
495
+ /** Per-run child-agent call limit. Overrides `defaults.delegation.maxChildAgentCalls`. */
496
+ maxChildAgentCalls?: number;
497
+ /** Per-run active child-agent call limit. Overrides `defaults.delegation.maxParallelChildAgentCalls`. */
498
+ maxParallelChildAgentCalls?: number;
499
+ /** Maximum local delegation depth. Overrides `defaults.delegation.maxDepth`. */
500
+ maxDepth?: number;
501
+ /** Model aliases allowed for every child-agent call in this workflow, including calls running on the agent's default `model`. */
502
+ modelAliases?: readonly (keyof NonNullable<S['models']> & string)[];
503
+ /** Per-child-agent model alias allowlists. These replace `modelAliases` for the named agent. */
504
+ agentModelAliases?: Partial<Record<keyof NonNullable<S['agents']> & string, readonly (keyof NonNullable<S['models']> & string)[]>>;
505
+ }
428
506
  type WorkflowDefinitionFor<S extends BuilderState, D> = D extends {
429
507
  input: infer I extends z.ZodTypeAny;
430
508
  output: infer O extends z.ZodTypeAny;
@@ -576,11 +654,21 @@ export type RunEvent = {
576
654
  runId: string;
577
655
  agentId: string;
578
656
  at: string;
657
+ workflowId?: string;
658
+ parentAgentId?: string;
659
+ delegationCallId?: string;
660
+ delegationDepth?: number;
661
+ modelAlias?: string;
579
662
  } | {
580
663
  type: 'agent.finished';
581
664
  runId: string;
582
665
  agentId: string;
583
666
  at: string;
667
+ workflowId?: string;
668
+ parentAgentId?: string;
669
+ delegationCallId?: string;
670
+ delegationDepth?: number;
671
+ modelAlias?: string;
584
672
  output?: JsonValue;
585
673
  error?: SerializedError;
586
674
  } | {
@@ -657,6 +745,7 @@ export interface HarnessBuilder<S extends BuilderState = {}> {
657
745
  memory(adapter: MemoryAdapter): HarnessBuilder<S>;
658
746
  runtime(runtime: DurableRuntimeAdapter): HarnessBuilder<S>;
659
747
  workspaceStore(store: DurableWorkspaceStore): HarnessBuilder<S>;
748
+ checkpoints(store: ContextCheckpointStore): HarnessBuilder<S>;
660
749
  requires(capabilities: readonly AdapterCapability[]): HarnessBuilder<S>;
661
750
  defaults(defaults: HarnessDefaults): HarnessBuilder<S>;
662
751
  models<const M extends ModelsConfig>(models: M): HarnessBuilder<S & {
@@ -745,6 +834,7 @@ export interface HarnessBuilder<S extends BuilderState = {}> {
745
834
  * summarize_ticket: {
746
835
  * input: z.object({ ticket: z.string() }),
747
836
  * output: z.string(),
837
+ * delegation: { agents: ['summarize'] },
748
838
  * handler: (ctx) => ctx.agents.summarize(ctx.input.ticket)
749
839
  * }
750
840
  * })