@purista/harness 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -3,6 +3,21 @@
3
3
  Self-hosted enterprise agent harness for typed tools, agents, workflows, state,
4
4
  sandboxing, streaming, and OpenTelemetry instrumentation.
5
5
 
6
+ The core package also exports provider-neutral eval helpers:
7
+
8
+ - `evaluatePromptCandidates(...)` compares prompt candidates against a fixed
9
+ item set and deterministic or custom scorers.
10
+ - `evaluateDeterministicScorer(...)` runs JSON Pointer based deterministic
11
+ scorer definitions without provider calls. It is exported from the main
12
+ package and re-exported from `@purista/harness/testing`.
13
+
14
+ Telemetry defaults to dual GenAI and OpenInference attributes with no content
15
+ capture. `InvokeOptions.traceparent` and `tracestate` accept inbound W3C Trace
16
+ Context so application traces can parent harness run spans.
17
+
18
+ See [Evaluating Prompts](https://github.com/puristajs/harness/blob/main/docs/guides/evaluating-prompts.md)
19
+ for the execution model, scorer limits, and privacy behavior.
20
+
6
21
  ## Install
7
22
 
8
23
  ```bash
@@ -1,9 +1,10 @@
1
1
  import type { Logger } from '../logger/index.js';
2
2
  import type { JsonValue } from '../models/json.js';
3
3
  import type { Message } from '../models/state.js';
4
- import type { AgentDefinition, ResolvedSkill, RunEvent, SessionMemory, ToolsConfig } from '../harness/defineHarness.js';
4
+ import type { AgentDefinition, ResolvedSkill, RunEvent, ToolsConfig } from '../harness/defineHarness.js';
5
+ import type { MemoryFacade } from '../ports/memory.js';
5
6
  import type { SandboxSession } from '../sandbox/index.js';
6
- import type { TelemetryShim } from '../telemetry/index.js';
7
+ import { type TelemetryShim } from '../telemetry/index.js';
7
8
  import { type McpRunnerRegistry } from '../tools/mcp/runner.js';
8
9
  export declare function runDefaultAgent(args: {
9
10
  harnessName: string;
@@ -19,7 +20,7 @@ export declare function runDefaultAgent(args: {
19
20
  customTools: ToolsConfig;
20
21
  mcpRegistry?: McpRunnerRegistry;
21
22
  session: SandboxSession;
22
- memory: SessionMemory;
23
+ memory: MemoryFacade;
23
24
  mountedSkills: Set<string>;
24
25
  historyWindow?: number;
25
26
  maxSteps: number;
@@ -28,6 +29,7 @@ export declare function runDefaultAgent(args: {
28
29
  logger: Logger;
29
30
  telemetry: TelemetryShim;
30
31
  emitEvent?: (event: RunEvent) => Promise<void>;
32
+ metadata?: Readonly<Record<string, JsonValue>>;
31
33
  }): Promise<{
32
34
  output: JsonValue;
33
35
  emitted: Message[];
@@ -1,6 +1,7 @@
1
1
  import { z } from 'zod';
2
2
  import { ATTR_GEN_AI_AGENT_ID, ATTR_GEN_AI_AGENT_NAME, ATTR_GEN_AI_TOOL_CALL_ID, ATTR_GEN_AI_TOOL_NAME, ATTR_GEN_AI_TOOL_TYPE } from '@opentelemetry/semantic-conventions/incubating';
3
3
  import { AgentLoopBudgetError, HarnessError, OperationCancelledError, OperationTimeoutError, PermissionDeniedError, ToolError, ToolNotFoundError, ValidationError, serializeError } from '../errors/index.js';
4
+ import { createMetrics } from '../telemetry/index.js';
4
5
  import { buildSkillIndex, mountSkillsOnce } from '../skills/index.js';
5
6
  import { BUILTIN_ALIAS_TO_CANONICAL, getBuiltinToolSpecs, invokeBuiltinTool } from '../tools/index.js';
6
7
  import { getMcpToolSpecs, invokeMcpTool, isMcpToolDefinition } from '../tools/mcp/runner.js';
@@ -31,14 +32,40 @@ export async function runDefaultAgent(args) {
31
32
  'harness.run.id': args.runId,
32
33
  ...(args.workflowId ? { 'harness.workflow.id': args.workflowId } : {}),
33
34
  'harness.agent.id': args.agentId,
35
+ 'gen_ai.operation.name': 'invoke_agent',
36
+ 'openinference.span.kind': 'AGENT',
37
+ 'metadata.agent_name': args.agentId,
38
+ 'metadata.agent_id': args.agentId,
34
39
  [ATTR_GEN_AI_AGENT_NAME]: args.agentId,
35
- [ATTR_GEN_AI_AGENT_ID]: args.runId,
40
+ [ATTR_GEN_AI_AGENT_ID]: args.agentId,
36
41
  'harness.agent.model': args.agent.model,
37
- 'harness.agent.has_handler': args.agent.handler !== undefined
42
+ 'harness.agent.has_handler': args.agent.handler !== undefined,
43
+ ...metadataSpanAttrs(args.metadata)
38
44
  };
39
- const execute = () => runDefaultAgentInner(args);
45
+ const metrics = createMetrics(args.telemetry, agentAttrs);
46
+ const execute = () => runDefaultAgentInner({ ...args, metrics });
40
47
  return args.telemetry.span(`invoke_agent ${args.agentId}`, agentAttrs, execute);
41
48
  }
49
+ function metadataSpanAttrs(metadata) {
50
+ const attrs = {};
51
+ for (const [key, value] of Object.entries(metadata ?? {})) {
52
+ if (!/^[a-zA-Z][a-zA-Z0-9_.-]{0,63}$/.test(key))
53
+ continue;
54
+ if (typeof value === 'string') {
55
+ if (value.length <= 256)
56
+ attrs[`harness.metadata.${key}`] = value;
57
+ continue;
58
+ }
59
+ if (typeof value === 'number' && Number.isFinite(value)) {
60
+ attrs[`harness.metadata.${key}`] = value;
61
+ continue;
62
+ }
63
+ if (typeof value === 'boolean') {
64
+ attrs[`harness.metadata.${key}`] = value;
65
+ }
66
+ }
67
+ return attrs;
68
+ }
42
69
  async function runDefaultAgentInner(args) {
43
70
  args.signal.throwIfAborted();
44
71
  const inputSchema = args.agent.input ?? z.string();
@@ -57,13 +84,15 @@ async function runDefaultAgentInner(args) {
57
84
  runId: args.runId,
58
85
  sessionId: args.sessionId,
59
86
  history: { list: async () => args.history },
60
- memory: args.memory
87
+ memory: args.memory,
88
+ metadata: args.metadata ?? {},
89
+ metrics: args.metrics
61
90
  });
62
91
  const validated = parseAgentSchema(outputSchema, output, 'agent_output');
63
92
  return { output: validated, emitted: [{ id: `msg_${Date.now()}_a`, sessionId: args.sessionId, runId: args.runId, role: 'assistant', content: JSON.stringify(validated), timestamp: new Date().toISOString() }] };
64
93
  }
65
94
  const baseInstructions = typeof args.agent.instructions === 'function'
66
- ? args.agent.instructions({ input: parsedInput, runId: args.runId, sessionId: args.sessionId, history: { list: async () => args.history }, memory: args.memory })
95
+ ? args.agent.instructions({ input: parsedInput, runId: args.runId, sessionId: args.sessionId, history: { list: async () => args.history }, memory: args.memory, metadata: args.metadata ?? {}, metrics: args.metrics })
67
96
  : args.agent.instructions;
68
97
  const instructions = `${baseInstructions}${buildSkillIndex(args.skills, skillIds)}`;
69
98
  const enabledBuiltins = args.agent.builtinTools === false ? [] : args.agent.builtinTools?.slice() ?? ['bash', 'read', 'write', 'edit', 'glob', 'grep', 'list'];
@@ -117,6 +146,7 @@ async function runDefaultAgentInner(args) {
117
146
  if (toolCalls.length === 0) {
118
147
  const validated = parseAgentSchema(outputSchema, response.object, 'agent_output');
119
148
  emitted.push({ id: `msg_${Date.now()}_a`, sessionId: args.sessionId, runId: args.runId, role: 'assistant', content: JSON.stringify(validated), timestamp: new Date().toISOString() });
149
+ await args.emitEvent?.({ type: 'model.object', runId: args.runId, agentId: args.agentId, object: validated, usage: response.usage });
120
150
  await args.emitEvent?.({ type: 'agent.finished', runId: args.runId, agentId: args.agentId, at: new Date().toISOString(), output: validated });
121
151
  return { output: validated, emitted };
122
152
  }
@@ -159,7 +189,25 @@ async function runDefaultAgentInner(args) {
159
189
  }
160
190
  const tsTool = tool;
161
191
  const parsed = tsTool.input.parse(input);
162
- const out = await withToolSignal(args.signal, args.toolTimeoutMs, (signal) => tsTool.handler({ signal, sandbox: withSandboxTelemetry(args, canonical), logger: args.logger, telemetry: args.telemetry, runId: args.runId, sessionId: args.sessionId, agentId: args.agentId, toolId: canonical }, parsed));
192
+ const out = await withToolSignal(args.signal, args.toolTimeoutMs, (signal) => tsTool.handler({
193
+ signal,
194
+ sandbox: withSandboxTelemetry(args, canonical),
195
+ logger: args.logger,
196
+ telemetry: args.telemetry,
197
+ metrics: createMetrics(args.telemetry, {
198
+ 'harness.name': args.harnessName,
199
+ 'harness.session.id': args.sessionId,
200
+ 'harness.run.id': args.runId,
201
+ ...(args.workflowId ? { 'harness.workflow.id': args.workflowId } : {}),
202
+ 'harness.agent.id': args.agentId,
203
+ 'harness.tool.id': canonical
204
+ }),
205
+ memory: args.memory,
206
+ runId: args.runId,
207
+ sessionId: args.sessionId,
208
+ agentId: args.agentId,
209
+ toolId: canonical
210
+ }, parsed));
163
211
  return { output: tsTool.output.parse(out) };
164
212
  });
165
213
  }
@@ -226,6 +274,10 @@ async function withToolSpan(args, toolId, callId, toolKind, mcpAttrs, fn) {
226
274
  ...(args.workflowId ? { 'harness.workflow.id': args.workflowId } : {}),
227
275
  'harness.agent.id': args.agentId,
228
276
  'harness.tool.id': toolId,
277
+ 'gen_ai.operation.name': 'execute_tool',
278
+ 'openinference.span.kind': 'TOOL',
279
+ 'tool.name': toolId,
280
+ 'tool.call.id': callId,
229
281
  [ATTR_GEN_AI_TOOL_NAME]: toolId,
230
282
  [ATTR_GEN_AI_TOOL_CALL_ID]: callId,
231
283
  [ATTR_GEN_AI_TOOL_TYPE]: toolKind,
@@ -22,9 +22,14 @@ export declare class ValidationError extends HarnessError {
22
22
  /** Model provider response shape is invalid. */ | 'model_response'
23
23
  /** Session memory key is invalid. */ | 'memory_key'
24
24
  /** Session memory value is invalid or non-serializable. */ | 'memory_value'
25
+ /** Session memory scope is invalid or unsupported. */ | 'memory_scope'
26
+ /** Session memory options are invalid or unsupported. */ | 'memory_write_options'
27
+ /** Session memory listing options are invalid. */ | 'memory_list_options'
28
+ /** Session memory search query is invalid. */ | 'memory_search_query'
25
29
  /** Message envelope validation failed. */ | 'message'
26
30
  /** Session history shape validation failed. */ | 'session_history'
27
- /** Invocation options are invalid. */ | 'invoke_options';
31
+ /** Invocation options are invalid. */ | 'invoke_options'
32
+ /** Evaluation helper input is invalid. */ | 'eval_input';
28
33
  issues: unknown;
29
34
  }, cause?: unknown);
30
35
  }
@@ -139,21 +144,23 @@ export declare class SessionBusyError extends HarnessError {
139
144
  /** State backend operation failed. */
140
145
  export declare class StateError extends HarnessError {
141
146
  constructor(message: string, meta: {
142
- op: 'getSession' | 'upsertSession' | 'closeSession' | 'appendMessages' | 'listMessages' | 'clearMessages' | 'createRun' | 'finishRun' | 'getRun' | 'listRuns' | 'appendEvents' | 'listEvents';
147
+ op: 'getSession' | 'upsertSession' | 'closeSession' | 'appendMessages' | 'listMessages' | 'clearMessages' | 'createRun' | 'finishRun' | 'getRun' | 'listRuns' | 'appendEvents' | 'listEvents' | 'memory.get' | 'memory.set' | 'memory.delete' | 'memory.list' | 'memory.search';
143
148
  reason?: 'duplicate_message_id' | string;
149
+ adapter?: 'memory' | string;
150
+ memory_provider?: string;
144
151
  }, cause?: unknown);
145
152
  }
146
153
  /** Timed execution budget expired. */
147
154
  export declare class OperationTimeoutError extends HarnessError {
148
155
  constructor(message: string, meta: {
149
- scope: 'run' | 'model' | 'tool' | 'sandbox_run';
156
+ scope: 'run' | 'model' | 'tool' | 'sandbox_run' | 'memory';
150
157
  timeout_ms: number;
151
158
  }, cause?: unknown);
152
159
  }
153
160
  /** Operation cancelled by abort signal or explicit cancellation path. */
154
161
  export declare class OperationCancelledError extends HarnessError {
155
162
  constructor(message: string, meta: {
156
- scope: 'run' | 'workflow' | 'agent' | 'model' | 'tool' | 'sandbox';
163
+ scope: 'run' | 'workflow' | 'agent' | 'model' | 'tool' | 'sandbox' | 'memory';
157
164
  }, cause?: unknown);
158
165
  }
159
166
  /** MCP transport/protocol failure. */
@@ -0,0 +1,57 @@
1
+ import type { JsonValue } from '../models/json.js';
2
+ export type DeterministicScorerDefinition = {
3
+ type: 'regex';
4
+ path: string;
5
+ pattern: string;
6
+ flags?: 'i' | 'm' | 'im';
7
+ } | {
8
+ type: 'json-schema';
9
+ schema: JsonValue;
10
+ } | {
11
+ type: 'contains';
12
+ path: string;
13
+ value: string;
14
+ caseInsensitive?: boolean;
15
+ } | {
16
+ type: 'attribute-equality';
17
+ leftPath: string;
18
+ rightPath: string;
19
+ };
20
+ export interface ScorerTarget {
21
+ input: unknown;
22
+ output: unknown;
23
+ expected?: unknown;
24
+ context?: unknown[];
25
+ }
26
+ export interface ScorerResult {
27
+ score: number;
28
+ passed: boolean;
29
+ evidence?: JsonValue;
30
+ }
31
+ export interface PromptCandidate<I = unknown> {
32
+ id: string;
33
+ prompt: string;
34
+ metadata?: Record<string, JsonValue>;
35
+ }
36
+ export interface EvaluationItem<I = unknown> {
37
+ id: string;
38
+ input: I;
39
+ expected?: unknown;
40
+ context?: unknown[];
41
+ }
42
+ export interface CandidateScore {
43
+ candidateId: string;
44
+ meanScore: number;
45
+ passRate: number;
46
+ itemCount: number;
47
+ scorerCount: number;
48
+ }
49
+ export interface EvaluatePromptCandidatesInput<I = unknown> {
50
+ candidates: PromptCandidate<I>[];
51
+ items: EvaluationItem<I>[];
52
+ scorer: (target: ScorerTarget, signal: AbortSignal) => Promise<ScorerResult>;
53
+ runCandidate: (candidate: PromptCandidate<I>, item: EvaluationItem<I>, signal: AbortSignal) => Promise<unknown>;
54
+ signal: AbortSignal;
55
+ }
56
+ export declare function evaluateDeterministicScorer(definition: DeterministicScorerDefinition, target: ScorerTarget): ScorerResult;
57
+ export declare function evaluatePromptCandidates<I = unknown>(input: EvaluatePromptCandidatesInput<I>): Promise<CandidateScore[]>;
@@ -0,0 +1,181 @@
1
+ import { ValidationError } from '../errors/index.js';
2
+ export function evaluateDeterministicScorer(definition, target) {
3
+ switch (definition.type) {
4
+ case 'regex': {
5
+ const selected = readPointer(target.output, definition.path);
6
+ if (!selected.found)
7
+ return missingPointer(definition.path);
8
+ return binary(new RegExp(definition.pattern, definition.flags ?? '').test(String(selected.value)));
9
+ }
10
+ case 'contains': {
11
+ const selected = readPointer(target.output, definition.path);
12
+ if (!selected.found)
13
+ return missingPointer(definition.path);
14
+ const haystack = String(selected.value);
15
+ const needle = definition.value;
16
+ return binary(definition.caseInsensitive
17
+ ? haystack.toLocaleLowerCase().includes(needle.toLocaleLowerCase())
18
+ : haystack.includes(needle));
19
+ }
20
+ case 'attribute-equality': {
21
+ const left = readPointer(target.output, definition.leftPath);
22
+ if (!left.found)
23
+ return missingPointer(definition.leftPath);
24
+ const right = readPointer(target.output, definition.rightPath);
25
+ if (!right.found)
26
+ return missingPointer(definition.rightPath);
27
+ return deepEqual(left.value, right.value)
28
+ ? binary(true)
29
+ : { score: 0, passed: false, evidence: { left: toJsonValue(left.value), right: toJsonValue(right.value) } };
30
+ }
31
+ case 'json-schema': {
32
+ const result = validateJsonSchema(definition.schema, target.output);
33
+ return result.passed
34
+ ? binary(true)
35
+ : { score: 0, passed: false, evidence: { reason: 'schema_validation_failed', issues: result.issues } };
36
+ }
37
+ }
38
+ }
39
+ export async function evaluatePromptCandidates(input) {
40
+ if (input.candidates.length === 0) {
41
+ throw new ValidationError('At least one prompt candidate is required.', { where: 'eval_input', issues: { candidates: 'empty' } });
42
+ }
43
+ if (input.items.length === 0) {
44
+ throw new ValidationError('At least one evaluation item is required.', { where: 'eval_input', issues: { items: 'empty' } });
45
+ }
46
+ const scores = [];
47
+ for (const candidate of input.candidates) {
48
+ input.signal.throwIfAborted();
49
+ let total = 0;
50
+ let passed = 0;
51
+ let scorerCount = 0;
52
+ for (const item of input.items) {
53
+ input.signal.throwIfAborted();
54
+ const output = await input.runCandidate(candidate, item, input.signal);
55
+ const target = {
56
+ input: item.input,
57
+ output
58
+ };
59
+ if (item.expected !== undefined)
60
+ target.expected = item.expected;
61
+ if (item.context !== undefined)
62
+ target.context = item.context;
63
+ const result = await input.scorer(target, input.signal);
64
+ total += result.score;
65
+ passed += result.passed ? 1 : 0;
66
+ scorerCount += 1;
67
+ }
68
+ scores.push({
69
+ candidateId: candidate.id,
70
+ meanScore: total / scorerCount,
71
+ passRate: passed / scorerCount,
72
+ itemCount: input.items.length,
73
+ scorerCount
74
+ });
75
+ }
76
+ return scores.sort((a, b) => {
77
+ if (a.meanScore !== b.meanScore)
78
+ return b.meanScore - a.meanScore;
79
+ if (a.passRate !== b.passRate)
80
+ return b.passRate - a.passRate;
81
+ return a.candidateId.localeCompare(b.candidateId);
82
+ });
83
+ }
84
+ function binary(passed) {
85
+ return { score: passed ? 1 : 0, passed };
86
+ }
87
+ function missingPointer(path) {
88
+ return { score: 0, passed: false, evidence: { reason: 'missing_pointer', path } };
89
+ }
90
+ function readPointer(value, pointer) {
91
+ if (pointer === '')
92
+ return { found: true, value };
93
+ if (!pointer.startsWith('/'))
94
+ return { found: false };
95
+ let current = value;
96
+ for (const rawPart of pointer.slice(1).split('/')) {
97
+ const part = rawPart.replace(/~1/g, '/').replace(/~0/g, '~');
98
+ if (Array.isArray(current)) {
99
+ const index = Number(part);
100
+ if (!Number.isInteger(index) || index < 0 || index >= current.length)
101
+ return { found: false };
102
+ current = current[index];
103
+ continue;
104
+ }
105
+ if (!isRecord(current) || !(part in current))
106
+ return { found: false };
107
+ current = current[part];
108
+ }
109
+ return { found: true, value: current };
110
+ }
111
+ function validateJsonSchema(schema, value) {
112
+ const issues = [];
113
+ validateSchemaAt(schema, value, '', issues);
114
+ return { passed: issues.length === 0, issues };
115
+ }
116
+ function validateSchemaAt(schema, value, path, issues) {
117
+ if (!isRecord(schema))
118
+ return;
119
+ if ('const' in schema && !deepEqual(value, schema['const'])) {
120
+ issues.push({ path, reason: 'const', expected: toJsonValue(schema['const']), actual: toJsonValue(value) });
121
+ return;
122
+ }
123
+ if (Array.isArray(schema['enum']) && !schema['enum'].some((entry) => deepEqual(entry, value))) {
124
+ issues.push({ path, reason: 'enum', actual: toJsonValue(value) });
125
+ return;
126
+ }
127
+ const type = typeof schema['type'] === 'string' ? schema['type'] : undefined;
128
+ if (type && !matchesType(value, type)) {
129
+ issues.push({ path, reason: 'type', expected: type, actual: typeof value });
130
+ return;
131
+ }
132
+ if (type === 'object' || schema['properties']) {
133
+ if (!isRecord(value)) {
134
+ issues.push({ path, reason: 'type', expected: 'object', actual: typeof value });
135
+ return;
136
+ }
137
+ const required = Array.isArray(schema['required']) ? schema['required'].filter((entry) => typeof entry === 'string') : [];
138
+ for (const key of required) {
139
+ if (!(key in value))
140
+ issues.push({ path: `${path}/${key}`, reason: 'required' });
141
+ }
142
+ const properties = isRecord(schema['properties']) ? schema['properties'] : {};
143
+ for (const [key, childSchema] of Object.entries(properties)) {
144
+ if (key in value)
145
+ validateSchemaAt(childSchema, value[key], `${path}/${key}`, issues);
146
+ }
147
+ if (schema['additionalProperties'] === false) {
148
+ for (const key of Object.keys(value)) {
149
+ if (!(key in properties))
150
+ issues.push({ path: `${path}/${key}`, reason: 'additional_properties' });
151
+ }
152
+ }
153
+ }
154
+ }
155
+ function matchesType(value, type) {
156
+ switch (type) {
157
+ case 'object': return isRecord(value);
158
+ case 'array': return Array.isArray(value);
159
+ case 'string': return typeof value === 'string';
160
+ case 'number': return typeof value === 'number' && Number.isFinite(value);
161
+ case 'integer': return Number.isInteger(value);
162
+ case 'boolean': return typeof value === 'boolean';
163
+ case 'null': return value === null;
164
+ default: return true;
165
+ }
166
+ }
167
+ function isRecord(value) {
168
+ return value !== null && typeof value === 'object' && !Array.isArray(value);
169
+ }
170
+ function deepEqual(a, b) {
171
+ return JSON.stringify(a) === JSON.stringify(b);
172
+ }
173
+ function toJsonValue(value) {
174
+ if (value === null || typeof value === 'string' || typeof value === 'number' || typeof value === 'boolean')
175
+ return value;
176
+ if (Array.isArray(value))
177
+ return value.map((entry) => toJsonValue(entry));
178
+ if (isRecord(value))
179
+ return Object.fromEntries(Object.entries(value).map(([key, entry]) => [key, toJsonValue(entry)]));
180
+ return String(value);
181
+ }
@@ -2,10 +2,12 @@ import { z } from 'zod';
2
2
  import { type Logger } from '../logger/index.js';
3
3
  import type { ModelAlias, ModelCapability, TokenUsage } from '../ports/model-provider.js';
4
4
  import type { StateStore } from '../ports/state.js';
5
- import type { TelemetryShim } from '../telemetry/index.js';
5
+ import type { Metrics, TelemetryShim } from '../telemetry/index.js';
6
6
  import type { HarnessAdapterContext } from '../ports/harness-context.js';
7
+ import type { MemoryAdapter, MemoryFacade, SessionMemory } from '../ports/memory.js';
7
8
  import type { JsonValue } from '../models/json.js';
8
9
  import type { Message } from '../models/state.js';
10
+ import type { RunStatus } from '../models/state.js';
9
11
  import type { HarnessError } from '../errors/harness-error.js';
10
12
  import { type Sandbox } from '../sandbox/index.js';
11
13
  import type { ModelHandle } from '../models/registry.js';
@@ -13,12 +15,13 @@ import { type AdapterCapability, type DurableRuntimeAdapter, type HarnessInspect
13
15
  /** Stable harness version string for diagnostics and generated documentation. */
14
16
  export declare const HARNESS_VERSION = "0.0.0";
15
17
  /** OpenTelemetry capture controls used by the harness. */
18
+ export type TelemetryFlavor = 'dual' | 'gen_ai_only' | 'openinference_only';
19
+ export type ContentCaptureMode = 'NO_CONTENT' | 'SPAN_ONLY' | 'EVENT_ONLY' | 'SPAN_AND_EVENT';
16
20
  export interface TelemetryOptions {
17
- /**
18
- * When `true`, emitted telemetry may include full prompt/message content.
19
- * The default is `false` to avoid accidental sensitive-content capture.
20
- */
21
- captureContent?: boolean;
21
+ /** Backend emission shape. */
22
+ flavor?: TelemetryFlavor;
23
+ /** Span/event content capture mode. */
24
+ contentCaptureMode?: ContentCaptureMode;
22
25
  }
23
26
  /** Default harness budgets and execution behavior. */
24
27
  export interface HarnessDefaults {
@@ -51,6 +54,12 @@ export interface InvokeOptions {
51
54
  timeoutMs?: number;
52
55
  /** Optional history-window override for this call only. */
53
56
  historyWindow?: number;
57
+ /** Optional W3C Trace Context parent. */
58
+ traceparent?: string;
59
+ /** Optional W3C Trace Context state. */
60
+ tracestate?: string;
61
+ /** Scalar metadata exposed to handlers and telemetry sanitizers. */
62
+ metadata?: Record<string, JsonValue>;
54
63
  }
55
64
  /** Canonical built-in tool names provided by the harness. */
56
65
  export type BuiltinToolName = 'bash' | 'read' | 'write' | 'edit' | 'glob' | 'grep' | 'list';
@@ -102,17 +111,6 @@ export interface ResolvedSkill {
102
111
  /** Absolute directory mounted into `/skills/<name>`. */
103
112
  directory: string;
104
113
  }
105
- /** Sandbox-backed per-session memory facade. */
106
- export interface SessionMemory {
107
- /** Reads `/memory/<key>.json` and returns the parsed JSON value if present. */
108
- read<T = JsonValue>(key: string): Promise<T | undefined>;
109
- /** Writes JSON-serializable data to `/memory/<key>.json`. */
110
- write(key: string, value: JsonValue): Promise<void>;
111
- /** Deletes `/memory/<key>.json` if it exists. */
112
- delete(key: string): Promise<void>;
113
- /** Lists known memory keys without the `.json` suffix. */
114
- list(): Promise<string[]>;
115
- }
116
114
  /** Conversation history accessor for a single session thread. */
117
115
  export interface ConversationHistory {
118
116
  /** Returns persisted conversation messages for the session. */
@@ -127,6 +125,8 @@ export interface ToolHandlerContext {
127
125
  sandbox: import('../sandbox/index.js').SandboxSession;
128
126
  logger: Logger;
129
127
  telemetry: TelemetryShim;
128
+ metrics: Metrics;
129
+ memory: MemoryFacade;
130
130
  runId: string;
131
131
  sessionId: string;
132
132
  agentId: string;
@@ -258,7 +258,9 @@ export interface AgentContextMinimal<S extends BuilderState, I> {
258
258
  sessionId: string;
259
259
  runId: string;
260
260
  history: ConversationHistory;
261
- memory: SessionMemory;
261
+ memory: MemoryFacade;
262
+ metadata: Readonly<Record<string, JsonValue>>;
263
+ metrics: Metrics;
262
264
  }
263
265
  /** Full context passed to workflow handlers. */
264
266
  export interface WorkflowContext<S extends BuilderState, I, O> {
@@ -270,6 +272,9 @@ export interface WorkflowContext<S extends BuilderState, I, O> {
270
272
  signal: AbortSignal;
271
273
  runId: string;
272
274
  sessionId: string;
275
+ metadata: Readonly<Record<string, JsonValue>>;
276
+ memory: MemoryFacade;
277
+ metrics: Metrics;
273
278
  output?: O;
274
279
  }
275
280
  /** Full context passed to custom agent handlers. */
@@ -442,6 +447,7 @@ export interface Session<S extends BuilderState> {
442
447
  };
443
448
  memory: SessionMemory;
444
449
  history: ConversationHistory;
450
+ getRunSummary(runId: string): Promise<RunSummary | undefined>;
445
451
  clearHistory(): Promise<void>;
446
452
  replaceHistory(messages: ReadonlyArray<Omit<Message, 'id' | 'timestamp'>>): Promise<void>;
447
453
  close(): Promise<void>;
@@ -454,6 +460,18 @@ export interface SerializedError {
454
460
  message: string;
455
461
  meta?: Record<string, unknown>;
456
462
  }
463
+ export interface RunSummary {
464
+ runId: string;
465
+ sessionId: string;
466
+ status: RunStatus;
467
+ startedAt: string;
468
+ finishedAt?: string;
469
+ tokenTotals: TokenUsage;
470
+ modelCalls: number;
471
+ toolCalls: number;
472
+ agentCalls: number;
473
+ error?: SerializedError;
474
+ }
457
475
  /** Harness streaming events emitted from `session.workflows.<id>.stream(...)`. */
458
476
  export type RunEvent = {
459
477
  type: 'run.started';
@@ -512,6 +530,7 @@ export type RunEvent = {
512
530
  runId: string;
513
531
  agentId?: string;
514
532
  object: JsonValue;
533
+ usage?: TokenUsage;
515
534
  } | {
516
535
  type: 'model.embedding.completed';
517
536
  runId: string;
@@ -538,6 +557,7 @@ export interface HarnessBuilder<S extends BuilderState = {}> {
538
557
  logger(logger: Logger): HarnessBuilder<S>;
539
558
  state(store: StateStore): HarnessBuilder<S>;
540
559
  sandbox(sandbox?: Sandbox<any>): HarnessBuilder<S>;
560
+ memory(adapter: MemoryAdapter): HarnessBuilder<S>;
541
561
  runtime(runtime: DurableRuntimeAdapter): HarnessBuilder<S>;
542
562
  requires(capabilities: readonly AdapterCapability[]): HarnessBuilder<S>;
543
563
  defaults(defaults: HarnessDefaults): HarnessBuilder<S>;
@@ -1,5 +1,7 @@
1
1
  import { z } from 'zod';
2
2
  import { JsonLogger } from '../logger/index.js';
3
+ import { sandboxMemory } from '../memory/sandbox/index.js';
4
+ import { validateMemoryAdapter } from '../ports/memory.js';
3
5
  import { InMemoryStateStore } from '../state/in-memory.js';
4
6
  import { HarnessConfigError } from '../errors/catalog.js';
5
7
  import { autoDetectSandbox } from '../sandbox/index.js';
@@ -26,6 +28,13 @@ class Builder {
26
28
  sandbox(sandbox = autoDetectSandbox()) {
27
29
  return this.clone({ sandbox });
28
30
  }
31
+ memory(memory) {
32
+ if (this.configured.memory) {
33
+ throw new HarnessConfigError('Memory adapter is already configured.', { reason: 'duplicate_adapter', path: 'memory' });
34
+ }
35
+ validateMemoryAdapter(memory);
36
+ return this.clone({ memory });
37
+ }
29
38
  runtime(runtime) {
30
39
  return this.clone({ runtime });
31
40
  }
@@ -68,7 +77,9 @@ class Builder {
68
77
  throw new HarnessConfigError('At least one model alias is required.', { reason: 'missing_models', path: 'models' });
69
78
  }
70
79
  const sandbox = this.configured.sandbox ?? autoDetectSandbox();
71
- const inspection = this.resolveInspection(this.options.name ?? 'agent-harness', sandbox, models);
80
+ const memory = this.configured.memory ?? sandboxMemory();
81
+ validateMemoryAdapter(memory);
82
+ const inspection = this.resolveInspection(this.options.name ?? 'agent-harness', sandbox, memory, models);
72
83
  const missing = missingCapabilities(inspection.requiredCapabilities, inspection.capabilities);
73
84
  if (missing.length > 0) {
74
85
  throw new HarnessConfigError('Required adapter capabilities are not available.', {
@@ -83,6 +94,7 @@ class Builder {
83
94
  ...(this.configured.telemetry ? { telemetry: this.configured.telemetry } : {}),
84
95
  state: this.configured.state ?? new InMemoryStateStore(),
85
96
  sandbox,
97
+ memory,
86
98
  defaults: {
87
99
  agentMaxIterations: this.configured.defaults?.agentMaxIterations ?? 16,
88
100
  runTimeoutMs: this.configured.defaults?.runTimeoutMs ?? 600_000,
@@ -103,7 +115,7 @@ class Builder {
103
115
  clone(patch) {
104
116
  return new Builder(this.options, { ...this.configured, ...patch });
105
117
  }
106
- resolveInspection(name, sandbox, models) {
118
+ resolveInspection(name, sandbox, memory, models) {
107
119
  const adapters = [];
108
120
  const sandboxCapabilities = hasAdapterCapabilities(sandbox) ? uniqueCapabilities(sandbox.capabilities) : [];
109
121
  adapters.push({
@@ -111,6 +123,15 @@ class Builder {
111
123
  id: getAdapterId(sandbox, 'sandbox'),
112
124
  capabilities: sandboxCapabilities
113
125
  });
126
+ adapters.push({
127
+ kind: 'memory',
128
+ id: memory.info.id,
129
+ capabilities: uniqueCapabilities(memory.info.capabilities),
130
+ metadata: {
131
+ packageName: memory.info.packageName,
132
+ ...(memory.info.version ? { version: memory.info.version } : {})
133
+ }
134
+ });
114
135
  if (this.configured.runtime) {
115
136
  adapters.push({
116
137
  kind: 'runtime',
package/dist/index.d.ts CHANGED
@@ -9,6 +9,8 @@ export * from './state/in-memory.js';
9
9
  export * from './models/json.js';
10
10
  export type { SessionRecord, Message, RunRecord, PersistedRunEvent, RunStatus } from './models/state.js';
11
11
  export * from './models/registry.js';
12
+ export * from './eval/index.js';
13
+ export * from './memory/sandbox/index.js';
12
14
  export * from './sandbox/index.js';
13
15
  export * from './tools/mcp/index.js';
14
16
  export * from './harness/defineHarness.js';
package/dist/index.js CHANGED
@@ -7,6 +7,8 @@ export { createDurableWorkflowContext, DurableStepError, DurableRunLeaseError, D
7
7
  export * from './state/in-memory.js';
8
8
  export * from './models/json.js';
9
9
  export * from './models/registry.js';
10
+ export * from './eval/index.js';
11
+ export * from './memory/sandbox/index.js';
10
12
  export * from './sandbox/index.js';
11
13
  export * from './tools/mcp/index.js';
12
14
  export * from './harness/defineHarness.js';