@purista/harness 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +15 -0
- package/dist/agents/index.d.ts +5 -3
- package/dist/agents/index.js +58 -6
- package/dist/errors/catalog.d.ts +11 -4
- package/dist/eval/index.d.ts +57 -0
- package/dist/eval/index.js +181 -0
- package/dist/harness/defineHarness.d.ts +38 -18
- package/dist/harness/defineHarness.js +23 -2
- package/dist/index.d.ts +2 -0
- package/dist/index.js +2 -0
- package/dist/memory/sandbox/index.d.ts +17 -0
- package/dist/memory/sandbox/index.js +122 -0
- package/dist/models/registry.js +32 -7
- package/dist/ports/capabilities.d.ts +24 -2
- package/dist/ports/harness-context.d.ts +4 -1
- package/dist/ports/index.d.ts +1 -0
- package/dist/ports/index.js +1 -0
- package/dist/ports/memory/facade.d.ts +5 -0
- package/dist/ports/memory/facade.js +123 -0
- package/dist/ports/memory/telemetry.d.ts +16 -0
- package/dist/ports/memory/telemetry.js +77 -0
- package/dist/ports/memory/types.d.ts +204 -0
- package/dist/ports/memory/types.js +1 -0
- package/dist/ports/memory/validation.d.ts +19 -0
- package/dist/ports/memory/validation.js +160 -0
- package/dist/ports/memory.d.ts +3 -0
- package/dist/ports/memory.js +3 -0
- package/dist/sessions/index.d.ts +2 -0
- package/dist/sessions/index.js +275 -68
- package/dist/telemetry/shim.d.ts +20 -0
- package/dist/telemetry/shim.js +28 -0
- package/dist/testing/fakeMemoryAdapter.d.ts +16 -0
- package/dist/testing/fakeMemoryAdapter.js +110 -0
- package/dist/testing/index.d.ts +3 -0
- package/dist/testing/index.js +2 -0
- package/package.json +8 -3
package/README.md
CHANGED
|
@@ -3,6 +3,21 @@
|
|
|
3
3
|
Self-hosted enterprise agent harness for typed tools, agents, workflows, state,
|
|
4
4
|
sandboxing, streaming, and OpenTelemetry instrumentation.
|
|
5
5
|
|
|
6
|
+
The core package also exports provider-neutral eval helpers:
|
|
7
|
+
|
|
8
|
+
- `evaluatePromptCandidates(...)` compares prompt candidates against a fixed
|
|
9
|
+
item set and deterministic or custom scorers.
|
|
10
|
+
- `evaluateDeterministicScorer(...)` runs JSON Pointer based deterministic
|
|
11
|
+
scorer definitions without provider calls. It is exported from the main
|
|
12
|
+
package and re-exported from `@purista/harness/testing`.
|
|
13
|
+
|
|
14
|
+
Telemetry defaults to dual GenAI and OpenInference attributes with no content
|
|
15
|
+
capture. `InvokeOptions.traceparent` and `tracestate` accept inbound W3C Trace
|
|
16
|
+
Context so application traces can parent harness run spans.
|
|
17
|
+
|
|
18
|
+
See [Evaluating Prompts](https://github.com/puristajs/harness/blob/main/docs/guides/evaluating-prompts.md)
|
|
19
|
+
for the execution model, scorer limits, and privacy behavior.
|
|
20
|
+
|
|
6
21
|
## Install
|
|
7
22
|
|
|
8
23
|
```bash
|
package/dist/agents/index.d.ts
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
import type { Logger } from '../logger/index.js';
|
|
2
2
|
import type { JsonValue } from '../models/json.js';
|
|
3
3
|
import type { Message } from '../models/state.js';
|
|
4
|
-
import type { AgentDefinition, ResolvedSkill, RunEvent,
|
|
4
|
+
import type { AgentDefinition, ResolvedSkill, RunEvent, ToolsConfig } from '../harness/defineHarness.js';
|
|
5
|
+
import type { MemoryFacade } from '../ports/memory.js';
|
|
5
6
|
import type { SandboxSession } from '../sandbox/index.js';
|
|
6
|
-
import type
|
|
7
|
+
import { type TelemetryShim } from '../telemetry/index.js';
|
|
7
8
|
import { type McpRunnerRegistry } from '../tools/mcp/runner.js';
|
|
8
9
|
export declare function runDefaultAgent(args: {
|
|
9
10
|
harnessName: string;
|
|
@@ -19,7 +20,7 @@ export declare function runDefaultAgent(args: {
|
|
|
19
20
|
customTools: ToolsConfig;
|
|
20
21
|
mcpRegistry?: McpRunnerRegistry;
|
|
21
22
|
session: SandboxSession;
|
|
22
|
-
memory:
|
|
23
|
+
memory: MemoryFacade;
|
|
23
24
|
mountedSkills: Set<string>;
|
|
24
25
|
historyWindow?: number;
|
|
25
26
|
maxSteps: number;
|
|
@@ -28,6 +29,7 @@ export declare function runDefaultAgent(args: {
|
|
|
28
29
|
logger: Logger;
|
|
29
30
|
telemetry: TelemetryShim;
|
|
30
31
|
emitEvent?: (event: RunEvent) => Promise<void>;
|
|
32
|
+
metadata?: Readonly<Record<string, JsonValue>>;
|
|
31
33
|
}): Promise<{
|
|
32
34
|
output: JsonValue;
|
|
33
35
|
emitted: Message[];
|
package/dist/agents/index.js
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { z } from 'zod';
|
|
2
2
|
import { ATTR_GEN_AI_AGENT_ID, ATTR_GEN_AI_AGENT_NAME, ATTR_GEN_AI_TOOL_CALL_ID, ATTR_GEN_AI_TOOL_NAME, ATTR_GEN_AI_TOOL_TYPE } from '@opentelemetry/semantic-conventions/incubating';
|
|
3
3
|
import { AgentLoopBudgetError, HarnessError, OperationCancelledError, OperationTimeoutError, PermissionDeniedError, ToolError, ToolNotFoundError, ValidationError, serializeError } from '../errors/index.js';
|
|
4
|
+
import { createMetrics } from '../telemetry/index.js';
|
|
4
5
|
import { buildSkillIndex, mountSkillsOnce } from '../skills/index.js';
|
|
5
6
|
import { BUILTIN_ALIAS_TO_CANONICAL, getBuiltinToolSpecs, invokeBuiltinTool } from '../tools/index.js';
|
|
6
7
|
import { getMcpToolSpecs, invokeMcpTool, isMcpToolDefinition } from '../tools/mcp/runner.js';
|
|
@@ -31,14 +32,40 @@ export async function runDefaultAgent(args) {
|
|
|
31
32
|
'harness.run.id': args.runId,
|
|
32
33
|
...(args.workflowId ? { 'harness.workflow.id': args.workflowId } : {}),
|
|
33
34
|
'harness.agent.id': args.agentId,
|
|
35
|
+
'gen_ai.operation.name': 'invoke_agent',
|
|
36
|
+
'openinference.span.kind': 'AGENT',
|
|
37
|
+
'metadata.agent_name': args.agentId,
|
|
38
|
+
'metadata.agent_id': args.agentId,
|
|
34
39
|
[ATTR_GEN_AI_AGENT_NAME]: args.agentId,
|
|
35
|
-
[ATTR_GEN_AI_AGENT_ID]: args.
|
|
40
|
+
[ATTR_GEN_AI_AGENT_ID]: args.agentId,
|
|
36
41
|
'harness.agent.model': args.agent.model,
|
|
37
|
-
'harness.agent.has_handler': args.agent.handler !== undefined
|
|
42
|
+
'harness.agent.has_handler': args.agent.handler !== undefined,
|
|
43
|
+
...metadataSpanAttrs(args.metadata)
|
|
38
44
|
};
|
|
39
|
-
const
|
|
45
|
+
const metrics = createMetrics(args.telemetry, agentAttrs);
|
|
46
|
+
const execute = () => runDefaultAgentInner({ ...args, metrics });
|
|
40
47
|
return args.telemetry.span(`invoke_agent ${args.agentId}`, agentAttrs, execute);
|
|
41
48
|
}
|
|
49
|
+
function metadataSpanAttrs(metadata) {
|
|
50
|
+
const attrs = {};
|
|
51
|
+
for (const [key, value] of Object.entries(metadata ?? {})) {
|
|
52
|
+
if (!/^[a-zA-Z][a-zA-Z0-9_.-]{0,63}$/.test(key))
|
|
53
|
+
continue;
|
|
54
|
+
if (typeof value === 'string') {
|
|
55
|
+
if (value.length <= 256)
|
|
56
|
+
attrs[`harness.metadata.${key}`] = value;
|
|
57
|
+
continue;
|
|
58
|
+
}
|
|
59
|
+
if (typeof value === 'number' && Number.isFinite(value)) {
|
|
60
|
+
attrs[`harness.metadata.${key}`] = value;
|
|
61
|
+
continue;
|
|
62
|
+
}
|
|
63
|
+
if (typeof value === 'boolean') {
|
|
64
|
+
attrs[`harness.metadata.${key}`] = value;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
return attrs;
|
|
68
|
+
}
|
|
42
69
|
async function runDefaultAgentInner(args) {
|
|
43
70
|
args.signal.throwIfAborted();
|
|
44
71
|
const inputSchema = args.agent.input ?? z.string();
|
|
@@ -57,13 +84,15 @@ async function runDefaultAgentInner(args) {
|
|
|
57
84
|
runId: args.runId,
|
|
58
85
|
sessionId: args.sessionId,
|
|
59
86
|
history: { list: async () => args.history },
|
|
60
|
-
memory: args.memory
|
|
87
|
+
memory: args.memory,
|
|
88
|
+
metadata: args.metadata ?? {},
|
|
89
|
+
metrics: args.metrics
|
|
61
90
|
});
|
|
62
91
|
const validated = parseAgentSchema(outputSchema, output, 'agent_output');
|
|
63
92
|
return { output: validated, emitted: [{ id: `msg_${Date.now()}_a`, sessionId: args.sessionId, runId: args.runId, role: 'assistant', content: JSON.stringify(validated), timestamp: new Date().toISOString() }] };
|
|
64
93
|
}
|
|
65
94
|
const baseInstructions = typeof args.agent.instructions === 'function'
|
|
66
|
-
? args.agent.instructions({ input: parsedInput, runId: args.runId, sessionId: args.sessionId, history: { list: async () => args.history }, memory: args.memory })
|
|
95
|
+
? args.agent.instructions({ input: parsedInput, runId: args.runId, sessionId: args.sessionId, history: { list: async () => args.history }, memory: args.memory, metadata: args.metadata ?? {}, metrics: args.metrics })
|
|
67
96
|
: args.agent.instructions;
|
|
68
97
|
const instructions = `${baseInstructions}${buildSkillIndex(args.skills, skillIds)}`;
|
|
69
98
|
const enabledBuiltins = args.agent.builtinTools === false ? [] : args.agent.builtinTools?.slice() ?? ['bash', 'read', 'write', 'edit', 'glob', 'grep', 'list'];
|
|
@@ -117,6 +146,7 @@ async function runDefaultAgentInner(args) {
|
|
|
117
146
|
if (toolCalls.length === 0) {
|
|
118
147
|
const validated = parseAgentSchema(outputSchema, response.object, 'agent_output');
|
|
119
148
|
emitted.push({ id: `msg_${Date.now()}_a`, sessionId: args.sessionId, runId: args.runId, role: 'assistant', content: JSON.stringify(validated), timestamp: new Date().toISOString() });
|
|
149
|
+
await args.emitEvent?.({ type: 'model.object', runId: args.runId, agentId: args.agentId, object: validated, usage: response.usage });
|
|
120
150
|
await args.emitEvent?.({ type: 'agent.finished', runId: args.runId, agentId: args.agentId, at: new Date().toISOString(), output: validated });
|
|
121
151
|
return { output: validated, emitted };
|
|
122
152
|
}
|
|
@@ -159,7 +189,25 @@ async function runDefaultAgentInner(args) {
|
|
|
159
189
|
}
|
|
160
190
|
const tsTool = tool;
|
|
161
191
|
const parsed = tsTool.input.parse(input);
|
|
162
|
-
const out = await withToolSignal(args.signal, args.toolTimeoutMs, (signal) => tsTool.handler({
|
|
192
|
+
const out = await withToolSignal(args.signal, args.toolTimeoutMs, (signal) => tsTool.handler({
|
|
193
|
+
signal,
|
|
194
|
+
sandbox: withSandboxTelemetry(args, canonical),
|
|
195
|
+
logger: args.logger,
|
|
196
|
+
telemetry: args.telemetry,
|
|
197
|
+
metrics: createMetrics(args.telemetry, {
|
|
198
|
+
'harness.name': args.harnessName,
|
|
199
|
+
'harness.session.id': args.sessionId,
|
|
200
|
+
'harness.run.id': args.runId,
|
|
201
|
+
...(args.workflowId ? { 'harness.workflow.id': args.workflowId } : {}),
|
|
202
|
+
'harness.agent.id': args.agentId,
|
|
203
|
+
'harness.tool.id': canonical
|
|
204
|
+
}),
|
|
205
|
+
memory: args.memory,
|
|
206
|
+
runId: args.runId,
|
|
207
|
+
sessionId: args.sessionId,
|
|
208
|
+
agentId: args.agentId,
|
|
209
|
+
toolId: canonical
|
|
210
|
+
}, parsed));
|
|
163
211
|
return { output: tsTool.output.parse(out) };
|
|
164
212
|
});
|
|
165
213
|
}
|
|
@@ -226,6 +274,10 @@ async function withToolSpan(args, toolId, callId, toolKind, mcpAttrs, fn) {
|
|
|
226
274
|
...(args.workflowId ? { 'harness.workflow.id': args.workflowId } : {}),
|
|
227
275
|
'harness.agent.id': args.agentId,
|
|
228
276
|
'harness.tool.id': toolId,
|
|
277
|
+
'gen_ai.operation.name': 'execute_tool',
|
|
278
|
+
'openinference.span.kind': 'TOOL',
|
|
279
|
+
'tool.name': toolId,
|
|
280
|
+
'tool.call.id': callId,
|
|
229
281
|
[ATTR_GEN_AI_TOOL_NAME]: toolId,
|
|
230
282
|
[ATTR_GEN_AI_TOOL_CALL_ID]: callId,
|
|
231
283
|
[ATTR_GEN_AI_TOOL_TYPE]: toolKind,
|
package/dist/errors/catalog.d.ts
CHANGED
|
@@ -22,9 +22,14 @@ export declare class ValidationError extends HarnessError {
|
|
|
22
22
|
/** Model provider response shape is invalid. */ | 'model_response'
|
|
23
23
|
/** Session memory key is invalid. */ | 'memory_key'
|
|
24
24
|
/** Session memory value is invalid or non-serializable. */ | 'memory_value'
|
|
25
|
+
/** Session memory scope is invalid or unsupported. */ | 'memory_scope'
|
|
26
|
+
/** Session memory options are invalid or unsupported. */ | 'memory_write_options'
|
|
27
|
+
/** Session memory listing options are invalid. */ | 'memory_list_options'
|
|
28
|
+
/** Session memory search query is invalid. */ | 'memory_search_query'
|
|
25
29
|
/** Message envelope validation failed. */ | 'message'
|
|
26
30
|
/** Session history shape validation failed. */ | 'session_history'
|
|
27
|
-
/** Invocation options are invalid. */ | 'invoke_options'
|
|
31
|
+
/** Invocation options are invalid. */ | 'invoke_options'
|
|
32
|
+
/** Evaluation helper input is invalid. */ | 'eval_input';
|
|
28
33
|
issues: unknown;
|
|
29
34
|
}, cause?: unknown);
|
|
30
35
|
}
|
|
@@ -139,21 +144,23 @@ export declare class SessionBusyError extends HarnessError {
|
|
|
139
144
|
/** State backend operation failed. */
|
|
140
145
|
export declare class StateError extends HarnessError {
|
|
141
146
|
constructor(message: string, meta: {
|
|
142
|
-
op: 'getSession' | 'upsertSession' | 'closeSession' | 'appendMessages' | 'listMessages' | 'clearMessages' | 'createRun' | 'finishRun' | 'getRun' | 'listRuns' | 'appendEvents' | 'listEvents';
|
|
147
|
+
op: 'getSession' | 'upsertSession' | 'closeSession' | 'appendMessages' | 'listMessages' | 'clearMessages' | 'createRun' | 'finishRun' | 'getRun' | 'listRuns' | 'appendEvents' | 'listEvents' | 'memory.get' | 'memory.set' | 'memory.delete' | 'memory.list' | 'memory.search';
|
|
143
148
|
reason?: 'duplicate_message_id' | string;
|
|
149
|
+
adapter?: 'memory' | string;
|
|
150
|
+
memory_provider?: string;
|
|
144
151
|
}, cause?: unknown);
|
|
145
152
|
}
|
|
146
153
|
/** Timed execution budget expired. */
|
|
147
154
|
export declare class OperationTimeoutError extends HarnessError {
|
|
148
155
|
constructor(message: string, meta: {
|
|
149
|
-
scope: 'run' | 'model' | 'tool' | 'sandbox_run';
|
|
156
|
+
scope: 'run' | 'model' | 'tool' | 'sandbox_run' | 'memory';
|
|
150
157
|
timeout_ms: number;
|
|
151
158
|
}, cause?: unknown);
|
|
152
159
|
}
|
|
153
160
|
/** Operation cancelled by abort signal or explicit cancellation path. */
|
|
154
161
|
export declare class OperationCancelledError extends HarnessError {
|
|
155
162
|
constructor(message: string, meta: {
|
|
156
|
-
scope: 'run' | 'workflow' | 'agent' | 'model' | 'tool' | 'sandbox';
|
|
163
|
+
scope: 'run' | 'workflow' | 'agent' | 'model' | 'tool' | 'sandbox' | 'memory';
|
|
157
164
|
}, cause?: unknown);
|
|
158
165
|
}
|
|
159
166
|
/** MCP transport/protocol failure. */
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import type { JsonValue } from '../models/json.js';
|
|
2
|
+
export type DeterministicScorerDefinition = {
|
|
3
|
+
type: 'regex';
|
|
4
|
+
path: string;
|
|
5
|
+
pattern: string;
|
|
6
|
+
flags?: 'i' | 'm' | 'im';
|
|
7
|
+
} | {
|
|
8
|
+
type: 'json-schema';
|
|
9
|
+
schema: JsonValue;
|
|
10
|
+
} | {
|
|
11
|
+
type: 'contains';
|
|
12
|
+
path: string;
|
|
13
|
+
value: string;
|
|
14
|
+
caseInsensitive?: boolean;
|
|
15
|
+
} | {
|
|
16
|
+
type: 'attribute-equality';
|
|
17
|
+
leftPath: string;
|
|
18
|
+
rightPath: string;
|
|
19
|
+
};
|
|
20
|
+
export interface ScorerTarget {
|
|
21
|
+
input: unknown;
|
|
22
|
+
output: unknown;
|
|
23
|
+
expected?: unknown;
|
|
24
|
+
context?: unknown[];
|
|
25
|
+
}
|
|
26
|
+
export interface ScorerResult {
|
|
27
|
+
score: number;
|
|
28
|
+
passed: boolean;
|
|
29
|
+
evidence?: JsonValue;
|
|
30
|
+
}
|
|
31
|
+
export interface PromptCandidate<I = unknown> {
|
|
32
|
+
id: string;
|
|
33
|
+
prompt: string;
|
|
34
|
+
metadata?: Record<string, JsonValue>;
|
|
35
|
+
}
|
|
36
|
+
export interface EvaluationItem<I = unknown> {
|
|
37
|
+
id: string;
|
|
38
|
+
input: I;
|
|
39
|
+
expected?: unknown;
|
|
40
|
+
context?: unknown[];
|
|
41
|
+
}
|
|
42
|
+
export interface CandidateScore {
|
|
43
|
+
candidateId: string;
|
|
44
|
+
meanScore: number;
|
|
45
|
+
passRate: number;
|
|
46
|
+
itemCount: number;
|
|
47
|
+
scorerCount: number;
|
|
48
|
+
}
|
|
49
|
+
export interface EvaluatePromptCandidatesInput<I = unknown> {
|
|
50
|
+
candidates: PromptCandidate<I>[];
|
|
51
|
+
items: EvaluationItem<I>[];
|
|
52
|
+
scorer: (target: ScorerTarget, signal: AbortSignal) => Promise<ScorerResult>;
|
|
53
|
+
runCandidate: (candidate: PromptCandidate<I>, item: EvaluationItem<I>, signal: AbortSignal) => Promise<unknown>;
|
|
54
|
+
signal: AbortSignal;
|
|
55
|
+
}
|
|
56
|
+
export declare function evaluateDeterministicScorer(definition: DeterministicScorerDefinition, target: ScorerTarget): ScorerResult;
|
|
57
|
+
export declare function evaluatePromptCandidates<I = unknown>(input: EvaluatePromptCandidatesInput<I>): Promise<CandidateScore[]>;
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
import { ValidationError } from '../errors/index.js';
|
|
2
|
+
export function evaluateDeterministicScorer(definition, target) {
|
|
3
|
+
switch (definition.type) {
|
|
4
|
+
case 'regex': {
|
|
5
|
+
const selected = readPointer(target.output, definition.path);
|
|
6
|
+
if (!selected.found)
|
|
7
|
+
return missingPointer(definition.path);
|
|
8
|
+
return binary(new RegExp(definition.pattern, definition.flags ?? '').test(String(selected.value)));
|
|
9
|
+
}
|
|
10
|
+
case 'contains': {
|
|
11
|
+
const selected = readPointer(target.output, definition.path);
|
|
12
|
+
if (!selected.found)
|
|
13
|
+
return missingPointer(definition.path);
|
|
14
|
+
const haystack = String(selected.value);
|
|
15
|
+
const needle = definition.value;
|
|
16
|
+
return binary(definition.caseInsensitive
|
|
17
|
+
? haystack.toLocaleLowerCase().includes(needle.toLocaleLowerCase())
|
|
18
|
+
: haystack.includes(needle));
|
|
19
|
+
}
|
|
20
|
+
case 'attribute-equality': {
|
|
21
|
+
const left = readPointer(target.output, definition.leftPath);
|
|
22
|
+
if (!left.found)
|
|
23
|
+
return missingPointer(definition.leftPath);
|
|
24
|
+
const right = readPointer(target.output, definition.rightPath);
|
|
25
|
+
if (!right.found)
|
|
26
|
+
return missingPointer(definition.rightPath);
|
|
27
|
+
return deepEqual(left.value, right.value)
|
|
28
|
+
? binary(true)
|
|
29
|
+
: { score: 0, passed: false, evidence: { left: toJsonValue(left.value), right: toJsonValue(right.value) } };
|
|
30
|
+
}
|
|
31
|
+
case 'json-schema': {
|
|
32
|
+
const result = validateJsonSchema(definition.schema, target.output);
|
|
33
|
+
return result.passed
|
|
34
|
+
? binary(true)
|
|
35
|
+
: { score: 0, passed: false, evidence: { reason: 'schema_validation_failed', issues: result.issues } };
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
export async function evaluatePromptCandidates(input) {
|
|
40
|
+
if (input.candidates.length === 0) {
|
|
41
|
+
throw new ValidationError('At least one prompt candidate is required.', { where: 'eval_input', issues: { candidates: 'empty' } });
|
|
42
|
+
}
|
|
43
|
+
if (input.items.length === 0) {
|
|
44
|
+
throw new ValidationError('At least one evaluation item is required.', { where: 'eval_input', issues: { items: 'empty' } });
|
|
45
|
+
}
|
|
46
|
+
const scores = [];
|
|
47
|
+
for (const candidate of input.candidates) {
|
|
48
|
+
input.signal.throwIfAborted();
|
|
49
|
+
let total = 0;
|
|
50
|
+
let passed = 0;
|
|
51
|
+
let scorerCount = 0;
|
|
52
|
+
for (const item of input.items) {
|
|
53
|
+
input.signal.throwIfAborted();
|
|
54
|
+
const output = await input.runCandidate(candidate, item, input.signal);
|
|
55
|
+
const target = {
|
|
56
|
+
input: item.input,
|
|
57
|
+
output
|
|
58
|
+
};
|
|
59
|
+
if (item.expected !== undefined)
|
|
60
|
+
target.expected = item.expected;
|
|
61
|
+
if (item.context !== undefined)
|
|
62
|
+
target.context = item.context;
|
|
63
|
+
const result = await input.scorer(target, input.signal);
|
|
64
|
+
total += result.score;
|
|
65
|
+
passed += result.passed ? 1 : 0;
|
|
66
|
+
scorerCount += 1;
|
|
67
|
+
}
|
|
68
|
+
scores.push({
|
|
69
|
+
candidateId: candidate.id,
|
|
70
|
+
meanScore: total / scorerCount,
|
|
71
|
+
passRate: passed / scorerCount,
|
|
72
|
+
itemCount: input.items.length,
|
|
73
|
+
scorerCount
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
return scores.sort((a, b) => {
|
|
77
|
+
if (a.meanScore !== b.meanScore)
|
|
78
|
+
return b.meanScore - a.meanScore;
|
|
79
|
+
if (a.passRate !== b.passRate)
|
|
80
|
+
return b.passRate - a.passRate;
|
|
81
|
+
return a.candidateId.localeCompare(b.candidateId);
|
|
82
|
+
});
|
|
83
|
+
}
|
|
84
|
+
function binary(passed) {
|
|
85
|
+
return { score: passed ? 1 : 0, passed };
|
|
86
|
+
}
|
|
87
|
+
function missingPointer(path) {
|
|
88
|
+
return { score: 0, passed: false, evidence: { reason: 'missing_pointer', path } };
|
|
89
|
+
}
|
|
90
|
+
function readPointer(value, pointer) {
|
|
91
|
+
if (pointer === '')
|
|
92
|
+
return { found: true, value };
|
|
93
|
+
if (!pointer.startsWith('/'))
|
|
94
|
+
return { found: false };
|
|
95
|
+
let current = value;
|
|
96
|
+
for (const rawPart of pointer.slice(1).split('/')) {
|
|
97
|
+
const part = rawPart.replace(/~1/g, '/').replace(/~0/g, '~');
|
|
98
|
+
if (Array.isArray(current)) {
|
|
99
|
+
const index = Number(part);
|
|
100
|
+
if (!Number.isInteger(index) || index < 0 || index >= current.length)
|
|
101
|
+
return { found: false };
|
|
102
|
+
current = current[index];
|
|
103
|
+
continue;
|
|
104
|
+
}
|
|
105
|
+
if (!isRecord(current) || !(part in current))
|
|
106
|
+
return { found: false };
|
|
107
|
+
current = current[part];
|
|
108
|
+
}
|
|
109
|
+
return { found: true, value: current };
|
|
110
|
+
}
|
|
111
|
+
function validateJsonSchema(schema, value) {
|
|
112
|
+
const issues = [];
|
|
113
|
+
validateSchemaAt(schema, value, '', issues);
|
|
114
|
+
return { passed: issues.length === 0, issues };
|
|
115
|
+
}
|
|
116
|
+
function validateSchemaAt(schema, value, path, issues) {
|
|
117
|
+
if (!isRecord(schema))
|
|
118
|
+
return;
|
|
119
|
+
if ('const' in schema && !deepEqual(value, schema['const'])) {
|
|
120
|
+
issues.push({ path, reason: 'const', expected: toJsonValue(schema['const']), actual: toJsonValue(value) });
|
|
121
|
+
return;
|
|
122
|
+
}
|
|
123
|
+
if (Array.isArray(schema['enum']) && !schema['enum'].some((entry) => deepEqual(entry, value))) {
|
|
124
|
+
issues.push({ path, reason: 'enum', actual: toJsonValue(value) });
|
|
125
|
+
return;
|
|
126
|
+
}
|
|
127
|
+
const type = typeof schema['type'] === 'string' ? schema['type'] : undefined;
|
|
128
|
+
if (type && !matchesType(value, type)) {
|
|
129
|
+
issues.push({ path, reason: 'type', expected: type, actual: typeof value });
|
|
130
|
+
return;
|
|
131
|
+
}
|
|
132
|
+
if (type === 'object' || schema['properties']) {
|
|
133
|
+
if (!isRecord(value)) {
|
|
134
|
+
issues.push({ path, reason: 'type', expected: 'object', actual: typeof value });
|
|
135
|
+
return;
|
|
136
|
+
}
|
|
137
|
+
const required = Array.isArray(schema['required']) ? schema['required'].filter((entry) => typeof entry === 'string') : [];
|
|
138
|
+
for (const key of required) {
|
|
139
|
+
if (!(key in value))
|
|
140
|
+
issues.push({ path: `${path}/${key}`, reason: 'required' });
|
|
141
|
+
}
|
|
142
|
+
const properties = isRecord(schema['properties']) ? schema['properties'] : {};
|
|
143
|
+
for (const [key, childSchema] of Object.entries(properties)) {
|
|
144
|
+
if (key in value)
|
|
145
|
+
validateSchemaAt(childSchema, value[key], `${path}/${key}`, issues);
|
|
146
|
+
}
|
|
147
|
+
if (schema['additionalProperties'] === false) {
|
|
148
|
+
for (const key of Object.keys(value)) {
|
|
149
|
+
if (!(key in properties))
|
|
150
|
+
issues.push({ path: `${path}/${key}`, reason: 'additional_properties' });
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
function matchesType(value, type) {
|
|
156
|
+
switch (type) {
|
|
157
|
+
case 'object': return isRecord(value);
|
|
158
|
+
case 'array': return Array.isArray(value);
|
|
159
|
+
case 'string': return typeof value === 'string';
|
|
160
|
+
case 'number': return typeof value === 'number' && Number.isFinite(value);
|
|
161
|
+
case 'integer': return Number.isInteger(value);
|
|
162
|
+
case 'boolean': return typeof value === 'boolean';
|
|
163
|
+
case 'null': return value === null;
|
|
164
|
+
default: return true;
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
function isRecord(value) {
|
|
168
|
+
return value !== null && typeof value === 'object' && !Array.isArray(value);
|
|
169
|
+
}
|
|
170
|
+
function deepEqual(a, b) {
|
|
171
|
+
return JSON.stringify(a) === JSON.stringify(b);
|
|
172
|
+
}
|
|
173
|
+
function toJsonValue(value) {
|
|
174
|
+
if (value === null || typeof value === 'string' || typeof value === 'number' || typeof value === 'boolean')
|
|
175
|
+
return value;
|
|
176
|
+
if (Array.isArray(value))
|
|
177
|
+
return value.map((entry) => toJsonValue(entry));
|
|
178
|
+
if (isRecord(value))
|
|
179
|
+
return Object.fromEntries(Object.entries(value).map(([key, entry]) => [key, toJsonValue(entry)]));
|
|
180
|
+
return String(value);
|
|
181
|
+
}
|
|
@@ -2,10 +2,12 @@ import { z } from 'zod';
|
|
|
2
2
|
import { type Logger } from '../logger/index.js';
|
|
3
3
|
import type { ModelAlias, ModelCapability, TokenUsage } from '../ports/model-provider.js';
|
|
4
4
|
import type { StateStore } from '../ports/state.js';
|
|
5
|
-
import type { TelemetryShim } from '../telemetry/index.js';
|
|
5
|
+
import type { Metrics, TelemetryShim } from '../telemetry/index.js';
|
|
6
6
|
import type { HarnessAdapterContext } from '../ports/harness-context.js';
|
|
7
|
+
import type { MemoryAdapter, MemoryFacade, SessionMemory } from '../ports/memory.js';
|
|
7
8
|
import type { JsonValue } from '../models/json.js';
|
|
8
9
|
import type { Message } from '../models/state.js';
|
|
10
|
+
import type { RunStatus } from '../models/state.js';
|
|
9
11
|
import type { HarnessError } from '../errors/harness-error.js';
|
|
10
12
|
import { type Sandbox } from '../sandbox/index.js';
|
|
11
13
|
import type { ModelHandle } from '../models/registry.js';
|
|
@@ -13,12 +15,13 @@ import { type AdapterCapability, type DurableRuntimeAdapter, type HarnessInspect
|
|
|
13
15
|
/** Stable harness version string for diagnostics and generated documentation. */
|
|
14
16
|
export declare const HARNESS_VERSION = "0.0.0";
|
|
15
17
|
/** OpenTelemetry capture controls used by the harness. */
|
|
18
|
+
export type TelemetryFlavor = 'dual' | 'gen_ai_only' | 'openinference_only';
|
|
19
|
+
export type ContentCaptureMode = 'NO_CONTENT' | 'SPAN_ONLY' | 'EVENT_ONLY' | 'SPAN_AND_EVENT';
|
|
16
20
|
export interface TelemetryOptions {
|
|
17
|
-
/**
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
captureContent?: boolean;
|
|
21
|
+
/** Backend emission shape. */
|
|
22
|
+
flavor?: TelemetryFlavor;
|
|
23
|
+
/** Span/event content capture mode. */
|
|
24
|
+
contentCaptureMode?: ContentCaptureMode;
|
|
22
25
|
}
|
|
23
26
|
/** Default harness budgets and execution behavior. */
|
|
24
27
|
export interface HarnessDefaults {
|
|
@@ -51,6 +54,12 @@ export interface InvokeOptions {
|
|
|
51
54
|
timeoutMs?: number;
|
|
52
55
|
/** Optional history-window override for this call only. */
|
|
53
56
|
historyWindow?: number;
|
|
57
|
+
/** Optional W3C Trace Context parent. */
|
|
58
|
+
traceparent?: string;
|
|
59
|
+
/** Optional W3C Trace Context state. */
|
|
60
|
+
tracestate?: string;
|
|
61
|
+
/** Scalar metadata exposed to handlers and telemetry sanitizers. */
|
|
62
|
+
metadata?: Record<string, JsonValue>;
|
|
54
63
|
}
|
|
55
64
|
/** Canonical built-in tool names provided by the harness. */
|
|
56
65
|
export type BuiltinToolName = 'bash' | 'read' | 'write' | 'edit' | 'glob' | 'grep' | 'list';
|
|
@@ -102,17 +111,6 @@ export interface ResolvedSkill {
|
|
|
102
111
|
/** Absolute directory mounted into `/skills/<name>`. */
|
|
103
112
|
directory: string;
|
|
104
113
|
}
|
|
105
|
-
/** Sandbox-backed per-session memory facade. */
|
|
106
|
-
export interface SessionMemory {
|
|
107
|
-
/** Reads `/memory/<key>.json` and returns the parsed JSON value if present. */
|
|
108
|
-
read<T = JsonValue>(key: string): Promise<T | undefined>;
|
|
109
|
-
/** Writes JSON-serializable data to `/memory/<key>.json`. */
|
|
110
|
-
write(key: string, value: JsonValue): Promise<void>;
|
|
111
|
-
/** Deletes `/memory/<key>.json` if it exists. */
|
|
112
|
-
delete(key: string): Promise<void>;
|
|
113
|
-
/** Lists known memory keys without the `.json` suffix. */
|
|
114
|
-
list(): Promise<string[]>;
|
|
115
|
-
}
|
|
116
114
|
/** Conversation history accessor for a single session thread. */
|
|
117
115
|
export interface ConversationHistory {
|
|
118
116
|
/** Returns persisted conversation messages for the session. */
|
|
@@ -127,6 +125,8 @@ export interface ToolHandlerContext {
|
|
|
127
125
|
sandbox: import('../sandbox/index.js').SandboxSession;
|
|
128
126
|
logger: Logger;
|
|
129
127
|
telemetry: TelemetryShim;
|
|
128
|
+
metrics: Metrics;
|
|
129
|
+
memory: MemoryFacade;
|
|
130
130
|
runId: string;
|
|
131
131
|
sessionId: string;
|
|
132
132
|
agentId: string;
|
|
@@ -258,7 +258,9 @@ export interface AgentContextMinimal<S extends BuilderState, I> {
|
|
|
258
258
|
sessionId: string;
|
|
259
259
|
runId: string;
|
|
260
260
|
history: ConversationHistory;
|
|
261
|
-
memory:
|
|
261
|
+
memory: MemoryFacade;
|
|
262
|
+
metadata: Readonly<Record<string, JsonValue>>;
|
|
263
|
+
metrics: Metrics;
|
|
262
264
|
}
|
|
263
265
|
/** Full context passed to workflow handlers. */
|
|
264
266
|
export interface WorkflowContext<S extends BuilderState, I, O> {
|
|
@@ -270,6 +272,9 @@ export interface WorkflowContext<S extends BuilderState, I, O> {
|
|
|
270
272
|
signal: AbortSignal;
|
|
271
273
|
runId: string;
|
|
272
274
|
sessionId: string;
|
|
275
|
+
metadata: Readonly<Record<string, JsonValue>>;
|
|
276
|
+
memory: MemoryFacade;
|
|
277
|
+
metrics: Metrics;
|
|
273
278
|
output?: O;
|
|
274
279
|
}
|
|
275
280
|
/** Full context passed to custom agent handlers. */
|
|
@@ -442,6 +447,7 @@ export interface Session<S extends BuilderState> {
|
|
|
442
447
|
};
|
|
443
448
|
memory: SessionMemory;
|
|
444
449
|
history: ConversationHistory;
|
|
450
|
+
getRunSummary(runId: string): Promise<RunSummary | undefined>;
|
|
445
451
|
clearHistory(): Promise<void>;
|
|
446
452
|
replaceHistory(messages: ReadonlyArray<Omit<Message, 'id' | 'timestamp'>>): Promise<void>;
|
|
447
453
|
close(): Promise<void>;
|
|
@@ -454,6 +460,18 @@ export interface SerializedError {
|
|
|
454
460
|
message: string;
|
|
455
461
|
meta?: Record<string, unknown>;
|
|
456
462
|
}
|
|
463
|
+
export interface RunSummary {
|
|
464
|
+
runId: string;
|
|
465
|
+
sessionId: string;
|
|
466
|
+
status: RunStatus;
|
|
467
|
+
startedAt: string;
|
|
468
|
+
finishedAt?: string;
|
|
469
|
+
tokenTotals: TokenUsage;
|
|
470
|
+
modelCalls: number;
|
|
471
|
+
toolCalls: number;
|
|
472
|
+
agentCalls: number;
|
|
473
|
+
error?: SerializedError;
|
|
474
|
+
}
|
|
457
475
|
/** Harness streaming events emitted from `session.workflows.<id>.stream(...)`. */
|
|
458
476
|
export type RunEvent = {
|
|
459
477
|
type: 'run.started';
|
|
@@ -512,6 +530,7 @@ export type RunEvent = {
|
|
|
512
530
|
runId: string;
|
|
513
531
|
agentId?: string;
|
|
514
532
|
object: JsonValue;
|
|
533
|
+
usage?: TokenUsage;
|
|
515
534
|
} | {
|
|
516
535
|
type: 'model.embedding.completed';
|
|
517
536
|
runId: string;
|
|
@@ -538,6 +557,7 @@ export interface HarnessBuilder<S extends BuilderState = {}> {
|
|
|
538
557
|
logger(logger: Logger): HarnessBuilder<S>;
|
|
539
558
|
state(store: StateStore): HarnessBuilder<S>;
|
|
540
559
|
sandbox(sandbox?: Sandbox<any>): HarnessBuilder<S>;
|
|
560
|
+
memory(adapter: MemoryAdapter): HarnessBuilder<S>;
|
|
541
561
|
runtime(runtime: DurableRuntimeAdapter): HarnessBuilder<S>;
|
|
542
562
|
requires(capabilities: readonly AdapterCapability[]): HarnessBuilder<S>;
|
|
543
563
|
defaults(defaults: HarnessDefaults): HarnessBuilder<S>;
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import { z } from 'zod';
|
|
2
2
|
import { JsonLogger } from '../logger/index.js';
|
|
3
|
+
import { sandboxMemory } from '../memory/sandbox/index.js';
|
|
4
|
+
import { validateMemoryAdapter } from '../ports/memory.js';
|
|
3
5
|
import { InMemoryStateStore } from '../state/in-memory.js';
|
|
4
6
|
import { HarnessConfigError } from '../errors/catalog.js';
|
|
5
7
|
import { autoDetectSandbox } from '../sandbox/index.js';
|
|
@@ -26,6 +28,13 @@ class Builder {
|
|
|
26
28
|
sandbox(sandbox = autoDetectSandbox()) {
|
|
27
29
|
return this.clone({ sandbox });
|
|
28
30
|
}
|
|
31
|
+
memory(memory) {
|
|
32
|
+
if (this.configured.memory) {
|
|
33
|
+
throw new HarnessConfigError('Memory adapter is already configured.', { reason: 'duplicate_adapter', path: 'memory' });
|
|
34
|
+
}
|
|
35
|
+
validateMemoryAdapter(memory);
|
|
36
|
+
return this.clone({ memory });
|
|
37
|
+
}
|
|
29
38
|
runtime(runtime) {
|
|
30
39
|
return this.clone({ runtime });
|
|
31
40
|
}
|
|
@@ -68,7 +77,9 @@ class Builder {
|
|
|
68
77
|
throw new HarnessConfigError('At least one model alias is required.', { reason: 'missing_models', path: 'models' });
|
|
69
78
|
}
|
|
70
79
|
const sandbox = this.configured.sandbox ?? autoDetectSandbox();
|
|
71
|
-
const
|
|
80
|
+
const memory = this.configured.memory ?? sandboxMemory();
|
|
81
|
+
validateMemoryAdapter(memory);
|
|
82
|
+
const inspection = this.resolveInspection(this.options.name ?? 'agent-harness', sandbox, memory, models);
|
|
72
83
|
const missing = missingCapabilities(inspection.requiredCapabilities, inspection.capabilities);
|
|
73
84
|
if (missing.length > 0) {
|
|
74
85
|
throw new HarnessConfigError('Required adapter capabilities are not available.', {
|
|
@@ -83,6 +94,7 @@ class Builder {
|
|
|
83
94
|
...(this.configured.telemetry ? { telemetry: this.configured.telemetry } : {}),
|
|
84
95
|
state: this.configured.state ?? new InMemoryStateStore(),
|
|
85
96
|
sandbox,
|
|
97
|
+
memory,
|
|
86
98
|
defaults: {
|
|
87
99
|
agentMaxIterations: this.configured.defaults?.agentMaxIterations ?? 16,
|
|
88
100
|
runTimeoutMs: this.configured.defaults?.runTimeoutMs ?? 600_000,
|
|
@@ -103,7 +115,7 @@ class Builder {
|
|
|
103
115
|
clone(patch) {
|
|
104
116
|
return new Builder(this.options, { ...this.configured, ...patch });
|
|
105
117
|
}
|
|
106
|
-
resolveInspection(name, sandbox, models) {
|
|
118
|
+
resolveInspection(name, sandbox, memory, models) {
|
|
107
119
|
const adapters = [];
|
|
108
120
|
const sandboxCapabilities = hasAdapterCapabilities(sandbox) ? uniqueCapabilities(sandbox.capabilities) : [];
|
|
109
121
|
adapters.push({
|
|
@@ -111,6 +123,15 @@ class Builder {
|
|
|
111
123
|
id: getAdapterId(sandbox, 'sandbox'),
|
|
112
124
|
capabilities: sandboxCapabilities
|
|
113
125
|
});
|
|
126
|
+
adapters.push({
|
|
127
|
+
kind: 'memory',
|
|
128
|
+
id: memory.info.id,
|
|
129
|
+
capabilities: uniqueCapabilities(memory.info.capabilities),
|
|
130
|
+
metadata: {
|
|
131
|
+
packageName: memory.info.packageName,
|
|
132
|
+
...(memory.info.version ? { version: memory.info.version } : {})
|
|
133
|
+
}
|
|
134
|
+
});
|
|
114
135
|
if (this.configured.runtime) {
|
|
115
136
|
adapters.push({
|
|
116
137
|
kind: 'runtime',
|
package/dist/index.d.ts
CHANGED
|
@@ -9,6 +9,8 @@ export * from './state/in-memory.js';
|
|
|
9
9
|
export * from './models/json.js';
|
|
10
10
|
export type { SessionRecord, Message, RunRecord, PersistedRunEvent, RunStatus } from './models/state.js';
|
|
11
11
|
export * from './models/registry.js';
|
|
12
|
+
export * from './eval/index.js';
|
|
13
|
+
export * from './memory/sandbox/index.js';
|
|
12
14
|
export * from './sandbox/index.js';
|
|
13
15
|
export * from './tools/mcp/index.js';
|
|
14
16
|
export * from './harness/defineHarness.js';
|
package/dist/index.js
CHANGED
|
@@ -7,6 +7,8 @@ export { createDurableWorkflowContext, DurableStepError, DurableRunLeaseError, D
|
|
|
7
7
|
export * from './state/in-memory.js';
|
|
8
8
|
export * from './models/json.js';
|
|
9
9
|
export * from './models/registry.js';
|
|
10
|
+
export * from './eval/index.js';
|
|
11
|
+
export * from './memory/sandbox/index.js';
|
|
10
12
|
export * from './sandbox/index.js';
|
|
11
13
|
export * from './tools/mcp/index.js';
|
|
12
14
|
export * from './harness/defineHarness.js';
|