npm - @purista/harness - Versions diffs - 1.0.0 → 1.2.0 - Mend

@purista/harness 1.0.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

package/README.md +15 -0
package/dist/agents/index.d.ts +5 -3
package/dist/agents/index.js +84 -8
package/dist/errors/catalog.d.ts +45 -5
package/dist/errors/catalog.js +19 -0
package/dist/errors/harness-error.d.ts +2 -0
package/dist/eval/index.d.ts +57 -0
package/dist/eval/index.js +181 -0
package/dist/harness/defineHarness.d.ts +96 -20
package/dist/harness/defineHarness.js +59 -2
package/dist/index.d.ts +4 -0
package/dist/index.js +4 -0
package/dist/memory/sandbox/index.d.ts +17 -0
package/dist/memory/sandbox/index.js +122 -0
package/dist/models/registry.js +32 -7
package/dist/ports/capabilities.d.ts +46 -2
package/dist/ports/harness-context.d.ts +4 -1
package/dist/ports/index.d.ts +2 -0
package/dist/ports/index.js +2 -0
package/dist/ports/memory/facade.d.ts +5 -0
package/dist/ports/memory/facade.js +123 -0
package/dist/ports/memory/telemetry.d.ts +16 -0
package/dist/ports/memory/telemetry.js +77 -0
package/dist/ports/memory/types.d.ts +204 -0
package/dist/ports/memory/types.js +1 -0
package/dist/ports/memory/validation.d.ts +19 -0
package/dist/ports/memory/validation.js +160 -0
package/dist/ports/memory.d.ts +3 -0
package/dist/ports/memory.js +3 -0
package/dist/ports/workspace.d.ts +177 -0
package/dist/ports/workspace.js +32 -0
package/dist/runtime/durable.d.ts +3 -0
package/dist/runtime/durable.js +2 -1
package/dist/sessions/index.d.ts +2 -0
package/dist/sessions/index.js +275 -68
package/dist/skills/index.d.ts +2 -1
package/dist/skills/index.js +263 -35
package/dist/telemetry/shim.d.ts +20 -0
package/dist/telemetry/shim.js +28 -0
package/dist/testing/durableWorkspaceStoreContract.d.ts +3 -0
package/dist/testing/durableWorkspaceStoreContract.js +41 -0
package/dist/testing/fakeMemoryAdapter.d.ts +16 -0
package/dist/testing/fakeMemoryAdapter.js +110 -0
package/dist/testing/index.d.ts +5 -0
package/dist/testing/index.js +4 -0
package/dist/workspace/in-memory.d.ts +35 -0
package/dist/workspace/in-memory.js +142 -0
package/dist/workspace/index.d.ts +1 -0
package/dist/workspace/index.js +1 -0
package/package.json +12 -6

package/README.md CHANGED Viewed

@@ -3,6 +3,21 @@
 Self-hosted enterprise agent harness for typed tools, agents, workflows, state,
 sandboxing, streaming, and OpenTelemetry instrumentation.
+The core package also exports provider-neutral eval helpers:
+- `evaluatePromptCandidates(...)` compares prompt candidates against a fixed
+  item set and deterministic or custom scorers.
+- `evaluateDeterministicScorer(...)` runs JSON Pointer based deterministic
+  scorer definitions without provider calls. It is exported from the main
+  package and re-exported from `@purista/harness/testing`.
+Telemetry defaults to dual GenAI and OpenInference attributes with no content
+capture. `InvokeOptions.traceparent` and `tracestate` accept inbound W3C Trace
+Context so application traces can parent harness run spans.
+See [Evaluating Prompts](https://github.com/puristajs/harness/blob/main/docs/guides/evaluating-prompts.md)
+for the execution model, scorer limits, and privacy behavior.
 ## Install
 ```bash

package/dist/agents/index.d.ts CHANGED Viewed

@@ -1,9 +1,10 @@
 import type { Logger } from '../logger/index.js';
 import type { JsonValue } from '../models/json.js';
 import type { Message } from '../models/state.js';
-import type { AgentDefinition, ResolvedSkill, RunEvent, SessionMemory, ToolsConfig } from '../harness/defineHarness.js';
+import type { AgentDefinition, ResolvedSkill, RunEvent, ToolsConfig } from '../harness/defineHarness.js';
+import type { MemoryFacade } from '../ports/memory.js';
 import type { SandboxSession } from '../sandbox/index.js';
-import type { TelemetryShim } from '../telemetry/index.js';
+import { type TelemetryShim } from '../telemetry/index.js';
 import { type McpRunnerRegistry } from '../tools/mcp/runner.js';
 export declare function runDefaultAgent(args: {
     harnessName: string;
@@ -19,7 +20,7 @@ export declare function runDefaultAgent(args: {
     customTools: ToolsConfig;
     mcpRegistry?: McpRunnerRegistry;
     session: SandboxSession;
-    memory: SessionMemory;
+    memory: MemoryFacade;
     mountedSkills: Set<string>;
     historyWindow?: number;
     maxSteps: number;
@@ -28,6 +29,7 @@ export declare function runDefaultAgent(args: {
     logger: Logger;
     telemetry: TelemetryShim;
     emitEvent?: (event: RunEvent) => Promise<void>;
+    metadata?: Readonly<Record<string, JsonValue>>;
 }): Promise<{
     output: JsonValue;
     emitted: Message[];

package/dist/agents/index.js CHANGED Viewed

@@ -1,6 +1,7 @@
 import { z } from 'zod';
 import { ATTR_GEN_AI_AGENT_ID, ATTR_GEN_AI_AGENT_NAME, ATTR_GEN_AI_TOOL_CALL_ID, ATTR_GEN_AI_TOOL_NAME, ATTR_GEN_AI_TOOL_TYPE } from '@opentelemetry/semantic-conventions/incubating';
-import { AgentLoopBudgetError, HarnessError, OperationCancelledError, OperationTimeoutError, PermissionDeniedError, ToolError, ToolNotFoundError, ValidationError, serializeError } from '../errors/index.js';
+import { AgentLoopBudgetError, HarnessConfigError, HarnessError, OperationCancelledError, OperationTimeoutError, PermissionDeniedError, ToolError, ToolNotFoundError, ValidationError, serializeError } from '../errors/index.js';
+import { createMetrics } from '../telemetry/index.js';
 import { buildSkillIndex, mountSkillsOnce } from '../skills/index.js';
 import { BUILTIN_ALIAS_TO_CANONICAL, getBuiltinToolSpecs, invokeBuiltinTool } from '../tools/index.js';
 import { getMcpToolSpecs, invokeMcpTool, isMcpToolDefinition } from '../tools/mcp/runner.js';
@@ -31,14 +32,40 @@ export async function runDefaultAgent(args) {
         'harness.run.id': args.runId,
         ...(args.workflowId ? { 'harness.workflow.id': args.workflowId } : {}),
         'harness.agent.id': args.agentId,
+        'gen_ai.operation.name': 'invoke_agent',
+        'openinference.span.kind': 'AGENT',
+        'metadata.agent_name': args.agentId,
+        'metadata.agent_id': args.agentId,
         [ATTR_GEN_AI_AGENT_NAME]: args.agentId,
-        [ATTR_GEN_AI_AGENT_ID]: args.runId,
+        [ATTR_GEN_AI_AGENT_ID]: args.agentId,
         'harness.agent.model': args.agent.model,
-        'harness.agent.has_handler': args.agent.handler !== undefined
+        'harness.agent.has_handler': args.agent.handler !== undefined,
+        ...metadataSpanAttrs(args.metadata)
     };
-    const execute = () => runDefaultAgentInner(args);
+    const metrics = createMetrics(args.telemetry, agentAttrs);
+    const execute = () => runDefaultAgentInner({ ...args, metrics });
     return args.telemetry.span(`invoke_agent ${args.agentId}`, agentAttrs, execute);
 }
+function metadataSpanAttrs(metadata) {
+    const attrs = {};
+    for (const [key, value] of Object.entries(metadata ?? {})) {
+        if (!/^[a-zA-Z][a-zA-Z0-9_.-]{0,63}$/.test(key))
+            continue;
+        if (typeof value === 'string') {
+            if (value.length <= 256)
+                attrs[`harness.metadata.${key}`] = value;
+            continue;
+        }
+        if (typeof value === 'number' && Number.isFinite(value)) {
+            attrs[`harness.metadata.${key}`] = value;
+            continue;
+        }
+        if (typeof value === 'boolean') {
+            attrs[`harness.metadata.${key}`] = value;
+        }
+    }
+    return attrs;
+}
 async function runDefaultAgentInner(args) {
     args.signal.throwIfAborted();
     const inputSchema = args.agent.input ?? z.string();
@@ -49,6 +76,7 @@ async function runDefaultAgentInner(args) {
         throw new ValidationError('Unknown model alias', { where: 'agent_input', issues: { model: args.agent.model } });
     const skillIds = args.agent.skills ?? [];
     await mountSkillsOnce(args.session, args.mountedSkills, args.skills, skillIds);
+    const activatedSkills = new Set();
     if (args.agent.handler) {
         const output = await args.agent.handler({
             input: parsedInput,
@@ -57,16 +85,25 @@ async function runDefaultAgentInner(args) {
             runId: args.runId,
             sessionId: args.sessionId,
             history: { list: async () => args.history },
-            memory: args.memory
+            memory: args.memory,
+            metadata: args.metadata ?? {},
+            metrics: args.metrics
         });
         const validated = parseAgentSchema(outputSchema, output, 'agent_output');
         return { output: validated, emitted: [{ id: `msg_${Date.now()}_a`, sessionId: args.sessionId, runId: args.runId, role: 'assistant', content: JSON.stringify(validated), timestamp: new Date().toISOString() }] };
     }
     const baseInstructions = typeof args.agent.instructions === 'function'
-        ? args.agent.instructions({ input: parsedInput, runId: args.runId, sessionId: args.sessionId, history: { list: async () => args.history }, memory: args.memory })
+        ? args.agent.instructions({ input: parsedInput, runId: args.runId, sessionId: args.sessionId, history: { list: async () => args.history }, memory: args.memory, metadata: args.metadata ?? {}, metrics: args.metrics })
         : args.agent.instructions;
     const instructions = `${baseInstructions}${buildSkillIndex(args.skills, skillIds)}`;
     const enabledBuiltins = args.agent.builtinTools === false ? [] : args.agent.builtinTools?.slice() ?? ['bash', 'read', 'write', 'edit', 'glob', 'grep', 'list'];
+    if (skillIds.length > 0 && !enabledBuiltins.includes('read')) {
+        throw new HarnessConfigError('Agents with skills require the read built-in tool for skill activation.', {
+            reason: 'skill_read_tool_missing',
+            path: `agents.${args.agentId}.builtinTools`,
+            id: args.agentId
+        });
+    }
     const builtinSpecs = getBuiltinToolSpecs(enabledBuiltins, args.session);
     const enabledCustomTools = new Set((args.agent.tools ?? []));
     const tsCustomSpecs = Object.entries(args.customTools)
@@ -117,6 +154,7 @@ async function runDefaultAgentInner(args) {
         if (toolCalls.length === 0) {
             const validated = parseAgentSchema(outputSchema, response.object, 'agent_output');
             emitted.push({ id: `msg_${Date.now()}_a`, sessionId: args.sessionId, runId: args.runId, role: 'assistant', content: JSON.stringify(validated), timestamp: new Date().toISOString() });
+            await args.emitEvent?.({ type: 'model.object', runId: args.runId, agentId: args.agentId, object: validated, usage: response.usage });
             await args.emitEvent?.({ type: 'agent.finished', runId: args.runId, agentId: args.agentId, at: new Date().toISOString(), output: validated });
             return { output: validated, emitted };
         }
@@ -141,7 +179,10 @@ async function runDefaultAgentInner(args) {
                         throw new PermissionDeniedError('Permission denied.', { tool_name: canonical, agent_id: args.agentId, reason: 'hook_deny' });
                     }
                     if (canonical in BUILTIN_ALIAS_TO_CANONICAL) {
-                        return { output: await withToolSignal(args.signal, args.toolTimeoutMs, (signal) => invokeBuiltinTool(canonical, input, withSandboxTelemetry(args, canonical), signal)) };
+                        const output = await withToolSignal(args.signal, args.toolTimeoutMs, (signal) => invokeBuiltinTool(canonical, input, withSandboxTelemetry(args, canonical), signal));
+                        if (canonical === 'read')
+                            markSkillActivation(input, args.skills, activatedSkills);
+                        return { output };
                     }
                     if (!enabledCustomTools.has(canonical)) {
                         throw new ToolNotFoundError('Tool is not allowed for this agent.', { tool_id: canonical, where: 'agent_allowlist' });
@@ -159,7 +200,25 @@ async function runDefaultAgentInner(args) {
                     }
                     const tsTool = tool;
                     const parsed = tsTool.input.parse(input);
-                    const out = await withToolSignal(args.signal, args.toolTimeoutMs, (signal) => tsTool.handler({ signal, sandbox: withSandboxTelemetry(args, canonical), logger: args.logger, telemetry: args.telemetry, runId: args.runId, sessionId: args.sessionId, agentId: args.agentId, toolId: canonical }, parsed));
+                    const out = await withToolSignal(args.signal, args.toolTimeoutMs, (signal) => tsTool.handler({
+                        signal,
+                        sandbox: withSandboxTelemetry(args, canonical),
+                        logger: args.logger,
+                        telemetry: args.telemetry,
+                        metrics: createMetrics(args.telemetry, {
+                            'harness.name': args.harnessName,
+                            'harness.session.id': args.sessionId,
+                            'harness.run.id': args.runId,
+                            ...(args.workflowId ? { 'harness.workflow.id': args.workflowId } : {}),
+                            'harness.agent.id': args.agentId,
+                            'harness.tool.id': canonical
+                        }),
+                        memory: args.memory,
+                        runId: args.runId,
+                        sessionId: args.sessionId,
+                        agentId: args.agentId,
+                        toolId: canonical
+                    }, parsed));
                     return { output: tsTool.output.parse(out) };
                 });
             }
@@ -182,6 +241,19 @@ async function runDefaultAgentInner(args) {
         steps += 1;
     }
 }
+function markSkillActivation(input, skills, activated) {
+    if (!input || typeof input !== 'object')
+        return;
+    const readPath = input.path;
+    if (typeof readPath !== 'string')
+        return;
+    for (const skill of Object.values(skills)) {
+        if (readPath === `${skill.mountPath}/SKILL.md`) {
+            activated.add(skill.name);
+            return;
+        }
+    }
+}
 async function withToolSignal(parent, timeoutMs, fn) {
     parent.throwIfAborted();
     const controller = new AbortController();
@@ -226,6 +298,10 @@ async function withToolSpan(args, toolId, callId, toolKind, mcpAttrs, fn) {
         ...(args.workflowId ? { 'harness.workflow.id': args.workflowId } : {}),
         'harness.agent.id': args.agentId,
         'harness.tool.id': toolId,
+        'gen_ai.operation.name': 'execute_tool',
+        'openinference.span.kind': 'TOOL',
+        'tool.name': toolId,
+        'tool.call.id': callId,
         [ATTR_GEN_AI_TOOL_NAME]: toolId,
         [ATTR_GEN_AI_TOOL_CALL_ID]: callId,
         [ATTR_GEN_AI_TOOL_TYPE]: toolKind,

package/dist/errors/catalog.d.ts CHANGED Viewed

@@ -22,9 +22,14 @@ export declare class ValidationError extends HarnessError {
         /** Model provider response shape is invalid. */  | 'model_response'
         /** Session memory key is invalid. */  | 'memory_key'
         /** Session memory value is invalid or non-serializable. */  | 'memory_value'
+        /** Session memory scope is invalid or unsupported. */  | 'memory_scope'
+        /** Session memory options are invalid or unsupported. */  | 'memory_write_options'
+        /** Session memory listing options are invalid. */  | 'memory_list_options'
+        /** Session memory search query is invalid. */  | 'memory_search_query'
         /** Message envelope validation failed. */  | 'message'
         /** Session history shape validation failed. */  | 'session_history'
-        /** Invocation options are invalid. */  | 'invoke_options';
+        /** Invocation options are invalid. */  | 'invoke_options'
+        /** Evaluation helper input is invalid. */  | 'eval_input';
         issues: unknown;
     }, cause?: unknown);
 }
@@ -99,8 +104,9 @@ export declare class SkillNotFoundError extends HarnessError {
 export declare class SkillManifestError extends HarnessError {
     constructor(message: string, meta: {
         directory: string;
-        reason: 'missing_skill_md' | 'invalid_frontmatter' | 'name_mismatch' | 'directory_missing' | 'reserved_name';
+        reason: 'missing_skill_md' | 'invalid_frontmatter' | 'missing_description' | 'invalid_name' | 'name_mismatch' | 'directory_missing' | 'collision_shadowed' | 'untrusted_project_skill' | 'scan_limit_reached' | 'reserved_name';
         skill_id?: string;
+        source?: string;
     }, cause?: unknown);
 }
 /** Workflow referenced an unknown agent id. */
@@ -139,21 +145,55 @@ export declare class SessionBusyError extends HarnessError {
 /** State backend operation failed. */
 export declare class StateError extends HarnessError {
     constructor(message: string, meta: {
-        op: 'getSession' | 'upsertSession' | 'closeSession' | 'appendMessages' | 'listMessages' | 'clearMessages' | 'createRun' | 'finishRun' | 'getRun' | 'listRuns' | 'appendEvents' | 'listEvents';
+        op: 'getSession' | 'upsertSession' | 'closeSession' | 'appendMessages' | 'listMessages' | 'clearMessages' | 'createRun' | 'finishRun' | 'getRun' | 'listRuns' | 'appendEvents' | 'listEvents' | 'memory.get' | 'memory.set' | 'memory.delete' | 'memory.list' | 'memory.search';
         reason?: 'duplicate_message_id' | string;
+        adapter?: 'memory' | string;
+        memory_provider?: string;
+    }, cause?: unknown);
+}
+/** Durable workspace lifecycle, consistency, inspection, or backend failure. */
+export declare class WorkspaceError extends HarnessError {
+    constructor(message: string, meta: {
+        reason: 'idempotency_conflict' | 'not_found' | 'aborted' | 'expired' | 'missing_checkpoint' | 'backend_failure' | 'unsupported_operation' | 'invalid_reference' | 'checkpoint_conflict' | 'cleanup_pending';
+        workspace_ref?: string;
+        checkpoint_ref?: string;
+        snapshot_ref?: string;
+        run_id?: string;
+        session_id?: string;
+    }, cause?: unknown);
+}
+/** Durable workspace quota would be or was exceeded. */
+export declare class WorkspaceQuotaExceededError extends HarnessError {
+    constructor(message: string, meta: {
+        quota: string;
+        limit?: number;
+        actual?: number;
+        partial?: boolean;
+        workspace_ref?: string;
+        run_id?: string;
+        session_id?: string;
+    }, cause?: unknown);
+}
+/** Durable workspace cleanup could not complete in the current attempt. */
+export declare class WorkspaceCleanupError extends HarnessError {
+    constructor(message: string, meta: {
+        reason: 'backend_failure' | 'partial_delete' | 'invalid_reference';
+        workspace_ref: string;
+        remaining_refs?: readonly string[];
+        retry_after_ms?: number;
     }, cause?: unknown);
 }
 /** Timed execution budget expired. */
 export declare class OperationTimeoutError extends HarnessError {
     constructor(message: string, meta: {
-        scope: 'run' | 'model' | 'tool' | 'sandbox_run';
+        scope: 'run' | 'model' | 'tool' | 'sandbox_run' | 'memory' | 'workspace';
         timeout_ms: number;
     }, cause?: unknown);
 }
 /** Operation cancelled by abort signal or explicit cancellation path. */
 export declare class OperationCancelledError extends HarnessError {
     constructor(message: string, meta: {
-        scope: 'run' | 'workflow' | 'agent' | 'model' | 'tool' | 'sandbox';
+        scope: 'run' | 'workflow' | 'agent' | 'model' | 'tool' | 'sandbox' | 'memory' | 'workspace';
     }, cause?: unknown);
 }
 /** MCP transport/protocol failure. */

package/dist/errors/catalog.js CHANGED Viewed

@@ -111,6 +111,25 @@ export class StateError extends HarnessError {
         super({ code: 'STATE_ERROR', category: 'state', retriable: true, message, meta, cause });
     }
 }
+/** Durable workspace lifecycle, consistency, inspection, or backend failure. */
+export class WorkspaceError extends HarnessError {
+    constructor(message, meta, cause) {
+        const retriable = meta.reason === 'backend_failure' || meta.reason === 'cleanup_pending';
+        super({ code: 'WORKSPACE_ERROR', category: 'workspace', retriable, message, meta, cause });
+    }
+}
+/** Durable workspace quota would be or was exceeded. */
+export class WorkspaceQuotaExceededError extends HarnessError {
+    constructor(message, meta, cause) {
+        super({ code: 'WORKSPACE_QUOTA_EXCEEDED', category: 'workspace', retriable: false, message, meta, cause });
+    }
+}
+/** Durable workspace cleanup could not complete in the current attempt. */
+export class WorkspaceCleanupError extends HarnessError {
+    constructor(message, meta, cause) {
+        super({ code: 'WORKSPACE_CLEANUP_ERROR', category: 'workspace', retriable: true, message, meta, cause });
+    }
+}
 /** Timed execution budget expired. */
 export class OperationTimeoutError extends HarnessError {
     constructor(message, meta, cause) {

package/dist/errors/harness-error.d.ts CHANGED Viewed

@@ -22,6 +22,8 @@ export type ErrorCategory =
  | 'session'
 /** State-store persistence failures. */
  | 'state'
+/** Durable workspace lifecycle or backend failures. */
+ | 'workspace'
 /** Timeout budget failures. */
  | 'timeout'
 /** Cooperative cancellation events. */

package/dist/eval/index.d.ts ADDED Viewed

@@ -0,0 +1,57 @@
+import type { JsonValue } from '../models/json.js';
+export type DeterministicScorerDefinition = {
+    type: 'regex';
+    path: string;
+    pattern: string;
+    flags?: 'i' | 'm' | 'im';
+} | {
+    type: 'json-schema';
+    schema: JsonValue;
+} | {
+    type: 'contains';
+    path: string;
+    value: string;
+    caseInsensitive?: boolean;
+} | {
+    type: 'attribute-equality';
+    leftPath: string;
+    rightPath: string;
+};
+export interface ScorerTarget {
+    input: unknown;
+    output: unknown;
+    expected?: unknown;
+    context?: unknown[];
+}
+export interface ScorerResult {
+    score: number;
+    passed: boolean;
+    evidence?: JsonValue;
+}
+export interface PromptCandidate<I = unknown> {
+    id: string;
+    prompt: string;
+    metadata?: Record<string, JsonValue>;
+}
+export interface EvaluationItem<I = unknown> {
+    id: string;
+    input: I;
+    expected?: unknown;
+    context?: unknown[];
+}
+export interface CandidateScore {
+    candidateId: string;
+    meanScore: number;
+    passRate: number;
+    itemCount: number;
+    scorerCount: number;
+}
+export interface EvaluatePromptCandidatesInput<I = unknown> {
+    candidates: PromptCandidate<I>[];
+    items: EvaluationItem<I>[];
+    scorer: (target: ScorerTarget, signal: AbortSignal) => Promise<ScorerResult>;
+    runCandidate: (candidate: PromptCandidate<I>, item: EvaluationItem<I>, signal: AbortSignal) => Promise<unknown>;
+    signal: AbortSignal;
+}
+export declare function evaluateDeterministicScorer(definition: DeterministicScorerDefinition, target: ScorerTarget): ScorerResult;
+export declare function evaluatePromptCandidates<I = unknown>(input: EvaluatePromptCandidatesInput<I>): Promise<CandidateScore[]>;

package/dist/eval/index.js ADDED Viewed

@@ -0,0 +1,181 @@
+import { ValidationError } from '../errors/index.js';
+export function evaluateDeterministicScorer(definition, target) {
+    switch (definition.type) {
+        case 'regex': {
+            const selected = readPointer(target.output, definition.path);
+            if (!selected.found)
+                return missingPointer(definition.path);
+            return binary(new RegExp(definition.pattern, definition.flags ?? '').test(String(selected.value)));
+        }
+        case 'contains': {
+            const selected = readPointer(target.output, definition.path);
+            if (!selected.found)
+                return missingPointer(definition.path);
+            const haystack = String(selected.value);
+            const needle = definition.value;
+            return binary(definition.caseInsensitive
+                ? haystack.toLocaleLowerCase().includes(needle.toLocaleLowerCase())
+                : haystack.includes(needle));
+        }
+        case 'attribute-equality': {
+            const left = readPointer(target.output, definition.leftPath);
+            if (!left.found)
+                return missingPointer(definition.leftPath);
+            const right = readPointer(target.output, definition.rightPath);
+            if (!right.found)
+                return missingPointer(definition.rightPath);
+            return deepEqual(left.value, right.value)
+                ? binary(true)
+                : { score: 0, passed: false, evidence: { left: toJsonValue(left.value), right: toJsonValue(right.value) } };
+        }
+        case 'json-schema': {
+            const result = validateJsonSchema(definition.schema, target.output);
+            return result.passed
+                ? binary(true)
+                : { score: 0, passed: false, evidence: { reason: 'schema_validation_failed', issues: result.issues } };
+        }
+    }
+}
+export async function evaluatePromptCandidates(input) {
+    if (input.candidates.length === 0) {
+        throw new ValidationError('At least one prompt candidate is required.', { where: 'eval_input', issues: { candidates: 'empty' } });
+    }
+    if (input.items.length === 0) {
+        throw new ValidationError('At least one evaluation item is required.', { where: 'eval_input', issues: { items: 'empty' } });
+    }
+    const scores = [];
+    for (const candidate of input.candidates) {
+        input.signal.throwIfAborted();
+        let total = 0;
+        let passed = 0;
+        let scorerCount = 0;
+        for (const item of input.items) {
+            input.signal.throwIfAborted();
+            const output = await input.runCandidate(candidate, item, input.signal);
+            const target = {
+                input: item.input,
+                output
+            };
+            if (item.expected !== undefined)
+                target.expected = item.expected;
+            if (item.context !== undefined)
+                target.context = item.context;
+            const result = await input.scorer(target, input.signal);
+            total += result.score;
+            passed += result.passed ? 1 : 0;
+            scorerCount += 1;
+        }
+        scores.push({
+            candidateId: candidate.id,
+            meanScore: total / scorerCount,
+            passRate: passed / scorerCount,
+            itemCount: input.items.length,
+            scorerCount
+        });
+    }
+    return scores.sort((a, b) => {
+        if (a.meanScore !== b.meanScore)
+            return b.meanScore - a.meanScore;
+        if (a.passRate !== b.passRate)
+            return b.passRate - a.passRate;
+        return a.candidateId.localeCompare(b.candidateId);
+    });
+}
+function binary(passed) {
+    return { score: passed ? 1 : 0, passed };
+}
+function missingPointer(path) {
+    return { score: 0, passed: false, evidence: { reason: 'missing_pointer', path } };
+}
+function readPointer(value, pointer) {
+    if (pointer === '')
+        return { found: true, value };
+    if (!pointer.startsWith('/'))
+        return { found: false };
+    let current = value;
+    for (const rawPart of pointer.slice(1).split('/')) {
+        const part = rawPart.replace(/~1/g, '/').replace(/~0/g, '~');
+        if (Array.isArray(current)) {
+            const index = Number(part);
+            if (!Number.isInteger(index) || index < 0 || index >= current.length)
+                return { found: false };
+            current = current[index];
+            continue;
+        }
+        if (!isRecord(current) || !(part in current))
+            return { found: false };
+        current = current[part];
+    }
+    return { found: true, value: current };
+}
+function validateJsonSchema(schema, value) {
+    const issues = [];
+    validateSchemaAt(schema, value, '', issues);
+    return { passed: issues.length === 0, issues };
+}
+function validateSchemaAt(schema, value, path, issues) {
+    if (!isRecord(schema))
+        return;
+    if ('const' in schema && !deepEqual(value, schema['const'])) {
+        issues.push({ path, reason: 'const', expected: toJsonValue(schema['const']), actual: toJsonValue(value) });
+        return;
+    }
+    if (Array.isArray(schema['enum']) && !schema['enum'].some((entry) => deepEqual(entry, value))) {
+        issues.push({ path, reason: 'enum', actual: toJsonValue(value) });
+        return;
+    }
+    const type = typeof schema['type'] === 'string' ? schema['type'] : undefined;
+    if (type && !matchesType(value, type)) {
+        issues.push({ path, reason: 'type', expected: type, actual: typeof value });
+        return;
+    }
+    if (type === 'object' || schema['properties']) {
+        if (!isRecord(value)) {
+            issues.push({ path, reason: 'type', expected: 'object', actual: typeof value });
+            return;
+        }
+        const required = Array.isArray(schema['required']) ? schema['required'].filter((entry) => typeof entry === 'string') : [];
+        for (const key of required) {
+            if (!(key in value))
+                issues.push({ path: `${path}/${key}`, reason: 'required' });
+        }
+        const properties = isRecord(schema['properties']) ? schema['properties'] : {};
+        for (const [key, childSchema] of Object.entries(properties)) {
+            if (key in value)
+                validateSchemaAt(childSchema, value[key], `${path}/${key}`, issues);
+        }
+        if (schema['additionalProperties'] === false) {
+            for (const key of Object.keys(value)) {
+                if (!(key in properties))
+                    issues.push({ path: `${path}/${key}`, reason: 'additional_properties' });
+            }
+        }
+    }
+}
+function matchesType(value, type) {
+    switch (type) {
+        case 'object': return isRecord(value);
+        case 'array': return Array.isArray(value);
+        case 'string': return typeof value === 'string';
+        case 'number': return typeof value === 'number' && Number.isFinite(value);
+        case 'integer': return Number.isInteger(value);
+        case 'boolean': return typeof value === 'boolean';
+        case 'null': return value === null;
+        default: return true;
+    }
+}
+function isRecord(value) {
+    return value !== null && typeof value === 'object' && !Array.isArray(value);
+}
+function deepEqual(a, b) {
+    return JSON.stringify(a) === JSON.stringify(b);
+}
+function toJsonValue(value) {
+    if (value === null || typeof value === 'string' || typeof value === 'number' || typeof value === 'boolean')
+        return value;
+    if (Array.isArray(value))
+        return value.map((entry) => toJsonValue(entry));
+    if (isRecord(value))
+        return Object.fromEntries(Object.entries(value).map(([key, entry]) => [key, toJsonValue(entry)]));
+    return String(value);
+}