@purista/harness 1.0.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/README.md +15 -0
  2. package/dist/agents/index.d.ts +5 -3
  3. package/dist/agents/index.js +84 -8
  4. package/dist/errors/catalog.d.ts +45 -5
  5. package/dist/errors/catalog.js +19 -0
  6. package/dist/errors/harness-error.d.ts +2 -0
  7. package/dist/eval/index.d.ts +57 -0
  8. package/dist/eval/index.js +181 -0
  9. package/dist/harness/defineHarness.d.ts +96 -20
  10. package/dist/harness/defineHarness.js +59 -2
  11. package/dist/index.d.ts +4 -0
  12. package/dist/index.js +4 -0
  13. package/dist/memory/sandbox/index.d.ts +17 -0
  14. package/dist/memory/sandbox/index.js +122 -0
  15. package/dist/models/registry.js +32 -7
  16. package/dist/ports/capabilities.d.ts +46 -2
  17. package/dist/ports/harness-context.d.ts +4 -1
  18. package/dist/ports/index.d.ts +2 -0
  19. package/dist/ports/index.js +2 -0
  20. package/dist/ports/memory/facade.d.ts +5 -0
  21. package/dist/ports/memory/facade.js +123 -0
  22. package/dist/ports/memory/telemetry.d.ts +16 -0
  23. package/dist/ports/memory/telemetry.js +77 -0
  24. package/dist/ports/memory/types.d.ts +204 -0
  25. package/dist/ports/memory/types.js +1 -0
  26. package/dist/ports/memory/validation.d.ts +19 -0
  27. package/dist/ports/memory/validation.js +160 -0
  28. package/dist/ports/memory.d.ts +3 -0
  29. package/dist/ports/memory.js +3 -0
  30. package/dist/ports/workspace.d.ts +177 -0
  31. package/dist/ports/workspace.js +32 -0
  32. package/dist/runtime/durable.d.ts +3 -0
  33. package/dist/runtime/durable.js +2 -1
  34. package/dist/sessions/index.d.ts +2 -0
  35. package/dist/sessions/index.js +275 -68
  36. package/dist/skills/index.d.ts +2 -1
  37. package/dist/skills/index.js +263 -35
  38. package/dist/telemetry/shim.d.ts +20 -0
  39. package/dist/telemetry/shim.js +28 -0
  40. package/dist/testing/durableWorkspaceStoreContract.d.ts +3 -0
  41. package/dist/testing/durableWorkspaceStoreContract.js +41 -0
  42. package/dist/testing/fakeMemoryAdapter.d.ts +16 -0
  43. package/dist/testing/fakeMemoryAdapter.js +110 -0
  44. package/dist/testing/index.d.ts +5 -0
  45. package/dist/testing/index.js +4 -0
  46. package/dist/workspace/in-memory.d.ts +35 -0
  47. package/dist/workspace/in-memory.js +142 -0
  48. package/dist/workspace/index.d.ts +1 -0
  49. package/dist/workspace/index.js +1 -0
  50. package/package.json +12 -6
package/README.md CHANGED
@@ -3,6 +3,21 @@
3
3
  Self-hosted enterprise agent harness for typed tools, agents, workflows, state,
4
4
  sandboxing, streaming, and OpenTelemetry instrumentation.
5
5
 
6
+ The core package also exports provider-neutral eval helpers:
7
+
8
+ - `evaluatePromptCandidates(...)` compares prompt candidates against a fixed
9
+ item set and deterministic or custom scorers.
10
+ - `evaluateDeterministicScorer(...)` runs JSON Pointer based deterministic
11
+ scorer definitions without provider calls. It is exported from the main
12
+ package and re-exported from `@purista/harness/testing`.
13
+
14
+ Telemetry defaults to dual GenAI and OpenInference attributes with no content
15
+ capture. `InvokeOptions.traceparent` and `tracestate` accept inbound W3C Trace
16
+ Context so application traces can parent harness run spans.
17
+
18
+ See [Evaluating Prompts](https://github.com/puristajs/harness/blob/main/docs/guides/evaluating-prompts.md)
19
+ for the execution model, scorer limits, and privacy behavior.
20
+
6
21
  ## Install
7
22
 
8
23
  ```bash
@@ -1,9 +1,10 @@
1
1
  import type { Logger } from '../logger/index.js';
2
2
  import type { JsonValue } from '../models/json.js';
3
3
  import type { Message } from '../models/state.js';
4
- import type { AgentDefinition, ResolvedSkill, RunEvent, SessionMemory, ToolsConfig } from '../harness/defineHarness.js';
4
+ import type { AgentDefinition, ResolvedSkill, RunEvent, ToolsConfig } from '../harness/defineHarness.js';
5
+ import type { MemoryFacade } from '../ports/memory.js';
5
6
  import type { SandboxSession } from '../sandbox/index.js';
6
- import type { TelemetryShim } from '../telemetry/index.js';
7
+ import { type TelemetryShim } from '../telemetry/index.js';
7
8
  import { type McpRunnerRegistry } from '../tools/mcp/runner.js';
8
9
  export declare function runDefaultAgent(args: {
9
10
  harnessName: string;
@@ -19,7 +20,7 @@ export declare function runDefaultAgent(args: {
19
20
  customTools: ToolsConfig;
20
21
  mcpRegistry?: McpRunnerRegistry;
21
22
  session: SandboxSession;
22
- memory: SessionMemory;
23
+ memory: MemoryFacade;
23
24
  mountedSkills: Set<string>;
24
25
  historyWindow?: number;
25
26
  maxSteps: number;
@@ -28,6 +29,7 @@ export declare function runDefaultAgent(args: {
28
29
  logger: Logger;
29
30
  telemetry: TelemetryShim;
30
31
  emitEvent?: (event: RunEvent) => Promise<void>;
32
+ metadata?: Readonly<Record<string, JsonValue>>;
31
33
  }): Promise<{
32
34
  output: JsonValue;
33
35
  emitted: Message[];
@@ -1,6 +1,7 @@
1
1
  import { z } from 'zod';
2
2
  import { ATTR_GEN_AI_AGENT_ID, ATTR_GEN_AI_AGENT_NAME, ATTR_GEN_AI_TOOL_CALL_ID, ATTR_GEN_AI_TOOL_NAME, ATTR_GEN_AI_TOOL_TYPE } from '@opentelemetry/semantic-conventions/incubating';
3
- import { AgentLoopBudgetError, HarnessError, OperationCancelledError, OperationTimeoutError, PermissionDeniedError, ToolError, ToolNotFoundError, ValidationError, serializeError } from '../errors/index.js';
3
+ import { AgentLoopBudgetError, HarnessConfigError, HarnessError, OperationCancelledError, OperationTimeoutError, PermissionDeniedError, ToolError, ToolNotFoundError, ValidationError, serializeError } from '../errors/index.js';
4
+ import { createMetrics } from '../telemetry/index.js';
4
5
  import { buildSkillIndex, mountSkillsOnce } from '../skills/index.js';
5
6
  import { BUILTIN_ALIAS_TO_CANONICAL, getBuiltinToolSpecs, invokeBuiltinTool } from '../tools/index.js';
6
7
  import { getMcpToolSpecs, invokeMcpTool, isMcpToolDefinition } from '../tools/mcp/runner.js';
@@ -31,14 +32,40 @@ export async function runDefaultAgent(args) {
31
32
  'harness.run.id': args.runId,
32
33
  ...(args.workflowId ? { 'harness.workflow.id': args.workflowId } : {}),
33
34
  'harness.agent.id': args.agentId,
35
+ 'gen_ai.operation.name': 'invoke_agent',
36
+ 'openinference.span.kind': 'AGENT',
37
+ 'metadata.agent_name': args.agentId,
38
+ 'metadata.agent_id': args.agentId,
34
39
  [ATTR_GEN_AI_AGENT_NAME]: args.agentId,
35
- [ATTR_GEN_AI_AGENT_ID]: args.runId,
40
+ [ATTR_GEN_AI_AGENT_ID]: args.agentId,
36
41
  'harness.agent.model': args.agent.model,
37
- 'harness.agent.has_handler': args.agent.handler !== undefined
42
+ 'harness.agent.has_handler': args.agent.handler !== undefined,
43
+ ...metadataSpanAttrs(args.metadata)
38
44
  };
39
- const execute = () => runDefaultAgentInner(args);
45
+ const metrics = createMetrics(args.telemetry, agentAttrs);
46
+ const execute = () => runDefaultAgentInner({ ...args, metrics });
40
47
  return args.telemetry.span(`invoke_agent ${args.agentId}`, agentAttrs, execute);
41
48
  }
49
+ function metadataSpanAttrs(metadata) {
50
+ const attrs = {};
51
+ for (const [key, value] of Object.entries(metadata ?? {})) {
52
+ if (!/^[a-zA-Z][a-zA-Z0-9_.-]{0,63}$/.test(key))
53
+ continue;
54
+ if (typeof value === 'string') {
55
+ if (value.length <= 256)
56
+ attrs[`harness.metadata.${key}`] = value;
57
+ continue;
58
+ }
59
+ if (typeof value === 'number' && Number.isFinite(value)) {
60
+ attrs[`harness.metadata.${key}`] = value;
61
+ continue;
62
+ }
63
+ if (typeof value === 'boolean') {
64
+ attrs[`harness.metadata.${key}`] = value;
65
+ }
66
+ }
67
+ return attrs;
68
+ }
42
69
  async function runDefaultAgentInner(args) {
43
70
  args.signal.throwIfAborted();
44
71
  const inputSchema = args.agent.input ?? z.string();
@@ -49,6 +76,7 @@ async function runDefaultAgentInner(args) {
49
76
  throw new ValidationError('Unknown model alias', { where: 'agent_input', issues: { model: args.agent.model } });
50
77
  const skillIds = args.agent.skills ?? [];
51
78
  await mountSkillsOnce(args.session, args.mountedSkills, args.skills, skillIds);
79
+ const activatedSkills = new Set();
52
80
  if (args.agent.handler) {
53
81
  const output = await args.agent.handler({
54
82
  input: parsedInput,
@@ -57,16 +85,25 @@ async function runDefaultAgentInner(args) {
57
85
  runId: args.runId,
58
86
  sessionId: args.sessionId,
59
87
  history: { list: async () => args.history },
60
- memory: args.memory
88
+ memory: args.memory,
89
+ metadata: args.metadata ?? {},
90
+ metrics: args.metrics
61
91
  });
62
92
  const validated = parseAgentSchema(outputSchema, output, 'agent_output');
63
93
  return { output: validated, emitted: [{ id: `msg_${Date.now()}_a`, sessionId: args.sessionId, runId: args.runId, role: 'assistant', content: JSON.stringify(validated), timestamp: new Date().toISOString() }] };
64
94
  }
65
95
  const baseInstructions = typeof args.agent.instructions === 'function'
66
- ? args.agent.instructions({ input: parsedInput, runId: args.runId, sessionId: args.sessionId, history: { list: async () => args.history }, memory: args.memory })
96
+ ? args.agent.instructions({ input: parsedInput, runId: args.runId, sessionId: args.sessionId, history: { list: async () => args.history }, memory: args.memory, metadata: args.metadata ?? {}, metrics: args.metrics })
67
97
  : args.agent.instructions;
68
98
  const instructions = `${baseInstructions}${buildSkillIndex(args.skills, skillIds)}`;
69
99
  const enabledBuiltins = args.agent.builtinTools === false ? [] : args.agent.builtinTools?.slice() ?? ['bash', 'read', 'write', 'edit', 'glob', 'grep', 'list'];
100
+ if (skillIds.length > 0 && !enabledBuiltins.includes('read')) {
101
+ throw new HarnessConfigError('Agents with skills require the read built-in tool for skill activation.', {
102
+ reason: 'skill_read_tool_missing',
103
+ path: `agents.${args.agentId}.builtinTools`,
104
+ id: args.agentId
105
+ });
106
+ }
70
107
  const builtinSpecs = getBuiltinToolSpecs(enabledBuiltins, args.session);
71
108
  const enabledCustomTools = new Set((args.agent.tools ?? []));
72
109
  const tsCustomSpecs = Object.entries(args.customTools)
@@ -117,6 +154,7 @@ async function runDefaultAgentInner(args) {
117
154
  if (toolCalls.length === 0) {
118
155
  const validated = parseAgentSchema(outputSchema, response.object, 'agent_output');
119
156
  emitted.push({ id: `msg_${Date.now()}_a`, sessionId: args.sessionId, runId: args.runId, role: 'assistant', content: JSON.stringify(validated), timestamp: new Date().toISOString() });
157
+ await args.emitEvent?.({ type: 'model.object', runId: args.runId, agentId: args.agentId, object: validated, usage: response.usage });
120
158
  await args.emitEvent?.({ type: 'agent.finished', runId: args.runId, agentId: args.agentId, at: new Date().toISOString(), output: validated });
121
159
  return { output: validated, emitted };
122
160
  }
@@ -141,7 +179,10 @@ async function runDefaultAgentInner(args) {
141
179
  throw new PermissionDeniedError('Permission denied.', { tool_name: canonical, agent_id: args.agentId, reason: 'hook_deny' });
142
180
  }
143
181
  if (canonical in BUILTIN_ALIAS_TO_CANONICAL) {
144
- return { output: await withToolSignal(args.signal, args.toolTimeoutMs, (signal) => invokeBuiltinTool(canonical, input, withSandboxTelemetry(args, canonical), signal)) };
182
+ const output = await withToolSignal(args.signal, args.toolTimeoutMs, (signal) => invokeBuiltinTool(canonical, input, withSandboxTelemetry(args, canonical), signal));
183
+ if (canonical === 'read')
184
+ markSkillActivation(input, args.skills, activatedSkills);
185
+ return { output };
145
186
  }
146
187
  if (!enabledCustomTools.has(canonical)) {
147
188
  throw new ToolNotFoundError('Tool is not allowed for this agent.', { tool_id: canonical, where: 'agent_allowlist' });
@@ -159,7 +200,25 @@ async function runDefaultAgentInner(args) {
159
200
  }
160
201
  const tsTool = tool;
161
202
  const parsed = tsTool.input.parse(input);
162
- const out = await withToolSignal(args.signal, args.toolTimeoutMs, (signal) => tsTool.handler({ signal, sandbox: withSandboxTelemetry(args, canonical), logger: args.logger, telemetry: args.telemetry, runId: args.runId, sessionId: args.sessionId, agentId: args.agentId, toolId: canonical }, parsed));
203
+ const out = await withToolSignal(args.signal, args.toolTimeoutMs, (signal) => tsTool.handler({
204
+ signal,
205
+ sandbox: withSandboxTelemetry(args, canonical),
206
+ logger: args.logger,
207
+ telemetry: args.telemetry,
208
+ metrics: createMetrics(args.telemetry, {
209
+ 'harness.name': args.harnessName,
210
+ 'harness.session.id': args.sessionId,
211
+ 'harness.run.id': args.runId,
212
+ ...(args.workflowId ? { 'harness.workflow.id': args.workflowId } : {}),
213
+ 'harness.agent.id': args.agentId,
214
+ 'harness.tool.id': canonical
215
+ }),
216
+ memory: args.memory,
217
+ runId: args.runId,
218
+ sessionId: args.sessionId,
219
+ agentId: args.agentId,
220
+ toolId: canonical
221
+ }, parsed));
163
222
  return { output: tsTool.output.parse(out) };
164
223
  });
165
224
  }
@@ -182,6 +241,19 @@ async function runDefaultAgentInner(args) {
182
241
  steps += 1;
183
242
  }
184
243
  }
244
+ function markSkillActivation(input, skills, activated) {
245
+ if (!input || typeof input !== 'object')
246
+ return;
247
+ const readPath = input.path;
248
+ if (typeof readPath !== 'string')
249
+ return;
250
+ for (const skill of Object.values(skills)) {
251
+ if (readPath === `${skill.mountPath}/SKILL.md`) {
252
+ activated.add(skill.name);
253
+ return;
254
+ }
255
+ }
256
+ }
185
257
  async function withToolSignal(parent, timeoutMs, fn) {
186
258
  parent.throwIfAborted();
187
259
  const controller = new AbortController();
@@ -226,6 +298,10 @@ async function withToolSpan(args, toolId, callId, toolKind, mcpAttrs, fn) {
226
298
  ...(args.workflowId ? { 'harness.workflow.id': args.workflowId } : {}),
227
299
  'harness.agent.id': args.agentId,
228
300
  'harness.tool.id': toolId,
301
+ 'gen_ai.operation.name': 'execute_tool',
302
+ 'openinference.span.kind': 'TOOL',
303
+ 'tool.name': toolId,
304
+ 'tool.call.id': callId,
229
305
  [ATTR_GEN_AI_TOOL_NAME]: toolId,
230
306
  [ATTR_GEN_AI_TOOL_CALL_ID]: callId,
231
307
  [ATTR_GEN_AI_TOOL_TYPE]: toolKind,
@@ -22,9 +22,14 @@ export declare class ValidationError extends HarnessError {
22
22
  /** Model provider response shape is invalid. */ | 'model_response'
23
23
  /** Session memory key is invalid. */ | 'memory_key'
24
24
  /** Session memory value is invalid or non-serializable. */ | 'memory_value'
25
+ /** Session memory scope is invalid or unsupported. */ | 'memory_scope'
26
+ /** Session memory options are invalid or unsupported. */ | 'memory_write_options'
27
+ /** Session memory listing options are invalid. */ | 'memory_list_options'
28
+ /** Session memory search query is invalid. */ | 'memory_search_query'
25
29
  /** Message envelope validation failed. */ | 'message'
26
30
  /** Session history shape validation failed. */ | 'session_history'
27
- /** Invocation options are invalid. */ | 'invoke_options';
31
+ /** Invocation options are invalid. */ | 'invoke_options'
32
+ /** Evaluation helper input is invalid. */ | 'eval_input';
28
33
  issues: unknown;
29
34
  }, cause?: unknown);
30
35
  }
@@ -99,8 +104,9 @@ export declare class SkillNotFoundError extends HarnessError {
99
104
  export declare class SkillManifestError extends HarnessError {
100
105
  constructor(message: string, meta: {
101
106
  directory: string;
102
- reason: 'missing_skill_md' | 'invalid_frontmatter' | 'name_mismatch' | 'directory_missing' | 'reserved_name';
107
+ reason: 'missing_skill_md' | 'invalid_frontmatter' | 'missing_description' | 'invalid_name' | 'name_mismatch' | 'directory_missing' | 'collision_shadowed' | 'untrusted_project_skill' | 'scan_limit_reached' | 'reserved_name';
103
108
  skill_id?: string;
109
+ source?: string;
104
110
  }, cause?: unknown);
105
111
  }
106
112
  /** Workflow referenced an unknown agent id. */
@@ -139,21 +145,55 @@ export declare class SessionBusyError extends HarnessError {
139
145
  /** State backend operation failed. */
140
146
  export declare class StateError extends HarnessError {
141
147
  constructor(message: string, meta: {
142
- op: 'getSession' | 'upsertSession' | 'closeSession' | 'appendMessages' | 'listMessages' | 'clearMessages' | 'createRun' | 'finishRun' | 'getRun' | 'listRuns' | 'appendEvents' | 'listEvents';
148
+ op: 'getSession' | 'upsertSession' | 'closeSession' | 'appendMessages' | 'listMessages' | 'clearMessages' | 'createRun' | 'finishRun' | 'getRun' | 'listRuns' | 'appendEvents' | 'listEvents' | 'memory.get' | 'memory.set' | 'memory.delete' | 'memory.list' | 'memory.search';
143
149
  reason?: 'duplicate_message_id' | string;
150
+ adapter?: 'memory' | string;
151
+ memory_provider?: string;
152
+ }, cause?: unknown);
153
+ }
154
+ /** Durable workspace lifecycle, consistency, inspection, or backend failure. */
155
+ export declare class WorkspaceError extends HarnessError {
156
+ constructor(message: string, meta: {
157
+ reason: 'idempotency_conflict' | 'not_found' | 'aborted' | 'expired' | 'missing_checkpoint' | 'backend_failure' | 'unsupported_operation' | 'invalid_reference' | 'checkpoint_conflict' | 'cleanup_pending';
158
+ workspace_ref?: string;
159
+ checkpoint_ref?: string;
160
+ snapshot_ref?: string;
161
+ run_id?: string;
162
+ session_id?: string;
163
+ }, cause?: unknown);
164
+ }
165
+ /** Durable workspace quota would be or was exceeded. */
166
+ export declare class WorkspaceQuotaExceededError extends HarnessError {
167
+ constructor(message: string, meta: {
168
+ quota: string;
169
+ limit?: number;
170
+ actual?: number;
171
+ partial?: boolean;
172
+ workspace_ref?: string;
173
+ run_id?: string;
174
+ session_id?: string;
175
+ }, cause?: unknown);
176
+ }
177
+ /** Durable workspace cleanup could not complete in the current attempt. */
178
+ export declare class WorkspaceCleanupError extends HarnessError {
179
+ constructor(message: string, meta: {
180
+ reason: 'backend_failure' | 'partial_delete' | 'invalid_reference';
181
+ workspace_ref: string;
182
+ remaining_refs?: readonly string[];
183
+ retry_after_ms?: number;
144
184
  }, cause?: unknown);
145
185
  }
146
186
  /** Timed execution budget expired. */
147
187
  export declare class OperationTimeoutError extends HarnessError {
148
188
  constructor(message: string, meta: {
149
- scope: 'run' | 'model' | 'tool' | 'sandbox_run';
189
+ scope: 'run' | 'model' | 'tool' | 'sandbox_run' | 'memory' | 'workspace';
150
190
  timeout_ms: number;
151
191
  }, cause?: unknown);
152
192
  }
153
193
  /** Operation cancelled by abort signal or explicit cancellation path. */
154
194
  export declare class OperationCancelledError extends HarnessError {
155
195
  constructor(message: string, meta: {
156
- scope: 'run' | 'workflow' | 'agent' | 'model' | 'tool' | 'sandbox';
196
+ scope: 'run' | 'workflow' | 'agent' | 'model' | 'tool' | 'sandbox' | 'memory' | 'workspace';
157
197
  }, cause?: unknown);
158
198
  }
159
199
  /** MCP transport/protocol failure. */
@@ -111,6 +111,25 @@ export class StateError extends HarnessError {
111
111
  super({ code: 'STATE_ERROR', category: 'state', retriable: true, message, meta, cause });
112
112
  }
113
113
  }
114
+ /** Durable workspace lifecycle, consistency, inspection, or backend failure. */
115
+ export class WorkspaceError extends HarnessError {
116
+ constructor(message, meta, cause) {
117
+ const retriable = meta.reason === 'backend_failure' || meta.reason === 'cleanup_pending';
118
+ super({ code: 'WORKSPACE_ERROR', category: 'workspace', retriable, message, meta, cause });
119
+ }
120
+ }
121
+ /** Durable workspace quota would be or was exceeded. */
122
+ export class WorkspaceQuotaExceededError extends HarnessError {
123
+ constructor(message, meta, cause) {
124
+ super({ code: 'WORKSPACE_QUOTA_EXCEEDED', category: 'workspace', retriable: false, message, meta, cause });
125
+ }
126
+ }
127
+ /** Durable workspace cleanup could not complete in the current attempt. */
128
+ export class WorkspaceCleanupError extends HarnessError {
129
+ constructor(message, meta, cause) {
130
+ super({ code: 'WORKSPACE_CLEANUP_ERROR', category: 'workspace', retriable: true, message, meta, cause });
131
+ }
132
+ }
114
133
  /** Timed execution budget expired. */
115
134
  export class OperationTimeoutError extends HarnessError {
116
135
  constructor(message, meta, cause) {
@@ -22,6 +22,8 @@ export type ErrorCategory =
22
22
  | 'session'
23
23
  /** State-store persistence failures. */
24
24
  | 'state'
25
+ /** Durable workspace lifecycle or backend failures. */
26
+ | 'workspace'
25
27
  /** Timeout budget failures. */
26
28
  | 'timeout'
27
29
  /** Cooperative cancellation events. */
@@ -0,0 +1,57 @@
1
+ import type { JsonValue } from '../models/json.js';
2
+ export type DeterministicScorerDefinition = {
3
+ type: 'regex';
4
+ path: string;
5
+ pattern: string;
6
+ flags?: 'i' | 'm' | 'im';
7
+ } | {
8
+ type: 'json-schema';
9
+ schema: JsonValue;
10
+ } | {
11
+ type: 'contains';
12
+ path: string;
13
+ value: string;
14
+ caseInsensitive?: boolean;
15
+ } | {
16
+ type: 'attribute-equality';
17
+ leftPath: string;
18
+ rightPath: string;
19
+ };
20
+ export interface ScorerTarget {
21
+ input: unknown;
22
+ output: unknown;
23
+ expected?: unknown;
24
+ context?: unknown[];
25
+ }
26
+ export interface ScorerResult {
27
+ score: number;
28
+ passed: boolean;
29
+ evidence?: JsonValue;
30
+ }
31
+ export interface PromptCandidate<I = unknown> {
32
+ id: string;
33
+ prompt: string;
34
+ metadata?: Record<string, JsonValue>;
35
+ }
36
+ export interface EvaluationItem<I = unknown> {
37
+ id: string;
38
+ input: I;
39
+ expected?: unknown;
40
+ context?: unknown[];
41
+ }
42
+ export interface CandidateScore {
43
+ candidateId: string;
44
+ meanScore: number;
45
+ passRate: number;
46
+ itemCount: number;
47
+ scorerCount: number;
48
+ }
49
+ export interface EvaluatePromptCandidatesInput<I = unknown> {
50
+ candidates: PromptCandidate<I>[];
51
+ items: EvaluationItem<I>[];
52
+ scorer: (target: ScorerTarget, signal: AbortSignal) => Promise<ScorerResult>;
53
+ runCandidate: (candidate: PromptCandidate<I>, item: EvaluationItem<I>, signal: AbortSignal) => Promise<unknown>;
54
+ signal: AbortSignal;
55
+ }
56
+ export declare function evaluateDeterministicScorer(definition: DeterministicScorerDefinition, target: ScorerTarget): ScorerResult;
57
+ export declare function evaluatePromptCandidates<I = unknown>(input: EvaluatePromptCandidatesInput<I>): Promise<CandidateScore[]>;
@@ -0,0 +1,181 @@
1
+ import { ValidationError } from '../errors/index.js';
2
+ export function evaluateDeterministicScorer(definition, target) {
3
+ switch (definition.type) {
4
+ case 'regex': {
5
+ const selected = readPointer(target.output, definition.path);
6
+ if (!selected.found)
7
+ return missingPointer(definition.path);
8
+ return binary(new RegExp(definition.pattern, definition.flags ?? '').test(String(selected.value)));
9
+ }
10
+ case 'contains': {
11
+ const selected = readPointer(target.output, definition.path);
12
+ if (!selected.found)
13
+ return missingPointer(definition.path);
14
+ const haystack = String(selected.value);
15
+ const needle = definition.value;
16
+ return binary(definition.caseInsensitive
17
+ ? haystack.toLocaleLowerCase().includes(needle.toLocaleLowerCase())
18
+ : haystack.includes(needle));
19
+ }
20
+ case 'attribute-equality': {
21
+ const left = readPointer(target.output, definition.leftPath);
22
+ if (!left.found)
23
+ return missingPointer(definition.leftPath);
24
+ const right = readPointer(target.output, definition.rightPath);
25
+ if (!right.found)
26
+ return missingPointer(definition.rightPath);
27
+ return deepEqual(left.value, right.value)
28
+ ? binary(true)
29
+ : { score: 0, passed: false, evidence: { left: toJsonValue(left.value), right: toJsonValue(right.value) } };
30
+ }
31
+ case 'json-schema': {
32
+ const result = validateJsonSchema(definition.schema, target.output);
33
+ return result.passed
34
+ ? binary(true)
35
+ : { score: 0, passed: false, evidence: { reason: 'schema_validation_failed', issues: result.issues } };
36
+ }
37
+ }
38
+ }
39
+ export async function evaluatePromptCandidates(input) {
40
+ if (input.candidates.length === 0) {
41
+ throw new ValidationError('At least one prompt candidate is required.', { where: 'eval_input', issues: { candidates: 'empty' } });
42
+ }
43
+ if (input.items.length === 0) {
44
+ throw new ValidationError('At least one evaluation item is required.', { where: 'eval_input', issues: { items: 'empty' } });
45
+ }
46
+ const scores = [];
47
+ for (const candidate of input.candidates) {
48
+ input.signal.throwIfAborted();
49
+ let total = 0;
50
+ let passed = 0;
51
+ let scorerCount = 0;
52
+ for (const item of input.items) {
53
+ input.signal.throwIfAborted();
54
+ const output = await input.runCandidate(candidate, item, input.signal);
55
+ const target = {
56
+ input: item.input,
57
+ output
58
+ };
59
+ if (item.expected !== undefined)
60
+ target.expected = item.expected;
61
+ if (item.context !== undefined)
62
+ target.context = item.context;
63
+ const result = await input.scorer(target, input.signal);
64
+ total += result.score;
65
+ passed += result.passed ? 1 : 0;
66
+ scorerCount += 1;
67
+ }
68
+ scores.push({
69
+ candidateId: candidate.id,
70
+ meanScore: total / scorerCount,
71
+ passRate: passed / scorerCount,
72
+ itemCount: input.items.length,
73
+ scorerCount
74
+ });
75
+ }
76
+ return scores.sort((a, b) => {
77
+ if (a.meanScore !== b.meanScore)
78
+ return b.meanScore - a.meanScore;
79
+ if (a.passRate !== b.passRate)
80
+ return b.passRate - a.passRate;
81
+ return a.candidateId.localeCompare(b.candidateId);
82
+ });
83
+ }
84
+ function binary(passed) {
85
+ return { score: passed ? 1 : 0, passed };
86
+ }
87
+ function missingPointer(path) {
88
+ return { score: 0, passed: false, evidence: { reason: 'missing_pointer', path } };
89
+ }
90
+ function readPointer(value, pointer) {
91
+ if (pointer === '')
92
+ return { found: true, value };
93
+ if (!pointer.startsWith('/'))
94
+ return { found: false };
95
+ let current = value;
96
+ for (const rawPart of pointer.slice(1).split('/')) {
97
+ const part = rawPart.replace(/~1/g, '/').replace(/~0/g, '~');
98
+ if (Array.isArray(current)) {
99
+ const index = Number(part);
100
+ if (!Number.isInteger(index) || index < 0 || index >= current.length)
101
+ return { found: false };
102
+ current = current[index];
103
+ continue;
104
+ }
105
+ if (!isRecord(current) || !(part in current))
106
+ return { found: false };
107
+ current = current[part];
108
+ }
109
+ return { found: true, value: current };
110
+ }
111
+ function validateJsonSchema(schema, value) {
112
+ const issues = [];
113
+ validateSchemaAt(schema, value, '', issues);
114
+ return { passed: issues.length === 0, issues };
115
+ }
116
+ function validateSchemaAt(schema, value, path, issues) {
117
+ if (!isRecord(schema))
118
+ return;
119
+ if ('const' in schema && !deepEqual(value, schema['const'])) {
120
+ issues.push({ path, reason: 'const', expected: toJsonValue(schema['const']), actual: toJsonValue(value) });
121
+ return;
122
+ }
123
+ if (Array.isArray(schema['enum']) && !schema['enum'].some((entry) => deepEqual(entry, value))) {
124
+ issues.push({ path, reason: 'enum', actual: toJsonValue(value) });
125
+ return;
126
+ }
127
+ const type = typeof schema['type'] === 'string' ? schema['type'] : undefined;
128
+ if (type && !matchesType(value, type)) {
129
+ issues.push({ path, reason: 'type', expected: type, actual: typeof value });
130
+ return;
131
+ }
132
+ if (type === 'object' || schema['properties']) {
133
+ if (!isRecord(value)) {
134
+ issues.push({ path, reason: 'type', expected: 'object', actual: typeof value });
135
+ return;
136
+ }
137
+ const required = Array.isArray(schema['required']) ? schema['required'].filter((entry) => typeof entry === 'string') : [];
138
+ for (const key of required) {
139
+ if (!(key in value))
140
+ issues.push({ path: `${path}/${key}`, reason: 'required' });
141
+ }
142
+ const properties = isRecord(schema['properties']) ? schema['properties'] : {};
143
+ for (const [key, childSchema] of Object.entries(properties)) {
144
+ if (key in value)
145
+ validateSchemaAt(childSchema, value[key], `${path}/${key}`, issues);
146
+ }
147
+ if (schema['additionalProperties'] === false) {
148
+ for (const key of Object.keys(value)) {
149
+ if (!(key in properties))
150
+ issues.push({ path: `${path}/${key}`, reason: 'additional_properties' });
151
+ }
152
+ }
153
+ }
154
+ }
155
+ function matchesType(value, type) {
156
+ switch (type) {
157
+ case 'object': return isRecord(value);
158
+ case 'array': return Array.isArray(value);
159
+ case 'string': return typeof value === 'string';
160
+ case 'number': return typeof value === 'number' && Number.isFinite(value);
161
+ case 'integer': return Number.isInteger(value);
162
+ case 'boolean': return typeof value === 'boolean';
163
+ case 'null': return value === null;
164
+ default: return true;
165
+ }
166
+ }
167
+ function isRecord(value) {
168
+ return value !== null && typeof value === 'object' && !Array.isArray(value);
169
+ }
170
+ function deepEqual(a, b) {
171
+ return JSON.stringify(a) === JSON.stringify(b);
172
+ }
173
+ function toJsonValue(value) {
174
+ if (value === null || typeof value === 'string' || typeof value === 'number' || typeof value === 'boolean')
175
+ return value;
176
+ if (Array.isArray(value))
177
+ return value.map((entry) => toJsonValue(entry));
178
+ if (isRecord(value))
179
+ return Object.fromEntries(Object.entries(value).map(([key, entry]) => [key, toJsonValue(entry)]));
180
+ return String(value);
181
+ }