@agent-native/core 0.52.0 → 0.54.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +41 -95
- package/blueprints/action/crud.md +98 -0
- package/blueprints/channel/discord.md +74 -0
- package/blueprints/provider/stripe.md +87 -0
- package/blueprints/sandbox/docker.md +78 -0
- package/dist/action.d.ts +64 -1
- package/dist/action.d.ts.map +1 -1
- package/dist/action.js +73 -2
- package/dist/action.js.map +1 -1
- package/dist/agent/index.d.ts +1 -0
- package/dist/agent/index.d.ts.map +1 -1
- package/dist/agent/index.js +1 -0
- package/dist/agent/index.js.map +1 -1
- package/dist/agent/observational-memory/compactor.d.ts +43 -0
- package/dist/agent/observational-memory/compactor.d.ts.map +1 -0
- package/dist/agent/observational-memory/compactor.js +50 -0
- package/dist/agent/observational-memory/compactor.js.map +1 -0
- package/dist/agent/observational-memory/config.d.ts +37 -0
- package/dist/agent/observational-memory/config.d.ts.map +1 -0
- package/dist/agent/observational-memory/config.js +48 -0
- package/dist/agent/observational-memory/config.js.map +1 -0
- package/dist/agent/observational-memory/index.d.ts +26 -0
- package/dist/agent/observational-memory/index.d.ts.map +1 -0
- package/dist/agent/observational-memory/index.js +25 -0
- package/dist/agent/observational-memory/index.js.map +1 -0
- package/dist/agent/observational-memory/internal-run.d.ts +37 -0
- package/dist/agent/observational-memory/internal-run.d.ts.map +1 -0
- package/dist/agent/observational-memory/internal-run.js +59 -0
- package/dist/agent/observational-memory/internal-run.js.map +1 -0
- package/dist/agent/observational-memory/message-text.d.ts +13 -0
- package/dist/agent/observational-memory/message-text.d.ts.map +1 -0
- package/dist/agent/observational-memory/message-text.js +46 -0
- package/dist/agent/observational-memory/message-text.js.map +1 -0
- package/dist/agent/observational-memory/migrations.d.ts +13 -0
- package/dist/agent/observational-memory/migrations.d.ts.map +1 -0
- package/dist/agent/observational-memory/migrations.js +43 -0
- package/dist/agent/observational-memory/migrations.js.map +1 -0
- package/dist/agent/observational-memory/observer.d.ts +37 -0
- package/dist/agent/observational-memory/observer.d.ts.map +1 -0
- package/dist/agent/observational-memory/observer.js +82 -0
- package/dist/agent/observational-memory/observer.js.map +1 -0
- package/dist/agent/observational-memory/plugin.d.ts +16 -0
- package/dist/agent/observational-memory/plugin.d.ts.map +1 -0
- package/dist/agent/observational-memory/plugin.js +26 -0
- package/dist/agent/observational-memory/plugin.js.map +1 -0
- package/dist/agent/observational-memory/prompts.d.ts +27 -0
- package/dist/agent/observational-memory/prompts.d.ts.map +1 -0
- package/dist/agent/observational-memory/prompts.js +42 -0
- package/dist/agent/observational-memory/prompts.js.map +1 -0
- package/dist/agent/observational-memory/read.d.ts +45 -0
- package/dist/agent/observational-memory/read.d.ts.map +1 -0
- package/dist/agent/observational-memory/read.js +97 -0
- package/dist/agent/observational-memory/read.js.map +1 -0
- package/dist/agent/observational-memory/reflector.d.ts +31 -0
- package/dist/agent/observational-memory/reflector.d.ts.map +1 -0
- package/dist/agent/observational-memory/reflector.js +76 -0
- package/dist/agent/observational-memory/reflector.js.map +1 -0
- package/dist/agent/observational-memory/schema.d.ts +267 -0
- package/dist/agent/observational-memory/schema.d.ts.map +1 -0
- package/dist/agent/observational-memory/schema.js +48 -0
- package/dist/agent/observational-memory/schema.js.map +1 -0
- package/dist/agent/observational-memory/store.d.ts +52 -0
- package/dist/agent/observational-memory/store.d.ts.map +1 -0
- package/dist/agent/observational-memory/store.js +197 -0
- package/dist/agent/observational-memory/store.js.map +1 -0
- package/dist/agent/observational-memory/types.d.ts +61 -0
- package/dist/agent/observational-memory/types.d.ts.map +1 -0
- package/dist/agent/observational-memory/types.js +9 -0
- package/dist/agent/observational-memory/types.js.map +1 -0
- package/dist/agent/processors.d.ts +146 -0
- package/dist/agent/processors.d.ts.map +1 -0
- package/dist/agent/processors.js +122 -0
- package/dist/agent/processors.js.map +1 -0
- package/dist/agent/production-agent.d.ts +25 -0
- package/dist/agent/production-agent.d.ts.map +1 -1
- package/dist/agent/production-agent.js +341 -1
- package/dist/agent/production-agent.js.map +1 -1
- package/dist/agent/run-loop-with-resume.d.ts.map +1 -1
- package/dist/agent/run-loop-with-resume.js +48 -0
- package/dist/agent/run-loop-with-resume.js.map +1 -1
- package/dist/agent/run-store.d.ts +17 -0
- package/dist/agent/run-store.d.ts.map +1 -1
- package/dist/agent/run-store.js +55 -0
- package/dist/agent/run-store.js.map +1 -1
- package/dist/agent/runtime-context.d.ts +30 -0
- package/dist/agent/runtime-context.d.ts.map +1 -1
- package/dist/agent/runtime-context.js +54 -1
- package/dist/agent/runtime-context.js.map +1 -1
- package/dist/agent/tool-call-journal.d.ts +99 -0
- package/dist/agent/tool-call-journal.d.ts.map +1 -0
- package/dist/agent/tool-call-journal.js +212 -0
- package/dist/agent/tool-call-journal.js.map +1 -0
- package/dist/agent/types.d.ts +35 -0
- package/dist/agent/types.d.ts.map +1 -1
- package/dist/agent/types.js.map +1 -1
- package/dist/cli/add.d.ts +109 -0
- package/dist/cli/add.d.ts.map +1 -0
- package/dist/cli/add.js +352 -0
- package/dist/cli/add.js.map +1 -0
- package/dist/cli/connect.d.ts +2 -2
- package/dist/cli/connect.d.ts.map +1 -1
- package/dist/cli/connect.js +92 -24
- package/dist/cli/connect.js.map +1 -1
- package/dist/cli/eval.d.ts +17 -0
- package/dist/cli/eval.d.ts.map +1 -0
- package/dist/cli/eval.js +121 -0
- package/dist/cli/eval.js.map +1 -0
- package/dist/cli/index.js +44 -3
- package/dist/cli/index.js.map +1 -1
- package/dist/cli/mcp.d.ts.map +1 -1
- package/dist/cli/mcp.js +11 -5
- package/dist/cli/mcp.js.map +1 -1
- package/dist/cli/plan-local.d.ts +66 -5
- package/dist/cli/plan-local.d.ts.map +1 -1
- package/dist/cli/plan-local.js +622 -21
- package/dist/cli/plan-local.js.map +1 -1
- package/dist/cli/skills.d.ts +2 -2
- package/dist/cli/skills.d.ts.map +1 -1
- package/dist/cli/skills.js +108 -62
- package/dist/cli/skills.js.map +1 -1
- package/dist/client/AssistantChat.d.ts.map +1 -1
- package/dist/client/AssistantChat.js +118 -92
- package/dist/client/AssistantChat.js.map +1 -1
- package/dist/client/agent-chat-adapter.d.ts.map +1 -1
- package/dist/client/agent-chat-adapter.js +16 -0
- package/dist/client/agent-chat-adapter.js.map +1 -1
- package/dist/client/chat/tool-call-display.d.ts +20 -1
- package/dist/client/chat/tool-call-display.d.ts.map +1 -1
- package/dist/client/chat/tool-call-display.js +32 -7
- package/dist/client/chat/tool-call-display.js.map +1 -1
- package/dist/client/sse-event-processor.d.ts +13 -0
- package/dist/client/sse-event-processor.d.ts.map +1 -1
- package/dist/client/sse-event-processor.js +21 -0
- package/dist/client/sse-event-processor.js.map +1 -1
- package/dist/coding-tools/run-code.d.ts.map +1 -1
- package/dist/coding-tools/run-code.js +18 -2
- package/dist/coding-tools/run-code.js.map +1 -1
- package/dist/db/client.d.ts +4 -2
- package/dist/db/client.d.ts.map +1 -1
- package/dist/db/client.js +6 -4
- package/dist/db/client.js.map +1 -1
- package/dist/deploy/route-discovery.d.ts.map +1 -1
- package/dist/deploy/route-discovery.js +1 -0
- package/dist/deploy/route-discovery.js.map +1 -1
- package/dist/eval/agent-runner.d.ts +63 -0
- package/dist/eval/agent-runner.d.ts.map +1 -0
- package/dist/eval/agent-runner.js +142 -0
- package/dist/eval/agent-runner.js.map +1 -0
- package/dist/eval/define-eval.d.ts +29 -0
- package/dist/eval/define-eval.d.ts.map +1 -0
- package/dist/eval/define-eval.js +43 -0
- package/dist/eval/define-eval.js.map +1 -0
- package/dist/eval/index.d.ts +18 -0
- package/dist/eval/index.d.ts.map +1 -0
- package/dist/eval/index.js +17 -0
- package/dist/eval/index.js.map +1 -0
- package/dist/eval/report.d.ts +8 -0
- package/dist/eval/report.d.ts.map +1 -0
- package/dist/eval/report.js +44 -0
- package/dist/eval/report.js.map +1 -0
- package/dist/eval/runner.d.ts +67 -0
- package/dist/eval/runner.d.ts.map +1 -0
- package/dist/eval/runner.js +256 -0
- package/dist/eval/runner.js.map +1 -0
- package/dist/eval/scorer.d.ts +83 -0
- package/dist/eval/scorer.d.ts.map +1 -0
- package/dist/eval/scorer.js +195 -0
- package/dist/eval/scorer.js.map +1 -0
- package/dist/eval/types.d.ts +162 -0
- package/dist/eval/types.d.ts.map +1 -0
- package/dist/eval/types.js +20 -0
- package/dist/eval/types.js.map +1 -0
- package/dist/extensions/fetch-tool.d.ts.map +1 -1
- package/dist/extensions/fetch-tool.js +80 -15
- package/dist/extensions/fetch-tool.js.map +1 -1
- package/dist/extensions/web-content.d.ts +61 -0
- package/dist/extensions/web-content.d.ts.map +1 -0
- package/dist/extensions/web-content.js +468 -0
- package/dist/extensions/web-content.js.map +1 -0
- package/dist/extensions/web-search-tool.js +3 -3
- package/dist/extensions/web-search-tool.js.map +1 -1
- package/dist/mcp/build-server.d.ts.map +1 -1
- package/dist/mcp/build-server.js +4 -1
- package/dist/mcp/build-server.js.map +1 -1
- package/dist/observability/traces.d.ts.map +1 -1
- package/dist/observability/traces.js +100 -1
- package/dist/observability/traces.js.map +1 -1
- package/dist/observability/tracing.d.ts +73 -0
- package/dist/observability/tracing.d.ts.map +1 -0
- package/dist/observability/tracing.js +126 -0
- package/dist/observability/tracing.js.map +1 -0
- package/dist/onboarding/default-steps.d.ts.map +1 -1
- package/dist/onboarding/default-steps.js +4 -1
- package/dist/onboarding/default-steps.js.map +1 -1
- package/dist/provider-api/actions/query-staged-dataset.d.ts +1 -1
- package/dist/provider-api/corpus-jobs.d.ts +80 -0
- package/dist/provider-api/corpus-jobs.d.ts.map +1 -1
- package/dist/provider-api/corpus-jobs.js +219 -22
- package/dist/provider-api/corpus-jobs.js.map +1 -1
- package/dist/provider-api/index.d.ts +24 -32
- package/dist/provider-api/index.d.ts.map +1 -1
- package/dist/provider-api/index.js +28 -1
- package/dist/provider-api/index.js.map +1 -1
- package/dist/scripts/agent-engines/list-agent-engines.d.ts.map +1 -1
- package/dist/scripts/agent-engines/list-agent-engines.js +10 -3
- package/dist/scripts/agent-engines/list-agent-engines.js.map +1 -1
- package/dist/server/action-discovery.d.ts.map +1 -1
- package/dist/server/action-discovery.js +4 -0
- package/dist/server/action-discovery.js.map +1 -1
- package/dist/server/agent-chat-plugin.d.ts +9 -0
- package/dist/server/agent-chat-plugin.d.ts.map +1 -1
- package/dist/server/agent-chat-plugin.js +119 -111
- package/dist/server/agent-chat-plugin.js.map +1 -1
- package/dist/server/agent-teams.d.ts +62 -0
- package/dist/server/agent-teams.d.ts.map +1 -1
- package/dist/server/agent-teams.js +99 -2
- package/dist/server/agent-teams.js.map +1 -1
- package/dist/server/better-auth-instance.d.ts +7 -0
- package/dist/server/better-auth-instance.d.ts.map +1 -1
- package/dist/server/better-auth-instance.js +90 -0
- package/dist/server/better-auth-instance.js.map +1 -1
- package/dist/server/core-routes-plugin.d.ts.map +1 -1
- package/dist/server/core-routes-plugin.js +7 -4
- package/dist/server/core-routes-plugin.js.map +1 -1
- package/dist/server/credential-provider.d.ts.map +1 -1
- package/dist/server/credential-provider.js +2 -0
- package/dist/server/credential-provider.js.map +1 -1
- package/dist/server/deep-link.d.ts +7 -0
- package/dist/server/deep-link.d.ts.map +1 -1
- package/dist/server/deep-link.js +13 -2
- package/dist/server/deep-link.js.map +1 -1
- package/dist/server/framework-request-handler.d.ts.map +1 -1
- package/dist/server/framework-request-handler.js +33 -1
- package/dist/server/framework-request-handler.js.map +1 -1
- package/dist/server/index.d.ts +2 -1
- package/dist/server/index.d.ts.map +1 -1
- package/dist/server/index.js +2 -1
- package/dist/server/index.js.map +1 -1
- package/dist/templates/default/.agents/skills/actions/SKILL.md +52 -1
- package/dist/templates/default/.agents/skills/security/SKILL.md +22 -0
- package/dist/templates/workspace-core/.agents/skills/actions/SKILL.md +52 -1
- package/dist/templates/workspace-core/.agents/skills/external-agents/SKILL.md +16 -4
- package/dist/templates/workspace-core/.agents/skills/harness-agents/SKILL.md +20 -0
- package/dist/templates/workspace-core/.agents/skills/observability/SKILL.md +31 -0
- package/dist/templates/workspace-core/.agents/skills/security/SKILL.md +22 -0
- package/docs/content/actions.md +50 -0
- package/docs/content/agent-teams.md +32 -0
- package/docs/content/blueprint-installer.md +73 -0
- package/docs/content/durable-resume.md +49 -0
- package/docs/content/evals.md +141 -0
- package/docs/content/external-agents.md +2 -2
- package/docs/content/human-approval.md +101 -0
- package/docs/content/observability.md +21 -0
- package/docs/content/observational-memory.md +63 -0
- package/docs/content/plan-plugin.md +5 -0
- package/docs/content/pr-visual-recap.md +9 -5
- package/docs/content/processors.md +99 -0
- package/docs/content/sandbox-adapters.md +134 -0
- package/docs/content/template-plan.md +97 -21
- package/package.json +10 -1
- package/src/templates/default/.agents/skills/actions/SKILL.md +52 -1
- package/src/templates/default/.agents/skills/security/SKILL.md +22 -0
- package/src/templates/workspace-core/.agents/skills/actions/SKILL.md +52 -1
- package/src/templates/workspace-core/.agents/skills/external-agents/SKILL.md +16 -4
- package/src/templates/workspace-core/.agents/skills/harness-agents/SKILL.md +20 -0
- package/src/templates/workspace-core/.agents/skills/observability/SKILL.md +31 -0
- package/src/templates/workspace-core/.agents/skills/security/SKILL.md +22 -0
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* The headless agent-run seam used by the evals runner.
|
|
3
|
+
*
|
|
4
|
+
* This invokes the real `runAgentLoop` as a *caller* — resolving a provider-
|
|
5
|
+
* agnostic engine + model from the existing registry, converting the app's
|
|
6
|
+
* actions into engine tools, and collecting the assistant's text + tool calls
|
|
7
|
+
* off the `send` event stream into a compact `AgentRunOutput`. It deliberately
|
|
8
|
+
* does NOT modify `production-agent.ts`: everything it needs (`runAgentLoop`,
|
|
9
|
+
* `actionsToEngineTools`, `ActionEntry`) is already exported from there.
|
|
10
|
+
*
|
|
11
|
+
* The factory shape (`createAgentRunner`) keeps the runner unit-testable: tests
|
|
12
|
+
* inject a fake `runAgentLoop` and a fake engine so no real model is called,
|
|
13
|
+
* while production wires in the genuine loop. The same factory builds the
|
|
14
|
+
* `ScorerAnalyzeContext.judge` helper so LLM-judge scorers stream through the
|
|
15
|
+
* exact same resolved engine.
|
|
16
|
+
*/
|
|
17
|
+
import type { ActionEntry, AgentLoopUsage } from "../agent/production-agent.js";
|
|
18
|
+
import type { AgentEngine, EngineMessage, EngineTool } from "../agent/engine/types.js";
|
|
19
|
+
import type { AgentChatEvent } from "../agent/types.js";
|
|
20
|
+
import type { AgentRunOutput, EvalInput, ScorerAnalyzeContext } from "./types.js";
|
|
21
|
+
/** The slice of `runAgentLoop` the runner depends on — injectable for tests. */
|
|
22
|
+
export type RunAgentLoopFn = (opts: {
|
|
23
|
+
engine: AgentEngine;
|
|
24
|
+
model: string;
|
|
25
|
+
systemPrompt: string;
|
|
26
|
+
tools: EngineTool[];
|
|
27
|
+
messages: EngineMessage[];
|
|
28
|
+
actions: Record<string, ActionEntry>;
|
|
29
|
+
send: (event: AgentChatEvent) => void;
|
|
30
|
+
signal: AbortSignal;
|
|
31
|
+
}) => Promise<AgentLoopUsage>;
|
|
32
|
+
export interface AgentRunnerConfig {
|
|
33
|
+
/** App actions to expose to the agent under test. */
|
|
34
|
+
actions: Record<string, ActionEntry>;
|
|
35
|
+
/** System prompt for the run. */
|
|
36
|
+
systemPrompt?: string;
|
|
37
|
+
/** Pre-resolved engine; resolved from the registry when omitted. */
|
|
38
|
+
engine?: AgentEngine;
|
|
39
|
+
/** Pre-resolved model; resolved from the engine's stored/default when omitted. */
|
|
40
|
+
model?: string;
|
|
41
|
+
/** Per-run wall-clock budget in ms (default 120s). */
|
|
42
|
+
timeoutMs?: number;
|
|
43
|
+
/**
|
|
44
|
+
* Seam for tests / custom hosts. Defaults to the real `runAgentLoop`. The
|
|
45
|
+
* runner never imports `runAgentLoop` directly so this can be swapped.
|
|
46
|
+
*/
|
|
47
|
+
runLoop?: RunAgentLoopFn;
|
|
48
|
+
}
|
|
49
|
+
export interface AgentRunner {
|
|
50
|
+
/** Run the agent loop for one eval input and collect a compact output. */
|
|
51
|
+
runAgent(input: EvalInput): Promise<AgentRunOutput>;
|
|
52
|
+
/** Analyze context handed to LLM-judge scorers (shares engine/model). */
|
|
53
|
+
analyzeContext(): ScorerAnalyzeContext;
|
|
54
|
+
readonly engine: AgentEngine;
|
|
55
|
+
readonly model: string;
|
|
56
|
+
}
|
|
57
|
+
/**
|
|
58
|
+
* Build an agent runner, resolving the engine/model once up front so every
|
|
59
|
+
* eval case (and every LLM-judge scorer) reuses the same provider-agnostic
|
|
60
|
+
* config. Resolution goes through `resolveEngine` — no model is ever hardcoded.
|
|
61
|
+
*/
|
|
62
|
+
export declare function createAgentRunner(config: AgentRunnerConfig): Promise<AgentRunner>;
|
|
63
|
+
//# sourceMappingURL=agent-runner.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"agent-runner.d.ts","sourceRoot":"","sources":["../../src/eval/agent-runner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAEH,OAAO,KAAK,EAAE,WAAW,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAKhF,OAAO,KAAK,EACV,WAAW,EACX,aAAa,EACb,UAAU,EACX,MAAM,0BAA0B,CAAC;AAClC,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,mBAAmB,CAAC;AAMxD,OAAO,KAAK,EACV,cAAc,EACd,SAAS,EACT,oBAAoB,EACrB,MAAM,YAAY,CAAC;AAKpB,gFAAgF;AAChF,MAAM,MAAM,cAAc,GAAG,CAAC,IAAI,EAAE;IAClC,MAAM,EAAE,WAAW,CAAC;IACpB,KAAK,EAAE,MAAM,CAAC;IACd,YAAY,EAAE,MAAM,CAAC;IACrB,KAAK,EAAE,UAAU,EAAE,CAAC;IACpB,QAAQ,EAAE,aAAa,EAAE,CAAC;IAC1B,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,WAAW,CAAC,CAAC;IACrC,IAAI,EAAE,CAAC,KAAK,EAAE,cAAc,KAAK,IAAI,CAAC;IACtC,MAAM,EAAE,WAAW,CAAC;CACrB,KAAK,OAAO,CAAC,cAAc,CAAC,CAAC;AAE9B,MAAM,WAAW,iBAAiB;IAChC,qDAAqD;IACrD,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,WAAW,CAAC,CAAC;IACrC,iCAAiC;IACjC,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,oEAAoE;IACpE,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,kFAAkF;IAClF,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,sDAAsD;IACtD,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB;;;OAGG;IACH,OAAO,CAAC,EAAE,cAAc,CAAC;CAC1B;AAED,MAAM,WAAW,WAAW;IAC1B,0EAA0E;IAC1E,QAAQ,CAAC,KAAK,EAAE,SAAS,GAAG,OAAO,CAAC,cAAc,CAAC,CAAC;IACpD,yEAAyE;IACzE,cAAc,IAAI,oBAAoB,CAAC;IACvC,QAAQ,CAAC,MAAM,EAAE,WAAW,CAAC;IAC7B,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;CACxB;AAiBD;;;;GAIG;AACH,wBAAsB,iBAAiB,CACrC,MAAM,EAAE,iBAAiB,GACxB,OAAO,CAAC,WAAW,CAAC,CA0GtB"}
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* The headless agent-run seam used by the evals runner.
|
|
3
|
+
*
|
|
4
|
+
* This invokes the real `runAgentLoop` as a *caller* — resolving a provider-
|
|
5
|
+
* agnostic engine + model from the existing registry, converting the app's
|
|
6
|
+
* actions into engine tools, and collecting the assistant's text + tool calls
|
|
7
|
+
* off the `send` event stream into a compact `AgentRunOutput`. It deliberately
|
|
8
|
+
* does NOT modify `production-agent.ts`: everything it needs (`runAgentLoop`,
|
|
9
|
+
* `actionsToEngineTools`, `ActionEntry`) is already exported from there.
|
|
10
|
+
*
|
|
11
|
+
* The factory shape (`createAgentRunner`) keeps the runner unit-testable: tests
|
|
12
|
+
* inject a fake `runAgentLoop` and a fake engine so no real model is called,
|
|
13
|
+
* while production wires in the genuine loop. The same factory builds the
|
|
14
|
+
* `ScorerAnalyzeContext.judge` helper so LLM-judge scorers stream through the
|
|
15
|
+
* exact same resolved engine.
|
|
16
|
+
*/
|
|
17
|
+
import { runAgentLoop, actionsToEngineTools, } from "../agent/production-agent.js";
|
|
18
|
+
import { resolveEngine, getStoredModelForEngine, normalizeModelForEngine, } from "../agent/engine/index.js";
|
|
19
|
+
const JUDGE_TIMEOUT_MS = 30_000;
|
|
20
|
+
const DEFAULT_AGENT_TIMEOUT_MS = 120_000;
|
|
21
|
+
function toEngineMessages(input) {
|
|
22
|
+
const messages = [];
|
|
23
|
+
for (const turn of input.history ?? []) {
|
|
24
|
+
messages.push({
|
|
25
|
+
role: turn.role,
|
|
26
|
+
content: [{ type: "text", text: turn.text }],
|
|
27
|
+
});
|
|
28
|
+
}
|
|
29
|
+
messages.push({
|
|
30
|
+
role: "user",
|
|
31
|
+
content: [{ type: "text", text: input.prompt }],
|
|
32
|
+
});
|
|
33
|
+
return messages;
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Build an agent runner, resolving the engine/model once up front so every
|
|
37
|
+
* eval case (and every LLM-judge scorer) reuses the same provider-agnostic
|
|
38
|
+
* config. Resolution goes through `resolveEngine` — no model is ever hardcoded.
|
|
39
|
+
*/
|
|
40
|
+
export async function createAgentRunner(config) {
|
|
41
|
+
const engine = config.engine ?? (await resolveEngine({ engineOption: undefined }));
|
|
42
|
+
const modelCandidate = config.model ??
|
|
43
|
+
(await getStoredModelForEngine(engine)) ??
|
|
44
|
+
engine.defaultModel;
|
|
45
|
+
const model = normalizeModelForEngine(engine, modelCandidate);
|
|
46
|
+
const systemPrompt = config.systemPrompt ?? "";
|
|
47
|
+
const runLoop = config.runLoop ?? runAgentLoop;
|
|
48
|
+
const timeoutMs = config.timeoutMs ?? DEFAULT_AGENT_TIMEOUT_MS;
|
|
49
|
+
const tools = actionsToEngineTools(config.actions);
|
|
50
|
+
async function runAgent(input) {
|
|
51
|
+
const runId = `eval:${crypto.randomUUID()}`;
|
|
52
|
+
const messages = toEngineMessages(input);
|
|
53
|
+
let text = "";
|
|
54
|
+
const toolCalls = [];
|
|
55
|
+
let ok = true;
|
|
56
|
+
let error;
|
|
57
|
+
const controller = new AbortController();
|
|
58
|
+
const timer = setTimeout(() => controller.abort(), timeoutMs);
|
|
59
|
+
const started = Date.now();
|
|
60
|
+
const send = (event) => {
|
|
61
|
+
switch (event.type) {
|
|
62
|
+
case "text":
|
|
63
|
+
text += event.text;
|
|
64
|
+
break;
|
|
65
|
+
case "tool_start":
|
|
66
|
+
toolCalls.push(event.tool);
|
|
67
|
+
break;
|
|
68
|
+
case "error":
|
|
69
|
+
ok = false;
|
|
70
|
+
error = event.error;
|
|
71
|
+
break;
|
|
72
|
+
default:
|
|
73
|
+
break;
|
|
74
|
+
}
|
|
75
|
+
};
|
|
76
|
+
try {
|
|
77
|
+
await runLoop({
|
|
78
|
+
engine,
|
|
79
|
+
model,
|
|
80
|
+
systemPrompt,
|
|
81
|
+
tools,
|
|
82
|
+
messages,
|
|
83
|
+
actions: config.actions,
|
|
84
|
+
send,
|
|
85
|
+
signal: controller.signal,
|
|
86
|
+
});
|
|
87
|
+
}
|
|
88
|
+
catch (err) {
|
|
89
|
+
ok = false;
|
|
90
|
+
error = err instanceof Error ? err.message : String(err);
|
|
91
|
+
}
|
|
92
|
+
finally {
|
|
93
|
+
clearTimeout(timer);
|
|
94
|
+
}
|
|
95
|
+
return {
|
|
96
|
+
text,
|
|
97
|
+
toolCalls,
|
|
98
|
+
ok,
|
|
99
|
+
error,
|
|
100
|
+
runId,
|
|
101
|
+
durationMs: Date.now() - started,
|
|
102
|
+
};
|
|
103
|
+
}
|
|
104
|
+
function analyzeContext() {
|
|
105
|
+
return {
|
|
106
|
+
engine,
|
|
107
|
+
model,
|
|
108
|
+
async judge(opts) {
|
|
109
|
+
const controller = new AbortController();
|
|
110
|
+
const signal = opts.signal ?? controller.signal;
|
|
111
|
+
const timer = opts.signal
|
|
112
|
+
? undefined
|
|
113
|
+
: setTimeout(() => controller.abort(), JUDGE_TIMEOUT_MS);
|
|
114
|
+
let out = "";
|
|
115
|
+
try {
|
|
116
|
+
const stream = engine.stream({
|
|
117
|
+
model,
|
|
118
|
+
systemPrompt: opts.systemPrompt ?? "",
|
|
119
|
+
messages: [
|
|
120
|
+
{ role: "user", content: [{ type: "text", text: opts.prompt }] },
|
|
121
|
+
],
|
|
122
|
+
tools: [],
|
|
123
|
+
abortSignal: signal,
|
|
124
|
+
maxOutputTokens: opts.maxOutputTokens ?? 512,
|
|
125
|
+
temperature: 0,
|
|
126
|
+
});
|
|
127
|
+
for await (const event of stream) {
|
|
128
|
+
if (event.type === "text-delta")
|
|
129
|
+
out += event.text;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
finally {
|
|
133
|
+
if (timer)
|
|
134
|
+
clearTimeout(timer);
|
|
135
|
+
}
|
|
136
|
+
return out;
|
|
137
|
+
},
|
|
138
|
+
};
|
|
139
|
+
}
|
|
140
|
+
return { runAgent, analyzeContext, engine, model };
|
|
141
|
+
}
|
|
142
|
+
//# sourceMappingURL=agent-runner.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"agent-runner.js","sourceRoot":"","sources":["../../src/eval/agent-runner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAGH,OAAO,EACL,YAAY,EACZ,oBAAoB,GACrB,MAAM,8BAA8B,CAAC;AAOtC,OAAO,EACL,aAAa,EACb,uBAAuB,EACvB,uBAAuB,GACxB,MAAM,0BAA0B,CAAC;AAOlC,MAAM,gBAAgB,GAAG,MAAM,CAAC;AAChC,MAAM,wBAAwB,GAAG,OAAO,CAAC;AAyCzC,SAAS,gBAAgB,CAAC,KAAgB;IACxC,MAAM,QAAQ,GAAoB,EAAE,CAAC;IACrC,KAAK,MAAM,IAAI,IAAI,KAAK,CAAC,OAAO,IAAI,EAAE,EAAE,CAAC;QACvC,QAAQ,CAAC,IAAI,CAAC;YACZ,IAAI,EAAE,IAAI,CAAC,IAAI;YACf,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,CAAC;SAC7C,CAAC,CAAC;IACL,CAAC;IACD,QAAQ,CAAC,IAAI,CAAC;QACZ,IAAI,EAAE,MAAM;QACZ,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,KAAK,CAAC,MAAM,EAAE,CAAC;KAChD,CAAC,CAAC;IACH,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,MAAyB;IAEzB,MAAM,MAAM,GACV,MAAM,CAAC,MAAM,IAAI,CAAC,MAAM,aAAa,CAAC,EAAE,YAAY,EAAE,SAAS,EAAE,CAAC,CAAC,CAAC;IACtE,MAAM,cAAc,GAClB,MAAM,CAAC,KAAK;QACZ,CAAC,MAAM,uBAAuB,CAAC,MAAM,CAAC,CAAC;QACvC,MAAM,CAAC,YAAY,CAAC;IACtB,MAAM,KAAK,GAAG,uBAAuB,CAAC,MAAM,EAAE,cAAc,CAAC,CAAC;IAC9D,MAAM,YAAY,GAAG,MAAM,CAAC,YAAY,IAAI,EAAE,CAAC;IAC/C,MAAM,OAAO,GAAG,MAAM,CAAC,OAAO,IAAK,YAA+B,CAAC;IACnE,MAAM,SAAS,GAAG,MAAM,CAAC,SAAS,IAAI,wBAAwB,CAAC;IAC/D,MAAM,KAAK,GAAG,oBAAoB,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAEnD,KAAK,UAAU,QAAQ,CAAC,KAAgB;QACtC,MAAM,KAAK,GAAG,QAAQ,MAAM,CAAC,UAAU,EAAE,EAAE,CAAC;QAC5C,MAAM,QAAQ,GAAG,gBAAgB,CAAC,KAAK,CAAC,CAAC;QAEzC,IAAI,IAAI,GAAG,EAAE,CAAC;QACd,MAAM,SAAS,GAAa,EAAE,CAAC;QAC/B,IAAI,EAAE,GAAG,IAAI,CAAC;QACd,IAAI,KAAyB,CAAC;QAE9B,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;QACzC,MAAM,KAAK,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,SAAS,CAAC,CAAC;QAC9D,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAE3B,MAAM,IAAI,GAAG,CAAC,KAAqB,EAAQ,EAAE;YAC3C,QAAQ,KAAK,CAAC,IAAI,EAAE,CAAC;gBACnB,KAAK,MAAM;oBACT,IAAI,IAAI,KAAK,CAAC,IAAI,CAAC;oBACnB,MAAM;gBACR,KAAK,YAAY;oBACf,SAAS,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;oBAC3B,MAAM;gBACR,KAAK,OAAO;oBACV,EAAE,GAAG,KAAK,CAAC;oBACX,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC;oBACpB,MAAM;gBACR;oBACE,MAAM;YACV,CAAC;QACH,CAAC,CAAC;QAEF,IAAI,CAAC;YACH,MAAM,OAAO,CAAC;gBACZ,MAAM;gBACN,KAAK;gBACL,YAAY;gBACZ,KAAK;gBACL,QAAQ;gBACR,OAAO,EAAE,MAAM,CAAC,OAAO;gBACvB,IAAI;gBACJ,MAAM,EAAE,UAAU,CAAC,MAAM;aAC1B,CAAC,CAAC;QACL,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,EAAE,GAAG,KAAK,CAAC;YACX,KAAK,GAAG,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;QAC3D,CAAC;gBAAS,CAAC;YACT,YAAY,CAAC,KAAK,CAAC,CAAC;QACtB,CAAC;QAED,OAAO;YACL,IAAI;YACJ,SAAS;YACT,EAAE;YACF,KAAK;YACL,KAAK;YACL,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,OAAO;SACjC,CAAC;IACJ,CAAC;IAED,SAAS,cAAc;QACrB,OAAO;YACL,MAAM;YACN,KAAK;YACL,KAAK,CAAC,KAAK,CAAC,IAAI;gBACd,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;gBACzC,MAAM,MAAM,GAAG,IAAI,CAAC,MAAM,IAAI,UAAU,CAAC,MAAM,CAAC;gBAChD,MAAM,KAAK,GAAG,IAAI,CAAC,MAAM;oBACvB,CAAC,CAAC,SAAS;oBACX,CAAC,CAAC,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,gBAAgB,CAAC,CAAC;gBAC3D,IAAI,GAAG,GAAG,EAAE,CAAC;gBACb,IAAI,CAAC;oBACH,MAAM,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC;wBAC3B,KAAK;wBACL,YAAY,EAAE,IAAI,CAAC,YAAY,IAAI,EAAE;wBACrC,QAAQ,EAAE;4BACR,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE;yBACjE;wBACD,KAAK,EAAE,EAAE;wBACT,WAAW,EAAE,MAAM;wBACnB,eAAe,EAAE,IAAI,CAAC,eAAe,IAAI,GAAG;wBAC5C,WAAW,EAAE,CAAC;qBACf,CAAC,CAAC;oBACH,IAAI,KAAK,EAAE,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;wBACjC,IAAI,KAAK,CAAC,IAAI,KAAK,YAAY;4BAAE,GAAG,IAAI,KAAK,CAAC,IAAI,CAAC;oBACrD,CAAC;gBACH,CAAC;wBAAS,CAAC;oBACT,IAAI,KAAK;wBAAE,YAAY,CAAC,KAAK,CAAC,CAAC;gBACjC,CAAC;gBACD,OAAO,GAAG,CAAC;YACb,CAAC;SACF,CAAC;IACJ,CAAC;IAED,OAAO,EAAE,QAAQ,EAAE,cAAc,EAAE,MAAM,EAAE,KAAK,EAAE,CAAC;AACrD,CAAC","sourcesContent":["/**\n * The headless agent-run seam used by the evals runner.\n *\n * This invokes the real `runAgentLoop` as a *caller* — resolving a provider-\n * agnostic engine + model from the existing registry, converting the app's\n * actions into engine tools, and collecting the assistant's text + tool calls\n * off the `send` event stream into a compact `AgentRunOutput`. It deliberately\n * does NOT modify `production-agent.ts`: everything it needs (`runAgentLoop`,\n * `actionsToEngineTools`, `ActionEntry`) is already exported from there.\n *\n * The factory shape (`createAgentRunner`) keeps the runner unit-testable: tests\n * inject a fake `runAgentLoop` and a fake engine so no real model is called,\n * while production wires in the genuine loop. The same factory builds the\n * `ScorerAnalyzeContext.judge` helper so LLM-judge scorers stream through the\n * exact same resolved engine.\n */\n\nimport type { ActionEntry, AgentLoopUsage } from \"../agent/production-agent.js\";\nimport {\n runAgentLoop,\n actionsToEngineTools,\n} from \"../agent/production-agent.js\";\nimport type {\n AgentEngine,\n EngineMessage,\n EngineTool,\n} from \"../agent/engine/types.js\";\nimport type { AgentChatEvent } from \"../agent/types.js\";\nimport {\n resolveEngine,\n getStoredModelForEngine,\n normalizeModelForEngine,\n} from \"../agent/engine/index.js\";\nimport type {\n AgentRunOutput,\n EvalInput,\n ScorerAnalyzeContext,\n} from \"./types.js\";\n\nconst JUDGE_TIMEOUT_MS = 30_000;\nconst DEFAULT_AGENT_TIMEOUT_MS = 120_000;\n\n/** The slice of `runAgentLoop` the runner depends on — injectable for tests. */\nexport type RunAgentLoopFn = (opts: {\n engine: AgentEngine;\n model: string;\n systemPrompt: string;\n tools: EngineTool[];\n messages: EngineMessage[];\n actions: Record<string, ActionEntry>;\n send: (event: AgentChatEvent) => void;\n signal: AbortSignal;\n}) => Promise<AgentLoopUsage>;\n\nexport interface AgentRunnerConfig {\n /** App actions to expose to the agent under test. */\n actions: Record<string, ActionEntry>;\n /** System prompt for the run. */\n systemPrompt?: string;\n /** Pre-resolved engine; resolved from the registry when omitted. */\n engine?: AgentEngine;\n /** Pre-resolved model; resolved from the engine's stored/default when omitted. */\n model?: string;\n /** Per-run wall-clock budget in ms (default 120s). */\n timeoutMs?: number;\n /**\n * Seam for tests / custom hosts. Defaults to the real `runAgentLoop`. The\n * runner never imports `runAgentLoop` directly so this can be swapped.\n */\n runLoop?: RunAgentLoopFn;\n}\n\nexport interface AgentRunner {\n /** Run the agent loop for one eval input and collect a compact output. */\n runAgent(input: EvalInput): Promise<AgentRunOutput>;\n /** Analyze context handed to LLM-judge scorers (shares engine/model). */\n analyzeContext(): ScorerAnalyzeContext;\n readonly engine: AgentEngine;\n readonly model: string;\n}\n\nfunction toEngineMessages(input: EvalInput): EngineMessage[] {\n const messages: EngineMessage[] = [];\n for (const turn of input.history ?? []) {\n messages.push({\n role: turn.role,\n content: [{ type: \"text\", text: turn.text }],\n });\n }\n messages.push({\n role: \"user\",\n content: [{ type: \"text\", text: input.prompt }],\n });\n return messages;\n}\n\n/**\n * Build an agent runner, resolving the engine/model once up front so every\n * eval case (and every LLM-judge scorer) reuses the same provider-agnostic\n * config. Resolution goes through `resolveEngine` — no model is ever hardcoded.\n */\nexport async function createAgentRunner(\n config: AgentRunnerConfig,\n): Promise<AgentRunner> {\n const engine =\n config.engine ?? (await resolveEngine({ engineOption: undefined }));\n const modelCandidate =\n config.model ??\n (await getStoredModelForEngine(engine)) ??\n engine.defaultModel;\n const model = normalizeModelForEngine(engine, modelCandidate);\n const systemPrompt = config.systemPrompt ?? \"\";\n const runLoop = config.runLoop ?? (runAgentLoop as RunAgentLoopFn);\n const timeoutMs = config.timeoutMs ?? DEFAULT_AGENT_TIMEOUT_MS;\n const tools = actionsToEngineTools(config.actions);\n\n async function runAgent(input: EvalInput): Promise<AgentRunOutput> {\n const runId = `eval:${crypto.randomUUID()}`;\n const messages = toEngineMessages(input);\n\n let text = \"\";\n const toolCalls: string[] = [];\n let ok = true;\n let error: string | undefined;\n\n const controller = new AbortController();\n const timer = setTimeout(() => controller.abort(), timeoutMs);\n const started = Date.now();\n\n const send = (event: AgentChatEvent): void => {\n switch (event.type) {\n case \"text\":\n text += event.text;\n break;\n case \"tool_start\":\n toolCalls.push(event.tool);\n break;\n case \"error\":\n ok = false;\n error = event.error;\n break;\n default:\n break;\n }\n };\n\n try {\n await runLoop({\n engine,\n model,\n systemPrompt,\n tools,\n messages,\n actions: config.actions,\n send,\n signal: controller.signal,\n });\n } catch (err) {\n ok = false;\n error = err instanceof Error ? err.message : String(err);\n } finally {\n clearTimeout(timer);\n }\n\n return {\n text,\n toolCalls,\n ok,\n error,\n runId,\n durationMs: Date.now() - started,\n };\n }\n\n function analyzeContext(): ScorerAnalyzeContext {\n return {\n engine,\n model,\n async judge(opts): Promise<string> {\n const controller = new AbortController();\n const signal = opts.signal ?? controller.signal;\n const timer = opts.signal\n ? undefined\n : setTimeout(() => controller.abort(), JUDGE_TIMEOUT_MS);\n let out = \"\";\n try {\n const stream = engine.stream({\n model,\n systemPrompt: opts.systemPrompt ?? \"\",\n messages: [\n { role: \"user\", content: [{ type: \"text\", text: opts.prompt }] },\n ],\n tools: [],\n abortSignal: signal,\n maxOutputTokens: opts.maxOutputTokens ?? 512,\n temperature: 0,\n });\n for await (const event of stream) {\n if (event.type === \"text-delta\") out += event.text;\n }\n } finally {\n if (timer) clearTimeout(timer);\n }\n return out;\n },\n };\n }\n\n return { runAgent, analyzeContext, engine, model };\n}\n"]}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `defineEval` — declare a named eval test case.
|
|
3
|
+
*
|
|
4
|
+
* An eval pairs an input (prompt + optional history/setup) with a list of
|
|
5
|
+
* scorers and a pass threshold. The runner (see `runner.ts`) actually runs the
|
|
6
|
+
* agent for the input, scores the output with each scorer, and gates on the
|
|
7
|
+
* threshold. Authors write `*.eval.ts` files that `export default defineEval(...)`
|
|
8
|
+
* or export an array of them.
|
|
9
|
+
*
|
|
10
|
+
* Example (`evals/greeting.eval.ts`):
|
|
11
|
+
* ```ts
|
|
12
|
+
* import { defineEval, contains, llmJudge } from "@agent-native/core/eval";
|
|
13
|
+
*
|
|
14
|
+
* export default defineEval({
|
|
15
|
+
* name: "greets the user by name",
|
|
16
|
+
* input: { prompt: "Say hi to Ada." },
|
|
17
|
+
* threshold: 0.7,
|
|
18
|
+
* scorers: [
|
|
19
|
+
* contains("Ada"),
|
|
20
|
+
* llmJudge({ criteria: "friendliness", rubric: "1.0 = warm greeting" }),
|
|
21
|
+
* ],
|
|
22
|
+
* });
|
|
23
|
+
* ```
|
|
24
|
+
*/
|
|
25
|
+
import type { Eval } from "./types.js";
|
|
26
|
+
/** Default per-scorer pass threshold when an eval doesn't specify one. */
|
|
27
|
+
export declare const DEFAULT_EVAL_THRESHOLD = 0.5;
|
|
28
|
+
export declare function defineEval(spec: Eval): Eval;
|
|
29
|
+
//# sourceMappingURL=define-eval.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"define-eval.d.ts","sourceRoot":"","sources":["../../src/eval/define-eval.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AAEH,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAEvC,0EAA0E;AAC1E,eAAO,MAAM,sBAAsB,MAAM,CAAC;AAE1C,wBAAgB,UAAU,CAAC,IAAI,EAAE,IAAI,GAAG,IAAI,CAqB3C"}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `defineEval` — declare a named eval test case.
|
|
3
|
+
*
|
|
4
|
+
* An eval pairs an input (prompt + optional history/setup) with a list of
|
|
5
|
+
* scorers and a pass threshold. The runner (see `runner.ts`) actually runs the
|
|
6
|
+
* agent for the input, scores the output with each scorer, and gates on the
|
|
7
|
+
* threshold. Authors write `*.eval.ts` files that `export default defineEval(...)`
|
|
8
|
+
* or export an array of them.
|
|
9
|
+
*
|
|
10
|
+
* Example (`evals/greeting.eval.ts`):
|
|
11
|
+
* ```ts
|
|
12
|
+
* import { defineEval, contains, llmJudge } from "@agent-native/core/eval";
|
|
13
|
+
*
|
|
14
|
+
* export default defineEval({
|
|
15
|
+
* name: "greets the user by name",
|
|
16
|
+
* input: { prompt: "Say hi to Ada." },
|
|
17
|
+
* threshold: 0.7,
|
|
18
|
+
* scorers: [
|
|
19
|
+
* contains("Ada"),
|
|
20
|
+
* llmJudge({ criteria: "friendliness", rubric: "1.0 = warm greeting" }),
|
|
21
|
+
* ],
|
|
22
|
+
* });
|
|
23
|
+
* ```
|
|
24
|
+
*/
|
|
25
|
+
/** Default per-scorer pass threshold when an eval doesn't specify one. */
|
|
26
|
+
export const DEFAULT_EVAL_THRESHOLD = 0.5;
|
|
27
|
+
export function defineEval(spec) {
|
|
28
|
+
if (!spec.name || typeof spec.name !== "string") {
|
|
29
|
+
throw new Error("defineEval: `name` is required");
|
|
30
|
+
}
|
|
31
|
+
if (!spec.input || typeof spec.input.prompt !== "string") {
|
|
32
|
+
throw new Error(`defineEval("${spec.name}"): \`input.prompt\` is required`);
|
|
33
|
+
}
|
|
34
|
+
if (!Array.isArray(spec.scorers) || spec.scorers.length === 0) {
|
|
35
|
+
throw new Error(`defineEval("${spec.name}"): at least one scorer is required`);
|
|
36
|
+
}
|
|
37
|
+
if (spec.threshold !== undefined &&
|
|
38
|
+
(spec.threshold < 0 || spec.threshold > 1)) {
|
|
39
|
+
throw new Error(`defineEval("${spec.name}"): \`threshold\` must be in [0, 1]`);
|
|
40
|
+
}
|
|
41
|
+
return spec;
|
|
42
|
+
}
|
|
43
|
+
//# sourceMappingURL=define-eval.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"define-eval.js","sourceRoot":"","sources":["../../src/eval/define-eval.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AAIH,0EAA0E;AAC1E,MAAM,CAAC,MAAM,sBAAsB,GAAG,GAAG,CAAC;AAE1C,MAAM,UAAU,UAAU,CAAC,IAAU;IACnC,IAAI,CAAC,IAAI,CAAC,IAAI,IAAI,OAAO,IAAI,CAAC,IAAI,KAAK,QAAQ,EAAE,CAAC;QAChD,MAAM,IAAI,KAAK,CAAC,gCAAgC,CAAC,CAAC;IACpD,CAAC;IACD,IAAI,CAAC,IAAI,CAAC,KAAK,IAAI,OAAO,IAAI,CAAC,KAAK,CAAC,MAAM,KAAK,QAAQ,EAAE,CAAC;QACzD,MAAM,IAAI,KAAK,CAAC,eAAe,IAAI,CAAC,IAAI,kCAAkC,CAAC,CAAC;IAC9E,CAAC;IACD,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,IAAI,CAAC,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC9D,MAAM,IAAI,KAAK,CACb,eAAe,IAAI,CAAC,IAAI,qCAAqC,CAC9D,CAAC;IACJ,CAAC;IACD,IACE,IAAI,CAAC,SAAS,KAAK,SAAS;QAC5B,CAAC,IAAI,CAAC,SAAS,GAAG,CAAC,IAAI,IAAI,CAAC,SAAS,GAAG,CAAC,CAAC,EAC1C,CAAC;QACD,MAAM,IAAI,KAAK,CACb,eAAe,IAAI,CAAC,IAAI,qCAAqC,CAC9D,CAAC;IACJ,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC","sourcesContent":["/**\n * `defineEval` — declare a named eval test case.\n *\n * An eval pairs an input (prompt + optional history/setup) with a list of\n * scorers and a pass threshold. The runner (see `runner.ts`) actually runs the\n * agent for the input, scores the output with each scorer, and gates on the\n * threshold. Authors write `*.eval.ts` files that `export default defineEval(...)`\n * or export an array of them.\n *\n * Example (`evals/greeting.eval.ts`):\n * ```ts\n * import { defineEval, contains, llmJudge } from \"@agent-native/core/eval\";\n *\n * export default defineEval({\n * name: \"greets the user by name\",\n * input: { prompt: \"Say hi to Ada.\" },\n * threshold: 0.7,\n * scorers: [\n * contains(\"Ada\"),\n * llmJudge({ criteria: \"friendliness\", rubric: \"1.0 = warm greeting\" }),\n * ],\n * });\n * ```\n */\n\nimport type { Eval } from \"./types.js\";\n\n/** Default per-scorer pass threshold when an eval doesn't specify one. */\nexport const DEFAULT_EVAL_THRESHOLD = 0.5;\n\nexport function defineEval(spec: Eval): Eval {\n if (!spec.name || typeof spec.name !== \"string\") {\n throw new Error(\"defineEval: `name` is required\");\n }\n if (!spec.input || typeof spec.input.prompt !== \"string\") {\n throw new Error(`defineEval(\"${spec.name}\"): \\`input.prompt\\` is required`);\n }\n if (!Array.isArray(spec.scorers) || spec.scorers.length === 0) {\n throw new Error(\n `defineEval(\"${spec.name}\"): at least one scorer is required`,\n );\n }\n if (\n spec.threshold !== undefined &&\n (spec.threshold < 0 || spec.threshold > 1)\n ) {\n throw new Error(\n `defineEval(\"${spec.name}\"): \\`threshold\\` must be in [0, 1]`,\n );\n }\n return spec;\n}\n"]}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Public surface for the first-class evals primitive.
|
|
3
|
+
*
|
|
4
|
+
* Authors write `*.eval.ts` (or `evals/*.ts`) files that `export default
|
|
5
|
+
* defineEval(...)`, compose scorers with `createScorer` / the built-ins, and
|
|
6
|
+
* run them with `agent-native eval` (which gates CI on the thresholds).
|
|
7
|
+
*
|
|
8
|
+
* This is complementary to `@agent-native/core`'s observability run-scoring:
|
|
9
|
+
* that scores real production runs after the fact; this actively runs the
|
|
10
|
+
* agent against fixed inputs as a deterministic gate. See `types.ts`.
|
|
11
|
+
*/
|
|
12
|
+
export { defineEval, DEFAULT_EVAL_THRESHOLD } from "./define-eval.js";
|
|
13
|
+
export { createScorer, clamp01, exactMatch, contains, usesTool, llmJudge, type LlmJudgeOptions, } from "./scorer.js";
|
|
14
|
+
export { createAgentRunner, type AgentRunner, type AgentRunnerConfig, type RunAgentLoopFn, } from "./agent-runner.js";
|
|
15
|
+
export { runEvalSuite, runEvals, scoreEval, loadEvals, discoverEvalFiles, type RunEvalSuiteOptions, } from "./runner.js";
|
|
16
|
+
export { formatReport } from "./report.js";
|
|
17
|
+
export type { Eval, EvalInput, EvalRunContext, AgentRunOutput, Scorer, ScorerDefinition, ScorerAnalyzeContext, ScorerResult, EvalResultRow, EvalRunReport, } from "./types.js";
|
|
18
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/eval/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EAAE,UAAU,EAAE,sBAAsB,EAAE,MAAM,kBAAkB,CAAC;AACtE,OAAO,EACL,YAAY,EACZ,OAAO,EACP,UAAU,EACV,QAAQ,EACR,QAAQ,EACR,QAAQ,EACR,KAAK,eAAe,GACrB,MAAM,aAAa,CAAC;AACrB,OAAO,EACL,iBAAiB,EACjB,KAAK,WAAW,EAChB,KAAK,iBAAiB,EACtB,KAAK,cAAc,GACpB,MAAM,mBAAmB,CAAC;AAC3B,OAAO,EACL,YAAY,EACZ,QAAQ,EACR,SAAS,EACT,SAAS,EACT,iBAAiB,EACjB,KAAK,mBAAmB,GACzB,MAAM,aAAa,CAAC;AACrB,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAC3C,YAAY,EACV,IAAI,EACJ,SAAS,EACT,cAAc,EACd,cAAc,EACd,MAAM,EACN,gBAAgB,EAChB,oBAAoB,EACpB,YAAY,EACZ,aAAa,EACb,aAAa,GACd,MAAM,YAAY,CAAC"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Public surface for the first-class evals primitive.
|
|
3
|
+
*
|
|
4
|
+
* Authors write `*.eval.ts` (or `evals/*.ts`) files that `export default
|
|
5
|
+
* defineEval(...)`, compose scorers with `createScorer` / the built-ins, and
|
|
6
|
+
* run them with `agent-native eval` (which gates CI on the thresholds).
|
|
7
|
+
*
|
|
8
|
+
* This is complementary to `@agent-native/core`'s observability run-scoring:
|
|
9
|
+
* that scores real production runs after the fact; this actively runs the
|
|
10
|
+
* agent against fixed inputs as a deterministic gate. See `types.ts`.
|
|
11
|
+
*/
|
|
12
|
+
export { defineEval, DEFAULT_EVAL_THRESHOLD } from "./define-eval.js";
|
|
13
|
+
export { createScorer, clamp01, exactMatch, contains, usesTool, llmJudge, } from "./scorer.js";
|
|
14
|
+
export { createAgentRunner, } from "./agent-runner.js";
|
|
15
|
+
export { runEvalSuite, runEvals, scoreEval, loadEvals, discoverEvalFiles, } from "./runner.js";
|
|
16
|
+
export { formatReport } from "./report.js";
|
|
17
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/eval/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EAAE,UAAU,EAAE,sBAAsB,EAAE,MAAM,kBAAkB,CAAC;AACtE,OAAO,EACL,YAAY,EACZ,OAAO,EACP,UAAU,EACV,QAAQ,EACR,QAAQ,EACR,QAAQ,GAET,MAAM,aAAa,CAAC;AACrB,OAAO,EACL,iBAAiB,GAIlB,MAAM,mBAAmB,CAAC;AAC3B,OAAO,EACL,YAAY,EACZ,QAAQ,EACR,SAAS,EACT,SAAS,EACT,iBAAiB,GAElB,MAAM,aAAa,CAAC;AACrB,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC","sourcesContent":["/**\n * Public surface for the first-class evals primitive.\n *\n * Authors write `*.eval.ts` (or `evals/*.ts`) files that `export default\n * defineEval(...)`, compose scorers with `createScorer` / the built-ins, and\n * run them with `agent-native eval` (which gates CI on the thresholds).\n *\n * This is complementary to `@agent-native/core`'s observability run-scoring:\n * that scores real production runs after the fact; this actively runs the\n * agent against fixed inputs as a deterministic gate. See `types.ts`.\n */\n\nexport { defineEval, DEFAULT_EVAL_THRESHOLD } from \"./define-eval.js\";\nexport {\n createScorer,\n clamp01,\n exactMatch,\n contains,\n usesTool,\n llmJudge,\n type LlmJudgeOptions,\n} from \"./scorer.js\";\nexport {\n createAgentRunner,\n type AgentRunner,\n type AgentRunnerConfig,\n type RunAgentLoopFn,\n} from \"./agent-runner.js\";\nexport {\n runEvalSuite,\n runEvals,\n scoreEval,\n loadEvals,\n discoverEvalFiles,\n type RunEvalSuiteOptions,\n} from \"./runner.js\";\nexport { formatReport } from \"./report.js\";\nexport type {\n Eval,\n EvalInput,\n EvalRunContext,\n AgentRunOutput,\n Scorer,\n ScorerDefinition,\n ScorerAnalyzeContext,\n ScorerResult,\n EvalResultRow,\n EvalRunReport,\n} from \"./types.js\";\n"]}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Human-readable formatting for an eval run report. Kept separate from the
|
|
3
|
+
* runner so the CLI can print a table while CI consumes the JSON shape.
|
|
4
|
+
*/
|
|
5
|
+
import type { EvalRunReport } from "./types.js";
|
|
6
|
+
/** Render a scored table for the terminal. */
|
|
7
|
+
export declare function formatReport(report: EvalRunReport): string;
|
|
8
|
+
//# sourceMappingURL=report.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"report.d.ts","sourceRoot":"","sources":["../../src/eval/report.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC;AAgBhD,8CAA8C;AAC9C,wBAAgB,YAAY,CAAC,MAAM,EAAE,aAAa,GAAG,MAAM,CAqC1D"}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Human-readable formatting for an eval run report. Kept separate from the
|
|
3
|
+
* runner so the CLI can print a table while CI consumes the JSON shape.
|
|
4
|
+
*/
|
|
5
|
+
function bar(score, width = 10) {
|
|
6
|
+
const filled = Math.round(clamp01(score) * width);
|
|
7
|
+
return "█".repeat(filled) + "░".repeat(width - filled);
|
|
8
|
+
}
|
|
9
|
+
function clamp01(n) {
|
|
10
|
+
if (!Number.isFinite(n))
|
|
11
|
+
return 0;
|
|
12
|
+
return Math.max(0, Math.min(1, n));
|
|
13
|
+
}
|
|
14
|
+
function pct(score) {
|
|
15
|
+
return `${Math.round(clamp01(score) * 100)}%`.padStart(4);
|
|
16
|
+
}
|
|
17
|
+
/** Render a scored table for the terminal. */
|
|
18
|
+
export function formatReport(report) {
|
|
19
|
+
const lines = [];
|
|
20
|
+
lines.push("");
|
|
21
|
+
lines.push(" Evals");
|
|
22
|
+
lines.push(" ─────");
|
|
23
|
+
for (const row of report.results) {
|
|
24
|
+
const mark = row.passed ? "✓" : "✗";
|
|
25
|
+
lines.push("");
|
|
26
|
+
lines.push(` ${mark} ${row.eval} (avg ${pct(row.avgScore)}, threshold ${pct(row.threshold)})`);
|
|
27
|
+
if (row.error) {
|
|
28
|
+
lines.push(` ⚠ run error: ${row.error}`);
|
|
29
|
+
}
|
|
30
|
+
for (const s of row.scores) {
|
|
31
|
+
const smark = s.passed ? "✓" : "✗";
|
|
32
|
+
const reason = s.reason ? ` — ${s.reason}` : "";
|
|
33
|
+
lines.push(` ${smark} ${s.scorer.padEnd(20)} ${bar(s.score)} ${pct(s.score)}${reason}`);
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
lines.push("");
|
|
37
|
+
lines.push(" ─────");
|
|
38
|
+
const verdict = report.failed === 0 ? "PASS" : "FAIL";
|
|
39
|
+
lines.push(` ${verdict}: ${report.passed}/${report.total} evals passed` +
|
|
40
|
+
(report.failed > 0 ? `, ${report.failed} below threshold` : ""));
|
|
41
|
+
lines.push("");
|
|
42
|
+
return lines.join("\n");
|
|
43
|
+
}
|
|
44
|
+
//# sourceMappingURL=report.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"report.js","sourceRoot":"","sources":["../../src/eval/report.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAIH,SAAS,GAAG,CAAC,KAAa,EAAE,KAAK,GAAG,EAAE;IACpC,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,GAAG,KAAK,CAAC,CAAC;IAClD,OAAO,GAAG,CAAC,MAAM,CAAC,MAAM,CAAC,GAAG,GAAG,CAAC,MAAM,CAAC,KAAK,GAAG,MAAM,CAAC,CAAC;AACzD,CAAC;AAED,SAAS,OAAO,CAAC,CAAS;IACxB,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC;QAAE,OAAO,CAAC,CAAC;IAClC,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;AACrC,CAAC;AAED,SAAS,GAAG,CAAC,KAAa;IACxB,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,GAAG,GAAG,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC;AAC5D,CAAC;AAED,8CAA8C;AAC9C,MAAM,UAAU,YAAY,CAAC,MAAqB;IAChD,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACf,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IACtB,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IAEtB,KAAK,MAAM,GAAG,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;QACjC,MAAM,IAAI,GAAG,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC;QACpC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACf,KAAK,CAAC,IAAI,CACR,KAAK,IAAI,IAAI,GAAG,CAAC,IAAI,UAAU,GAAG,CAAC,GAAG,CAAC,QAAQ,CAAC,eAAe,GAAG,CAChE,GAAG,CAAC,SAAS,CACd,GAAG,CACL,CAAC;QACF,IAAI,GAAG,CAAC,KAAK,EAAE,CAAC;YACd,KAAK,CAAC,IAAI,CAAC,sBAAsB,GAAG,CAAC,KAAK,EAAE,CAAC,CAAC;QAChD,CAAC;QACD,KAAK,MAAM,CAAC,IAAI,GAAG,CAAC,MAAM,EAAE,CAAC;YAC3B,MAAM,KAAK,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC;YACnC,MAAM,MAAM,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;YACjD,KAAK,CAAC,IAAI,CACR,SAAS,KAAK,IAAI,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,EAAE,CAAC,IAAI,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,GAAG,CAC1D,CAAC,CAAC,KAAK,CACR,GAAG,MAAM,EAAE,CACb,CAAC;QACJ,CAAC;IACH,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACf,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IACtB,MAAM,OAAO,GAAG,MAAM,CAAC,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC;IACtD,KAAK,CAAC,IAAI,CACR,KAAK,OAAO,KAAK,MAAM,CAAC,MAAM,IAAI,MAAM,CAAC,KAAK,eAAe;QAC3D,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,MAAM,CAAC,MAAM,kBAAkB,CAAC,CAAC,CAAC,EAAE,CAAC,CAClE,CAAC;IACF,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACf,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC","sourcesContent":["/**\n * Human-readable formatting for an eval run report. Kept separate from the\n * runner so the CLI can print a table while CI consumes the JSON shape.\n */\n\nimport type { EvalRunReport } from \"./types.js\";\n\nfunction bar(score: number, width = 10): string {\n const filled = Math.round(clamp01(score) * width);\n return \"█\".repeat(filled) + \"░\".repeat(width - filled);\n}\n\nfunction clamp01(n: number): number {\n if (!Number.isFinite(n)) return 0;\n return Math.max(0, Math.min(1, n));\n}\n\nfunction pct(score: number): string {\n return `${Math.round(clamp01(score) * 100)}%`.padStart(4);\n}\n\n/** Render a scored table for the terminal. */\nexport function formatReport(report: EvalRunReport): string {\n const lines: string[] = [];\n lines.push(\"\");\n lines.push(\" Evals\");\n lines.push(\" ─────\");\n\n for (const row of report.results) {\n const mark = row.passed ? \"✓\" : \"✗\";\n lines.push(\"\");\n lines.push(\n ` ${mark} ${row.eval} (avg ${pct(row.avgScore)}, threshold ${pct(\n row.threshold,\n )})`,\n );\n if (row.error) {\n lines.push(` ⚠ run error: ${row.error}`);\n }\n for (const s of row.scores) {\n const smark = s.passed ? \"✓\" : \"✗\";\n const reason = s.reason ? ` — ${s.reason}` : \"\";\n lines.push(\n ` ${smark} ${s.scorer.padEnd(20)} ${bar(s.score)} ${pct(\n s.score,\n )}${reason}`,\n );\n }\n }\n\n lines.push(\"\");\n lines.push(\" ─────\");\n const verdict = report.failed === 0 ? \"PASS\" : \"FAIL\";\n lines.push(\n ` ${verdict}: ${report.passed}/${report.total} evals passed` +\n (report.failed > 0 ? `, ${report.failed} below threshold` : \"\"),\n );\n lines.push(\"\");\n return lines.join(\"\\n\");\n}\n"]}
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* The evals runner: discover `*.eval.ts` / `evals/*.ts` files, run each eval
|
|
3
|
+
* through its scorer pipeline against the *real* agent loop, score, and report.
|
|
4
|
+
*
|
|
5
|
+
* It is the engine behind `agent-native eval` — when used as a CI deploy gate
|
|
6
|
+
* the CLI exits non-zero if any eval scores below its threshold.
|
|
7
|
+
*
|
|
8
|
+
* Two layers:
|
|
9
|
+
* - `scoreEval` / `runEvals` — pure orchestration over an `AgentRunner` and
|
|
10
|
+
* a list of evals. Fully unit-testable with an injected runner (no model).
|
|
11
|
+
* - `discoverEvalFiles` / `loadEvals` — filesystem discovery + dynamic import
|
|
12
|
+
* of author-written eval modules.
|
|
13
|
+
*
|
|
14
|
+
* Results are also (best-effort) written to the observability eval store so a
|
|
15
|
+
* dashboard can surface CI eval history next to production run evals.
|
|
16
|
+
*/
|
|
17
|
+
import type { ActionEntry } from "../agent/production-agent.js";
|
|
18
|
+
import type { AgentRunner } from "./agent-runner.js";
|
|
19
|
+
import type { Eval, EvalResultRow, EvalRunReport } from "./types.js";
|
|
20
|
+
/** Run a single eval: invoke the agent, then score with each scorer. */
|
|
21
|
+
export declare function scoreEval(evalCase: Eval, runner: AgentRunner, opts?: {
|
|
22
|
+
thresholdOverride?: number;
|
|
23
|
+
}): Promise<EvalResultRow>;
|
|
24
|
+
/** Run a batch of evals against one runner and aggregate a report. */
|
|
25
|
+
export declare function runEvals(evals: Eval[], runner: AgentRunner, opts?: {
|
|
26
|
+
thresholdOverride?: number;
|
|
27
|
+
persist?: boolean;
|
|
28
|
+
}): Promise<EvalRunReport>;
|
|
29
|
+
/**
|
|
30
|
+
* Walk `root` for eval files. Matches two conventions:
|
|
31
|
+
* - any `**\/*.eval.ts` (co-located with code), and
|
|
32
|
+
* - any `*.ts` directly inside an `evals/` directory.
|
|
33
|
+
* `pattern` further filters by substring of the relative path.
|
|
34
|
+
*/
|
|
35
|
+
export declare function discoverEvalFiles(root: string, pattern?: string): Promise<string[]>;
|
|
36
|
+
/** Discover and import all eval files under `root`, returning their evals. */
|
|
37
|
+
export declare function loadEvals(root: string, pattern?: string): Promise<{
|
|
38
|
+
files: string[];
|
|
39
|
+
evals: Eval[];
|
|
40
|
+
}>;
|
|
41
|
+
export interface RunEvalSuiteOptions {
|
|
42
|
+
/** App root to discover eval files + actions under. Defaults to cwd. */
|
|
43
|
+
cwd?: string;
|
|
44
|
+
/** Substring filter on the eval file path. */
|
|
45
|
+
pattern?: string;
|
|
46
|
+
/** Global threshold override (wins over per-eval thresholds). */
|
|
47
|
+
thresholdOverride?: number;
|
|
48
|
+
/** App actions to expose to the agent. Auto-discovered when omitted. */
|
|
49
|
+
actions?: Record<string, ActionEntry>;
|
|
50
|
+
/** System prompt for runs. */
|
|
51
|
+
systemPrompt?: string;
|
|
52
|
+
/** Write results to the observability eval store (default true). */
|
|
53
|
+
persist?: boolean;
|
|
54
|
+
/** Pre-built runner (tests inject this to avoid touching engine/loop). */
|
|
55
|
+
runner?: AgentRunner;
|
|
56
|
+
/** Pre-loaded evals (tests inject this to skip filesystem discovery). */
|
|
57
|
+
evals?: Eval[];
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* End-to-end: load evals, build a runner, score, report. The CLI wraps this
|
|
61
|
+
* and maps `report.failed > 0` to a non-zero exit code (the CI gate).
|
|
62
|
+
*/
|
|
63
|
+
export declare function runEvalSuite(opts?: RunEvalSuiteOptions): Promise<{
|
|
64
|
+
report: EvalRunReport;
|
|
65
|
+
files: string[];
|
|
66
|
+
}>;
|
|
67
|
+
//# sourceMappingURL=runner.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"runner.d.ts","sourceRoot":"","sources":["../../src/eval/runner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAKH,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,8BAA8B,CAAC;AAMhE,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAErD,OAAO,KAAK,EAEV,IAAI,EACJ,aAAa,EACb,aAAa,EAEd,MAAM,YAAY,CAAC;AAsCpB,wEAAwE;AACxE,wBAAsB,SAAS,CAC7B,QAAQ,EAAE,IAAI,EACd,MAAM,EAAE,WAAW,EACnB,IAAI,GAAE;IAAE,iBAAiB,CAAC,EAAE,MAAM,CAAA;CAAO,GACxC,OAAO,CAAC,aAAa,CAAC,CAkCxB;AAED,sEAAsE;AACtE,wBAAsB,QAAQ,CAC5B,KAAK,EAAE,IAAI,EAAE,EACb,MAAM,EAAE,WAAW,EACnB,IAAI,GAAE;IAAE,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAAC,OAAO,CAAC,EAAE,OAAO,CAAA;CAAO,GAC3D,OAAO,CAAC,aAAa,CAAC,CAexB;AA4CD;;;;;GAKG;AACH,wBAAsB,iBAAiB,CACrC,IAAI,EAAE,MAAM,EACZ,OAAO,CAAC,EAAE,MAAM,GACf,OAAO,CAAC,MAAM,EAAE,CAAC,CAqCnB;AA0BD,8EAA8E;AAC9E,wBAAsB,SAAS,CAC7B,IAAI,EAAE,MAAM,EACZ,OAAO,CAAC,EAAE,MAAM,GACf,OAAO,CAAC;IAAE,KAAK,EAAE,MAAM,EAAE,CAAC;IAAC,KAAK,EAAE,IAAI,EAAE,CAAA;CAAE,CAAC,CAW7C;AAID,MAAM,WAAW,mBAAmB;IAClC,wEAAwE;IACxE,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,8CAA8C;IAC9C,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,iEAAiE;IACjE,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,wEAAwE;IACxE,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,WAAW,CAAC,CAAC;IACtC,8BAA8B;IAC9B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,oEAAoE;IACpE,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,0EAA0E;IAC1E,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,yEAAyE;IACzE,KAAK,CAAC,EAAE,IAAI,EAAE,CAAC;CAChB;AAED;;;GAGG;AACH,wBAAsB,YAAY,CAChC,IAAI,GAAE,mBAAwB,GAC7B,OAAO,CAAC;IAAE,MAAM,EAAE,aAAa,CAAC;IAAC,KAAK,EAAE,MAAM,EAAE,CAAA;CAAE,CAAC,CAuBrD"}
|