@wix/evalforge-evaluator 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,22 @@
1
+ /**
2
+ * API Client for fetching data from the eval server.
3
+ */
4
+ import type { EvalRun, EvalRunResult, SkillsGroup, TestScenario, Template, Skill, Agent } from '@wix/evalforge-types';
5
+ export interface ApiClient {
6
+ getEvalRun(projectId: string, id: string): Promise<EvalRun>;
7
+ getSkillsGroup(projectId: string, id: string): Promise<SkillsGroup>;
8
+ getScenario(projectId: string, id: string): Promise<TestScenario>;
9
+ getSkill(projectId: string, id: string): Promise<Skill>;
10
+ getAgent(projectId: string, id: string): Promise<Agent>;
11
+ getTemplate(projectId: string, id: string): Promise<Template>;
12
+ addResult(projectId: string, evalRunId: string, result: EvalRunResult): Promise<void>;
13
+ clearResults(projectId: string, evalRunId: string): Promise<void>;
14
+ updateEvalRun(projectId: string, evalRunId: string, update: Partial<EvalRun>): Promise<void>;
15
+ }
16
+ /**
17
+ * Create an API client for the eval server.
18
+ *
19
+ * @param serverUrl - Base URL of the server (e.g., "https://dev.wix.com/_api/evalforge-backend")
20
+ * @param apiPrefix - API path prefix (default: empty string, set to "/api/v1" if needed)
21
+ */
22
+ export declare function createApiClient(serverUrl: string, apiPrefix?: string): ApiClient;
@@ -0,0 +1,24 @@
1
+ /**
2
+ * Evaluator Configuration
3
+ *
4
+ * Reads configuration from environment variables set by the server
5
+ * when spawning the evaluator process.
6
+ */
7
+ export interface EvaluatorConfig {
8
+ /** Backend server URL for fetching data and reporting results */
9
+ serverUrl: string;
10
+ /** API path prefix (default: empty, set to "/api/v1" if needed for production) */
11
+ apiPrefix: string;
12
+ /** AI Gateway base URL for LLM calls */
13
+ aiGatewayUrl: string;
14
+ /** Custom headers for AI Gateway requests (parsed from "key:value" lines) */
15
+ aiGatewayHeaders: Record<string, string>;
16
+ /** Directory for storing evaluation working directories */
17
+ evaluationsDir?: string;
18
+ }
19
+ /**
20
+ * Load evaluator configuration from environment variables.
21
+ *
22
+ * @throws Error if required environment variables are missing
23
+ */
24
+ export declare function loadConfig(): EvaluatorConfig;
@@ -0,0 +1,29 @@
1
+ import type { EvalRun, TestScenario, Skill, Agent, Template } from '@wix/evalforge-types';
2
+ import type { ApiClient } from './api-client.js';
3
+ /**
4
+ * A scenario paired with its optional template.
5
+ */
6
+ export interface ScenarioItem {
7
+ scenario: TestScenario;
8
+ template?: Template;
9
+ }
10
+ /**
11
+ * Data shape returned by fetchEvaluationData for the run flow.
12
+ * Run-level code agent + skills; per-scenario template.
13
+ */
14
+ export interface EvaluationData {
15
+ evalRun: EvalRun;
16
+ codeAgent: Agent | null;
17
+ skills: Skill[];
18
+ scenarioItems: ScenarioItem[];
19
+ }
20
+ /**
21
+ * Fetch all data needed to run an evaluation.
22
+ *
23
+ * Uses evalRun.agentId and evalRun.skillsGroupId as run-level inputs.
24
+ * Loads skills from the skills group; loads code agent; loads scenarios and templates by scenario.templateId.
25
+ *
26
+ * @throws Error if eval run has no code agent when expected
27
+ * @throws Error if eval run has no skills group or skills group has no skills when expected
28
+ */
29
+ export declare function fetchEvaluationData(api: ApiClient, projectId: string, evalRunId: string): Promise<EvaluationData>;
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env node
2
+ export {};
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env node
2
+ export {};
@@ -0,0 +1,13 @@
1
+ import type { Agent, TestScenario } from '@wix/evalforge-types';
2
+ import type { EvaluatorConfig } from '../config.js';
3
+ import type { PartialEvalRunResult } from './types.js';
4
+ /**
5
+ * Execute a scenario against a standalone coding agent.
6
+ *
7
+ * @param config - Evaluator configuration
8
+ * @param scenario - The test scenario to run
9
+ * @param agent - The coding agent to test
10
+ * @param workDir - Optional working directory for the scenario
11
+ * @returns Partial result without assertion fields
12
+ */
13
+ export declare function callAgent(config: EvaluatorConfig, scenario: TestScenario, agent: Agent, workDir?: string): Promise<PartialEvalRunResult>;
@@ -0,0 +1,15 @@
1
+ import type { Skill, Agent, TestScenario } from '@wix/evalforge-types';
2
+ import type { EvaluatorConfig } from '../config.js';
3
+ import type { PartialEvalRunResult } from './types.js';
4
+ /**
5
+ * Execute a scenario against a skill (optionally with a linked coding agent).
6
+ *
7
+ * @param config - Evaluator configuration
8
+ * @param evalRunId - The evaluation run ID (for live trace context)
9
+ * @param scenario - The test scenario to run
10
+ * @param skill - The skill to test
11
+ * @param agent - Optional linked coding agent to run the skill within
12
+ * @param workDir - Optional working directory for the scenario
13
+ * @returns Partial result without assertion fields
14
+ */
15
+ export declare function callSkill(config: EvaluatorConfig, evalRunId: string, scenario: TestScenario, skill: Skill, agent?: Agent, workDir?: string): Promise<PartialEvalRunResult>;
@@ -0,0 +1,14 @@
1
+ import type { Skill, TestScenario, LLMTrace } from '@wix/evalforge-types';
2
+ import type { ClaudeCodeExecutionOptions, ClaudeCodeExecutionResult } from './types.js';
3
+ /**
4
+ * Execute a skill using the Claude Agent SDK.
5
+ *
6
+ * @param skill - The skill to execute (SKILL.md content used as system prompt)
7
+ * @param scenario - The test scenario containing the trigger prompt
8
+ * @param options - Execution options (cwd, mcpServers, etc.)
9
+ * @returns Execution result with output, usage, and LLM trace
10
+ */
11
+ export declare function executeWithClaudeCode(skill: Skill, scenario: TestScenario, options: ClaudeCodeExecutionOptions): Promise<{
12
+ result: ClaudeCodeExecutionResult;
13
+ llmTrace: LLMTrace;
14
+ }>;
@@ -0,0 +1,8 @@
1
+ /**
2
+ * Claude Agent SDK integration module.
3
+ *
4
+ * This module provides functionality to execute skills using the
5
+ * @anthropic-ai/claude-agent-sdk with full LLM trace support.
6
+ */
7
+ export { executeWithClaudeCode } from './execute.js';
8
+ export type { ClaudeCodeExecutionOptions, ClaudeCodeExecutionResult } from './types.js';
@@ -0,0 +1,68 @@
1
+ /**
2
+ * MCP server configuration compatible with Claude Agent SDK.
3
+ */
4
+ export interface McpServerConfig {
5
+ type?: 'stdio';
6
+ command: string;
7
+ args?: string[];
8
+ env?: Record<string, string>;
9
+ }
10
+ /**
11
+ * Trace context for live streaming of agent execution.
12
+ */
13
+ export interface TraceContext {
14
+ /** The evaluation run ID */
15
+ evalRunId: string;
16
+ /** The scenario ID being executed */
17
+ scenarioId: string;
18
+ /** The scenario name for display */
19
+ scenarioName: string;
20
+ /** The target ID (skill, agent, etc.) */
21
+ targetId: string;
22
+ /** The target name for display */
23
+ targetName: string;
24
+ }
25
+ /**
26
+ * Options for Claude Agent SDK execution.
27
+ */
28
+ export interface ClaudeCodeExecutionOptions {
29
+ /** Working directory for the execution */
30
+ cwd: string;
31
+ /** System prompt to append (typically the SKILL.md content) */
32
+ systemPrompt?: string;
33
+ /** MCP server configurations */
34
+ mcpServers?: Record<string, McpServerConfig>;
35
+ /** Model to use for generation */
36
+ model?: string;
37
+ /** Temperature for generation (0–1, from agent modelConfig) */
38
+ temperature?: number;
39
+ /** Max output tokens (from agent modelConfig) */
40
+ maxTokens?: number;
41
+ /** Maximum number of agent turns */
42
+ maxTurns?: number;
43
+ /** Maximum thinking tokens */
44
+ maxThinkingTokens?: number;
45
+ /** AI Gateway base URL for Anthropic API */
46
+ aiGatewayUrl?: string;
47
+ /** AI Gateway custom headers as key-value pairs */
48
+ aiGatewayHeaders?: Record<string, string>;
49
+ /** Trace context for live streaming (optional) */
50
+ traceContext?: TraceContext;
51
+ }
52
+ /**
53
+ * Result from Claude Code execution.
54
+ */
55
+ export interface ClaudeCodeExecutionResult {
56
+ /** Final output text */
57
+ outputText: string;
58
+ /** Total duration in milliseconds */
59
+ durationMs: number;
60
+ /** Token usage statistics */
61
+ usage: {
62
+ inputTokens: number;
63
+ outputTokens: number;
64
+ totalTokens: number;
65
+ };
66
+ /** Cost in USD (if available) */
67
+ costUsd?: number;
68
+ }
@@ -0,0 +1,13 @@
1
+ import type { Template } from '@wix/evalforge-types';
2
+ import type { EvaluatorConfig } from '../config.js';
3
+ /**
4
+ * Prepare the working directory for a scenario.
5
+ * If a template is provided, download and extract it.
6
+ *
7
+ * @param config - Evaluator configuration
8
+ * @param evalRunId - The evaluation run ID
9
+ * @param targetId - The target ID (prompt agent, skill, or agent)
10
+ * @param template - Optional pre-fetched template entity
11
+ * @returns The working directory path, or undefined if no template
12
+ */
13
+ export declare function prepareWorkingDirectory(config: EvaluatorConfig, evalRunId: string, targetId: string, template?: Template): Promise<string | undefined>;
@@ -0,0 +1,16 @@
1
+ import { type EvalRunResult, type TestScenario, type Template } from '@wix/evalforge-types';
2
+ import type { EvaluatorConfig } from '../config.js';
3
+ import type { TargetInput } from './types.js';
4
+ export type { TargetInput, PartialEvalRunResult } from './types.js';
5
+ export type { ScenarioItem, EvaluationData } from '../fetch-evaluation-data.js';
6
+ /**
7
+ * Run a single scenario against a target.
8
+ *
9
+ * @param config - Evaluator configuration
10
+ * @param evalRunId - The evaluation run ID
11
+ * @param scenario - The test scenario to run
12
+ * @param target - The target to test (skill or agent)
13
+ * @param template - Optional pre-fetched template entity
14
+ * @returns Complete evaluation result
15
+ */
16
+ export declare function runScenario(config: EvaluatorConfig, evalRunId: string, scenario: TestScenario, target: TargetInput, template?: Template): Promise<EvalRunResult>;
@@ -0,0 +1,6 @@
1
+ import { type ModelIds, type LLMTrace } from '@wix/evalforge-types';
2
+ import type { CapturedStep } from './types.js';
3
+ /**
4
+ * Build LLMTrace from captured steps.
5
+ */
6
+ export declare function buildLLMTrace(capturedSteps: CapturedStep[], modelId: ModelIds): LLMTrace;
@@ -0,0 +1,37 @@
1
+ import type { EvalRunResult, Skill, Agent, LLMStepType } from '@wix/evalforge-types';
2
+ /**
3
+ * Partial result returned by call functions (without assertion fields).
4
+ */
5
+ export type PartialEvalRunResult = Omit<EvalRunResult, 'assertionResults' | 'passed' | 'failed' | 'passRate'>;
6
+ /**
7
+ * Target types for dispatching scenario execution.
8
+ * PR4: only skill (with optional code agent) and standalone agent are used.
9
+ */
10
+ export type TargetInput = {
11
+ type: 'skill';
12
+ skill: Skill;
13
+ agent?: Agent;
14
+ } | {
15
+ type: 'agent';
16
+ agent: Agent;
17
+ };
18
+ /**
19
+ * Step data captured during streaming with timing info.
20
+ */
21
+ export interface CapturedStep {
22
+ text: string;
23
+ usage: {
24
+ inputTokens: number;
25
+ outputTokens: number;
26
+ totalTokens: number;
27
+ };
28
+ finishReason: string;
29
+ toolCalls?: Array<{
30
+ toolName: string;
31
+ args: unknown;
32
+ }>;
33
+ toolResults?: unknown[];
34
+ startedAt: Date;
35
+ durationMs: number;
36
+ }
37
+ export { LLMStepType };
package/package.json ADDED
@@ -0,0 +1,66 @@
1
+ {
2
+ "name": "@wix/evalforge-evaluator",
3
+ "version": "0.6.0",
4
+ "description": "EvalForge Evaluator",
5
+ "bin": "./build/index.js",
6
+ "files": [
7
+ "build"
8
+ ],
9
+ "scripts": {
10
+ "clean": "rm -rf build",
11
+ "build:cjs": "esbuild src/index.ts --bundle --platform=node --outfile=build/index.js --format=cjs --sourcemap --external:@wix/evalforge-types --external:@anthropic-ai/claude-agent-sdk",
12
+ "build:esm": "esbuild src/index.ts --bundle --platform=node --outfile=build/index.mjs --format=esm --sourcemap --external:@wix/evalforge-types --external:@anthropic-ai/claude-agent-sdk",
13
+ "build:types": "tsc --emitDeclarationOnly --outDir ./build/types",
14
+ "build": "yarn run clean && yarn run build:cjs && yarn run build:esm && yarn run build:types",
15
+ "lint": "eslint .",
16
+ "test": "echo 'No tests specified' && exit 0"
17
+ },
18
+ "dependencies": {
19
+ "@ai-sdk/anthropic": "^3.0.2",
20
+ "@anthropic-ai/claude-code": "^2.0.76",
21
+ "@wix/evalforge-types": "0.3.0",
22
+ "ai": "^6.0.6",
23
+ "tar": "^7.5.3"
24
+ },
25
+ "peerDependencies": {
26
+ "@anthropic-ai/claude-agent-sdk": "^0.2.12"
27
+ },
28
+ "peerDependenciesMeta": {
29
+ "@anthropic-ai/claude-agent-sdk": {
30
+ "optional": false
31
+ }
32
+ },
33
+ "devDependencies": {
34
+ "@eslint/js": "^9.39.2",
35
+ "@types/node": "^22.19.3",
36
+ "@types/tar": "^6.1.13",
37
+ "dotenv": "^17.2.3",
38
+ "esbuild": "^0.27.2",
39
+ "eslint": "^9.39.2",
40
+ "eslint-config-prettier": "^10.1.8",
41
+ "eslint-plugin-prettier": "^5.5.4",
42
+ "prettier": "^3.7.4",
43
+ "tsx": "^4.21.0",
44
+ "typescript": "^5.9.3",
45
+ "typescript-eslint": "^8.51.0"
46
+ },
47
+ "exports": {
48
+ ".": {
49
+ "types": "./build/types/index.d.ts",
50
+ "import": "./build/index.mjs",
51
+ "require": "./build/index.js"
52
+ },
53
+ "./package.json": "./package.json"
54
+ },
55
+ "publishConfig": {
56
+ "registry": "https://registry.npmjs.org/",
57
+ "access": "public"
58
+ },
59
+ "wix": {
60
+ "artifact": {
61
+ "groupId": "com.wixpress",
62
+ "artifactId": "evalforge-evaluator"
63
+ }
64
+ },
65
+ "falconPackageHash": "948c3ac942079bd1a8592d8449148cee8a5198ae1654ff42e6d30952"
66
+ }