@wix/evalforge-evaluator 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +6689 -0
- package/build/index.js.map +7 -0
- package/build/index.mjs +6672 -0
- package/build/index.mjs.map +7 -0
- package/build/types/api-client.d.ts +22 -0
- package/build/types/config.d.ts +24 -0
- package/build/types/fetch-evaluation-data.d.ts +29 -0
- package/build/types/index.d.ts +2 -0
- package/build/types/local.d.ts +2 -0
- package/build/types/run-scenario/callAgent.d.ts +13 -0
- package/build/types/run-scenario/callSkill.d.ts +15 -0
- package/build/types/run-scenario/claude-code/execute.d.ts +14 -0
- package/build/types/run-scenario/claude-code/index.d.ts +8 -0
- package/build/types/run-scenario/claude-code/types.d.ts +68 -0
- package/build/types/run-scenario/environment.d.ts +13 -0
- package/build/types/run-scenario/index.d.ts +16 -0
- package/build/types/run-scenario/llm-trace.d.ts +6 -0
- package/build/types/run-scenario/types.d.ts +37 -0
- package/package.json +66 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* API Client for fetching data from the eval server.
|
|
3
|
+
*/
|
|
4
|
+
import type { EvalRun, EvalRunResult, SkillsGroup, TestScenario, Template, Skill, Agent } from '@wix/evalforge-types';
|
|
5
|
+
export interface ApiClient {
|
|
6
|
+
getEvalRun(projectId: string, id: string): Promise<EvalRun>;
|
|
7
|
+
getSkillsGroup(projectId: string, id: string): Promise<SkillsGroup>;
|
|
8
|
+
getScenario(projectId: string, id: string): Promise<TestScenario>;
|
|
9
|
+
getSkill(projectId: string, id: string): Promise<Skill>;
|
|
10
|
+
getAgent(projectId: string, id: string): Promise<Agent>;
|
|
11
|
+
getTemplate(projectId: string, id: string): Promise<Template>;
|
|
12
|
+
addResult(projectId: string, evalRunId: string, result: EvalRunResult): Promise<void>;
|
|
13
|
+
clearResults(projectId: string, evalRunId: string): Promise<void>;
|
|
14
|
+
updateEvalRun(projectId: string, evalRunId: string, update: Partial<EvalRun>): Promise<void>;
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Create an API client for the eval server.
|
|
18
|
+
*
|
|
19
|
+
* @param serverUrl - Base URL of the server (e.g., "https://dev.wix.com/_api/evalforge-backend")
|
|
20
|
+
* @param apiPrefix - API path prefix (default: empty string, set to "/api/v1" if needed)
|
|
21
|
+
*/
|
|
22
|
+
export declare function createApiClient(serverUrl: string, apiPrefix?: string): ApiClient;
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evaluator Configuration
|
|
3
|
+
*
|
|
4
|
+
* Reads configuration from environment variables set by the server
|
|
5
|
+
* when spawning the evaluator process.
|
|
6
|
+
*/
|
|
7
|
+
export interface EvaluatorConfig {
|
|
8
|
+
/** Backend server URL for fetching data and reporting results */
|
|
9
|
+
serverUrl: string;
|
|
10
|
+
/** API path prefix (default: empty, set to "/api/v1" if needed for production) */
|
|
11
|
+
apiPrefix: string;
|
|
12
|
+
/** AI Gateway base URL for LLM calls */
|
|
13
|
+
aiGatewayUrl: string;
|
|
14
|
+
/** Custom headers for AI Gateway requests (parsed from "key:value" lines) */
|
|
15
|
+
aiGatewayHeaders: Record<string, string>;
|
|
16
|
+
/** Directory for storing evaluation working directories */
|
|
17
|
+
evaluationsDir?: string;
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Load evaluator configuration from environment variables.
|
|
21
|
+
*
|
|
22
|
+
* @throws Error if required environment variables are missing
|
|
23
|
+
*/
|
|
24
|
+
export declare function loadConfig(): EvaluatorConfig;
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import type { EvalRun, TestScenario, Skill, Agent, Template } from '@wix/evalforge-types';
|
|
2
|
+
import type { ApiClient } from './api-client.js';
|
|
3
|
+
/**
|
|
4
|
+
* A scenario paired with its optional template.
|
|
5
|
+
*/
|
|
6
|
+
export interface ScenarioItem {
|
|
7
|
+
scenario: TestScenario;
|
|
8
|
+
template?: Template;
|
|
9
|
+
}
|
|
10
|
+
/**
|
|
11
|
+
* Data shape returned by fetchEvaluationData for the run flow.
|
|
12
|
+
* Run-level code agent + skills; per-scenario template.
|
|
13
|
+
*/
|
|
14
|
+
export interface EvaluationData {
|
|
15
|
+
evalRun: EvalRun;
|
|
16
|
+
codeAgent: Agent | null;
|
|
17
|
+
skills: Skill[];
|
|
18
|
+
scenarioItems: ScenarioItem[];
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Fetch all data needed to run an evaluation.
|
|
22
|
+
*
|
|
23
|
+
* Uses evalRun.agentId and evalRun.skillsGroupId as run-level inputs.
|
|
24
|
+
* Loads skills from the skills group; loads code agent; loads scenarios and templates by scenario.templateId.
|
|
25
|
+
*
|
|
26
|
+
* @throws Error if eval run has no code agent when expected
|
|
27
|
+
* @throws Error if eval run has no skills group or skills group has no skills when expected
|
|
28
|
+
*/
|
|
29
|
+
export declare function fetchEvaluationData(api: ApiClient, projectId: string, evalRunId: string): Promise<EvaluationData>;
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import type { Agent, TestScenario } from '@wix/evalforge-types';
|
|
2
|
+
import type { EvaluatorConfig } from '../config.js';
|
|
3
|
+
import type { PartialEvalRunResult } from './types.js';
|
|
4
|
+
/**
|
|
5
|
+
* Execute a scenario against a standalone coding agent.
|
|
6
|
+
*
|
|
7
|
+
* @param config - Evaluator configuration
|
|
8
|
+
* @param scenario - The test scenario to run
|
|
9
|
+
* @param agent - The coding agent to test
|
|
10
|
+
* @param workDir - Optional working directory for the scenario
|
|
11
|
+
* @returns Partial result without assertion fields
|
|
12
|
+
*/
|
|
13
|
+
export declare function callAgent(config: EvaluatorConfig, scenario: TestScenario, agent: Agent, workDir?: string): Promise<PartialEvalRunResult>;
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import type { Skill, Agent, TestScenario } from '@wix/evalforge-types';
|
|
2
|
+
import type { EvaluatorConfig } from '../config.js';
|
|
3
|
+
import type { PartialEvalRunResult } from './types.js';
|
|
4
|
+
/**
|
|
5
|
+
* Execute a scenario against a skill (optionally with a linked coding agent).
|
|
6
|
+
*
|
|
7
|
+
* @param config - Evaluator configuration
|
|
8
|
+
* @param evalRunId - The evaluation run ID (for live trace context)
|
|
9
|
+
* @param scenario - The test scenario to run
|
|
10
|
+
* @param skill - The skill to test
|
|
11
|
+
* @param agent - Optional linked coding agent to run the skill within
|
|
12
|
+
* @param workDir - Optional working directory for the scenario
|
|
13
|
+
* @returns Partial result without assertion fields
|
|
14
|
+
*/
|
|
15
|
+
export declare function callSkill(config: EvaluatorConfig, evalRunId: string, scenario: TestScenario, skill: Skill, agent?: Agent, workDir?: string): Promise<PartialEvalRunResult>;
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import type { Skill, TestScenario, LLMTrace } from '@wix/evalforge-types';
|
|
2
|
+
import type { ClaudeCodeExecutionOptions, ClaudeCodeExecutionResult } from './types.js';
|
|
3
|
+
/**
|
|
4
|
+
* Execute a skill using the Claude Agent SDK.
|
|
5
|
+
*
|
|
6
|
+
* @param skill - The skill to execute (SKILL.md content used as system prompt)
|
|
7
|
+
* @param scenario - The test scenario containing the trigger prompt
|
|
8
|
+
* @param options - Execution options (cwd, mcpServers, etc.)
|
|
9
|
+
* @returns Execution result with output, usage, and LLM trace
|
|
10
|
+
*/
|
|
11
|
+
export declare function executeWithClaudeCode(skill: Skill, scenario: TestScenario, options: ClaudeCodeExecutionOptions): Promise<{
|
|
12
|
+
result: ClaudeCodeExecutionResult;
|
|
13
|
+
llmTrace: LLMTrace;
|
|
14
|
+
}>;
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Claude Agent SDK integration module.
|
|
3
|
+
*
|
|
4
|
+
* This module provides functionality to execute skills using the
|
|
5
|
+
* @anthropic-ai/claude-agent-sdk with full LLM trace support.
|
|
6
|
+
*/
|
|
7
|
+
export { executeWithClaudeCode } from './execute.js';
|
|
8
|
+
export type { ClaudeCodeExecutionOptions, ClaudeCodeExecutionResult } from './types.js';
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MCP server configuration compatible with Claude Agent SDK.
|
|
3
|
+
*/
|
|
4
|
+
export interface McpServerConfig {
|
|
5
|
+
type?: 'stdio';
|
|
6
|
+
command: string;
|
|
7
|
+
args?: string[];
|
|
8
|
+
env?: Record<string, string>;
|
|
9
|
+
}
|
|
10
|
+
/**
|
|
11
|
+
* Trace context for live streaming of agent execution.
|
|
12
|
+
*/
|
|
13
|
+
export interface TraceContext {
|
|
14
|
+
/** The evaluation run ID */
|
|
15
|
+
evalRunId: string;
|
|
16
|
+
/** The scenario ID being executed */
|
|
17
|
+
scenarioId: string;
|
|
18
|
+
/** The scenario name for display */
|
|
19
|
+
scenarioName: string;
|
|
20
|
+
/** The target ID (skill, agent, etc.) */
|
|
21
|
+
targetId: string;
|
|
22
|
+
/** The target name for display */
|
|
23
|
+
targetName: string;
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Options for Claude Agent SDK execution.
|
|
27
|
+
*/
|
|
28
|
+
export interface ClaudeCodeExecutionOptions {
|
|
29
|
+
/** Working directory for the execution */
|
|
30
|
+
cwd: string;
|
|
31
|
+
/** System prompt to append (typically the SKILL.md content) */
|
|
32
|
+
systemPrompt?: string;
|
|
33
|
+
/** MCP server configurations */
|
|
34
|
+
mcpServers?: Record<string, McpServerConfig>;
|
|
35
|
+
/** Model to use for generation */
|
|
36
|
+
model?: string;
|
|
37
|
+
/** Temperature for generation (0–1, from agent modelConfig) */
|
|
38
|
+
temperature?: number;
|
|
39
|
+
/** Max output tokens (from agent modelConfig) */
|
|
40
|
+
maxTokens?: number;
|
|
41
|
+
/** Maximum number of agent turns */
|
|
42
|
+
maxTurns?: number;
|
|
43
|
+
/** Maximum thinking tokens */
|
|
44
|
+
maxThinkingTokens?: number;
|
|
45
|
+
/** AI Gateway base URL for Anthropic API */
|
|
46
|
+
aiGatewayUrl?: string;
|
|
47
|
+
/** AI Gateway custom headers as key-value pairs */
|
|
48
|
+
aiGatewayHeaders?: Record<string, string>;
|
|
49
|
+
/** Trace context for live streaming (optional) */
|
|
50
|
+
traceContext?: TraceContext;
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Result from Claude Code execution.
|
|
54
|
+
*/
|
|
55
|
+
export interface ClaudeCodeExecutionResult {
|
|
56
|
+
/** Final output text */
|
|
57
|
+
outputText: string;
|
|
58
|
+
/** Total duration in milliseconds */
|
|
59
|
+
durationMs: number;
|
|
60
|
+
/** Token usage statistics */
|
|
61
|
+
usage: {
|
|
62
|
+
inputTokens: number;
|
|
63
|
+
outputTokens: number;
|
|
64
|
+
totalTokens: number;
|
|
65
|
+
};
|
|
66
|
+
/** Cost in USD (if available) */
|
|
67
|
+
costUsd?: number;
|
|
68
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import type { Template } from '@wix/evalforge-types';
|
|
2
|
+
import type { EvaluatorConfig } from '../config.js';
|
|
3
|
+
/**
|
|
4
|
+
* Prepare the working directory for a scenario.
|
|
5
|
+
* If a template is provided, download and extract it.
|
|
6
|
+
*
|
|
7
|
+
* @param config - Evaluator configuration
|
|
8
|
+
* @param evalRunId - The evaluation run ID
|
|
9
|
+
* @param targetId - The target ID (prompt agent, skill, or agent)
|
|
10
|
+
* @param template - Optional pre-fetched template entity
|
|
11
|
+
* @returns The working directory path, or undefined if no template
|
|
12
|
+
*/
|
|
13
|
+
export declare function prepareWorkingDirectory(config: EvaluatorConfig, evalRunId: string, targetId: string, template?: Template): Promise<string | undefined>;
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import { type EvalRunResult, type TestScenario, type Template } from '@wix/evalforge-types';
|
|
2
|
+
import type { EvaluatorConfig } from '../config.js';
|
|
3
|
+
import type { TargetInput } from './types.js';
|
|
4
|
+
export type { TargetInput, PartialEvalRunResult } from './types.js';
|
|
5
|
+
export type { ScenarioItem, EvaluationData } from '../fetch-evaluation-data.js';
|
|
6
|
+
/**
|
|
7
|
+
* Run a single scenario against a target.
|
|
8
|
+
*
|
|
9
|
+
* @param config - Evaluator configuration
|
|
10
|
+
* @param evalRunId - The evaluation run ID
|
|
11
|
+
* @param scenario - The test scenario to run
|
|
12
|
+
* @param target - The target to test (skill or agent)
|
|
13
|
+
* @param template - Optional pre-fetched template entity
|
|
14
|
+
* @returns Complete evaluation result
|
|
15
|
+
*/
|
|
16
|
+
export declare function runScenario(config: EvaluatorConfig, evalRunId: string, scenario: TestScenario, target: TargetInput, template?: Template): Promise<EvalRunResult>;
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import { type ModelIds, type LLMTrace } from '@wix/evalforge-types';
|
|
2
|
+
import type { CapturedStep } from './types.js';
|
|
3
|
+
/**
|
|
4
|
+
* Build LLMTrace from captured steps.
|
|
5
|
+
*/
|
|
6
|
+
export declare function buildLLMTrace(capturedSteps: CapturedStep[], modelId: ModelIds): LLMTrace;
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import type { EvalRunResult, Skill, Agent, LLMStepType } from '@wix/evalforge-types';
|
|
2
|
+
/**
|
|
3
|
+
* Partial result returned by call functions (without assertion fields).
|
|
4
|
+
*/
|
|
5
|
+
export type PartialEvalRunResult = Omit<EvalRunResult, 'assertionResults' | 'passed' | 'failed' | 'passRate'>;
|
|
6
|
+
/**
|
|
7
|
+
* Target types for dispatching scenario execution.
|
|
8
|
+
* PR4: only skill (with optional code agent) and standalone agent are used.
|
|
9
|
+
*/
|
|
10
|
+
export type TargetInput = {
|
|
11
|
+
type: 'skill';
|
|
12
|
+
skill: Skill;
|
|
13
|
+
agent?: Agent;
|
|
14
|
+
} | {
|
|
15
|
+
type: 'agent';
|
|
16
|
+
agent: Agent;
|
|
17
|
+
};
|
|
18
|
+
/**
|
|
19
|
+
* Step data captured during streaming with timing info.
|
|
20
|
+
*/
|
|
21
|
+
export interface CapturedStep {
|
|
22
|
+
text: string;
|
|
23
|
+
usage: {
|
|
24
|
+
inputTokens: number;
|
|
25
|
+
outputTokens: number;
|
|
26
|
+
totalTokens: number;
|
|
27
|
+
};
|
|
28
|
+
finishReason: string;
|
|
29
|
+
toolCalls?: Array<{
|
|
30
|
+
toolName: string;
|
|
31
|
+
args: unknown;
|
|
32
|
+
}>;
|
|
33
|
+
toolResults?: unknown[];
|
|
34
|
+
startedAt: Date;
|
|
35
|
+
durationMs: number;
|
|
36
|
+
}
|
|
37
|
+
export { LLMStepType };
|
package/package.json
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@wix/evalforge-evaluator",
|
|
3
|
+
"version": "0.6.0",
|
|
4
|
+
"description": "EvalForge Evaluator",
|
|
5
|
+
"bin": "./build/index.js",
|
|
6
|
+
"files": [
|
|
7
|
+
"build"
|
|
8
|
+
],
|
|
9
|
+
"scripts": {
|
|
10
|
+
"clean": "rm -rf build",
|
|
11
|
+
"build:cjs": "esbuild src/index.ts --bundle --platform=node --outfile=build/index.js --format=cjs --sourcemap --external:@wix/evalforge-types --external:@anthropic-ai/claude-agent-sdk",
|
|
12
|
+
"build:esm": "esbuild src/index.ts --bundle --platform=node --outfile=build/index.mjs --format=esm --sourcemap --external:@wix/evalforge-types --external:@anthropic-ai/claude-agent-sdk",
|
|
13
|
+
"build:types": "tsc --emitDeclarationOnly --outDir ./build/types",
|
|
14
|
+
"build": "yarn run clean && yarn run build:cjs && yarn run build:esm && yarn run build:types",
|
|
15
|
+
"lint": "eslint .",
|
|
16
|
+
"test": "echo 'No tests specified' && exit 0"
|
|
17
|
+
},
|
|
18
|
+
"dependencies": {
|
|
19
|
+
"@ai-sdk/anthropic": "^3.0.2",
|
|
20
|
+
"@anthropic-ai/claude-code": "^2.0.76",
|
|
21
|
+
"@wix/evalforge-types": "0.3.0",
|
|
22
|
+
"ai": "^6.0.6",
|
|
23
|
+
"tar": "^7.5.3"
|
|
24
|
+
},
|
|
25
|
+
"peerDependencies": {
|
|
26
|
+
"@anthropic-ai/claude-agent-sdk": "^0.2.12"
|
|
27
|
+
},
|
|
28
|
+
"peerDependenciesMeta": {
|
|
29
|
+
"@anthropic-ai/claude-agent-sdk": {
|
|
30
|
+
"optional": false
|
|
31
|
+
}
|
|
32
|
+
},
|
|
33
|
+
"devDependencies": {
|
|
34
|
+
"@eslint/js": "^9.39.2",
|
|
35
|
+
"@types/node": "^22.19.3",
|
|
36
|
+
"@types/tar": "^6.1.13",
|
|
37
|
+
"dotenv": "^17.2.3",
|
|
38
|
+
"esbuild": "^0.27.2",
|
|
39
|
+
"eslint": "^9.39.2",
|
|
40
|
+
"eslint-config-prettier": "^10.1.8",
|
|
41
|
+
"eslint-plugin-prettier": "^5.5.4",
|
|
42
|
+
"prettier": "^3.7.4",
|
|
43
|
+
"tsx": "^4.21.0",
|
|
44
|
+
"typescript": "^5.9.3",
|
|
45
|
+
"typescript-eslint": "^8.51.0"
|
|
46
|
+
},
|
|
47
|
+
"exports": {
|
|
48
|
+
".": {
|
|
49
|
+
"types": "./build/types/index.d.ts",
|
|
50
|
+
"import": "./build/index.mjs",
|
|
51
|
+
"require": "./build/index.js"
|
|
52
|
+
},
|
|
53
|
+
"./package.json": "./package.json"
|
|
54
|
+
},
|
|
55
|
+
"publishConfig": {
|
|
56
|
+
"registry": "https://registry.npmjs.org/",
|
|
57
|
+
"access": "public"
|
|
58
|
+
},
|
|
59
|
+
"wix": {
|
|
60
|
+
"artifact": {
|
|
61
|
+
"groupId": "com.wixpress",
|
|
62
|
+
"artifactId": "evalforge-evaluator"
|
|
63
|
+
}
|
|
64
|
+
},
|
|
65
|
+
"falconPackageHash": "948c3ac942079bd1a8592d8449148cee8a5198ae1654ff42e6d30952"
|
|
66
|
+
}
|