@wix/evalforge-evaluator 0.54.0 → 0.56.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +106 -118
- package/build/index.js.map +3 -3
- package/build/index.mjs +106 -118
- package/build/index.mjs.map +3 -3
- package/build/types/fetch-evaluation-data.d.ts +5 -2
- package/build/types/run-scenario/agents/claude-code/execute.d.ts +7 -3
- package/build/types/run-scenario/agents/claude-code/types.d.ts +0 -2
- package/build/types/run-scenario/index.d.ts +5 -5
- package/build/types/run-scenario/run-agent-with-context.d.ts +21 -0
- package/build/types/run-scenario/types.d.ts +1 -13
- package/package.json +3 -3
- package/build/types/run-scenario/callAgent.d.ts +0 -13
- package/build/types/run-scenario/callSkill.d.ts +0 -18
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { EvalRun, TestScenario, Skill, Agent, Template, Assertion } from '@wix/evalforge-types';
|
|
1
|
+
import type { EvalRun, TestScenario, Skill, SkillsGroup, Agent, Template, Assertion } from '@wix/evalforge-types';
|
|
2
2
|
import type { ApiClient } from './api-client.js';
|
|
3
3
|
/**
|
|
4
4
|
* A scenario paired with its optional template and resolved assertions.
|
|
@@ -11,12 +11,15 @@ export interface ScenarioItem {
|
|
|
11
11
|
}
|
|
12
12
|
/**
|
|
13
13
|
* Data shape returned by fetchEvaluationData for the run flow.
|
|
14
|
-
* Run-level code agent + skills; per-scenario template.
|
|
14
|
+
* Run-level code agent + skills + skills group; per-scenario template.
|
|
15
15
|
*/
|
|
16
16
|
export interface EvaluationData {
|
|
17
17
|
evalRun: EvalRun;
|
|
18
18
|
codeAgent: Agent | null;
|
|
19
19
|
skills: Skill[];
|
|
20
|
+
skillsGroup: SkillsGroup | null;
|
|
21
|
+
/** Display name for the skills group (from skillsGroup.name when present) */
|
|
22
|
+
skillsGroupName: string;
|
|
20
23
|
scenarioItems: ScenarioItem[];
|
|
21
24
|
}
|
|
22
25
|
/**
|
|
@@ -1,14 +1,18 @@
|
|
|
1
1
|
import type { Skill, TestScenario, LLMTrace } from '@wix/evalforge-types';
|
|
2
2
|
import type { ClaudeCodeExecutionOptions, ClaudeCodeExecutionResult } from './types.js';
|
|
3
3
|
/**
|
|
4
|
-
* Execute
|
|
4
|
+
* Execute skills using the Claude Agent SDK.
|
|
5
5
|
*
|
|
6
|
-
*
|
|
6
|
+
* All skills are written to .claude/skills/<name>/SKILL.md. The SDK discovers
|
|
7
|
+
* them from the filesystem; no system prompt is passed. The agent has access
|
|
8
|
+
* to all skills as a group.
|
|
9
|
+
*
|
|
10
|
+
* @param skills - All skills to execute (each written to its own directory)
|
|
7
11
|
* @param scenario - The test scenario containing the trigger prompt
|
|
8
12
|
* @param options - Execution options (cwd, mcpServers, etc.)
|
|
9
13
|
* @returns Execution result with output, usage, and LLM trace
|
|
10
14
|
*/
|
|
11
|
-
export declare function executeWithClaudeCode(
|
|
15
|
+
export declare function executeWithClaudeCode(skills: Skill[], scenario: TestScenario, options: ClaudeCodeExecutionOptions): Promise<{
|
|
12
16
|
result: ClaudeCodeExecutionResult;
|
|
13
17
|
llmTrace: LLMTrace;
|
|
14
18
|
}>;
|
|
@@ -23,8 +23,6 @@ export interface McpServerConfig {
|
|
|
23
23
|
export interface ClaudeCodeExecutionOptions {
|
|
24
24
|
/** Working directory for the execution */
|
|
25
25
|
cwd: string;
|
|
26
|
-
/** System prompt to append (typically the SKILL.md content) */
|
|
27
|
-
systemPrompt?: string;
|
|
28
26
|
/** MCP server configurations */
|
|
29
27
|
mcpServers?: Record<string, McpServerConfig>;
|
|
30
28
|
/** Model to use for generation */
|
|
@@ -1,18 +1,18 @@
|
|
|
1
1
|
import type { EvalRunResult, TestScenario, Template } from '@wix/evalforge-types';
|
|
2
2
|
import type { EvaluatorConfig } from '../config.js';
|
|
3
|
-
import type { TargetInput } from './types.js';
|
|
4
3
|
import type { Assertion } from '@wix/evalforge-types';
|
|
5
|
-
|
|
4
|
+
import type { EvaluationData } from '../fetch-evaluation-data.js';
|
|
5
|
+
export type { PartialEvalRunResult } from './types.js';
|
|
6
6
|
export type { ScenarioItem, EvaluationData } from '../fetch-evaluation-data.js';
|
|
7
7
|
/**
|
|
8
|
-
* Run a single scenario
|
|
8
|
+
* Run a single scenario using evaluation data.
|
|
9
9
|
*
|
|
10
10
|
* @param config - Evaluator configuration
|
|
11
11
|
* @param evalRunId - The evaluation run ID
|
|
12
12
|
* @param scenario - The test scenario to run
|
|
13
|
-
* @param
|
|
13
|
+
* @param evalData - Fetched evaluation data (skills, skillsGroup, codeAgent)
|
|
14
14
|
* @param template - Optional pre-fetched template entity
|
|
15
15
|
* @param resolvedAssertions - Optional assertions resolved from assertionIds
|
|
16
16
|
* @returns Complete evaluation result
|
|
17
17
|
*/
|
|
18
|
-
export declare function runScenario(config: EvaluatorConfig, evalRunId: string, scenario: TestScenario,
|
|
18
|
+
export declare function runScenario(config: EvaluatorConfig, evalRunId: string, scenario: TestScenario, evalData: EvaluationData, template?: Template, resolvedAssertions?: Assertion[]): Promise<EvalRunResult>;
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import type { Skill, Agent, TestScenario } from '@wix/evalforge-types';
|
|
2
|
+
import type { EvaluatorConfig } from '../config.js';
|
|
3
|
+
import type { PartialEvalRunResult } from './types.js';
|
|
4
|
+
/**
|
|
5
|
+
* Run the agent with the full execution context (skills, MCPs, sub-agents, etc.).
|
|
6
|
+
*
|
|
7
|
+
* Uses the agent adapter registry to select the appropriate adapter based on
|
|
8
|
+
* the agent's runCommand. If no agent is specified, defaults to 'claude'.
|
|
9
|
+
* The context includes skills group, optional MCPs, optional sub-agents.
|
|
10
|
+
*
|
|
11
|
+
* @param config - Evaluator configuration
|
|
12
|
+
* @param evalRunId - The evaluation run ID (for live trace context)
|
|
13
|
+
* @param scenario - The test scenario to run
|
|
14
|
+
* @param skills - All skills in the group to test together
|
|
15
|
+
* @param skillsGroupId - ID of the skills group (for workDir and result targetId)
|
|
16
|
+
* @param skillsGroupName - Display name for the skills group
|
|
17
|
+
* @param agent - Optional linked coding agent to run within
|
|
18
|
+
* @param workDir - Optional working directory for the scenario
|
|
19
|
+
* @returns Partial result without assertion fields
|
|
20
|
+
*/
|
|
21
|
+
export declare function runAgentWithContext(config: EvaluatorConfig, evalRunId: string, scenario: TestScenario, skills: Skill[], skillsGroupId: string, skillsGroupName: string, agent?: Agent, workDir?: string): Promise<PartialEvalRunResult>;
|
|
@@ -1,20 +1,8 @@
|
|
|
1
|
-
import type { EvalRunResult,
|
|
1
|
+
import type { EvalRunResult, LLMStepType } from '@wix/evalforge-types';
|
|
2
2
|
/**
|
|
3
3
|
* Partial result returned by call functions (without assertion fields).
|
|
4
4
|
*/
|
|
5
5
|
export type PartialEvalRunResult = Omit<EvalRunResult, 'assertionResults' | 'passed' | 'failed' | 'passRate'>;
|
|
6
|
-
/**
|
|
7
|
-
* Target types for dispatching scenario execution.
|
|
8
|
-
* PR4: only skill (with optional code agent) and standalone agent are used.
|
|
9
|
-
*/
|
|
10
|
-
export type TargetInput = {
|
|
11
|
-
type: 'skill';
|
|
12
|
-
skill: Skill;
|
|
13
|
-
agent?: Agent;
|
|
14
|
-
} | {
|
|
15
|
-
type: 'agent';
|
|
16
|
-
agent: Agent;
|
|
17
|
-
};
|
|
18
6
|
/**
|
|
19
7
|
* Step data captured during streaming with timing info.
|
|
20
8
|
*/
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@wix/evalforge-evaluator",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.56.0",
|
|
4
4
|
"description": "EvalForge Evaluator",
|
|
5
5
|
"bin": "./build/index.js",
|
|
6
6
|
"files": [
|
|
@@ -20,7 +20,7 @@
|
|
|
20
20
|
"@anthropic-ai/claude-agent-sdk": "^0.2.12",
|
|
21
21
|
"@anthropic-ai/claude-code": "^2.0.76",
|
|
22
22
|
"@wix/eval-assertions": "0.7.0",
|
|
23
|
-
"@wix/evalforge-types": "0.
|
|
23
|
+
"@wix/evalforge-types": "0.22.0",
|
|
24
24
|
"ai": "^6.0.6",
|
|
25
25
|
"diff": "^7.0.0",
|
|
26
26
|
"tar": "^7.5.3",
|
|
@@ -59,5 +59,5 @@
|
|
|
59
59
|
"artifactId": "evalforge-evaluator"
|
|
60
60
|
}
|
|
61
61
|
},
|
|
62
|
-
"falconPackageHash": "
|
|
62
|
+
"falconPackageHash": "26eed2a12041f36af6cb43667cdd603be49ece4e6b44410dfcf29efb"
|
|
63
63
|
}
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
import type { Agent, TestScenario } from '@wix/evalforge-types';
|
|
2
|
-
import type { EvaluatorConfig } from '../config.js';
|
|
3
|
-
import type { PartialEvalRunResult } from './types.js';
|
|
4
|
-
/**
|
|
5
|
-
* Execute a scenario against a standalone coding agent.
|
|
6
|
-
*
|
|
7
|
-
* @param config - Evaluator configuration
|
|
8
|
-
* @param scenario - The test scenario to run
|
|
9
|
-
* @param agent - The coding agent to test
|
|
10
|
-
* @param workDir - Optional working directory for the scenario
|
|
11
|
-
* @returns Partial result without assertion fields
|
|
12
|
-
*/
|
|
13
|
-
export declare function callAgent(config: EvaluatorConfig, scenario: TestScenario, agent: Agent, workDir?: string): Promise<PartialEvalRunResult>;
|
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
import type { Skill, Agent, TestScenario } from '@wix/evalforge-types';
|
|
2
|
-
import type { EvaluatorConfig } from '../config.js';
|
|
3
|
-
import type { PartialEvalRunResult } from './types.js';
|
|
4
|
-
/**
|
|
5
|
-
* Execute a scenario against a skill (optionally with a linked coding agent).
|
|
6
|
-
*
|
|
7
|
-
* Uses the agent adapter registry to select the appropriate adapter based on
|
|
8
|
-
* the agent's runCommand. If no agent is specified, defaults to 'claude'.
|
|
9
|
-
*
|
|
10
|
-
* @param config - Evaluator configuration
|
|
11
|
-
* @param evalRunId - The evaluation run ID (for live trace context)
|
|
12
|
-
* @param scenario - The test scenario to run
|
|
13
|
-
* @param skill - The skill to test
|
|
14
|
-
* @param agent - Optional linked coding agent to run the skill within
|
|
15
|
-
* @param workDir - Optional working directory for the scenario
|
|
16
|
-
* @returns Partial result without assertion fields
|
|
17
|
-
*/
|
|
18
|
-
export declare function callSkill(config: EvaluatorConfig, evalRunId: string, scenario: TestScenario, skill: Skill, agent?: Agent, workDir?: string): Promise<PartialEvalRunResult>;
|