@wix/evalforge-evaluator 0.54.0 → 0.56.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- import type { EvalRun, TestScenario, Skill, Agent, Template, Assertion } from '@wix/evalforge-types';
1
+ import type { EvalRun, TestScenario, Skill, SkillsGroup, Agent, Template, Assertion } from '@wix/evalforge-types';
2
2
  import type { ApiClient } from './api-client.js';
3
3
  /**
4
4
  * A scenario paired with its optional template and resolved assertions.
@@ -11,12 +11,15 @@ export interface ScenarioItem {
11
11
  }
12
12
  /**
13
13
  * Data shape returned by fetchEvaluationData for the run flow.
14
- * Run-level code agent + skills; per-scenario template.
14
+ * Run-level code agent + skills + skills group; per-scenario template.
15
15
  */
16
16
  export interface EvaluationData {
17
17
  evalRun: EvalRun;
18
18
  codeAgent: Agent | null;
19
19
  skills: Skill[];
20
+ skillsGroup: SkillsGroup | null;
21
+ /** Display name for the skills group (from skillsGroup.name when present) */
22
+ skillsGroupName: string;
20
23
  scenarioItems: ScenarioItem[];
21
24
  }
22
25
  /**
@@ -1,14 +1,18 @@
1
1
  import type { Skill, TestScenario, LLMTrace } from '@wix/evalforge-types';
2
2
  import type { ClaudeCodeExecutionOptions, ClaudeCodeExecutionResult } from './types.js';
3
3
  /**
4
- * Execute a skill using the Claude Agent SDK.
4
+ * Execute skills using the Claude Agent SDK.
5
5
  *
6
- * @param skill - The skill to execute (SKILL.md content used as system prompt)
6
+ * All skills are written to .claude/skills/<name>/SKILL.md. The SDK discovers
7
+ * them from the filesystem; no system prompt is passed. The agent has access
8
+ * to all skills as a group.
9
+ *
10
+ * @param skills - All skills to execute (each written to its own directory)
7
11
  * @param scenario - The test scenario containing the trigger prompt
8
12
  * @param options - Execution options (cwd, mcpServers, etc.)
9
13
  * @returns Execution result with output, usage, and LLM trace
10
14
  */
11
- export declare function executeWithClaudeCode(skill: Skill, scenario: TestScenario, options: ClaudeCodeExecutionOptions): Promise<{
15
+ export declare function executeWithClaudeCode(skills: Skill[], scenario: TestScenario, options: ClaudeCodeExecutionOptions): Promise<{
12
16
  result: ClaudeCodeExecutionResult;
13
17
  llmTrace: LLMTrace;
14
18
  }>;
@@ -23,8 +23,6 @@ export interface McpServerConfig {
23
23
  export interface ClaudeCodeExecutionOptions {
24
24
  /** Working directory for the execution */
25
25
  cwd: string;
26
- /** System prompt to append (typically the SKILL.md content) */
27
- systemPrompt?: string;
28
26
  /** MCP server configurations */
29
27
  mcpServers?: Record<string, McpServerConfig>;
30
28
  /** Model to use for generation */
@@ -1,18 +1,18 @@
1
1
  import type { EvalRunResult, TestScenario, Template } from '@wix/evalforge-types';
2
2
  import type { EvaluatorConfig } from '../config.js';
3
- import type { TargetInput } from './types.js';
4
3
  import type { Assertion } from '@wix/evalforge-types';
5
- export type { TargetInput, PartialEvalRunResult } from './types.js';
4
+ import type { EvaluationData } from '../fetch-evaluation-data.js';
5
+ export type { PartialEvalRunResult } from './types.js';
6
6
  export type { ScenarioItem, EvaluationData } from '../fetch-evaluation-data.js';
7
7
  /**
8
- * Run a single scenario against a target.
8
+ * Run a single scenario using evaluation data.
9
9
  *
10
10
  * @param config - Evaluator configuration
11
11
  * @param evalRunId - The evaluation run ID
12
12
  * @param scenario - The test scenario to run
13
- * @param target - The target to test (skill or agent)
13
+ * @param evalData - Fetched evaluation data (skills, skillsGroup, codeAgent)
14
14
  * @param template - Optional pre-fetched template entity
15
15
  * @param resolvedAssertions - Optional assertions resolved from assertionIds
16
16
  * @returns Complete evaluation result
17
17
  */
18
- export declare function runScenario(config: EvaluatorConfig, evalRunId: string, scenario: TestScenario, target: TargetInput, template?: Template, resolvedAssertions?: Assertion[]): Promise<EvalRunResult>;
18
+ export declare function runScenario(config: EvaluatorConfig, evalRunId: string, scenario: TestScenario, evalData: EvaluationData, template?: Template, resolvedAssertions?: Assertion[]): Promise<EvalRunResult>;
@@ -0,0 +1,21 @@
1
+ import type { Skill, Agent, TestScenario } from '@wix/evalforge-types';
2
+ import type { EvaluatorConfig } from '../config.js';
3
+ import type { PartialEvalRunResult } from './types.js';
4
+ /**
5
+ * Run the agent with the full execution context (skills, MCPs, sub-agents, etc.).
6
+ *
7
+ * Uses the agent adapter registry to select the appropriate adapter based on
8
+ * the agent's runCommand. If no agent is specified, defaults to 'claude'.
9
+ * The context includes skills group, optional MCPs, optional sub-agents.
10
+ *
11
+ * @param config - Evaluator configuration
12
+ * @param evalRunId - The evaluation run ID (for live trace context)
13
+ * @param scenario - The test scenario to run
14
+ * @param skills - All skills in the group to test together
15
+ * @param skillsGroupId - ID of the skills group (for workDir and result targetId)
16
+ * @param skillsGroupName - Display name for the skills group
17
+ * @param agent - Optional linked coding agent to run within
18
+ * @param workDir - Optional working directory for the scenario
19
+ * @returns Partial result without assertion fields
20
+ */
21
+ export declare function runAgentWithContext(config: EvaluatorConfig, evalRunId: string, scenario: TestScenario, skills: Skill[], skillsGroupId: string, skillsGroupName: string, agent?: Agent, workDir?: string): Promise<PartialEvalRunResult>;
@@ -1,20 +1,8 @@
1
- import type { EvalRunResult, Skill, Agent, LLMStepType } from '@wix/evalforge-types';
1
+ import type { EvalRunResult, LLMStepType } from '@wix/evalforge-types';
2
2
  /**
3
3
  * Partial result returned by call functions (without assertion fields).
4
4
  */
5
5
  export type PartialEvalRunResult = Omit<EvalRunResult, 'assertionResults' | 'passed' | 'failed' | 'passRate'>;
6
- /**
7
- * Target types for dispatching scenario execution.
8
- * PR4: only skill (with optional code agent) and standalone agent are used.
9
- */
10
- export type TargetInput = {
11
- type: 'skill';
12
- skill: Skill;
13
- agent?: Agent;
14
- } | {
15
- type: 'agent';
16
- agent: Agent;
17
- };
18
6
  /**
19
7
  * Step data captured during streaming with timing info.
20
8
  */
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@wix/evalforge-evaluator",
3
- "version": "0.54.0",
3
+ "version": "0.56.0",
4
4
  "description": "EvalForge Evaluator",
5
5
  "bin": "./build/index.js",
6
6
  "files": [
@@ -20,7 +20,7 @@
20
20
  "@anthropic-ai/claude-agent-sdk": "^0.2.12",
21
21
  "@anthropic-ai/claude-code": "^2.0.76",
22
22
  "@wix/eval-assertions": "0.7.0",
23
- "@wix/evalforge-types": "0.20.0",
23
+ "@wix/evalforge-types": "0.22.0",
24
24
  "ai": "^6.0.6",
25
25
  "diff": "^7.0.0",
26
26
  "tar": "^7.5.3",
@@ -59,5 +59,5 @@
59
59
  "artifactId": "evalforge-evaluator"
60
60
  }
61
61
  },
62
- "falconPackageHash": "c70c247a18a6cbb88083e45b4d517001dfeead7d50f93b4d43bd4f0e"
62
+ "falconPackageHash": "26eed2a12041f36af6cb43667cdd603be49ece4e6b44410dfcf29efb"
63
63
  }
@@ -1,13 +0,0 @@
1
- import type { Agent, TestScenario } from '@wix/evalforge-types';
2
- import type { EvaluatorConfig } from '../config.js';
3
- import type { PartialEvalRunResult } from './types.js';
4
- /**
5
- * Execute a scenario against a standalone coding agent.
6
- *
7
- * @param config - Evaluator configuration
8
- * @param scenario - The test scenario to run
9
- * @param agent - The coding agent to test
10
- * @param workDir - Optional working directory for the scenario
11
- * @returns Partial result without assertion fields
12
- */
13
- export declare function callAgent(config: EvaluatorConfig, scenario: TestScenario, agent: Agent, workDir?: string): Promise<PartialEvalRunResult>;
@@ -1,18 +0,0 @@
1
- import type { Skill, Agent, TestScenario } from '@wix/evalforge-types';
2
- import type { EvaluatorConfig } from '../config.js';
3
- import type { PartialEvalRunResult } from './types.js';
4
- /**
5
- * Execute a scenario against a skill (optionally with a linked coding agent).
6
- *
7
- * Uses the agent adapter registry to select the appropriate adapter based on
8
- * the agent's runCommand. If no agent is specified, defaults to 'claude'.
9
- *
10
- * @param config - Evaluator configuration
11
- * @param evalRunId - The evaluation run ID (for live trace context)
12
- * @param scenario - The test scenario to run
13
- * @param skill - The skill to test
14
- * @param agent - Optional linked coding agent to run the skill within
15
- * @param workDir - Optional working directory for the scenario
16
- * @returns Partial result without assertion fields
17
- */
18
- export declare function callSkill(config: EvaluatorConfig, evalRunId: string, scenario: TestScenario, skill: Skill, agent?: Agent, workDir?: string): Promise<PartialEvalRunResult>;