@wix/evalforge-evaluator 0.99.0 → 0.100.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +147 -148
- package/build/index.js.map +4 -4
- package/build/index.mjs +127 -128
- package/build/index.mjs.map +4 -4
- package/build/types/fetch-evaluation-data.d.ts +2 -2
- package/build/types/run-scenario/agents/claude-code/write-skills.d.ts +3 -1
- package/build/types/run-scenario/agents/registry.d.ts +32 -63
- package/build/types/run-scenario/index.d.ts +1 -1
- package/build/types/run-scenario/run-agent-with-context.d.ts +3 -3
- package/build/types/run-scenario/utils/write-files.d.ts +6 -0
- package/package.json +4 -4
|
@@ -31,11 +31,11 @@ export interface ScenarioItem {
|
|
|
31
31
|
}
|
|
32
32
|
/**
|
|
33
33
|
* Data shape returned by fetchEvaluationData for the run flow.
|
|
34
|
-
* Run-level
|
|
34
|
+
* Run-level agent + skills + skills group + mcps + subAgents; per-scenario template.
|
|
35
35
|
*/
|
|
36
36
|
export interface EvaluationData {
|
|
37
37
|
evalRun: EvalRun;
|
|
38
|
-
|
|
38
|
+
agent: Agent | null;
|
|
39
39
|
skills: SkillWithLatestVersion[];
|
|
40
40
|
skillsGroup: SkillsGroup | null;
|
|
41
41
|
/** Display name for the skills group (from skillsGroup.name when present) */
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import type { GitHubSource, SkillFile, SkillWithLatestVersion } from '@wix/evalforge-types';
|
|
2
|
+
import { writeFilesToDirectory } from '../../utils/write-files.js';
|
|
2
3
|
export type FetchGitHubFolderFn = (source: GitHubSource, options?: {
|
|
3
4
|
userAgent?: string;
|
|
4
5
|
}) => Promise<SkillFile[]>;
|
|
@@ -17,5 +18,6 @@ export declare function writeSkillsToFilesystem(cwd: string, skills: SkillWithLa
|
|
|
17
18
|
export declare function writeSkillToFilesystem(cwd: string, skill: SkillWithLatestVersion, fetchFn?: FetchGitHubFolderFn): Promise<void>;
|
|
18
19
|
/**
|
|
19
20
|
* Write skill files from a snapshot array to the filesystem.
|
|
21
|
+
* @deprecated Use writeFilesToDirectory from utils/write-files.ts instead
|
|
20
22
|
*/
|
|
21
|
-
export declare
|
|
23
|
+
export declare const writeSkillFiles: typeof writeFilesToDirectory;
|
|
@@ -1,103 +1,72 @@
|
|
|
1
|
-
import type { AgentAdapter
|
|
1
|
+
import type { AgentAdapter } from '@wix/evalforge-types';
|
|
2
2
|
/**
|
|
3
3
|
* Registry for agent adapters.
|
|
4
4
|
*
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
5
|
+
* Supports two lookup strategies:
|
|
6
|
+
* - By CLI command (e.g. 'claude') for CLI-based agents
|
|
7
|
+
* - By adapter ID (e.g. 'simple-agent') for SDK-based agents
|
|
8
8
|
*
|
|
9
9
|
* @example
|
|
10
10
|
* ```typescript
|
|
11
11
|
* const registry = new AgentAdapterRegistry();
|
|
12
12
|
* registry.register(new ClaudeCodeAdapter());
|
|
13
13
|
*
|
|
14
|
-
* const adapter = registry.
|
|
14
|
+
* const adapter = registry.resolve('claude');
|
|
15
15
|
* if (adapter) {
|
|
16
16
|
* const result = await adapter.execute(context);
|
|
17
17
|
* }
|
|
18
18
|
* ```
|
|
19
19
|
*/
|
|
20
20
|
export declare class AgentAdapterRegistry {
|
|
21
|
-
/**
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
/**
|
|
27
|
-
* Set of all registered adapter instances (for getAll).
|
|
28
|
-
*/
|
|
21
|
+
/** Map of CLI commands to their registered adapters. */
|
|
22
|
+
private commandMap;
|
|
23
|
+
/** Map of adapter IDs to their registered adapters. */
|
|
24
|
+
private idMap;
|
|
25
|
+
/** Set of all registered adapter instances (for getAll). */
|
|
29
26
|
private registeredAdapters;
|
|
30
27
|
/**
|
|
31
28
|
* Register an agent adapter.
|
|
32
29
|
*
|
|
33
|
-
* The adapter
|
|
34
|
-
* If a command is already registered, it will be overwritten with a warning.
|
|
35
|
-
*
|
|
36
|
-
* @param adapter - The adapter to register
|
|
30
|
+
* The adapter is registered by its ID and for all commands in its supportedCommands array.
|
|
31
|
+
* If a command or ID is already registered, it will be overwritten with a warning.
|
|
37
32
|
*/
|
|
38
33
|
register(adapter: AgentAdapter): void;
|
|
34
|
+
/** Get an adapter by CLI command. */
|
|
35
|
+
getByCommand(command: string): AgentAdapter | undefined;
|
|
36
|
+
/** Get an adapter by adapter ID. */
|
|
37
|
+
getById(adapterId: string): AgentAdapter | undefined;
|
|
39
38
|
/**
|
|
40
|
-
*
|
|
41
|
-
*
|
|
42
|
-
* @param runCommand - The run command to look up
|
|
43
|
-
* @returns The registered adapter, or undefined if not found
|
|
44
|
-
*/
|
|
45
|
-
get(runCommand: AgentRunCommand): AgentAdapter | undefined;
|
|
46
|
-
/**
|
|
47
|
-
* Check if a command has a registered adapter.
|
|
48
|
-
*
|
|
49
|
-
* @param runCommand - The run command to check
|
|
50
|
-
* @returns True if an adapter is registered for this command
|
|
51
|
-
*/
|
|
52
|
-
has(runCommand: AgentRunCommand): boolean;
|
|
53
|
-
/**
|
|
54
|
-
* Get all registered adapters.
|
|
55
|
-
*
|
|
56
|
-
* @returns Array of all unique registered adapters
|
|
39
|
+
* Unified lookup: tries CLI command first, then adapter ID.
|
|
40
|
+
* Use this when the identifier could be either a command or an adapter ID.
|
|
57
41
|
*/
|
|
42
|
+
resolve(identifier: string): AgentAdapter | undefined;
|
|
43
|
+
/** Check if a command or adapter ID has a registered adapter. */
|
|
44
|
+
has(identifier: string): boolean;
|
|
45
|
+
/** Get all registered adapters. */
|
|
58
46
|
getAll(): AgentAdapter[];
|
|
59
|
-
/**
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
*/
|
|
64
|
-
getSupportedCommands(): AgentRunCommand[];
|
|
47
|
+
/** Get all supported CLI commands. */
|
|
48
|
+
getSupportedCommands(): string[];
|
|
49
|
+
/** Get all registered adapter IDs. */
|
|
50
|
+
getAdapterIds(): string[];
|
|
65
51
|
/**
|
|
66
52
|
* Unregister an adapter by its ID.
|
|
67
|
-
*
|
|
68
53
|
* Removes the adapter and all its command mappings.
|
|
69
|
-
*
|
|
70
|
-
* @param adapterId - The ID of the adapter to remove
|
|
71
|
-
* @returns True if the adapter was found and removed
|
|
72
54
|
*/
|
|
73
55
|
unregister(adapterId: string): boolean;
|
|
74
|
-
/**
|
|
75
|
-
* Clear all registered adapters.
|
|
76
|
-
* Primarily useful for testing.
|
|
77
|
-
*/
|
|
56
|
+
/** Clear all registered adapters. Primarily useful for testing. */
|
|
78
57
|
clear(): void;
|
|
79
58
|
}
|
|
80
59
|
/**
|
|
81
60
|
* Default global registry instance.
|
|
82
|
-
*
|
|
83
|
-
* This is the main registry used by the evaluation system.
|
|
84
61
|
* Adapters are auto-registered here when their modules are imported.
|
|
85
62
|
*/
|
|
86
63
|
export declare const defaultRegistry: AgentAdapterRegistry;
|
|
87
64
|
/**
|
|
88
65
|
* Get an adapter from the default registry.
|
|
89
66
|
*
|
|
90
|
-
*
|
|
91
|
-
*
|
|
92
|
-
* @param runCommand - The run command to look up
|
|
93
|
-
* @returns The registered adapter
|
|
94
|
-
* @throws Error if no adapter is registered for the command
|
|
95
|
-
*/
|
|
96
|
-
export declare function getAdapter(runCommand: AgentRunCommand): AgentAdapter;
|
|
97
|
-
/**
|
|
98
|
-
* Check if a command has a registered adapter in the default registry.
|
|
99
|
-
*
|
|
100
|
-
* @param runCommand - The run command to check
|
|
101
|
-
* @returns True if an adapter is registered for this command
|
|
67
|
+
* Uses unified lookup (command first, then adapter ID).
|
|
68
|
+
* Throws a helpful error if the adapter is not found.
|
|
102
69
|
*/
|
|
103
|
-
export declare function
|
|
70
|
+
export declare function getAdapter(identifier: string): AgentAdapter;
|
|
71
|
+
/** Check if an identifier has a registered adapter in the default registry. */
|
|
72
|
+
export declare function hasAdapter(identifier: string): boolean;
|
|
@@ -10,7 +10,7 @@ export type { ScenarioItem, EvaluationData } from '../fetch-evaluation-data.js';
|
|
|
10
10
|
* @param config - Evaluator configuration
|
|
11
11
|
* @param evalRunId - The evaluation run ID
|
|
12
12
|
* @param scenario - The test scenario to run
|
|
13
|
-
* @param evalData - Fetched evaluation data (skills, skillsGroup,
|
|
13
|
+
* @param evalData - Fetched evaluation data (skills, skillsGroup, agent, mcps, subAgents)
|
|
14
14
|
* @param template - Optional pre-fetched template entity
|
|
15
15
|
* @param resolvedAssertions - Optional assertions resolved from assertionIds
|
|
16
16
|
* @returns Complete evaluation result
|
|
@@ -6,13 +6,13 @@ import type { EvaluationData } from '../fetch-evaluation-data.js';
|
|
|
6
6
|
* Run the agent with the full execution context (skills, MCPs, sub-agents, etc.).
|
|
7
7
|
*
|
|
8
8
|
* Uses the agent adapter registry to select the appropriate adapter based on
|
|
9
|
-
* the agent's runCommand
|
|
10
|
-
*
|
|
9
|
+
* the agent's runCommand (for CLI agents) or adapter ID (for SDK agents).
|
|
10
|
+
* If no agent is specified, defaults to 'claude'.
|
|
11
11
|
*
|
|
12
12
|
* @param config - Evaluator configuration
|
|
13
13
|
* @param evalRunId - The evaluation run ID (for live trace context)
|
|
14
14
|
* @param scenario - The test scenario to run
|
|
15
|
-
* @param evalData - Fetched evaluation data (skills, skillsGroup,
|
|
15
|
+
* @param evalData - Fetched evaluation data (skills, skillsGroup, agent, mcps, subAgents)
|
|
16
16
|
* @param workDir - Optional working directory for the scenario
|
|
17
17
|
* @returns Partial result without assertion fields
|
|
18
18
|
*/
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import type { SkillFile } from '@wix/evalforge-types';
|
|
2
|
+
/**
|
|
3
|
+
* Write an array of {path, content} files to a target directory.
|
|
4
|
+
* Validates paths to prevent directory traversal.
|
|
5
|
+
*/
|
|
6
|
+
export declare function writeFilesToDirectory(targetDir: string, files: SkillFile[]): Promise<void>;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@wix/evalforge-evaluator",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.100.0",
|
|
4
4
|
"description": "EvalForge Evaluator",
|
|
5
5
|
"bin": "./build/index.js",
|
|
6
6
|
"files": [
|
|
@@ -20,8 +20,8 @@
|
|
|
20
20
|
"@anthropic-ai/claude-agent-sdk": "^0.2.44",
|
|
21
21
|
"@anthropic-ai/claude-code": "^2.1.44",
|
|
22
22
|
"@wix/eval-assertions": "0.21.0",
|
|
23
|
-
"@wix/evalforge-github-client": "0.
|
|
24
|
-
"@wix/evalforge-types": "0.
|
|
23
|
+
"@wix/evalforge-github-client": "0.18.0",
|
|
24
|
+
"@wix/evalforge-types": "0.43.0",
|
|
25
25
|
"ai": "^6.0.6",
|
|
26
26
|
"diff": "^7.0.0",
|
|
27
27
|
"tar": "^7.5.3",
|
|
@@ -60,5 +60,5 @@
|
|
|
60
60
|
"artifactId": "evalforge-evaluator"
|
|
61
61
|
}
|
|
62
62
|
},
|
|
63
|
-
"falconPackageHash": "
|
|
63
|
+
"falconPackageHash": "04a26e31de9664dd48aa040a4bd2f8da8bcef67de281fc36482e8272"
|
|
64
64
|
}
|