@wix/evalforge-evaluator 0.104.0 → 0.106.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +47 -37
- package/build/index.js.map +2 -2
- package/build/index.mjs +47 -37
- package/build/index.mjs.map +2 -2
- package/build/types/api-client.d.ts +2 -2
- package/build/types/fetch-evaluation-data.d.ts +6 -7
- package/build/types/run-scenario/index.d.ts +1 -1
- package/build/types/run-scenario/run-agent-with-context.d.ts +1 -1
- package/package.json +4 -4
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* API Client for fetching data from the eval server.
|
|
3
3
|
*/
|
|
4
|
-
import type { EvalRun, EvalRunResult,
|
|
4
|
+
import type { EvalRun, EvalRunResult, TestScenario, Template, SkillWithLatestVersion, SkillVersion, Agent, Preset, CustomAssertion, MCPEntity, SubAgent, Rule } from '@wix/evalforge-types';
|
|
5
5
|
export interface ApiClient {
|
|
6
6
|
getEvalRun(projectId: string, id: string): Promise<EvalRun>;
|
|
7
|
-
getSkillsGroup(projectId: string, id: string): Promise<SkillsGroup>;
|
|
8
7
|
getScenario(projectId: string, id: string): Promise<TestScenario>;
|
|
9
8
|
getSkill(projectId: string, id: string): Promise<SkillWithLatestVersion>;
|
|
10
9
|
getSkillVersion(projectId: string, skillId: string, versionId: string): Promise<SkillVersion>;
|
|
@@ -14,6 +13,7 @@ export interface ApiClient {
|
|
|
14
13
|
getMcp(projectId: string, id: string): Promise<MCPEntity>;
|
|
15
14
|
getSubAgent(projectId: string, id: string): Promise<SubAgent>;
|
|
16
15
|
getRule(projectId: string, id: string): Promise<Rule>;
|
|
16
|
+
getPreset(projectId: string, id: string): Promise<Preset>;
|
|
17
17
|
getAssertion(projectId: string, id: string): Promise<CustomAssertion>;
|
|
18
18
|
addResult(projectId: string, evalRunId: string, result: EvalRunResult): Promise<void>;
|
|
19
19
|
clearResults(projectId: string, evalRunId: string): Promise<void>;
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { EvalRun, TestScenario, SkillWithLatestVersion,
|
|
1
|
+
import type { EvalRun, TestScenario, SkillWithLatestVersion, Agent, Template, CustomAssertion, Assertion, MCPEntity, SubAgent, Rule } from '@wix/evalforge-types';
|
|
2
2
|
import { type SystemAssertionId } from '@wix/evalforge-types';
|
|
3
3
|
import type { ApiClient } from './api-client.js';
|
|
4
4
|
/**
|
|
@@ -31,15 +31,14 @@ export interface ScenarioItem {
|
|
|
31
31
|
}
|
|
32
32
|
/**
|
|
33
33
|
* Data shape returned by fetchEvaluationData for the run flow.
|
|
34
|
-
* Run-level agent + skills +
|
|
34
|
+
* Run-level agent + skills + mcps + subAgents; per-scenario template.
|
|
35
35
|
*/
|
|
36
36
|
export interface EvaluationData {
|
|
37
37
|
evalRun: EvalRun;
|
|
38
38
|
agent: Agent | null;
|
|
39
39
|
skills: SkillWithLatestVersion[];
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
skillsGroupName: string;
|
|
40
|
+
/** Display name: preset name from DB when available, otherwise joined skill names */
|
|
41
|
+
presetName: string;
|
|
43
42
|
mcps: MCPEntity[];
|
|
44
43
|
subAgents: SubAgent[];
|
|
45
44
|
rules: Rule[];
|
|
@@ -48,8 +47,8 @@ export interface EvaluationData {
|
|
|
48
47
|
/**
|
|
49
48
|
* Fetch all data needed to run an evaluation.
|
|
50
49
|
*
|
|
51
|
-
* Uses evalRun.agentId and evalRun.
|
|
52
|
-
* Loads skills
|
|
50
|
+
* Uses evalRun.agentId and evalRun.skillIds as run-level inputs.
|
|
51
|
+
* Loads skills by ID; loads code agent; loads scenarios and templates by scenario.templateId.
|
|
53
52
|
*
|
|
54
53
|
* @throws Error if eval run has no code agent when expected
|
|
55
54
|
* @throws Error if eval run has no skills group or skills group has no skills when expected
|
|
@@ -10,7 +10,7 @@ export type { ScenarioItem, EvaluationData } from '../fetch-evaluation-data.js';
|
|
|
10
10
|
* @param config - Evaluator configuration
|
|
11
11
|
* @param evalRunId - The evaluation run ID
|
|
12
12
|
* @param scenario - The test scenario to run
|
|
13
|
-
* @param evalData - Fetched evaluation data (skills,
|
|
13
|
+
* @param evalData - Fetched evaluation data (skills, agent, mcps, subAgents)
|
|
14
14
|
* @param template - Optional pre-fetched template entity
|
|
15
15
|
* @param resolvedAssertions - Optional assertions resolved from assertionIds
|
|
16
16
|
* @returns Complete evaluation result
|
|
@@ -12,7 +12,7 @@ import type { EvaluationData } from '../fetch-evaluation-data.js';
|
|
|
12
12
|
* @param config - Evaluator configuration
|
|
13
13
|
* @param evalRunId - The evaluation run ID (for live trace context)
|
|
14
14
|
* @param scenario - The test scenario to run
|
|
15
|
-
* @param evalData - Fetched evaluation data (skills,
|
|
15
|
+
* @param evalData - Fetched evaluation data (skills, agent, mcps, subAgents)
|
|
16
16
|
* @param workDir - Optional working directory for the scenario
|
|
17
17
|
* @returns Partial result without assertion fields
|
|
18
18
|
*/
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@wix/evalforge-evaluator",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.106.0",
|
|
4
4
|
"description": "EvalForge Evaluator",
|
|
5
5
|
"bin": "./build/index.js",
|
|
6
6
|
"files": [
|
|
@@ -22,8 +22,8 @@
|
|
|
22
22
|
"@anthropic-ai/claude-agent-sdk": "^0.2.49",
|
|
23
23
|
"@anthropic-ai/claude-code": "^2.1.49",
|
|
24
24
|
"@wix/eval-assertions": "0.23.0",
|
|
25
|
-
"@wix/evalforge-github-client": "0.
|
|
26
|
-
"@wix/evalforge-types": "0.
|
|
25
|
+
"@wix/evalforge-github-client": "0.24.0",
|
|
26
|
+
"@wix/evalforge-types": "0.49.0",
|
|
27
27
|
"ai": "^6.0.93",
|
|
28
28
|
"diff": "^7.0.0",
|
|
29
29
|
"tar": "^7.5.3",
|
|
@@ -62,5 +62,5 @@
|
|
|
62
62
|
"artifactId": "evalforge-evaluator"
|
|
63
63
|
}
|
|
64
64
|
},
|
|
65
|
-
"falconPackageHash": "
|
|
65
|
+
"falconPackageHash": "b023f8b862d171c04024e06788df0308e961466bde8f723fc5846f5a"
|
|
66
66
|
}
|