@wix/evalforge-evaluator 0.55.0 → 0.56.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +106 -118
- package/build/index.js.map +3 -3
- package/build/index.mjs +106 -118
- package/build/index.mjs.map +3 -3
- package/build/types/fetch-evaluation-data.d.ts +5 -2
- package/build/types/run-scenario/agents/claude-code/execute.d.ts +7 -3
- package/build/types/run-scenario/agents/claude-code/types.d.ts +0 -2
- package/build/types/run-scenario/index.d.ts +5 -5
- package/build/types/run-scenario/run-agent-with-context.d.ts +21 -0
- package/build/types/run-scenario/types.d.ts +1 -13
- package/package.json +3 -3
- package/build/types/run-scenario/callAgent.d.ts +0 -13
- package/build/types/run-scenario/callSkill.d.ts +0 -18
package/build/index.mjs
CHANGED
|
@@ -267,11 +267,9 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
267
267
|
codeAgent = await api.getAgent(projectId2, evalRun.agentId);
|
|
268
268
|
}
|
|
269
269
|
let skills = [];
|
|
270
|
+
let skillsGroup = null;
|
|
270
271
|
if (evalRun.skillsGroupId) {
|
|
271
|
-
|
|
272
|
-
projectId2,
|
|
273
|
-
evalRun.skillsGroupId
|
|
274
|
-
);
|
|
272
|
+
skillsGroup = await api.getSkillsGroup(projectId2, evalRun.skillsGroupId);
|
|
275
273
|
if (skillsGroup.skillIds.length > 0) {
|
|
276
274
|
skills = await Promise.all(
|
|
277
275
|
skillsGroup.skillIds.map((id) => api.getSkill(projectId2, id))
|
|
@@ -320,10 +318,13 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
320
318
|
resolvedAssertions: resolvedAssertions.length > 0 ? resolvedAssertions : void 0
|
|
321
319
|
};
|
|
322
320
|
});
|
|
321
|
+
const skillsGroupName = skillsGroup?.name ?? "";
|
|
323
322
|
return {
|
|
324
323
|
evalRun,
|
|
325
324
|
codeAgent,
|
|
326
325
|
skills,
|
|
326
|
+
skillsGroup,
|
|
327
|
+
skillsGroupName,
|
|
327
328
|
scenarioItems
|
|
328
329
|
};
|
|
329
330
|
}
|
|
@@ -6206,7 +6207,7 @@ async function prepareWorkingDirectory(config, evalRunId2, targetId, scenarioId,
|
|
|
6206
6207
|
return workDir;
|
|
6207
6208
|
}
|
|
6208
6209
|
|
|
6209
|
-
// src/run-scenario/
|
|
6210
|
+
// src/run-scenario/run-agent-with-context.ts
|
|
6210
6211
|
import { randomUUID as randomUUID2 } from "crypto";
|
|
6211
6212
|
|
|
6212
6213
|
// src/run-scenario/agents/registry.ts
|
|
@@ -6520,10 +6521,11 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
6520
6521
|
outputPreview: `Message type: ${message.type}`
|
|
6521
6522
|
};
|
|
6522
6523
|
}
|
|
6523
|
-
async function executeWithClaudeCode(
|
|
6524
|
+
async function executeWithClaudeCode(skills, scenario, options) {
|
|
6525
|
+
const skillNames = skills.map((s) => s.name).join(", ");
|
|
6524
6526
|
console.log("[executeWithClaudeCode] Starting execution", {
|
|
6525
|
-
|
|
6526
|
-
|
|
6527
|
+
skillCount: skills.length,
|
|
6528
|
+
skillNames,
|
|
6527
6529
|
scenarioId: scenario.id,
|
|
6528
6530
|
scenarioName: scenario.name,
|
|
6529
6531
|
cwd: options.cwd,
|
|
@@ -6559,22 +6561,23 @@ async function executeWithClaudeCode(skill, scenario, options) {
|
|
|
6559
6561
|
const startTime = /* @__PURE__ */ new Date();
|
|
6560
6562
|
const allMessages = [];
|
|
6561
6563
|
console.error(
|
|
6562
|
-
"[DEBUG-H4]
|
|
6564
|
+
"[DEBUG-H4] writeSkillsToFilesystem START",
|
|
6563
6565
|
JSON.stringify({
|
|
6564
6566
|
cwd: options.cwd,
|
|
6565
|
-
|
|
6567
|
+
skillCount: skills.length,
|
|
6568
|
+
skillNames: skills.map((s) => s.name),
|
|
6566
6569
|
timestamp: Date.now()
|
|
6567
6570
|
})
|
|
6568
6571
|
);
|
|
6569
6572
|
try {
|
|
6570
|
-
await
|
|
6573
|
+
await writeSkillsToFilesystem(options.cwd, skills);
|
|
6571
6574
|
console.error(
|
|
6572
|
-
"[DEBUG-H4]
|
|
6575
|
+
"[DEBUG-H4] writeSkillsToFilesystem SUCCESS",
|
|
6573
6576
|
JSON.stringify({ timestamp: Date.now() })
|
|
6574
6577
|
);
|
|
6575
6578
|
} catch (writeError) {
|
|
6576
6579
|
console.error(
|
|
6577
|
-
"[DEBUG-H4]
|
|
6580
|
+
"[DEBUG-H4] writeSkillsToFilesystem FAILED",
|
|
6578
6581
|
JSON.stringify({
|
|
6579
6582
|
error: writeError instanceof Error ? writeError.message : String(writeError),
|
|
6580
6583
|
stack: writeError instanceof Error ? writeError.stack : void 0,
|
|
@@ -6582,7 +6585,7 @@ async function executeWithClaudeCode(skill, scenario, options) {
|
|
|
6582
6585
|
})
|
|
6583
6586
|
);
|
|
6584
6587
|
throw new Error(
|
|
6585
|
-
`Failed to write
|
|
6588
|
+
`Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
|
|
6586
6589
|
);
|
|
6587
6590
|
}
|
|
6588
6591
|
const sdkEnv = buildSdkEnvironment(options);
|
|
@@ -6618,7 +6621,10 @@ async function executeWithClaudeCode(skill, scenario, options) {
|
|
|
6618
6621
|
}
|
|
6619
6622
|
console.log("[SDK-DEBUG] PATH available:", !!sdkEnv.PATH);
|
|
6620
6623
|
console.log("[SDK-DEBUG] HOME:", sdkEnv.HOME || "NOT SET");
|
|
6621
|
-
console.log(
|
|
6624
|
+
console.log(
|
|
6625
|
+
"[SDK-DEBUG] Skills:",
|
|
6626
|
+
skills.map((s) => `${s.id} - ${s.name}`).join(", ")
|
|
6627
|
+
);
|
|
6622
6628
|
console.log("[SDK-DEBUG] Scenario:", scenario.id, "-", scenario.name);
|
|
6623
6629
|
console.log(
|
|
6624
6630
|
"[SDK-DEBUG] Prompt preview:",
|
|
@@ -6728,7 +6734,7 @@ async function executeWithClaudeCode(skill, scenario, options) {
|
|
|
6728
6734
|
timedOut = true;
|
|
6729
6735
|
reject(
|
|
6730
6736
|
new Error(
|
|
6731
|
-
`SDK execution timed out after ${SDK_TIMEOUT_MS}ms.
|
|
6737
|
+
`SDK execution timed out after ${SDK_TIMEOUT_MS}ms. Skills: ${skillNames}, Scenario: ${scenario.name}, Messages received: ${messageCount}, MaxTurns: ${maxTurns}`
|
|
6732
6738
|
)
|
|
6733
6739
|
);
|
|
6734
6740
|
}, SDK_TIMEOUT_MS);
|
|
@@ -6936,8 +6942,8 @@ IMPORTANT: This is an automated evaluation run. Follow these guidelines:
|
|
|
6936
6942
|
}
|
|
6937
6943
|
}
|
|
6938
6944
|
console.error("[SDK-ERROR] Execution context:");
|
|
6939
|
-
console.error("[SDK-ERROR]
|
|
6940
|
-
console.error("[SDK-ERROR]
|
|
6945
|
+
console.error("[SDK-ERROR] skillCount:", skills.length);
|
|
6946
|
+
console.error("[SDK-ERROR] skillNames:", skillNames);
|
|
6941
6947
|
console.error("[SDK-ERROR] scenarioId:", scenario.id);
|
|
6942
6948
|
console.error("[SDK-ERROR] scenarioName:", scenario.name);
|
|
6943
6949
|
console.error("[SDK-ERROR] cwd:", options.cwd);
|
|
@@ -6997,7 +7003,8 @@ IMPORTANT: This is an automated evaluation run. Follow these guidelines:
|
|
|
6997
7003
|
messageCount,
|
|
6998
7004
|
errorName,
|
|
6999
7005
|
errorMessage,
|
|
7000
|
-
|
|
7006
|
+
skillCount: skills.length,
|
|
7007
|
+
skillNames,
|
|
7001
7008
|
scenarioId: scenario.id,
|
|
7002
7009
|
model: options.model || DEFAULT_MODEL,
|
|
7003
7010
|
sdkEnv: sdkEnvDebug,
|
|
@@ -7090,13 +7097,15 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
|
|
|
7090
7097
|
llmTrace
|
|
7091
7098
|
};
|
|
7092
7099
|
}
|
|
7093
|
-
async function
|
|
7094
|
-
const
|
|
7095
|
-
|
|
7096
|
-
|
|
7097
|
-
|
|
7098
|
-
|
|
7099
|
-
|
|
7100
|
+
async function writeSkillsToFilesystem(cwd, skills) {
|
|
7101
|
+
for (const skill of skills) {
|
|
7102
|
+
const skillName = skill.name;
|
|
7103
|
+
const skillDir = join2(cwd, ".claude", "skills", skillName);
|
|
7104
|
+
await mkdir2(skillDir, { recursive: true });
|
|
7105
|
+
const skillPath = join2(skillDir, "SKILL.md");
|
|
7106
|
+
await writeFile(skillPath, skill.skillMd, "utf-8");
|
|
7107
|
+
console.log(`[Skill] Written to ${skillPath}`);
|
|
7108
|
+
}
|
|
7100
7109
|
}
|
|
7101
7110
|
function buildSdkEnvironment(options) {
|
|
7102
7111
|
const env = { ...process.env };
|
|
@@ -7292,7 +7301,7 @@ var ClaudeCodeAdapter = class {
|
|
|
7292
7301
|
*/
|
|
7293
7302
|
async execute(context) {
|
|
7294
7303
|
const {
|
|
7295
|
-
|
|
7304
|
+
skills,
|
|
7296
7305
|
scenario,
|
|
7297
7306
|
cwd,
|
|
7298
7307
|
modelConfig,
|
|
@@ -7303,7 +7312,6 @@ var ClaudeCodeAdapter = class {
|
|
|
7303
7312
|
const modelForSdk = modelConfig?.model ? AVAILABLE_MODELS_MAP[modelConfig.model]?.providerModelId ?? modelConfig.model : void 0;
|
|
7304
7313
|
const options = {
|
|
7305
7314
|
cwd,
|
|
7306
|
-
systemPrompt: skill.skillMd,
|
|
7307
7315
|
model: modelForSdk,
|
|
7308
7316
|
temperature: modelConfig?.temperature,
|
|
7309
7317
|
maxTokens: modelConfig?.maxTokens,
|
|
@@ -7312,7 +7320,7 @@ var ClaudeCodeAdapter = class {
|
|
|
7312
7320
|
traceContext
|
|
7313
7321
|
};
|
|
7314
7322
|
const { result, llmTrace } = await executeWithClaudeCode(
|
|
7315
|
-
|
|
7323
|
+
skills,
|
|
7316
7324
|
scenario,
|
|
7317
7325
|
options
|
|
7318
7326
|
);
|
|
@@ -7895,7 +7903,6 @@ arrayDiff.join = arrayDiff.removeEmpty = function(value) {
|
|
|
7895
7903
|
var IGNORED_PATTERNS = [
|
|
7896
7904
|
"node_modules",
|
|
7897
7905
|
".git",
|
|
7898
|
-
".claude",
|
|
7899
7906
|
".cursor",
|
|
7900
7907
|
"dist",
|
|
7901
7908
|
"build",
|
|
@@ -8061,15 +8068,15 @@ function extractTemplateFiles(before, after) {
|
|
|
8061
8068
|
return files;
|
|
8062
8069
|
}
|
|
8063
8070
|
|
|
8064
|
-
// src/run-scenario/
|
|
8071
|
+
// src/run-scenario/run-agent-with-context.ts
|
|
8065
8072
|
var DEFAULT_AGENT_COMMAND = "claude";
|
|
8066
|
-
async function
|
|
8073
|
+
async function runAgentWithContext(config, evalRunId2, scenario, skills, skillsGroupId, skillsGroupName, agent, workDir) {
|
|
8067
8074
|
const runCommand = agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
|
|
8068
8075
|
const adapter = getAdapter(runCommand);
|
|
8069
8076
|
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
8070
8077
|
const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
|
|
8071
8078
|
const executionContext = {
|
|
8072
|
-
|
|
8079
|
+
skills,
|
|
8073
8080
|
scenario,
|
|
8074
8081
|
cwd: workDir || process.cwd(),
|
|
8075
8082
|
modelConfig: agent?.modelConfig,
|
|
@@ -8079,8 +8086,8 @@ async function callSkill(config, evalRunId2, scenario, skill, agent, workDir) {
|
|
|
8079
8086
|
evalRunId: evalRunId2,
|
|
8080
8087
|
scenarioId: scenario.id,
|
|
8081
8088
|
scenarioName: scenario.name,
|
|
8082
|
-
targetId:
|
|
8083
|
-
targetName:
|
|
8089
|
+
targetId: skillsGroupId,
|
|
8090
|
+
targetName: skillsGroupName,
|
|
8084
8091
|
tracePushUrl: config.tracePushUrl,
|
|
8085
8092
|
routeHeader: config.routeHeader,
|
|
8086
8093
|
authToken: config.authToken
|
|
@@ -8093,8 +8100,8 @@ async function callSkill(config, evalRunId2, scenario, skill, agent, workDir) {
|
|
|
8093
8100
|
const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot) : void 0;
|
|
8094
8101
|
return {
|
|
8095
8102
|
id: randomUUID2(),
|
|
8096
|
-
targetId:
|
|
8097
|
-
targetName:
|
|
8103
|
+
targetId: skillsGroupId,
|
|
8104
|
+
targetName: skillsGroupName,
|
|
8098
8105
|
scenarioId: scenario.id,
|
|
8099
8106
|
scenarioName: scenario.name,
|
|
8100
8107
|
modelConfig: agent?.modelConfig,
|
|
@@ -8108,45 +8115,26 @@ async function callSkill(config, evalRunId2, scenario, skill, agent, workDir) {
|
|
|
8108
8115
|
};
|
|
8109
8116
|
}
|
|
8110
8117
|
|
|
8111
|
-
// src/run-scenario/callAgent.ts
|
|
8112
|
-
async function callAgent(config, scenario, agent, workDir) {
|
|
8113
|
-
throw new Error("Agent execution not yet implemented");
|
|
8114
|
-
}
|
|
8115
|
-
|
|
8116
8118
|
// src/run-scenario/index.ts
|
|
8117
|
-
function
|
|
8118
|
-
|
|
8119
|
-
case "skill":
|
|
8120
|
-
return target.skill.id;
|
|
8121
|
-
case "agent":
|
|
8122
|
-
return target.agent.id;
|
|
8123
|
-
}
|
|
8124
|
-
}
|
|
8125
|
-
async function runScenario(config, evalRunId2, scenario, target, template, resolvedAssertions) {
|
|
8126
|
-
const targetId = getTargetId(target);
|
|
8119
|
+
async function runScenario(config, evalRunId2, scenario, evalData, template, resolvedAssertions) {
|
|
8120
|
+
const skillsGroupId = evalData.evalRun.skillsGroupId;
|
|
8127
8121
|
const workDir = await prepareWorkingDirectory(
|
|
8128
8122
|
config,
|
|
8129
8123
|
evalRunId2,
|
|
8130
|
-
|
|
8124
|
+
skillsGroupId,
|
|
8131
8125
|
scenario.id,
|
|
8132
8126
|
template
|
|
8133
8127
|
);
|
|
8134
|
-
|
|
8135
|
-
|
|
8136
|
-
|
|
8137
|
-
|
|
8138
|
-
|
|
8139
|
-
|
|
8140
|
-
|
|
8141
|
-
|
|
8142
|
-
|
|
8143
|
-
|
|
8144
|
-
);
|
|
8145
|
-
break;
|
|
8146
|
-
case "agent":
|
|
8147
|
-
partialResult = await callAgent(config, scenario, target.agent, workDir);
|
|
8148
|
-
break;
|
|
8149
|
-
}
|
|
8128
|
+
const partialResult = await runAgentWithContext(
|
|
8129
|
+
config,
|
|
8130
|
+
evalRunId2,
|
|
8131
|
+
scenario,
|
|
8132
|
+
evalData.skills,
|
|
8133
|
+
skillsGroupId,
|
|
8134
|
+
evalData.skillsGroupName,
|
|
8135
|
+
evalData.codeAgent ?? void 0,
|
|
8136
|
+
workDir
|
|
8137
|
+
);
|
|
8150
8138
|
const inlineAssertions = scenario.assertions ?? [];
|
|
8151
8139
|
const assertions = [
|
|
8152
8140
|
...inlineAssertions,
|
|
@@ -8377,60 +8365,60 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
8377
8365
|
);
|
|
8378
8366
|
}
|
|
8379
8367
|
let completedScenarios = 0;
|
|
8380
|
-
const totalScenarios = scenarioItems.length
|
|
8368
|
+
const totalScenarios = scenarioItems.length;
|
|
8381
8369
|
for (const { scenario, template, resolvedAssertions } of scenarioItems) {
|
|
8382
|
-
|
|
8383
|
-
|
|
8370
|
+
state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
|
|
8371
|
+
state.currentContext = {
|
|
8372
|
+
projectId: projectId2,
|
|
8373
|
+
evalRunId: evalRunId2,
|
|
8374
|
+
scenarioId: scenario.id,
|
|
8375
|
+
scenarioName: scenario.name,
|
|
8376
|
+
skillsGroupId: evalData.evalRun.skillsGroupId,
|
|
8377
|
+
skillsGroupName: evalData.skillsGroupName,
|
|
8378
|
+
agentId: codeAgent?.id,
|
|
8379
|
+
agentName: codeAgent?.name,
|
|
8380
|
+
progress: `${completedScenarios + 1}/${totalScenarios}`
|
|
8381
|
+
};
|
|
8382
|
+
const skillNames = evalData.skills.map((s) => s.name).join(", ");
|
|
8383
|
+
console.log(
|
|
8384
|
+
"[Evaluator] Running scenario with skills group:",
|
|
8385
|
+
evalData.skillsGroupName,
|
|
8386
|
+
skillNames ? `(${skillNames})` : "",
|
|
8387
|
+
codeAgent ? `with agent: ${codeAgent.name}` : "",
|
|
8388
|
+
`(${completedScenarios + 1}/${totalScenarios})`
|
|
8389
|
+
);
|
|
8390
|
+
try {
|
|
8391
|
+
const result = await runScenario(
|
|
8392
|
+
config,
|
|
8393
|
+
evalRunId2,
|
|
8394
|
+
scenario,
|
|
8395
|
+
evalData,
|
|
8396
|
+
template,
|
|
8397
|
+
resolvedAssertions
|
|
8398
|
+
);
|
|
8399
|
+
console.log("[Evaluator] Scenario completed, adding result");
|
|
8400
|
+
state.currentPhase = ExecutionPhase.ADD_RESULT;
|
|
8384
8401
|
state.currentContext = {
|
|
8385
|
-
|
|
8386
|
-
|
|
8387
|
-
scenarioId: scenario.id,
|
|
8388
|
-
scenarioName: scenario.name,
|
|
8389
|
-
skillId: skill.id,
|
|
8390
|
-
skillName: skill.name,
|
|
8391
|
-
agentId: codeAgent?.id,
|
|
8392
|
-
agentName: codeAgent?.name,
|
|
8393
|
-
progress: `${completedScenarios + 1}/${totalScenarios}`
|
|
8402
|
+
...state.currentContext,
|
|
8403
|
+
resultId: result.id
|
|
8394
8404
|
};
|
|
8395
|
-
|
|
8396
|
-
|
|
8397
|
-
|
|
8398
|
-
|
|
8399
|
-
|
|
8405
|
+
await api.addResult(projectId2, evalRunId2, result);
|
|
8406
|
+
completedScenarios++;
|
|
8407
|
+
} catch (err) {
|
|
8408
|
+
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
8409
|
+
const errorStack = err instanceof Error ? err.stack : void 0;
|
|
8410
|
+
console.error(
|
|
8411
|
+
"[Evaluator] Failed to run scenario with skills group:",
|
|
8412
|
+
evalData.skillsGroupName,
|
|
8413
|
+
"Error:",
|
|
8414
|
+
errorMsg
|
|
8400
8415
|
);
|
|
8401
|
-
|
|
8402
|
-
|
|
8403
|
-
config,
|
|
8404
|
-
evalRunId2,
|
|
8405
|
-
scenario,
|
|
8406
|
-
{ type: "skill", skill, agent: codeAgent ?? void 0 },
|
|
8407
|
-
template,
|
|
8408
|
-
resolvedAssertions
|
|
8409
|
-
);
|
|
8410
|
-
console.log("[Evaluator] Skill completed, adding result");
|
|
8411
|
-
state.currentPhase = ExecutionPhase.ADD_RESULT;
|
|
8412
|
-
state.currentContext = {
|
|
8413
|
-
...state.currentContext,
|
|
8414
|
-
resultId: result.id
|
|
8415
|
-
};
|
|
8416
|
-
await api.addResult(projectId2, evalRunId2, result);
|
|
8417
|
-
completedScenarios++;
|
|
8418
|
-
} catch (err) {
|
|
8419
|
-
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
8420
|
-
const errorStack = err instanceof Error ? err.stack : void 0;
|
|
8421
|
-
console.error(
|
|
8422
|
-
"[Evaluator] Failed to run skill:",
|
|
8423
|
-
skill.name,
|
|
8424
|
-
"Error:",
|
|
8425
|
-
errorMsg
|
|
8426
|
-
);
|
|
8427
|
-
if (errorStack) {
|
|
8428
|
-
console.error("[Evaluator] Stack trace:", errorStack);
|
|
8429
|
-
}
|
|
8430
|
-
throw new Error(
|
|
8431
|
-
`[${state.currentPhase}] Failed to execute skill "${skill.name}" on scenario "${scenario.name}": ${errorMsg}`
|
|
8432
|
-
);
|
|
8416
|
+
if (errorStack) {
|
|
8417
|
+
console.error("[Evaluator] Stack trace:", errorStack);
|
|
8433
8418
|
}
|
|
8419
|
+
throw new Error(
|
|
8420
|
+
`[${state.currentPhase}] Failed to execute skills group "${evalData.skillsGroupName}" on scenario "${scenario.name}": ${errorMsg}`
|
|
8421
|
+
);
|
|
8434
8422
|
}
|
|
8435
8423
|
}
|
|
8436
8424
|
state.currentPhase = ExecutionPhase.UPDATE_STATUS;
|