@wix/evalforge-evaluator 0.54.0 → 0.56.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +106 -118
- package/build/index.js.map +3 -3
- package/build/index.mjs +106 -118
- package/build/index.mjs.map +3 -3
- package/build/types/fetch-evaluation-data.d.ts +5 -2
- package/build/types/run-scenario/agents/claude-code/execute.d.ts +7 -3
- package/build/types/run-scenario/agents/claude-code/types.d.ts +0 -2
- package/build/types/run-scenario/index.d.ts +5 -5
- package/build/types/run-scenario/run-agent-with-context.d.ts +21 -0
- package/build/types/run-scenario/types.d.ts +1 -13
- package/package.json +3 -3
- package/build/types/run-scenario/callAgent.d.ts +0 -13
- package/build/types/run-scenario/callSkill.d.ts +0 -18
package/build/index.js
CHANGED
|
@@ -287,11 +287,9 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
287
287
|
codeAgent = await api.getAgent(projectId2, evalRun.agentId);
|
|
288
288
|
}
|
|
289
289
|
let skills = [];
|
|
290
|
+
let skillsGroup = null;
|
|
290
291
|
if (evalRun.skillsGroupId) {
|
|
291
|
-
|
|
292
|
-
projectId2,
|
|
293
|
-
evalRun.skillsGroupId
|
|
294
|
-
);
|
|
292
|
+
skillsGroup = await api.getSkillsGroup(projectId2, evalRun.skillsGroupId);
|
|
295
293
|
if (skillsGroup.skillIds.length > 0) {
|
|
296
294
|
skills = await Promise.all(
|
|
297
295
|
skillsGroup.skillIds.map((id) => api.getSkill(projectId2, id))
|
|
@@ -340,10 +338,13 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
340
338
|
resolvedAssertions: resolvedAssertions.length > 0 ? resolvedAssertions : void 0
|
|
341
339
|
};
|
|
342
340
|
});
|
|
341
|
+
const skillsGroupName = skillsGroup?.name ?? "";
|
|
343
342
|
return {
|
|
344
343
|
evalRun,
|
|
345
344
|
codeAgent,
|
|
346
345
|
skills,
|
|
346
|
+
skillsGroup,
|
|
347
|
+
skillsGroupName,
|
|
347
348
|
scenarioItems
|
|
348
349
|
};
|
|
349
350
|
}
|
|
@@ -6224,7 +6225,7 @@ async function prepareWorkingDirectory(config, evalRunId2, targetId, scenarioId,
|
|
|
6224
6225
|
return workDir;
|
|
6225
6226
|
}
|
|
6226
6227
|
|
|
6227
|
-
// src/run-scenario/
|
|
6228
|
+
// src/run-scenario/run-agent-with-context.ts
|
|
6228
6229
|
var import_crypto2 = require("crypto");
|
|
6229
6230
|
|
|
6230
6231
|
// src/run-scenario/agents/registry.ts
|
|
@@ -6533,10 +6534,11 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
6533
6534
|
outputPreview: `Message type: ${message.type}`
|
|
6534
6535
|
};
|
|
6535
6536
|
}
|
|
6536
|
-
async function executeWithClaudeCode(
|
|
6537
|
+
async function executeWithClaudeCode(skills, scenario, options) {
|
|
6538
|
+
const skillNames = skills.map((s) => s.name).join(", ");
|
|
6537
6539
|
console.log("[executeWithClaudeCode] Starting execution", {
|
|
6538
|
-
|
|
6539
|
-
|
|
6540
|
+
skillCount: skills.length,
|
|
6541
|
+
skillNames,
|
|
6540
6542
|
scenarioId: scenario.id,
|
|
6541
6543
|
scenarioName: scenario.name,
|
|
6542
6544
|
cwd: options.cwd,
|
|
@@ -6572,22 +6574,23 @@ async function executeWithClaudeCode(skill, scenario, options) {
|
|
|
6572
6574
|
const startTime = /* @__PURE__ */ new Date();
|
|
6573
6575
|
const allMessages = [];
|
|
6574
6576
|
console.error(
|
|
6575
|
-
"[DEBUG-H4]
|
|
6577
|
+
"[DEBUG-H4] writeSkillsToFilesystem START",
|
|
6576
6578
|
JSON.stringify({
|
|
6577
6579
|
cwd: options.cwd,
|
|
6578
|
-
|
|
6580
|
+
skillCount: skills.length,
|
|
6581
|
+
skillNames: skills.map((s) => s.name),
|
|
6579
6582
|
timestamp: Date.now()
|
|
6580
6583
|
})
|
|
6581
6584
|
);
|
|
6582
6585
|
try {
|
|
6583
|
-
await
|
|
6586
|
+
await writeSkillsToFilesystem(options.cwd, skills);
|
|
6584
6587
|
console.error(
|
|
6585
|
-
"[DEBUG-H4]
|
|
6588
|
+
"[DEBUG-H4] writeSkillsToFilesystem SUCCESS",
|
|
6586
6589
|
JSON.stringify({ timestamp: Date.now() })
|
|
6587
6590
|
);
|
|
6588
6591
|
} catch (writeError) {
|
|
6589
6592
|
console.error(
|
|
6590
|
-
"[DEBUG-H4]
|
|
6593
|
+
"[DEBUG-H4] writeSkillsToFilesystem FAILED",
|
|
6591
6594
|
JSON.stringify({
|
|
6592
6595
|
error: writeError instanceof Error ? writeError.message : String(writeError),
|
|
6593
6596
|
stack: writeError instanceof Error ? writeError.stack : void 0,
|
|
@@ -6595,7 +6598,7 @@ async function executeWithClaudeCode(skill, scenario, options) {
|
|
|
6595
6598
|
})
|
|
6596
6599
|
);
|
|
6597
6600
|
throw new Error(
|
|
6598
|
-
`Failed to write
|
|
6601
|
+
`Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
|
|
6599
6602
|
);
|
|
6600
6603
|
}
|
|
6601
6604
|
const sdkEnv = buildSdkEnvironment(options);
|
|
@@ -6631,7 +6634,10 @@ async function executeWithClaudeCode(skill, scenario, options) {
|
|
|
6631
6634
|
}
|
|
6632
6635
|
console.log("[SDK-DEBUG] PATH available:", !!sdkEnv.PATH);
|
|
6633
6636
|
console.log("[SDK-DEBUG] HOME:", sdkEnv.HOME || "NOT SET");
|
|
6634
|
-
console.log(
|
|
6637
|
+
console.log(
|
|
6638
|
+
"[SDK-DEBUG] Skills:",
|
|
6639
|
+
skills.map((s) => `${s.id} - ${s.name}`).join(", ")
|
|
6640
|
+
);
|
|
6635
6641
|
console.log("[SDK-DEBUG] Scenario:", scenario.id, "-", scenario.name);
|
|
6636
6642
|
console.log(
|
|
6637
6643
|
"[SDK-DEBUG] Prompt preview:",
|
|
@@ -6741,7 +6747,7 @@ async function executeWithClaudeCode(skill, scenario, options) {
|
|
|
6741
6747
|
timedOut = true;
|
|
6742
6748
|
reject(
|
|
6743
6749
|
new Error(
|
|
6744
|
-
`SDK execution timed out after ${SDK_TIMEOUT_MS}ms.
|
|
6750
|
+
`SDK execution timed out after ${SDK_TIMEOUT_MS}ms. Skills: ${skillNames}, Scenario: ${scenario.name}, Messages received: ${messageCount}, MaxTurns: ${maxTurns}`
|
|
6745
6751
|
)
|
|
6746
6752
|
);
|
|
6747
6753
|
}, SDK_TIMEOUT_MS);
|
|
@@ -6949,8 +6955,8 @@ IMPORTANT: This is an automated evaluation run. Follow these guidelines:
|
|
|
6949
6955
|
}
|
|
6950
6956
|
}
|
|
6951
6957
|
console.error("[SDK-ERROR] Execution context:");
|
|
6952
|
-
console.error("[SDK-ERROR]
|
|
6953
|
-
console.error("[SDK-ERROR]
|
|
6958
|
+
console.error("[SDK-ERROR] skillCount:", skills.length);
|
|
6959
|
+
console.error("[SDK-ERROR] skillNames:", skillNames);
|
|
6954
6960
|
console.error("[SDK-ERROR] scenarioId:", scenario.id);
|
|
6955
6961
|
console.error("[SDK-ERROR] scenarioName:", scenario.name);
|
|
6956
6962
|
console.error("[SDK-ERROR] cwd:", options.cwd);
|
|
@@ -7010,7 +7016,8 @@ IMPORTANT: This is an automated evaluation run. Follow these guidelines:
|
|
|
7010
7016
|
messageCount,
|
|
7011
7017
|
errorName,
|
|
7012
7018
|
errorMessage,
|
|
7013
|
-
|
|
7019
|
+
skillCount: skills.length,
|
|
7020
|
+
skillNames,
|
|
7014
7021
|
scenarioId: scenario.id,
|
|
7015
7022
|
model: options.model || DEFAULT_MODEL,
|
|
7016
7023
|
sdkEnv: sdkEnvDebug,
|
|
@@ -7103,13 +7110,15 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
|
|
|
7103
7110
|
llmTrace
|
|
7104
7111
|
};
|
|
7105
7112
|
}
|
|
7106
|
-
async function
|
|
7107
|
-
const
|
|
7108
|
-
|
|
7109
|
-
|
|
7110
|
-
|
|
7111
|
-
|
|
7112
|
-
|
|
7113
|
+
async function writeSkillsToFilesystem(cwd, skills) {
|
|
7114
|
+
for (const skill of skills) {
|
|
7115
|
+
const skillName = skill.name;
|
|
7116
|
+
const skillDir = (0, import_path5.join)(cwd, ".claude", "skills", skillName);
|
|
7117
|
+
await (0, import_promises3.mkdir)(skillDir, { recursive: true });
|
|
7118
|
+
const skillPath = (0, import_path5.join)(skillDir, "SKILL.md");
|
|
7119
|
+
await (0, import_promises3.writeFile)(skillPath, skill.skillMd, "utf-8");
|
|
7120
|
+
console.log(`[Skill] Written to ${skillPath}`);
|
|
7121
|
+
}
|
|
7113
7122
|
}
|
|
7114
7123
|
function buildSdkEnvironment(options) {
|
|
7115
7124
|
const env = { ...process.env };
|
|
@@ -7305,7 +7314,7 @@ var ClaudeCodeAdapter = class {
|
|
|
7305
7314
|
*/
|
|
7306
7315
|
async execute(context) {
|
|
7307
7316
|
const {
|
|
7308
|
-
|
|
7317
|
+
skills,
|
|
7309
7318
|
scenario,
|
|
7310
7319
|
cwd,
|
|
7311
7320
|
modelConfig,
|
|
@@ -7316,7 +7325,6 @@ var ClaudeCodeAdapter = class {
|
|
|
7316
7325
|
const modelForSdk = modelConfig?.model ? import_evalforge_types3.AVAILABLE_MODELS_MAP[modelConfig.model]?.providerModelId ?? modelConfig.model : void 0;
|
|
7317
7326
|
const options = {
|
|
7318
7327
|
cwd,
|
|
7319
|
-
systemPrompt: skill.skillMd,
|
|
7320
7328
|
model: modelForSdk,
|
|
7321
7329
|
temperature: modelConfig?.temperature,
|
|
7322
7330
|
maxTokens: modelConfig?.maxTokens,
|
|
@@ -7325,7 +7333,7 @@ var ClaudeCodeAdapter = class {
|
|
|
7325
7333
|
traceContext
|
|
7326
7334
|
};
|
|
7327
7335
|
const { result, llmTrace } = await executeWithClaudeCode(
|
|
7328
|
-
|
|
7336
|
+
skills,
|
|
7329
7337
|
scenario,
|
|
7330
7338
|
options
|
|
7331
7339
|
);
|
|
@@ -7908,7 +7916,6 @@ arrayDiff.join = arrayDiff.removeEmpty = function(value) {
|
|
|
7908
7916
|
var IGNORED_PATTERNS = [
|
|
7909
7917
|
"node_modules",
|
|
7910
7918
|
".git",
|
|
7911
|
-
".claude",
|
|
7912
7919
|
".cursor",
|
|
7913
7920
|
"dist",
|
|
7914
7921
|
"build",
|
|
@@ -8074,15 +8081,15 @@ function extractTemplateFiles(before, after) {
|
|
|
8074
8081
|
return files;
|
|
8075
8082
|
}
|
|
8076
8083
|
|
|
8077
|
-
// src/run-scenario/
|
|
8084
|
+
// src/run-scenario/run-agent-with-context.ts
|
|
8078
8085
|
var DEFAULT_AGENT_COMMAND = "claude";
|
|
8079
|
-
async function
|
|
8086
|
+
async function runAgentWithContext(config, evalRunId2, scenario, skills, skillsGroupId, skillsGroupName, agent, workDir) {
|
|
8080
8087
|
const runCommand = agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
|
|
8081
8088
|
const adapter = getAdapter(runCommand);
|
|
8082
8089
|
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
8083
8090
|
const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
|
|
8084
8091
|
const executionContext = {
|
|
8085
|
-
|
|
8092
|
+
skills,
|
|
8086
8093
|
scenario,
|
|
8087
8094
|
cwd: workDir || process.cwd(),
|
|
8088
8095
|
modelConfig: agent?.modelConfig,
|
|
@@ -8092,8 +8099,8 @@ async function callSkill(config, evalRunId2, scenario, skill, agent, workDir) {
|
|
|
8092
8099
|
evalRunId: evalRunId2,
|
|
8093
8100
|
scenarioId: scenario.id,
|
|
8094
8101
|
scenarioName: scenario.name,
|
|
8095
|
-
targetId:
|
|
8096
|
-
targetName:
|
|
8102
|
+
targetId: skillsGroupId,
|
|
8103
|
+
targetName: skillsGroupName,
|
|
8097
8104
|
tracePushUrl: config.tracePushUrl,
|
|
8098
8105
|
routeHeader: config.routeHeader,
|
|
8099
8106
|
authToken: config.authToken
|
|
@@ -8106,8 +8113,8 @@ async function callSkill(config, evalRunId2, scenario, skill, agent, workDir) {
|
|
|
8106
8113
|
const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot) : void 0;
|
|
8107
8114
|
return {
|
|
8108
8115
|
id: (0, import_crypto2.randomUUID)(),
|
|
8109
|
-
targetId:
|
|
8110
|
-
targetName:
|
|
8116
|
+
targetId: skillsGroupId,
|
|
8117
|
+
targetName: skillsGroupName,
|
|
8111
8118
|
scenarioId: scenario.id,
|
|
8112
8119
|
scenarioName: scenario.name,
|
|
8113
8120
|
modelConfig: agent?.modelConfig,
|
|
@@ -8121,45 +8128,26 @@ async function callSkill(config, evalRunId2, scenario, skill, agent, workDir) {
|
|
|
8121
8128
|
};
|
|
8122
8129
|
}
|
|
8123
8130
|
|
|
8124
|
-
// src/run-scenario/callAgent.ts
|
|
8125
|
-
async function callAgent(config, scenario, agent, workDir) {
|
|
8126
|
-
throw new Error("Agent execution not yet implemented");
|
|
8127
|
-
}
|
|
8128
|
-
|
|
8129
8131
|
// src/run-scenario/index.ts
|
|
8130
|
-
function
|
|
8131
|
-
|
|
8132
|
-
case "skill":
|
|
8133
|
-
return target.skill.id;
|
|
8134
|
-
case "agent":
|
|
8135
|
-
return target.agent.id;
|
|
8136
|
-
}
|
|
8137
|
-
}
|
|
8138
|
-
async function runScenario(config, evalRunId2, scenario, target, template, resolvedAssertions) {
|
|
8139
|
-
const targetId = getTargetId(target);
|
|
8132
|
+
async function runScenario(config, evalRunId2, scenario, evalData, template, resolvedAssertions) {
|
|
8133
|
+
const skillsGroupId = evalData.evalRun.skillsGroupId;
|
|
8140
8134
|
const workDir = await prepareWorkingDirectory(
|
|
8141
8135
|
config,
|
|
8142
8136
|
evalRunId2,
|
|
8143
|
-
|
|
8137
|
+
skillsGroupId,
|
|
8144
8138
|
scenario.id,
|
|
8145
8139
|
template
|
|
8146
8140
|
);
|
|
8147
|
-
|
|
8148
|
-
|
|
8149
|
-
|
|
8150
|
-
|
|
8151
|
-
|
|
8152
|
-
|
|
8153
|
-
|
|
8154
|
-
|
|
8155
|
-
|
|
8156
|
-
|
|
8157
|
-
);
|
|
8158
|
-
break;
|
|
8159
|
-
case "agent":
|
|
8160
|
-
partialResult = await callAgent(config, scenario, target.agent, workDir);
|
|
8161
|
-
break;
|
|
8162
|
-
}
|
|
8141
|
+
const partialResult = await runAgentWithContext(
|
|
8142
|
+
config,
|
|
8143
|
+
evalRunId2,
|
|
8144
|
+
scenario,
|
|
8145
|
+
evalData.skills,
|
|
8146
|
+
skillsGroupId,
|
|
8147
|
+
evalData.skillsGroupName,
|
|
8148
|
+
evalData.codeAgent ?? void 0,
|
|
8149
|
+
workDir
|
|
8150
|
+
);
|
|
8163
8151
|
const inlineAssertions = scenario.assertions ?? [];
|
|
8164
8152
|
const assertions = [
|
|
8165
8153
|
...inlineAssertions,
|
|
@@ -8390,60 +8378,60 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
8390
8378
|
);
|
|
8391
8379
|
}
|
|
8392
8380
|
let completedScenarios = 0;
|
|
8393
|
-
const totalScenarios = scenarioItems.length
|
|
8381
|
+
const totalScenarios = scenarioItems.length;
|
|
8394
8382
|
for (const { scenario, template, resolvedAssertions } of scenarioItems) {
|
|
8395
|
-
|
|
8396
|
-
|
|
8383
|
+
state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
|
|
8384
|
+
state.currentContext = {
|
|
8385
|
+
projectId: projectId2,
|
|
8386
|
+
evalRunId: evalRunId2,
|
|
8387
|
+
scenarioId: scenario.id,
|
|
8388
|
+
scenarioName: scenario.name,
|
|
8389
|
+
skillsGroupId: evalData.evalRun.skillsGroupId,
|
|
8390
|
+
skillsGroupName: evalData.skillsGroupName,
|
|
8391
|
+
agentId: codeAgent?.id,
|
|
8392
|
+
agentName: codeAgent?.name,
|
|
8393
|
+
progress: `${completedScenarios + 1}/${totalScenarios}`
|
|
8394
|
+
};
|
|
8395
|
+
const skillNames = evalData.skills.map((s) => s.name).join(", ");
|
|
8396
|
+
console.log(
|
|
8397
|
+
"[Evaluator] Running scenario with skills group:",
|
|
8398
|
+
evalData.skillsGroupName,
|
|
8399
|
+
skillNames ? `(${skillNames})` : "",
|
|
8400
|
+
codeAgent ? `with agent: ${codeAgent.name}` : "",
|
|
8401
|
+
`(${completedScenarios + 1}/${totalScenarios})`
|
|
8402
|
+
);
|
|
8403
|
+
try {
|
|
8404
|
+
const result = await runScenario(
|
|
8405
|
+
config,
|
|
8406
|
+
evalRunId2,
|
|
8407
|
+
scenario,
|
|
8408
|
+
evalData,
|
|
8409
|
+
template,
|
|
8410
|
+
resolvedAssertions
|
|
8411
|
+
);
|
|
8412
|
+
console.log("[Evaluator] Scenario completed, adding result");
|
|
8413
|
+
state.currentPhase = ExecutionPhase.ADD_RESULT;
|
|
8397
8414
|
state.currentContext = {
|
|
8398
|
-
|
|
8399
|
-
|
|
8400
|
-
scenarioId: scenario.id,
|
|
8401
|
-
scenarioName: scenario.name,
|
|
8402
|
-
skillId: skill.id,
|
|
8403
|
-
skillName: skill.name,
|
|
8404
|
-
agentId: codeAgent?.id,
|
|
8405
|
-
agentName: codeAgent?.name,
|
|
8406
|
-
progress: `${completedScenarios + 1}/${totalScenarios}`
|
|
8415
|
+
...state.currentContext,
|
|
8416
|
+
resultId: result.id
|
|
8407
8417
|
};
|
|
8408
|
-
|
|
8409
|
-
|
|
8410
|
-
|
|
8411
|
-
|
|
8412
|
-
|
|
8418
|
+
await api.addResult(projectId2, evalRunId2, result);
|
|
8419
|
+
completedScenarios++;
|
|
8420
|
+
} catch (err) {
|
|
8421
|
+
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
8422
|
+
const errorStack = err instanceof Error ? err.stack : void 0;
|
|
8423
|
+
console.error(
|
|
8424
|
+
"[Evaluator] Failed to run scenario with skills group:",
|
|
8425
|
+
evalData.skillsGroupName,
|
|
8426
|
+
"Error:",
|
|
8427
|
+
errorMsg
|
|
8413
8428
|
);
|
|
8414
|
-
|
|
8415
|
-
|
|
8416
|
-
config,
|
|
8417
|
-
evalRunId2,
|
|
8418
|
-
scenario,
|
|
8419
|
-
{ type: "skill", skill, agent: codeAgent ?? void 0 },
|
|
8420
|
-
template,
|
|
8421
|
-
resolvedAssertions
|
|
8422
|
-
);
|
|
8423
|
-
console.log("[Evaluator] Skill completed, adding result");
|
|
8424
|
-
state.currentPhase = ExecutionPhase.ADD_RESULT;
|
|
8425
|
-
state.currentContext = {
|
|
8426
|
-
...state.currentContext,
|
|
8427
|
-
resultId: result.id
|
|
8428
|
-
};
|
|
8429
|
-
await api.addResult(projectId2, evalRunId2, result);
|
|
8430
|
-
completedScenarios++;
|
|
8431
|
-
} catch (err) {
|
|
8432
|
-
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
8433
|
-
const errorStack = err instanceof Error ? err.stack : void 0;
|
|
8434
|
-
console.error(
|
|
8435
|
-
"[Evaluator] Failed to run skill:",
|
|
8436
|
-
skill.name,
|
|
8437
|
-
"Error:",
|
|
8438
|
-
errorMsg
|
|
8439
|
-
);
|
|
8440
|
-
if (errorStack) {
|
|
8441
|
-
console.error("[Evaluator] Stack trace:", errorStack);
|
|
8442
|
-
}
|
|
8443
|
-
throw new Error(
|
|
8444
|
-
`[${state.currentPhase}] Failed to execute skill "${skill.name}" on scenario "${scenario.name}": ${errorMsg}`
|
|
8445
|
-
);
|
|
8429
|
+
if (errorStack) {
|
|
8430
|
+
console.error("[Evaluator] Stack trace:", errorStack);
|
|
8446
8431
|
}
|
|
8432
|
+
throw new Error(
|
|
8433
|
+
`[${state.currentPhase}] Failed to execute skills group "${evalData.skillsGroupName}" on scenario "${scenario.name}": ${errorMsg}`
|
|
8434
|
+
);
|
|
8447
8435
|
}
|
|
8448
8436
|
}
|
|
8449
8437
|
state.currentPhase = ExecutionPhase.UPDATE_STATUS;
|