@wix/evalforge-evaluator 0.85.0 → 0.86.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +35 -36
- package/build/index.js.map +4 -4
- package/build/index.mjs +33 -31
- package/build/index.mjs.map +4 -4
- package/package.json +5 -5
- package/build/types/run-scenario/llm-trace.d.ts +0 -6
package/build/index.js
CHANGED
|
@@ -24,7 +24,7 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
|
|
|
24
24
|
));
|
|
25
25
|
|
|
26
26
|
// src/index.ts
|
|
27
|
-
var
|
|
27
|
+
var import_evalforge_types6 = require("@wix/evalforge-types");
|
|
28
28
|
|
|
29
29
|
// src/config.ts
|
|
30
30
|
function loadConfig() {
|
|
@@ -399,7 +399,7 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
399
399
|
}
|
|
400
400
|
|
|
401
401
|
// src/run-scenario/index.ts
|
|
402
|
-
var
|
|
402
|
+
var import_evalforge_types4 = require("@wix/evalforge-types");
|
|
403
403
|
var import_eval_assertions = require("@wix/eval-assertions");
|
|
404
404
|
|
|
405
405
|
// src/run-scenario/environment.ts
|
|
@@ -636,9 +636,6 @@ function getAdapter(runCommand) {
|
|
|
636
636
|
return adapter;
|
|
637
637
|
}
|
|
638
638
|
|
|
639
|
-
// src/run-scenario/agents/claude-code/claude-code-adapter.ts
|
|
640
|
-
var import_evalforge_types4 = require("@wix/evalforge-types");
|
|
641
|
-
|
|
642
639
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
643
640
|
var import_evalforge_types3 = require("@wix/evalforge-types");
|
|
644
641
|
var import_crypto = require("crypto");
|
|
@@ -695,21 +692,7 @@ async function writeSubAgentsToFilesystem(cwd, subAgents) {
|
|
|
695
692
|
}
|
|
696
693
|
|
|
697
694
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
698
|
-
var DEFAULT_MODEL =
|
|
699
|
-
function calculateStepCost(inputTokens, outputTokens, modelName) {
|
|
700
|
-
const model = import_evalforge_types3.AVAILABLE_MODELS.find(
|
|
701
|
-
(m) => m.name === modelName || m.providerModelId === modelName || // Handle model aliases like "claude-3-5-sonnet-latest" -> "claude-3-5-sonnet-20241022"
|
|
702
|
-
modelName.includes("claude-3-5-sonnet") ? m.providerModelId.includes("claude-3-5-sonnet") : modelName.includes("claude-4-sonnet") ? m.providerModelId.includes("claude-4-sonnet") : modelName.includes("claude-4-opus") ? m.providerModelId.includes("claude-4-opus") : false
|
|
703
|
-
);
|
|
704
|
-
if (!model) {
|
|
705
|
-
const inputCost2 = inputTokens / 1e6 * 3;
|
|
706
|
-
const outputCost2 = outputTokens / 1e6 * 15;
|
|
707
|
-
return inputCost2 + outputCost2;
|
|
708
|
-
}
|
|
709
|
-
const inputCost = inputTokens / 1e6 * model.pricing.inputPer1M;
|
|
710
|
-
const outputCost = outputTokens / 1e6 * model.pricing.outputPer1M;
|
|
711
|
-
return inputCost + outputCost;
|
|
712
|
-
}
|
|
695
|
+
var DEFAULT_MODEL = import_evalforge_types3.ClaudeModel.CLAUDE_4_5_SONNET_1_0;
|
|
713
696
|
function emitTraceEvent(event, tracePushUrl, routeHeader, authToken) {
|
|
714
697
|
console.log(`${import_evalforge_types3.TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
|
|
715
698
|
if (tracePushUrl) {
|
|
@@ -922,6 +905,13 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
922
905
|
}
|
|
923
906
|
const startTime = /* @__PURE__ */ new Date();
|
|
924
907
|
const allMessages = [];
|
|
908
|
+
const { mkdir: mkdirAsync, writeFile: writeFile4 } = await import("fs/promises");
|
|
909
|
+
const claudeDir = `${options.cwd}/.claude`;
|
|
910
|
+
await mkdirAsync(claudeDir, { recursive: true });
|
|
911
|
+
await writeFile4(`${claudeDir}/settings.json`, "{}", {
|
|
912
|
+
flag: "wx"
|
|
913
|
+
}).catch(() => {
|
|
914
|
+
});
|
|
925
915
|
if (options.mcps && options.mcps.length > 0) {
|
|
926
916
|
await writeMcpToFilesystem(options.cwd, options.mcps);
|
|
927
917
|
}
|
|
@@ -1448,7 +1438,14 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
|
|
|
1448
1438
|
startTime,
|
|
1449
1439
|
endTime
|
|
1450
1440
|
);
|
|
1451
|
-
const
|
|
1441
|
+
const rawOutput = extractFinalOutput(allMessages);
|
|
1442
|
+
const isError = sdkResult?.subtype !== "success";
|
|
1443
|
+
let outputText = rawOutput;
|
|
1444
|
+
if (!rawOutput && isError) {
|
|
1445
|
+
const hasErrors = sdkResult && "errors" in sdkResult && sdkResult.errors?.length;
|
|
1446
|
+
const errorDetails = hasErrors ? sdkResult.errors.join("; ") : sdkResult?.subtype ?? "unknown";
|
|
1447
|
+
outputText = `[ERROR] Agent execution failed: ${errorDetails}`;
|
|
1448
|
+
}
|
|
1452
1449
|
const usage = extractTotalUsage(sdkResult);
|
|
1453
1450
|
const llmTrace = buildLLMTraceFromSteps(
|
|
1454
1451
|
steps,
|
|
@@ -1590,12 +1587,13 @@ function extractTotalUsage(result) {
|
|
|
1590
1587
|
};
|
|
1591
1588
|
}
|
|
1592
1589
|
function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
1590
|
+
const totalCost = usage.costUsd ?? 0;
|
|
1591
|
+
const totalStepTokens = steps.reduce(
|
|
1592
|
+
(sum, s) => sum + s.usage.totalTokens,
|
|
1593
|
+
0
|
|
1594
|
+
);
|
|
1593
1595
|
const traceSteps = steps.map((step, index) => {
|
|
1594
|
-
const
|
|
1595
|
-
step.usage.inputTokens,
|
|
1596
|
-
step.usage.outputTokens,
|
|
1597
|
-
model
|
|
1598
|
-
);
|
|
1596
|
+
const proportion = totalStepTokens > 0 ? step.usage.totalTokens / totalStepTokens : 0;
|
|
1599
1597
|
return {
|
|
1600
1598
|
id: (0, import_crypto.randomUUID)(),
|
|
1601
1599
|
stepNumber: index + 1,
|
|
@@ -1609,7 +1607,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
1609
1607
|
completion: step.usage.outputTokens,
|
|
1610
1608
|
total: step.usage.totalTokens
|
|
1611
1609
|
},
|
|
1612
|
-
costUsd:
|
|
1610
|
+
costUsd: totalCost * proportion,
|
|
1613
1611
|
toolName: step.toolCalls?.[0]?.toolName,
|
|
1614
1612
|
toolArguments: step.toolCalls?.[0] ? JSON.stringify(step.toolCalls[0].args) : void 0,
|
|
1615
1613
|
outputPreview: step.text?.slice(0, 200),
|
|
@@ -1622,13 +1620,12 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
1622
1620
|
completion: traceSteps.reduce((sum, s) => sum + s.tokenUsage.completion, 0),
|
|
1623
1621
|
total: traceSteps.reduce((sum, s) => sum + s.tokenUsage.total, 0)
|
|
1624
1622
|
};
|
|
1625
|
-
const stepsTotalCost = traceSteps.reduce((sum, s) => sum + s.costUsd, 0);
|
|
1626
1623
|
const finalTokens = {
|
|
1627
1624
|
prompt: usage.inputTokens > 0 ? usage.inputTokens : stepsTokens.prompt,
|
|
1628
1625
|
completion: usage.outputTokens > 0 ? usage.outputTokens : stepsTokens.completion,
|
|
1629
1626
|
total: usage.totalTokens > 0 ? usage.totalTokens : stepsTokens.total
|
|
1630
1627
|
};
|
|
1631
|
-
const finalCost =
|
|
1628
|
+
const finalCost = totalCost;
|
|
1632
1629
|
const summary = {
|
|
1633
1630
|
totalSteps: traceSteps.length,
|
|
1634
1631
|
totalDurationMs,
|
|
@@ -1674,7 +1671,7 @@ var ClaudeCodeAdapter = class {
|
|
|
1674
1671
|
mcps,
|
|
1675
1672
|
subAgents
|
|
1676
1673
|
} = context;
|
|
1677
|
-
const modelForSdk = modelConfig?.model
|
|
1674
|
+
const modelForSdk = modelConfig?.model;
|
|
1678
1675
|
const options = {
|
|
1679
1676
|
cwd,
|
|
1680
1677
|
model: modelForSdk,
|
|
@@ -2523,8 +2520,10 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
2523
2520
|
}))
|
|
2524
2521
|
};
|
|
2525
2522
|
const { "x-wix-ai-gateway-stream": _stream, ...judgeHeaders } = config.aiGatewayHeaders;
|
|
2523
|
+
const defaultJudgeModel = import_evalforge_types4.AVAILABLE_MODEL_IDS[0];
|
|
2526
2524
|
const assertionContext = {
|
|
2527
2525
|
workDir,
|
|
2526
|
+
defaultJudgeModel,
|
|
2528
2527
|
llmConfig: {
|
|
2529
2528
|
baseUrl: config.aiGatewayUrl,
|
|
2530
2529
|
headers: judgeHeaders
|
|
@@ -2536,10 +2535,10 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
2536
2535
|
assertionContext
|
|
2537
2536
|
) : [];
|
|
2538
2537
|
const passed = assertionResults.filter(
|
|
2539
|
-
(r) => r.status ===
|
|
2538
|
+
(r) => r.status === import_evalforge_types4.AssertionResultStatus.PASSED
|
|
2540
2539
|
).length;
|
|
2541
2540
|
const failed = assertionResults.filter(
|
|
2542
|
-
(r) => r.status ===
|
|
2541
|
+
(r) => r.status === import_evalforge_types4.AssertionResultStatus.FAILED
|
|
2543
2542
|
).length;
|
|
2544
2543
|
const total = assertionResults.length;
|
|
2545
2544
|
const passRate = total > 0 ? Math.round(passed / total * 100) : 100;
|
|
@@ -2553,7 +2552,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
2553
2552
|
}
|
|
2554
2553
|
|
|
2555
2554
|
// src/error-reporter.ts
|
|
2556
|
-
var
|
|
2555
|
+
var import_evalforge_types5 = require("@wix/evalforge-types");
|
|
2557
2556
|
function formatError(error, phase, context) {
|
|
2558
2557
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString();
|
|
2559
2558
|
if (error instanceof Error) {
|
|
@@ -2802,7 +2801,7 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
2802
2801
|
};
|
|
2803
2802
|
try {
|
|
2804
2803
|
await api.updateEvalRun(projectId2, evalRunId2, {
|
|
2805
|
-
status:
|
|
2804
|
+
status: import_evalforge_types6.EvalStatus.COMPLETED,
|
|
2806
2805
|
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2807
2806
|
});
|
|
2808
2807
|
} catch (updateErr) {
|
|
@@ -2843,7 +2842,7 @@ runEvaluation(projectId, evalRunId).then(() => {
|
|
|
2843
2842
|
authToken: config.authToken
|
|
2844
2843
|
});
|
|
2845
2844
|
await api.updateEvalRun(projectId, evalRunId, {
|
|
2846
|
-
status:
|
|
2845
|
+
status: import_evalforge_types6.EvalStatus.FAILED,
|
|
2847
2846
|
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2848
2847
|
jobError,
|
|
2849
2848
|
jobStatus: "FAILED"
|
|
@@ -2866,7 +2865,7 @@ runEvaluation(projectId, evalRunId).then(() => {
|
|
|
2866
2865
|
authToken
|
|
2867
2866
|
});
|
|
2868
2867
|
await api.updateEvalRun(projectId, evalRunId, {
|
|
2869
|
-
status:
|
|
2868
|
+
status: import_evalforge_types6.EvalStatus.FAILED,
|
|
2870
2869
|
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2871
2870
|
jobError: `Config load failed, then: ${jobError}`,
|
|
2872
2871
|
jobStatus: "FAILED"
|