@wix/evalforge-evaluator 0.85.0 → 0.86.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +35 -36
- package/build/index.js.map +4 -4
- package/build/index.mjs +33 -31
- package/build/index.mjs.map +4 -4
- package/package.json +5 -5
- package/build/types/run-scenario/llm-trace.d.ts +0 -6
package/build/index.mjs
CHANGED
|
@@ -379,7 +379,10 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
379
379
|
}
|
|
380
380
|
|
|
381
381
|
// src/run-scenario/index.ts
|
|
382
|
-
import {
|
|
382
|
+
import {
|
|
383
|
+
AssertionResultStatus,
|
|
384
|
+
AVAILABLE_MODEL_IDS
|
|
385
|
+
} from "@wix/evalforge-types";
|
|
383
386
|
import {
|
|
384
387
|
evaluateAssertions as evaluateAssertionsBase
|
|
385
388
|
} from "@wix/eval-assertions";
|
|
@@ -618,15 +621,12 @@ function getAdapter(runCommand) {
|
|
|
618
621
|
return adapter;
|
|
619
622
|
}
|
|
620
623
|
|
|
621
|
-
// src/run-scenario/agents/claude-code/claude-code-adapter.ts
|
|
622
|
-
import { AVAILABLE_MODELS_MAP } from "@wix/evalforge-types";
|
|
623
|
-
|
|
624
624
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
625
625
|
import {
|
|
626
|
+
ClaudeModel,
|
|
626
627
|
LLMStepType,
|
|
627
628
|
LiveTraceEventType,
|
|
628
|
-
TRACE_EVENT_PREFIX
|
|
629
|
-
AVAILABLE_MODELS
|
|
629
|
+
TRACE_EVENT_PREFIX
|
|
630
630
|
} from "@wix/evalforge-types";
|
|
631
631
|
import { randomUUID } from "crypto";
|
|
632
632
|
|
|
@@ -682,21 +682,7 @@ async function writeSubAgentsToFilesystem(cwd, subAgents) {
|
|
|
682
682
|
}
|
|
683
683
|
|
|
684
684
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
685
|
-
var DEFAULT_MODEL =
|
|
686
|
-
function calculateStepCost(inputTokens, outputTokens, modelName) {
|
|
687
|
-
const model = AVAILABLE_MODELS.find(
|
|
688
|
-
(m) => m.name === modelName || m.providerModelId === modelName || // Handle model aliases like "claude-3-5-sonnet-latest" -> "claude-3-5-sonnet-20241022"
|
|
689
|
-
modelName.includes("claude-3-5-sonnet") ? m.providerModelId.includes("claude-3-5-sonnet") : modelName.includes("claude-4-sonnet") ? m.providerModelId.includes("claude-4-sonnet") : modelName.includes("claude-4-opus") ? m.providerModelId.includes("claude-4-opus") : false
|
|
690
|
-
);
|
|
691
|
-
if (!model) {
|
|
692
|
-
const inputCost2 = inputTokens / 1e6 * 3;
|
|
693
|
-
const outputCost2 = outputTokens / 1e6 * 15;
|
|
694
|
-
return inputCost2 + outputCost2;
|
|
695
|
-
}
|
|
696
|
-
const inputCost = inputTokens / 1e6 * model.pricing.inputPer1M;
|
|
697
|
-
const outputCost = outputTokens / 1e6 * model.pricing.outputPer1M;
|
|
698
|
-
return inputCost + outputCost;
|
|
699
|
-
}
|
|
685
|
+
var DEFAULT_MODEL = ClaudeModel.CLAUDE_4_5_SONNET_1_0;
|
|
700
686
|
function emitTraceEvent(event, tracePushUrl, routeHeader, authToken) {
|
|
701
687
|
console.log(`${TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
|
|
702
688
|
if (tracePushUrl) {
|
|
@@ -909,6 +895,13 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
909
895
|
}
|
|
910
896
|
const startTime = /* @__PURE__ */ new Date();
|
|
911
897
|
const allMessages = [];
|
|
898
|
+
const { mkdir: mkdirAsync, writeFile: writeFile4 } = await import("fs/promises");
|
|
899
|
+
const claudeDir = `${options.cwd}/.claude`;
|
|
900
|
+
await mkdirAsync(claudeDir, { recursive: true });
|
|
901
|
+
await writeFile4(`${claudeDir}/settings.json`, "{}", {
|
|
902
|
+
flag: "wx"
|
|
903
|
+
}).catch(() => {
|
|
904
|
+
});
|
|
912
905
|
if (options.mcps && options.mcps.length > 0) {
|
|
913
906
|
await writeMcpToFilesystem(options.cwd, options.mcps);
|
|
914
907
|
}
|
|
@@ -1435,7 +1428,14 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
|
|
|
1435
1428
|
startTime,
|
|
1436
1429
|
endTime
|
|
1437
1430
|
);
|
|
1438
|
-
const
|
|
1431
|
+
const rawOutput = extractFinalOutput(allMessages);
|
|
1432
|
+
const isError = sdkResult?.subtype !== "success";
|
|
1433
|
+
let outputText = rawOutput;
|
|
1434
|
+
if (!rawOutput && isError) {
|
|
1435
|
+
const hasErrors = sdkResult && "errors" in sdkResult && sdkResult.errors?.length;
|
|
1436
|
+
const errorDetails = hasErrors ? sdkResult.errors.join("; ") : sdkResult?.subtype ?? "unknown";
|
|
1437
|
+
outputText = `[ERROR] Agent execution failed: ${errorDetails}`;
|
|
1438
|
+
}
|
|
1439
1439
|
const usage = extractTotalUsage(sdkResult);
|
|
1440
1440
|
const llmTrace = buildLLMTraceFromSteps(
|
|
1441
1441
|
steps,
|
|
@@ -1577,12 +1577,13 @@ function extractTotalUsage(result) {
|
|
|
1577
1577
|
};
|
|
1578
1578
|
}
|
|
1579
1579
|
function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
1580
|
+
const totalCost = usage.costUsd ?? 0;
|
|
1581
|
+
const totalStepTokens = steps.reduce(
|
|
1582
|
+
(sum, s) => sum + s.usage.totalTokens,
|
|
1583
|
+
0
|
|
1584
|
+
);
|
|
1580
1585
|
const traceSteps = steps.map((step, index) => {
|
|
1581
|
-
const
|
|
1582
|
-
step.usage.inputTokens,
|
|
1583
|
-
step.usage.outputTokens,
|
|
1584
|
-
model
|
|
1585
|
-
);
|
|
1586
|
+
const proportion = totalStepTokens > 0 ? step.usage.totalTokens / totalStepTokens : 0;
|
|
1586
1587
|
return {
|
|
1587
1588
|
id: randomUUID(),
|
|
1588
1589
|
stepNumber: index + 1,
|
|
@@ -1596,7 +1597,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
1596
1597
|
completion: step.usage.outputTokens,
|
|
1597
1598
|
total: step.usage.totalTokens
|
|
1598
1599
|
},
|
|
1599
|
-
costUsd:
|
|
1600
|
+
costUsd: totalCost * proportion,
|
|
1600
1601
|
toolName: step.toolCalls?.[0]?.toolName,
|
|
1601
1602
|
toolArguments: step.toolCalls?.[0] ? JSON.stringify(step.toolCalls[0].args) : void 0,
|
|
1602
1603
|
outputPreview: step.text?.slice(0, 200),
|
|
@@ -1609,13 +1610,12 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
1609
1610
|
completion: traceSteps.reduce((sum, s) => sum + s.tokenUsage.completion, 0),
|
|
1610
1611
|
total: traceSteps.reduce((sum, s) => sum + s.tokenUsage.total, 0)
|
|
1611
1612
|
};
|
|
1612
|
-
const stepsTotalCost = traceSteps.reduce((sum, s) => sum + s.costUsd, 0);
|
|
1613
1613
|
const finalTokens = {
|
|
1614
1614
|
prompt: usage.inputTokens > 0 ? usage.inputTokens : stepsTokens.prompt,
|
|
1615
1615
|
completion: usage.outputTokens > 0 ? usage.outputTokens : stepsTokens.completion,
|
|
1616
1616
|
total: usage.totalTokens > 0 ? usage.totalTokens : stepsTokens.total
|
|
1617
1617
|
};
|
|
1618
|
-
const finalCost =
|
|
1618
|
+
const finalCost = totalCost;
|
|
1619
1619
|
const summary = {
|
|
1620
1620
|
totalSteps: traceSteps.length,
|
|
1621
1621
|
totalDurationMs,
|
|
@@ -1661,7 +1661,7 @@ var ClaudeCodeAdapter = class {
|
|
|
1661
1661
|
mcps,
|
|
1662
1662
|
subAgents
|
|
1663
1663
|
} = context;
|
|
1664
|
-
const modelForSdk = modelConfig?.model
|
|
1664
|
+
const modelForSdk = modelConfig?.model;
|
|
1665
1665
|
const options = {
|
|
1666
1666
|
cwd,
|
|
1667
1667
|
model: modelForSdk,
|
|
@@ -2510,8 +2510,10 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
2510
2510
|
}))
|
|
2511
2511
|
};
|
|
2512
2512
|
const { "x-wix-ai-gateway-stream": _stream, ...judgeHeaders } = config.aiGatewayHeaders;
|
|
2513
|
+
const defaultJudgeModel = AVAILABLE_MODEL_IDS[0];
|
|
2513
2514
|
const assertionContext = {
|
|
2514
2515
|
workDir,
|
|
2516
|
+
defaultJudgeModel,
|
|
2515
2517
|
llmConfig: {
|
|
2516
2518
|
baseUrl: config.aiGatewayUrl,
|
|
2517
2519
|
headers: judgeHeaders
|