@wix/evalforge-evaluator 0.202.0 → 0.204.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -0
- package/build/index.js +309 -161
- package/build/index.js.map +4 -4
- package/build/index.mjs +309 -161
- package/build/index.mjs.map +4 -4
- package/build/types/run-scenario/agents/opencode/build-trace.d.ts +15 -1
- package/build/types/run-scenario/agents/opencode/execute.d.ts +4 -8
- package/build/types/run-scenario/agents/opencode/gateway-cost-interceptor.d.ts +28 -0
- package/package.json +2 -2
package/build/index.js
CHANGED
|
@@ -2407,9 +2407,9 @@ var require_debug = __commonJS({
|
|
|
2407
2407
|
var require_follow_redirects = __commonJS({
|
|
2408
2408
|
"../../node_modules/follow-redirects/index.js"(exports2, module2) {
|
|
2409
2409
|
var url = require("url");
|
|
2410
|
-
var
|
|
2411
|
-
var
|
|
2412
|
-
var
|
|
2410
|
+
var URL3 = url.URL;
|
|
2411
|
+
var http2 = require("http");
|
|
2412
|
+
var https2 = require("https");
|
|
2413
2413
|
var Writable = require("stream").Writable;
|
|
2414
2414
|
var assert = require("assert");
|
|
2415
2415
|
var debug = require_debug();
|
|
@@ -2423,7 +2423,7 @@ var require_follow_redirects = __commonJS({
|
|
|
2423
2423
|
})();
|
|
2424
2424
|
var useNativeURL = false;
|
|
2425
2425
|
try {
|
|
2426
|
-
assert(new
|
|
2426
|
+
assert(new URL3(""));
|
|
2427
2427
|
} catch (error) {
|
|
2428
2428
|
useNativeURL = error.code === "ERR_INVALID_URL";
|
|
2429
2429
|
}
|
|
@@ -2803,7 +2803,7 @@ var require_follow_redirects = __commonJS({
|
|
|
2803
2803
|
function parseUrl(input) {
|
|
2804
2804
|
var parsed;
|
|
2805
2805
|
if (useNativeURL) {
|
|
2806
|
-
parsed = new
|
|
2806
|
+
parsed = new URL3(input);
|
|
2807
2807
|
} else {
|
|
2808
2808
|
parsed = validateUrl(url.parse(input));
|
|
2809
2809
|
if (!isString(parsed.protocol)) {
|
|
@@ -2813,7 +2813,7 @@ var require_follow_redirects = __commonJS({
|
|
|
2813
2813
|
return parsed;
|
|
2814
2814
|
}
|
|
2815
2815
|
function resolveUrl2(relative2, base) {
|
|
2816
|
-
return useNativeURL ? new
|
|
2816
|
+
return useNativeURL ? new URL3(relative2, base) : parseUrl(url.resolve(base, relative2));
|
|
2817
2817
|
}
|
|
2818
2818
|
function validateUrl(input) {
|
|
2819
2819
|
if (/^\[/.test(input.hostname) && !/^\[[:0-9a-f]+\]$/i.test(input.hostname)) {
|
|
@@ -2892,9 +2892,9 @@ var require_follow_redirects = __commonJS({
|
|
|
2892
2892
|
return typeof value === "object" && "length" in value;
|
|
2893
2893
|
}
|
|
2894
2894
|
function isURL(value) {
|
|
2895
|
-
return
|
|
2895
|
+
return URL3 && value instanceof URL3;
|
|
2896
2896
|
}
|
|
2897
|
-
module2.exports = wrap({ http, https });
|
|
2897
|
+
module2.exports = wrap({ http: http2, https: https2 });
|
|
2898
2898
|
module2.exports.wrap = wrap;
|
|
2899
2899
|
}
|
|
2900
2900
|
});
|
|
@@ -2997,8 +2997,8 @@ var require_http = __commonJS({
|
|
|
2997
2997
|
var settle = require_settle();
|
|
2998
2998
|
var buildFullPath = require_buildFullPath();
|
|
2999
2999
|
var buildURL = require_buildURL();
|
|
3000
|
-
var
|
|
3001
|
-
var
|
|
3000
|
+
var http2 = require("http");
|
|
3001
|
+
var https2 = require("https");
|
|
3002
3002
|
var httpFollow = require_follow_redirects().http;
|
|
3003
3003
|
var httpsFollow = require_follow_redirects().https;
|
|
3004
3004
|
var url = require("url");
|
|
@@ -3137,7 +3137,7 @@ var require_http = __commonJS({
|
|
|
3137
3137
|
if (config.transport) {
|
|
3138
3138
|
transport = config.transport;
|
|
3139
3139
|
} else if (config.maxRedirects === 0) {
|
|
3140
|
-
transport = isHttpsProxy ?
|
|
3140
|
+
transport = isHttpsProxy ? https2 : http2;
|
|
3141
3141
|
} else {
|
|
3142
3142
|
if (config.maxRedirects) {
|
|
3143
3143
|
options.maxRedirects = config.maxRedirects;
|
|
@@ -3282,8 +3282,8 @@ var require_helpers = __commonJS({
|
|
|
3282
3282
|
};
|
|
3283
3283
|
Object.defineProperty(exports2, "__esModule", { value: true });
|
|
3284
3284
|
exports2.req = exports2.json = exports2.toBuffer = void 0;
|
|
3285
|
-
var
|
|
3286
|
-
var
|
|
3285
|
+
var http2 = __importStar2(require("http"));
|
|
3286
|
+
var https2 = __importStar2(require("https"));
|
|
3287
3287
|
async function toBuffer(stream) {
|
|
3288
3288
|
let length = 0;
|
|
3289
3289
|
const chunks = [];
|
|
@@ -3308,7 +3308,7 @@ var require_helpers = __commonJS({
|
|
|
3308
3308
|
exports2.json = json;
|
|
3309
3309
|
function req(url, opts = {}) {
|
|
3310
3310
|
const href = typeof url === "string" ? url : url.href;
|
|
3311
|
-
const req2 = (href.startsWith("https:") ?
|
|
3311
|
+
const req2 = (href.startsWith("https:") ? https2 : http2).request(url, opts);
|
|
3312
3312
|
const promise = new Promise((resolve3, reject) => {
|
|
3313
3313
|
req2.once("response", resolve3).once("error", reject).end();
|
|
3314
3314
|
});
|
|
@@ -3356,11 +3356,11 @@ var require_dist = __commonJS({
|
|
|
3356
3356
|
Object.defineProperty(exports2, "__esModule", { value: true });
|
|
3357
3357
|
exports2.Agent = void 0;
|
|
3358
3358
|
var net = __importStar2(require("net"));
|
|
3359
|
-
var
|
|
3359
|
+
var http2 = __importStar2(require("http"));
|
|
3360
3360
|
var https_1 = require("https");
|
|
3361
3361
|
__exportStar2(require_helpers(), exports2);
|
|
3362
3362
|
var INTERNAL = /* @__PURE__ */ Symbol("AgentBaseInternalState");
|
|
3363
|
-
var Agent = class extends
|
|
3363
|
+
var Agent = class extends http2.Agent {
|
|
3364
3364
|
constructor(opts) {
|
|
3365
3365
|
super(opts);
|
|
3366
3366
|
this[INTERNAL] = {};
|
|
@@ -3432,7 +3432,7 @@ var require_dist = __commonJS({
|
|
|
3432
3432
|
const fakeSocket = this.incrementSockets(name);
|
|
3433
3433
|
Promise.resolve().then(() => this.connect(req, connectOpts)).then((socket) => {
|
|
3434
3434
|
this.decrementSockets(name, fakeSocket);
|
|
3435
|
-
if (socket instanceof
|
|
3435
|
+
if (socket instanceof http2.Agent) {
|
|
3436
3436
|
try {
|
|
3437
3437
|
return socket.addRequest(req, connectOpts);
|
|
3438
3438
|
} catch (err) {
|
|
@@ -7372,7 +7372,7 @@ var import_fs = require("fs");
|
|
|
7372
7372
|
var import_crypto = require("crypto");
|
|
7373
7373
|
var import_path2 = __toESM(require("path"));
|
|
7374
7374
|
var import_child_process = require("child_process");
|
|
7375
|
-
var INSTALL_TIMEOUT_MS =
|
|
7375
|
+
var INSTALL_TIMEOUT_MS = 18e4;
|
|
7376
7376
|
var HEARTBEAT_INTERVAL_MS = 5e3;
|
|
7377
7377
|
function reportRegistry(workDir, onProgress) {
|
|
7378
7378
|
try {
|
|
@@ -9650,14 +9650,12 @@ function toCanonicalModelId(modelId) {
|
|
|
9650
9650
|
const slashIndex = modelId.indexOf("/");
|
|
9651
9651
|
return slashIndex > 0 ? modelId.slice(slashIndex + 1) : modelId;
|
|
9652
9652
|
}
|
|
9653
|
-
function
|
|
9654
|
-
|
|
9653
|
+
function isValidCost(cost) {
|
|
9654
|
+
return typeof cost === "number" && Number.isFinite(cost);
|
|
9655
|
+
}
|
|
9656
|
+
function groupEventsIntoTurns(timestampedEvents) {
|
|
9655
9657
|
const turns = [];
|
|
9656
|
-
let current = {
|
|
9657
|
-
textParts: [],
|
|
9658
|
-
reasoningParts: [],
|
|
9659
|
-
toolCalls: []
|
|
9660
|
-
};
|
|
9658
|
+
let current = { textParts: [], reasoningParts: [], toolCalls: [] };
|
|
9661
9659
|
for (const { event: evt, receivedAt } of timestampedEvents) {
|
|
9662
9660
|
switch (evt.type) {
|
|
9663
9661
|
case "text":
|
|
@@ -9679,160 +9677,197 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
|
|
|
9679
9677
|
current.stepFinish = sf.part;
|
|
9680
9678
|
current.receivedAt = receivedAt;
|
|
9681
9679
|
turns.push(current);
|
|
9682
|
-
current = {
|
|
9683
|
-
textParts: [],
|
|
9684
|
-
reasoningParts: [],
|
|
9685
|
-
toolCalls: []
|
|
9686
|
-
};
|
|
9680
|
+
current = { textParts: [], reasoningParts: [], toolCalls: [] };
|
|
9687
9681
|
break;
|
|
9688
9682
|
}
|
|
9689
9683
|
}
|
|
9690
9684
|
}
|
|
9691
|
-
|
|
9685
|
+
const hasTrailingContent = current.textParts.length > 0 || current.reasoningParts.length > 0 || current.toolCalls.length > 0;
|
|
9686
|
+
if (hasTrailingContent) {
|
|
9692
9687
|
if (timestampedEvents.length > 0) {
|
|
9693
9688
|
current.receivedAt = timestampedEvents[timestampedEvents.length - 1].receivedAt;
|
|
9694
9689
|
}
|
|
9695
9690
|
turns.push(current);
|
|
9696
9691
|
}
|
|
9697
|
-
|
|
9698
|
-
|
|
9699
|
-
|
|
9700
|
-
|
|
9701
|
-
|
|
9702
|
-
const
|
|
9703
|
-
|
|
9704
|
-
|
|
9705
|
-
|
|
9706
|
-
|
|
9707
|
-
|
|
9708
|
-
|
|
9709
|
-
|
|
9710
|
-
|
|
9711
|
-
|
|
9712
|
-
|
|
9713
|
-
|
|
9714
|
-
|
|
9715
|
-
|
|
9716
|
-
|
|
9717
|
-
|
|
9718
|
-
|
|
9719
|
-
const
|
|
9720
|
-
|
|
9721
|
-
|
|
9722
|
-
|
|
9692
|
+
return turns;
|
|
9693
|
+
}
|
|
9694
|
+
function resolveTurnCosts(turns, gatewayCosts) {
|
|
9695
|
+
const turnCosts = turns.map((turn, i) => {
|
|
9696
|
+
if (!turn.stepFinish) return 0;
|
|
9697
|
+
const capturedCost = gatewayCosts?.[i];
|
|
9698
|
+
return isValidCost(capturedCost) ? capturedCost : turn.stepFinish.cost;
|
|
9699
|
+
});
|
|
9700
|
+
if (!gatewayCosts || gatewayCosts.length === 0) return turnCosts;
|
|
9701
|
+
const requestTurnIndexes = turns.flatMap(
|
|
9702
|
+
(turn, i) => turn.stepFinish ? [i] : []
|
|
9703
|
+
);
|
|
9704
|
+
const missingCount = requestTurnIndexes.filter(
|
|
9705
|
+
(i) => !isValidCost(gatewayCosts[i])
|
|
9706
|
+
).length;
|
|
9707
|
+
if (missingCount > 0) {
|
|
9708
|
+
console.warn(
|
|
9709
|
+
`[opencode] gateway cost missing for ${missingCount}/${requestTurnIndexes.length} turn(s); using OpenCode-reported cost for those`
|
|
9710
|
+
);
|
|
9711
|
+
}
|
|
9712
|
+
let extraCallsCost = 0;
|
|
9713
|
+
for (let i = requestTurnIndexes.length; i < gatewayCosts.length; i++) {
|
|
9714
|
+
const capturedCost = gatewayCosts[i];
|
|
9715
|
+
if (isValidCost(capturedCost)) extraCallsCost += capturedCost;
|
|
9716
|
+
}
|
|
9717
|
+
if (extraCallsCost > 0 && requestTurnIndexes.length > 0) {
|
|
9718
|
+
const lastTurnIndex = requestTurnIndexes[requestTurnIndexes.length - 1];
|
|
9719
|
+
turnCosts[lastTurnIndex] += extraCallsCost;
|
|
9720
|
+
console.warn(
|
|
9721
|
+
`[opencode] ${gatewayCosts.length} gateway call(s) for ${requestTurnIndexes.length} turn(s); folded $${extraCallsCost} of extra calls into the last turn`
|
|
9722
|
+
);
|
|
9723
|
+
}
|
|
9724
|
+
return turnCosts;
|
|
9725
|
+
}
|
|
9726
|
+
function buildTurnSteps(turn, turnIndex, ctx) {
|
|
9727
|
+
const {
|
|
9728
|
+
turns,
|
|
9729
|
+
turnCosts,
|
|
9730
|
+
totalDurationMs,
|
|
9731
|
+
executionStartMs,
|
|
9732
|
+
model,
|
|
9733
|
+
provider
|
|
9734
|
+
} = ctx;
|
|
9735
|
+
const sf = turn.stepFinish;
|
|
9736
|
+
const stepInputTokens = sf?.tokens.input ?? 0;
|
|
9737
|
+
const stepOutputTokens = sf?.tokens.output ?? 0;
|
|
9738
|
+
const stepCost = turnCosts[turnIndex];
|
|
9739
|
+
const finishReason = sf?.reason ?? "unknown";
|
|
9740
|
+
const stepModel = toCanonicalModelId(sf?.modelID || model);
|
|
9741
|
+
const stepProvider = sf?.providerID || provider;
|
|
9742
|
+
const turnEndMs = turn.receivedAt ?? executionStartMs + totalDurationMs;
|
|
9743
|
+
const prevEndMs = turnIndex > 0 ? turns[turnIndex - 1].receivedAt ?? executionStartMs : executionStartMs;
|
|
9744
|
+
const durationMs = Math.max(0, turnEndMs - prevEndMs);
|
|
9745
|
+
const startedAt = new Date(prevEndMs).toISOString();
|
|
9746
|
+
const text = turn.textParts.join("");
|
|
9747
|
+
const thinking = turn.reasoningParts.join("");
|
|
9748
|
+
const toolCallCount = turn.toolCalls.length;
|
|
9749
|
+
const hasThinking = !!thinking;
|
|
9750
|
+
const hasText = !!text;
|
|
9751
|
+
const isSuccess = finishReason !== "error";
|
|
9752
|
+
const errorMsg = finishReason === "error" ? "Generation failed" : void 0;
|
|
9753
|
+
const subSteps = [];
|
|
9754
|
+
const thinkingSubSteps = hasThinking && (hasText || toolCallCount > 0) ? 1 : 0;
|
|
9755
|
+
const toolSubSteps = toolCallCount;
|
|
9756
|
+
const textSubSteps = hasText && toolCallCount > 0 ? 1 : 0;
|
|
9757
|
+
const totalSubSteps = thinkingSubSteps + toolSubSteps + textSubSteps || 1;
|
|
9758
|
+
if (hasThinking && (hasText || toolCallCount > 0)) {
|
|
9759
|
+
subSteps.push({
|
|
9760
|
+
id: (0, import_crypto3.randomUUID)(),
|
|
9761
|
+
stepNumber: 0,
|
|
9762
|
+
turnIndex,
|
|
9763
|
+
type: import_evalforge_types7.LLMStepType.THINKING,
|
|
9764
|
+
model: stepModel,
|
|
9765
|
+
provider: stepProvider,
|
|
9766
|
+
startedAt,
|
|
9767
|
+
durationMs: Math.round(durationMs / totalSubSteps),
|
|
9768
|
+
tokenUsage: {
|
|
9769
|
+
prompt: Math.round(stepInputTokens / totalSubSteps),
|
|
9770
|
+
completion: Math.round(stepOutputTokens / totalSubSteps),
|
|
9771
|
+
total: Math.round((stepInputTokens + stepOutputTokens) / totalSubSteps)
|
|
9772
|
+
},
|
|
9773
|
+
costUsd: stepCost / totalSubSteps,
|
|
9774
|
+
outputPreview: thinking.slice(0, 200),
|
|
9775
|
+
success: isSuccess,
|
|
9776
|
+
error: errorMsg
|
|
9777
|
+
});
|
|
9778
|
+
}
|
|
9779
|
+
if (toolCallCount > 0) {
|
|
9780
|
+
for (let tcIdx = 0; tcIdx < toolCallCount; tcIdx++) {
|
|
9781
|
+
const tc = turn.toolCalls[tcIdx];
|
|
9782
|
+
const isLast = tcIdx === toolCallCount - 1 && textSubSteps === 0;
|
|
9783
|
+
const toolBudgetSteps = toolSubSteps + textSubSteps;
|
|
9784
|
+
const toolFraction = toolBudgetSteps > 0 ? 1 / toolBudgetSteps : 1;
|
|
9785
|
+
const remainingFraction = (totalSubSteps - thinkingSubSteps) / totalSubSteps;
|
|
9723
9786
|
subSteps.push({
|
|
9724
9787
|
id: (0, import_crypto3.randomUUID)(),
|
|
9725
9788
|
stepNumber: 0,
|
|
9726
9789
|
turnIndex,
|
|
9727
|
-
type: import_evalforge_types7.LLMStepType.
|
|
9790
|
+
type: import_evalforge_types7.LLMStepType.TOOL_USE,
|
|
9728
9791
|
model: stepModel,
|
|
9729
9792
|
provider: stepProvider,
|
|
9730
9793
|
startedAt,
|
|
9731
|
-
durationMs: Math.round(durationMs
|
|
9794
|
+
durationMs: isLast ? durationMs - subSteps.reduce((s, ss) => s + ss.durationMs, 0) : Math.round(durationMs * remainingFraction * toolFraction),
|
|
9732
9795
|
tokenUsage: {
|
|
9733
|
-
prompt: Math.round(
|
|
9734
|
-
|
|
9796
|
+
prompt: Math.round(
|
|
9797
|
+
stepInputTokens * remainingFraction * toolFraction
|
|
9798
|
+
),
|
|
9799
|
+
completion: Math.round(
|
|
9800
|
+
stepOutputTokens * remainingFraction * toolFraction
|
|
9801
|
+
),
|
|
9735
9802
|
total: Math.round(
|
|
9736
|
-
(stepInputTokens + stepOutputTokens)
|
|
9803
|
+
(stepInputTokens + stepOutputTokens) * remainingFraction * toolFraction
|
|
9737
9804
|
)
|
|
9738
9805
|
},
|
|
9739
|
-
costUsd: stepCost
|
|
9740
|
-
|
|
9741
|
-
|
|
9742
|
-
|
|
9743
|
-
});
|
|
9744
|
-
}
|
|
9745
|
-
if (toolCallCount > 0) {
|
|
9746
|
-
for (let tcIdx = 0; tcIdx < toolCallCount; tcIdx++) {
|
|
9747
|
-
const tc = turn.toolCalls[tcIdx];
|
|
9748
|
-
const isLast = tcIdx === toolCallCount - 1 && textSubSteps === 0;
|
|
9749
|
-
const toolBudgetSteps = toolSubSteps + textSubSteps;
|
|
9750
|
-
const toolFraction = toolBudgetSteps > 0 ? 1 / toolBudgetSteps : 1;
|
|
9751
|
-
const remainingFraction = (totalSubSteps - thinkingSubSteps) / totalSubSteps;
|
|
9752
|
-
subSteps.push({
|
|
9753
|
-
id: (0, import_crypto3.randomUUID)(),
|
|
9754
|
-
stepNumber: 0,
|
|
9755
|
-
turnIndex,
|
|
9756
|
-
type: import_evalforge_types7.LLMStepType.TOOL_USE,
|
|
9757
|
-
model: stepModel,
|
|
9758
|
-
provider: stepProvider,
|
|
9759
|
-
startedAt,
|
|
9760
|
-
durationMs: isLast ? durationMs - subSteps.reduce((s, ss) => s + ss.durationMs, 0) : Math.round(durationMs * remainingFraction * toolFraction),
|
|
9761
|
-
tokenUsage: {
|
|
9762
|
-
prompt: Math.round(
|
|
9763
|
-
stepInputTokens * remainingFraction * toolFraction
|
|
9764
|
-
),
|
|
9765
|
-
completion: Math.round(
|
|
9766
|
-
stepOutputTokens * remainingFraction * toolFraction
|
|
9767
|
-
),
|
|
9768
|
-
total: Math.round(
|
|
9769
|
-
(stepInputTokens + stepOutputTokens) * remainingFraction * toolFraction
|
|
9770
|
-
)
|
|
9771
|
-
},
|
|
9772
|
-
costUsd: stepCost * remainingFraction * toolFraction,
|
|
9773
|
-
toolName: tc.toolName,
|
|
9774
|
-
toolArguments: JSON.stringify(tc.args),
|
|
9775
|
-
outputPreview: tcIdx === 0 && !hasText ? (text || thinking)?.slice(0, 200) : void 0,
|
|
9776
|
-
success: isSuccess,
|
|
9777
|
-
error: errorMsg
|
|
9778
|
-
});
|
|
9779
|
-
}
|
|
9780
|
-
}
|
|
9781
|
-
if (hasText && toolCallCount > 0) {
|
|
9782
|
-
subSteps.push({
|
|
9783
|
-
id: (0, import_crypto3.randomUUID)(),
|
|
9784
|
-
stepNumber: 0,
|
|
9785
|
-
turnIndex,
|
|
9786
|
-
type: import_evalforge_types7.LLMStepType.COMPLETION,
|
|
9787
|
-
model: stepModel,
|
|
9788
|
-
provider: stepProvider,
|
|
9789
|
-
startedAt,
|
|
9790
|
-
durationMs: durationMs - subSteps.reduce((s, ss) => s + ss.durationMs, 0),
|
|
9791
|
-
tokenUsage: {
|
|
9792
|
-
prompt: stepInputTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.prompt, 0),
|
|
9793
|
-
completion: stepOutputTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.completion, 0),
|
|
9794
|
-
total: stepInputTokens + stepOutputTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.total, 0)
|
|
9795
|
-
},
|
|
9796
|
-
costUsd: stepCost - subSteps.reduce((s, ss) => s + ss.costUsd, 0),
|
|
9797
|
-
outputPreview: text.slice(0, 200),
|
|
9798
|
-
success: isSuccess,
|
|
9799
|
-
error: errorMsg
|
|
9800
|
-
});
|
|
9801
|
-
}
|
|
9802
|
-
if (subSteps.length === 0) {
|
|
9803
|
-
const stepType = hasThinking && !hasText ? import_evalforge_types7.LLMStepType.THINKING : import_evalforge_types7.LLMStepType.COMPLETION;
|
|
9804
|
-
subSteps.push({
|
|
9805
|
-
id: (0, import_crypto3.randomUUID)(),
|
|
9806
|
-
stepNumber: 0,
|
|
9807
|
-
turnIndex,
|
|
9808
|
-
type: stepType,
|
|
9809
|
-
model: stepModel,
|
|
9810
|
-
provider: stepProvider,
|
|
9811
|
-
startedAt,
|
|
9812
|
-
durationMs,
|
|
9813
|
-
tokenUsage: {
|
|
9814
|
-
prompt: stepInputTokens,
|
|
9815
|
-
completion: stepOutputTokens,
|
|
9816
|
-
total: stepInputTokens + stepOutputTokens
|
|
9817
|
-
},
|
|
9818
|
-
costUsd: stepCost,
|
|
9819
|
-
outputPreview: (text || thinking)?.slice(0, 200),
|
|
9806
|
+
costUsd: stepCost * remainingFraction * toolFraction,
|
|
9807
|
+
toolName: tc.toolName,
|
|
9808
|
+
toolArguments: JSON.stringify(tc.args),
|
|
9809
|
+
outputPreview: tcIdx === 0 && !hasText ? (text || thinking)?.slice(0, 200) : void 0,
|
|
9820
9810
|
success: isSuccess,
|
|
9821
9811
|
error: errorMsg
|
|
9822
9812
|
});
|
|
9823
9813
|
}
|
|
9824
|
-
|
|
9825
|
-
|
|
9814
|
+
}
|
|
9815
|
+
if (hasText && toolCallCount > 0) {
|
|
9816
|
+
subSteps.push({
|
|
9817
|
+
id: (0, import_crypto3.randomUUID)(),
|
|
9818
|
+
stepNumber: 0,
|
|
9819
|
+
turnIndex,
|
|
9820
|
+
type: import_evalforge_types7.LLMStepType.COMPLETION,
|
|
9821
|
+
model: stepModel,
|
|
9822
|
+
provider: stepProvider,
|
|
9823
|
+
startedAt,
|
|
9824
|
+
durationMs: durationMs - subSteps.reduce((s, ss) => s + ss.durationMs, 0),
|
|
9825
|
+
tokenUsage: {
|
|
9826
|
+
prompt: stepInputTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.prompt, 0),
|
|
9827
|
+
completion: stepOutputTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.completion, 0),
|
|
9828
|
+
total: stepInputTokens + stepOutputTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.total, 0)
|
|
9829
|
+
},
|
|
9830
|
+
costUsd: stepCost - subSteps.reduce((s, ss) => s + ss.costUsd, 0),
|
|
9831
|
+
outputPreview: text.slice(0, 200),
|
|
9832
|
+
success: isSuccess,
|
|
9833
|
+
error: errorMsg
|
|
9834
|
+
});
|
|
9835
|
+
}
|
|
9836
|
+
if (subSteps.length === 0) {
|
|
9837
|
+
const stepType = hasThinking && !hasText ? import_evalforge_types7.LLMStepType.THINKING : import_evalforge_types7.LLMStepType.COMPLETION;
|
|
9838
|
+
subSteps.push({
|
|
9839
|
+
id: (0, import_crypto3.randomUUID)(),
|
|
9840
|
+
stepNumber: 0,
|
|
9841
|
+
turnIndex,
|
|
9842
|
+
type: stepType,
|
|
9843
|
+
model: stepModel,
|
|
9844
|
+
provider: stepProvider,
|
|
9845
|
+
startedAt,
|
|
9846
|
+
durationMs,
|
|
9847
|
+
tokenUsage: {
|
|
9848
|
+
prompt: stepInputTokens,
|
|
9849
|
+
completion: stepOutputTokens,
|
|
9850
|
+
total: stepInputTokens + stepOutputTokens
|
|
9851
|
+
},
|
|
9852
|
+
costUsd: stepCost,
|
|
9853
|
+
outputPreview: (text || thinking)?.slice(0, 200),
|
|
9854
|
+
success: isSuccess,
|
|
9855
|
+
error: errorMsg
|
|
9856
|
+
});
|
|
9857
|
+
}
|
|
9858
|
+
return subSteps;
|
|
9859
|
+
}
|
|
9860
|
+
function buildSummary(allSteps, turns, turnCosts, totalDurationMs, canonicalModel) {
|
|
9826
9861
|
let totalPrompt = 0;
|
|
9827
9862
|
let totalCompletion = 0;
|
|
9828
9863
|
let totalCost = 0;
|
|
9829
|
-
|
|
9864
|
+
turns.forEach((turn, turnIndex) => {
|
|
9830
9865
|
if (turn.stepFinish) {
|
|
9831
9866
|
totalPrompt += turn.stepFinish.tokens.input;
|
|
9832
9867
|
totalCompletion += turn.stepFinish.tokens.output;
|
|
9833
|
-
totalCost +=
|
|
9868
|
+
totalCost += turnCosts[turnIndex];
|
|
9834
9869
|
}
|
|
9835
|
-
}
|
|
9870
|
+
});
|
|
9836
9871
|
const totalTokens = {
|
|
9837
9872
|
prompt: totalPrompt,
|
|
9838
9873
|
completion: totalCompletion,
|
|
@@ -9853,7 +9888,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
|
|
|
9853
9888
|
stepTypeBreakdown[step.type] = entry;
|
|
9854
9889
|
}
|
|
9855
9890
|
const modelUsed = allSteps[0]?.model || canonicalModel;
|
|
9856
|
-
|
|
9891
|
+
return {
|
|
9857
9892
|
totalSteps: allSteps.length,
|
|
9858
9893
|
totalTurns: turns.length,
|
|
9859
9894
|
totalDurationMs,
|
|
@@ -9870,11 +9905,97 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
|
|
|
9870
9905
|
modelsUsed: [modelUsed],
|
|
9871
9906
|
stepTypeBreakdown
|
|
9872
9907
|
};
|
|
9873
|
-
|
|
9874
|
-
|
|
9875
|
-
|
|
9876
|
-
|
|
9908
|
+
}
|
|
9909
|
+
function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, executionStartTime, gatewayCosts) {
|
|
9910
|
+
const canonicalModel = toCanonicalModelId(model);
|
|
9911
|
+
const turns = groupEventsIntoTurns(timestampedEvents);
|
|
9912
|
+
const turnCosts = resolveTurnCosts(turns, gatewayCosts);
|
|
9913
|
+
const ctx = {
|
|
9914
|
+
turns,
|
|
9915
|
+
turnCosts,
|
|
9916
|
+
totalDurationMs,
|
|
9917
|
+
executionStartMs: executionStartTime.getTime(),
|
|
9918
|
+
model,
|
|
9919
|
+
provider
|
|
9877
9920
|
};
|
|
9921
|
+
const allSteps = turns.flatMap((turn, turnIndex) => buildTurnSteps(turn, turnIndex, ctx)).map((step, i) => ({ ...step, stepNumber: i + 1 }));
|
|
9922
|
+
const summary = buildSummary(
|
|
9923
|
+
allSteps,
|
|
9924
|
+
turns,
|
|
9925
|
+
turnCosts,
|
|
9926
|
+
totalDurationMs,
|
|
9927
|
+
canonicalModel
|
|
9928
|
+
);
|
|
9929
|
+
return { id: (0, import_crypto3.randomUUID)(), steps: allSteps, summary };
|
|
9930
|
+
}
|
|
9931
|
+
|
|
9932
|
+
// src/run-scenario/agents/opencode/gateway-cost-interceptor.ts
|
|
9933
|
+
var import_node_http = __toESM(require("node:http"));
|
|
9934
|
+
var import_node_https = __toESM(require("node:https"));
|
|
9935
|
+
var import_node_url = require("node:url");
|
|
9936
|
+
var TAIL_BYTES = 64 * 1024;
|
|
9937
|
+
var COST_RE = /"total_cost_usd"\s*:\s*(-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)/g;
|
|
9938
|
+
function extractLastCost(text) {
|
|
9939
|
+
let match;
|
|
9940
|
+
let last = null;
|
|
9941
|
+
COST_RE.lastIndex = 0;
|
|
9942
|
+
while ((match = COST_RE.exec(text)) !== null) {
|
|
9943
|
+
const value = Number(match[1]);
|
|
9944
|
+
if (Number.isFinite(value)) last = value;
|
|
9945
|
+
}
|
|
9946
|
+
return last;
|
|
9947
|
+
}
|
|
9948
|
+
function startGatewayCostInterceptor(realGatewayUrl) {
|
|
9949
|
+
const base = realGatewayUrl.replace(/\/$/, "");
|
|
9950
|
+
const captured = [];
|
|
9951
|
+
const server = import_node_http.default.createServer((clientReq, clientRes) => {
|
|
9952
|
+
const slot = captured.length;
|
|
9953
|
+
captured.push(null);
|
|
9954
|
+
const target = new import_node_url.URL(base + (clientReq.url ?? ""));
|
|
9955
|
+
const transport = target.protocol === "https:" ? import_node_https.default : import_node_http.default;
|
|
9956
|
+
const headers = { ...clientReq.headers };
|
|
9957
|
+
delete headers.host;
|
|
9958
|
+
headers["accept-encoding"] = "identity";
|
|
9959
|
+
const proxyReq = transport.request(
|
|
9960
|
+
{
|
|
9961
|
+
protocol: target.protocol,
|
|
9962
|
+
hostname: target.hostname,
|
|
9963
|
+
port: target.port,
|
|
9964
|
+
path: target.pathname + target.search,
|
|
9965
|
+
method: clientReq.method,
|
|
9966
|
+
headers
|
|
9967
|
+
},
|
|
9968
|
+
(proxyRes) => {
|
|
9969
|
+
clientRes.writeHead(proxyRes.statusCode ?? 502, proxyRes.headers);
|
|
9970
|
+
let tail = "";
|
|
9971
|
+
proxyRes.on("data", (chunk) => {
|
|
9972
|
+
clientRes.write(chunk);
|
|
9973
|
+
tail = (tail + chunk.toString("utf8")).slice(-TAIL_BYTES);
|
|
9974
|
+
});
|
|
9975
|
+
proxyRes.on("end", () => {
|
|
9976
|
+
clientRes.end();
|
|
9977
|
+
captured[slot] = extractLastCost(tail);
|
|
9978
|
+
});
|
|
9979
|
+
proxyRes.on("error", () => clientRes.destroy());
|
|
9980
|
+
}
|
|
9981
|
+
);
|
|
9982
|
+
proxyReq.on("error", () => {
|
|
9983
|
+
if (!clientRes.headersSent) clientRes.writeHead(502);
|
|
9984
|
+
clientRes.end();
|
|
9985
|
+
});
|
|
9986
|
+
clientReq.pipe(proxyReq);
|
|
9987
|
+
});
|
|
9988
|
+
return new Promise((resolve3) => {
|
|
9989
|
+
server.listen(0, "127.0.0.1", () => {
|
|
9990
|
+
const addr = server.address();
|
|
9991
|
+
const port = typeof addr === "object" && addr ? addr.port : 0;
|
|
9992
|
+
resolve3({
|
|
9993
|
+
url: `http://127.0.0.1:${port}`,
|
|
9994
|
+
getCapturedCosts: () => captured.slice(),
|
|
9995
|
+
close: () => new Promise((r) => server.close(() => r()))
|
|
9996
|
+
});
|
|
9997
|
+
});
|
|
9998
|
+
});
|
|
9878
9999
|
}
|
|
9879
10000
|
|
|
9880
10001
|
// src/run-scenario/agents/opencode/build-conversation.ts
|
|
@@ -10150,6 +10271,7 @@ function spawnOpenCodeProcess(opts) {
|
|
|
10150
10271
|
return new Promise((resolve3) => {
|
|
10151
10272
|
let resolved = false;
|
|
10152
10273
|
let stderr = "";
|
|
10274
|
+
let rawStdout = "";
|
|
10153
10275
|
let lineBuffer = "";
|
|
10154
10276
|
let lastOutputTime = Date.now();
|
|
10155
10277
|
let traceStepNumber = initialStepNumber;
|
|
@@ -10271,6 +10393,7 @@ function spawnOpenCodeProcess(opts) {
|
|
|
10271
10393
|
child.stdout?.on("data", (data) => {
|
|
10272
10394
|
const text = data.toString();
|
|
10273
10395
|
lastOutputTime = Date.now();
|
|
10396
|
+
rawStdout += text;
|
|
10274
10397
|
lineBuffer += text;
|
|
10275
10398
|
const lines = lineBuffer.split("\n");
|
|
10276
10399
|
lineBuffer = lines.pop() || "";
|
|
@@ -10314,7 +10437,7 @@ function spawnOpenCodeProcess(opts) {
|
|
|
10314
10437
|
stderr += text;
|
|
10315
10438
|
lastOutputTime = Date.now();
|
|
10316
10439
|
});
|
|
10317
|
-
child.on("close", (code) => {
|
|
10440
|
+
child.on("close", (code, signal) => {
|
|
10318
10441
|
if (lineBuffer.trim()) {
|
|
10319
10442
|
const evt = tryParseJson(lineBuffer);
|
|
10320
10443
|
if (evt && evt.type) {
|
|
@@ -10322,8 +10445,18 @@ function spawnOpenCodeProcess(opts) {
|
|
|
10322
10445
|
}
|
|
10323
10446
|
}
|
|
10324
10447
|
console.log(
|
|
10325
|
-
`[executeWithOpenCode] Process exited with code ${code}, ${events.length} events collected`
|
|
10448
|
+
`[executeWithOpenCode] Process exited with code ${code}, signal ${signal}, ${events.length} events collected`
|
|
10326
10449
|
);
|
|
10450
|
+
if (events.length === 0) {
|
|
10451
|
+
console.error(
|
|
10452
|
+
`[executeWithOpenCode] No events. exitCode=${code} signal=${signal}
|
|
10453
|
+
--- raw stdout (first 4000) ---
|
|
10454
|
+
${rawStdout.slice(0, 4e3)}
|
|
10455
|
+
--- raw stderr (first 4000) ---
|
|
10456
|
+
${stderr.slice(0, 4e3)}
|
|
10457
|
+
--- end raw output ---`
|
|
10458
|
+
);
|
|
10459
|
+
}
|
|
10327
10460
|
if (code === 0) {
|
|
10328
10461
|
finalize(true, false);
|
|
10329
10462
|
} else {
|
|
@@ -10346,7 +10479,7 @@ Stderr: ${stderr.slice(0, 1e3)}`
|
|
|
10346
10479
|
});
|
|
10347
10480
|
});
|
|
10348
10481
|
}
|
|
10349
|
-
async function
|
|
10482
|
+
async function executeWithOpenCodeInner(skills, scenario, options, interceptor) {
|
|
10350
10483
|
const skillNames = skills.map((s) => s.name).join(", ");
|
|
10351
10484
|
console.log("[executeWithOpenCode] Starting execution", {
|
|
10352
10485
|
skillCount: skills.length,
|
|
@@ -10365,7 +10498,9 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
10365
10498
|
temperature: options.temperature,
|
|
10366
10499
|
topP: options.topP,
|
|
10367
10500
|
maxTurns,
|
|
10368
|
-
|
|
10501
|
+
// Point OpenCode at the local interceptor (which forwards to the real
|
|
10502
|
+
// gateway and captures the true cost); fall back to the gateway directly.
|
|
10503
|
+
aiGatewayUrl: interceptor?.url ?? options.aiGatewayUrl,
|
|
10369
10504
|
aiGatewayHeaders: options.aiGatewayHeaders,
|
|
10370
10505
|
mcps: options.mcps,
|
|
10371
10506
|
rules: options.rules,
|
|
@@ -10518,13 +10653,11 @@ Attempt: ${attempt}, Events: ${accumulatedEvents.length}`
|
|
|
10518
10653
|
}
|
|
10519
10654
|
let inputTokens = 0;
|
|
10520
10655
|
let outputTokens = 0;
|
|
10521
|
-
let costUsd = 0;
|
|
10522
10656
|
for (const { event: evt } of accumulatedEvents) {
|
|
10523
10657
|
if (evt.type === "step_finish") {
|
|
10524
10658
|
const sf = evt;
|
|
10525
10659
|
inputTokens += sf.part.tokens.input;
|
|
10526
10660
|
outputTokens += sf.part.tokens.output;
|
|
10527
|
-
costUsd += sf.part.cost;
|
|
10528
10661
|
}
|
|
10529
10662
|
}
|
|
10530
10663
|
if (traceContext) {
|
|
@@ -10550,7 +10683,8 @@ Attempt: ${attempt}, Events: ${accumulatedEvents.length}`
|
|
|
10550
10683
|
totalDurationMs,
|
|
10551
10684
|
modelStr,
|
|
10552
10685
|
providerID,
|
|
10553
|
-
startTime
|
|
10686
|
+
startTime,
|
|
10687
|
+
interceptor?.getCapturedCosts()
|
|
10554
10688
|
);
|
|
10555
10689
|
const conversation = buildConversation2(accumulatedEvents);
|
|
10556
10690
|
return {
|
|
@@ -10562,12 +10696,26 @@ Attempt: ${attempt}, Events: ${accumulatedEvents.length}`
|
|
|
10562
10696
|
outputTokens,
|
|
10563
10697
|
totalTokens: inputTokens + outputTokens
|
|
10564
10698
|
},
|
|
10565
|
-
|
|
10699
|
+
// Single source of truth: gateway-derived cost aggregated in the trace.
|
|
10700
|
+
costUsd: llmTrace.summary.totalCostUsd
|
|
10566
10701
|
},
|
|
10567
10702
|
llmTrace,
|
|
10568
10703
|
conversation
|
|
10569
10704
|
};
|
|
10570
10705
|
}
|
|
10706
|
+
async function executeWithOpenCode(skills, scenario, options) {
|
|
10707
|
+
const interceptor = options.aiGatewayUrl ? await startGatewayCostInterceptor(options.aiGatewayUrl) : void 0;
|
|
10708
|
+
try {
|
|
10709
|
+
return await executeWithOpenCodeInner(
|
|
10710
|
+
skills,
|
|
10711
|
+
scenario,
|
|
10712
|
+
options,
|
|
10713
|
+
interceptor
|
|
10714
|
+
);
|
|
10715
|
+
} finally {
|
|
10716
|
+
await interceptor?.close();
|
|
10717
|
+
}
|
|
10718
|
+
}
|
|
10571
10719
|
|
|
10572
10720
|
// src/run-scenario/agents/opencode/opencode-adapter.ts
|
|
10573
10721
|
var OpenCodeAdapter = class {
|