agentv 3.3.0 → 3.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +7 -9
- package/dist/{agentv-provider-HDSAUUEF-LUBMM7TH.js → agentv-provider-NFFLXG5M-TJAWCWCX.js} +2 -2
- package/dist/{chunk-6LP5Z5Y4.js → chunk-5GG6DDP5.js} +256 -128
- package/dist/chunk-5GG6DDP5.js.map +1 -0
- package/dist/{chunk-AR3QEKXH.js → chunk-BJV6MDBE.js} +3 -3
- package/dist/{chunk-AR3QEKXH.js.map → chunk-BJV6MDBE.js.map} +1 -1
- package/dist/{chunk-5M3K2DMV.js → chunk-D6G4N2H2.js} +550 -516
- package/dist/chunk-D6G4N2H2.js.map +1 -0
- package/dist/{chunk-4ZMSAQWS.js → chunk-RLL4QGNL.js} +172 -81
- package/dist/chunk-RLL4QGNL.js.map +1 -0
- package/dist/cli.js +4 -4
- package/dist/{dist-OC53WD3P.js → dist-MZFXE6B5.js} +3 -5
- package/dist/index.js +4 -4
- package/dist/{interactive-NA6SAIAG.js → interactive-J7SUWZH2.js} +45 -5
- package/dist/interactive-J7SUWZH2.js.map +1 -0
- package/dist/templates/.agentv/.env.example +11 -9
- package/dist/templates/.agentv/config.yaml +5 -0
- package/dist/templates/.agentv/targets.yaml +0 -16
- package/package.json +2 -2
- package/dist/chunk-4ZMSAQWS.js.map +0 -1
- package/dist/chunk-5M3K2DMV.js.map +0 -1
- package/dist/chunk-6LP5Z5Y4.js.map +0 -1
- package/dist/interactive-NA6SAIAG.js.map +0 -1
- /package/dist/{agentv-provider-HDSAUUEF-LUBMM7TH.js.map → agentv-provider-NFFLXG5M-TJAWCWCX.js.map} +0 -0
- /package/dist/{dist-OC53WD3P.js.map → dist-MZFXE6B5.js.map} +0 -0
|
@@ -149,7 +149,7 @@ import {
|
|
|
149
149
|
withUserAgentSuffix,
|
|
150
150
|
withoutTrailingSlash,
|
|
151
151
|
zodSchema
|
|
152
|
-
} from "./chunk-
|
|
152
|
+
} from "./chunk-BJV6MDBE.js";
|
|
153
153
|
import {
|
|
154
154
|
SpanStatusCode,
|
|
155
155
|
context,
|
|
@@ -301,7 +301,7 @@ var require_dist = __commonJS({
|
|
|
301
301
|
}
|
|
302
302
|
});
|
|
303
303
|
|
|
304
|
-
// ../../packages/core/dist/chunk-
|
|
304
|
+
// ../../packages/core/dist/chunk-EFR4JHPL.js
|
|
305
305
|
import { constants } from "node:fs";
|
|
306
306
|
import { access, readFile } from "node:fs/promises";
|
|
307
307
|
import path from "node:path";
|
|
@@ -419,7 +419,7 @@ __export(external_exports2, {
|
|
|
419
419
|
void: () => voidType
|
|
420
420
|
});
|
|
421
421
|
|
|
422
|
-
// ../../packages/core/dist/chunk-
|
|
422
|
+
// ../../packages/core/dist/chunk-EFR4JHPL.js
|
|
423
423
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
424
424
|
var TEST_MESSAGE_ROLES = TEST_MESSAGE_ROLE_VALUES;
|
|
425
425
|
var TEST_MESSAGE_ROLE_SET = new Set(TEST_MESSAGE_ROLE_VALUES);
|
|
@@ -498,9 +498,6 @@ var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
|
498
498
|
function isEvaluatorKind(value) {
|
|
499
499
|
return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
|
|
500
500
|
}
|
|
501
|
-
function getHitCount(result) {
|
|
502
|
-
return result.hits.length;
|
|
503
|
-
}
|
|
504
501
|
async function fileExists(filePath) {
|
|
505
502
|
try {
|
|
506
503
|
await access(filePath, constants.F_OK);
|
|
@@ -1810,6 +1807,7 @@ var AGENT_PROVIDER_KINDS = [
|
|
|
1810
1807
|
"copilot-sdk",
|
|
1811
1808
|
"copilot-cli",
|
|
1812
1809
|
"pi-coding-agent",
|
|
1810
|
+
"pi-agent-sdk",
|
|
1813
1811
|
"claude",
|
|
1814
1812
|
"claude-cli",
|
|
1815
1813
|
"claude-sdk",
|
|
@@ -17622,7 +17620,7 @@ var AzureProvider = class {
|
|
|
17622
17620
|
};
|
|
17623
17621
|
this.retryConfig = config.retry;
|
|
17624
17622
|
const azure = createAzure(buildAzureOptions(config));
|
|
17625
|
-
this.model = azure(config.deploymentName);
|
|
17623
|
+
this.model = azure.chat(config.deploymentName);
|
|
17626
17624
|
}
|
|
17627
17625
|
id;
|
|
17628
17626
|
kind = "azure";
|
|
@@ -20912,6 +20910,29 @@ var MockProvider = class {
|
|
|
20912
20910
|
return this.delayMs;
|
|
20913
20911
|
}
|
|
20914
20912
|
};
|
|
20913
|
+
function extractPiTextContent(content) {
|
|
20914
|
+
if (typeof content === "string") {
|
|
20915
|
+
return content;
|
|
20916
|
+
}
|
|
20917
|
+
if (!Array.isArray(content)) {
|
|
20918
|
+
return void 0;
|
|
20919
|
+
}
|
|
20920
|
+
const textParts = [];
|
|
20921
|
+
for (const part of content) {
|
|
20922
|
+
if (!part || typeof part !== "object") {
|
|
20923
|
+
continue;
|
|
20924
|
+
}
|
|
20925
|
+
const p = part;
|
|
20926
|
+
if (p.type === "text" && typeof p.text === "string") {
|
|
20927
|
+
textParts.push(p.text);
|
|
20928
|
+
}
|
|
20929
|
+
}
|
|
20930
|
+
return textParts.length > 0 ? textParts.join("\n") : void 0;
|
|
20931
|
+
}
|
|
20932
|
+
function toFiniteNumber(value) {
|
|
20933
|
+
if (typeof value === "number" && Number.isFinite(value)) return value;
|
|
20934
|
+
return void 0;
|
|
20935
|
+
}
|
|
20915
20936
|
var piAgentModule = null;
|
|
20916
20937
|
var piAiModule = null;
|
|
20917
20938
|
async function loadPiModules() {
|
|
@@ -20952,7 +20973,8 @@ var PiAgentSdkProvider = class {
|
|
|
20952
20973
|
throw new Error("Pi agent SDK request was aborted before execution");
|
|
20953
20974
|
}
|
|
20954
20975
|
const { Agent, getModel, getEnvApiKey } = await loadPiModules();
|
|
20955
|
-
const
|
|
20976
|
+
const startTimeIso = (/* @__PURE__ */ new Date()).toISOString();
|
|
20977
|
+
const startMs = Date.now();
|
|
20956
20978
|
const providerName = this.config.provider ?? "anthropic";
|
|
20957
20979
|
const modelId = this.config.model ?? "claude-sonnet-4-20250514";
|
|
20958
20980
|
const model = getModel(providerName, modelId);
|
|
@@ -20969,16 +20991,73 @@ var PiAgentSdkProvider = class {
|
|
|
20969
20991
|
return this.config.apiKey ?? getEnvApiKey(provider) ?? void 0;
|
|
20970
20992
|
}
|
|
20971
20993
|
});
|
|
20972
|
-
|
|
20973
|
-
let
|
|
20994
|
+
let tokenUsage;
|
|
20995
|
+
let costUsd;
|
|
20996
|
+
const toolTrackers = /* @__PURE__ */ new Map();
|
|
20997
|
+
const completedToolResults = /* @__PURE__ */ new Map();
|
|
20974
20998
|
const unsubscribe = agent.subscribe((event) => {
|
|
20975
|
-
|
|
20976
|
-
|
|
20977
|
-
|
|
20978
|
-
|
|
20979
|
-
|
|
20980
|
-
|
|
20999
|
+
switch (event.type) {
|
|
21000
|
+
case "message_end": {
|
|
21001
|
+
const msg = event.message;
|
|
21002
|
+
if (msg && typeof msg === "object" && "role" in msg && msg.role === "assistant" && "usage" in msg) {
|
|
21003
|
+
const usage = msg.usage;
|
|
21004
|
+
if (usage && typeof usage === "object") {
|
|
21005
|
+
const u = usage;
|
|
21006
|
+
const input = toFiniteNumber(u.input);
|
|
21007
|
+
const output = toFiniteNumber(u.output);
|
|
21008
|
+
const cached = toFiniteNumber(u.cacheRead);
|
|
21009
|
+
let callDelta;
|
|
21010
|
+
if (input !== void 0 || output !== void 0) {
|
|
21011
|
+
callDelta = {
|
|
21012
|
+
input: input ?? 0,
|
|
21013
|
+
output: output ?? 0,
|
|
21014
|
+
...cached !== void 0 ? { cached } : {}
|
|
21015
|
+
};
|
|
21016
|
+
tokenUsage = {
|
|
21017
|
+
input: (tokenUsage?.input ?? 0) + callDelta.input,
|
|
21018
|
+
output: (tokenUsage?.output ?? 0) + callDelta.output,
|
|
21019
|
+
...cached !== void 0 ? { cached: (tokenUsage?.cached ?? 0) + cached } : tokenUsage?.cached !== void 0 ? { cached: tokenUsage.cached } : {}
|
|
21020
|
+
};
|
|
21021
|
+
}
|
|
21022
|
+
const cost = u.cost;
|
|
21023
|
+
if (cost && typeof cost === "object") {
|
|
21024
|
+
const total = toFiniteNumber(cost.total);
|
|
21025
|
+
if (total !== void 0) {
|
|
21026
|
+
costUsd = (costUsd ?? 0) + total;
|
|
21027
|
+
}
|
|
21028
|
+
}
|
|
21029
|
+
request.streamCallbacks?.onLlmCallEnd?.(modelId, callDelta);
|
|
21030
|
+
}
|
|
20981
21031
|
}
|
|
21032
|
+
break;
|
|
21033
|
+
}
|
|
21034
|
+
case "tool_execution_start": {
|
|
21035
|
+
toolTrackers.set(event.toolCallId, {
|
|
21036
|
+
toolCallId: event.toolCallId,
|
|
21037
|
+
toolName: event.toolName,
|
|
21038
|
+
args: event.args,
|
|
21039
|
+
startMs: Date.now(),
|
|
21040
|
+
startTime: (/* @__PURE__ */ new Date()).toISOString()
|
|
21041
|
+
});
|
|
21042
|
+
request.streamCallbacks?.onToolCallStart?.(event.toolName, event.toolCallId);
|
|
21043
|
+
break;
|
|
21044
|
+
}
|
|
21045
|
+
case "tool_execution_end": {
|
|
21046
|
+
const tracker = toolTrackers.get(event.toolCallId);
|
|
21047
|
+
const durationMs = tracker ? Date.now() - tracker.startMs : 0;
|
|
21048
|
+
completedToolResults.set(event.toolCallId, {
|
|
21049
|
+
output: event.result,
|
|
21050
|
+
durationMs
|
|
21051
|
+
});
|
|
21052
|
+
request.streamCallbacks?.onToolCallEnd?.(
|
|
21053
|
+
event.toolName,
|
|
21054
|
+
tracker?.args,
|
|
21055
|
+
event.result,
|
|
21056
|
+
durationMs,
|
|
21057
|
+
event.toolCallId
|
|
21058
|
+
);
|
|
21059
|
+
toolTrackers.delete(event.toolCallId);
|
|
21060
|
+
break;
|
|
20982
21061
|
}
|
|
20983
21062
|
}
|
|
20984
21063
|
});
|
|
@@ -20997,10 +21076,12 @@ var PiAgentSdkProvider = class {
|
|
|
20997
21076
|
}
|
|
20998
21077
|
await agent.waitForIdle();
|
|
20999
21078
|
const agentMessages = agent.state.messages;
|
|
21079
|
+
const output = [];
|
|
21000
21080
|
for (const msg of agentMessages) {
|
|
21001
|
-
output.push(convertAgentMessage(msg));
|
|
21081
|
+
output.push(convertAgentMessage(msg, toolTrackers, completedToolResults));
|
|
21002
21082
|
}
|
|
21003
|
-
const
|
|
21083
|
+
const endTimeIso = (/* @__PURE__ */ new Date()).toISOString();
|
|
21084
|
+
const durationMs = Date.now() - startMs;
|
|
21004
21085
|
return {
|
|
21005
21086
|
raw: {
|
|
21006
21087
|
messages: agentMessages,
|
|
@@ -21009,49 +21090,54 @@ var PiAgentSdkProvider = class {
|
|
|
21009
21090
|
provider: this.config.provider
|
|
21010
21091
|
},
|
|
21011
21092
|
output,
|
|
21012
|
-
|
|
21093
|
+
tokenUsage,
|
|
21094
|
+
costUsd,
|
|
21095
|
+
durationMs,
|
|
21096
|
+
startTime: startTimeIso,
|
|
21097
|
+
endTime: endTimeIso
|
|
21013
21098
|
};
|
|
21014
21099
|
} finally {
|
|
21015
21100
|
unsubscribe();
|
|
21016
21101
|
}
|
|
21017
21102
|
}
|
|
21018
21103
|
};
|
|
21019
|
-
function
|
|
21020
|
-
if (typeof content === "string") {
|
|
21021
|
-
return content;
|
|
21022
|
-
}
|
|
21023
|
-
if (!Array.isArray(content)) {
|
|
21024
|
-
return void 0;
|
|
21025
|
-
}
|
|
21026
|
-
const textParts = [];
|
|
21027
|
-
for (const part of content) {
|
|
21028
|
-
if (!part || typeof part !== "object") {
|
|
21029
|
-
continue;
|
|
21030
|
-
}
|
|
21031
|
-
const p = part;
|
|
21032
|
-
if (p.type === "text" && typeof p.text === "string") {
|
|
21033
|
-
textParts.push(p.text);
|
|
21034
|
-
}
|
|
21035
|
-
}
|
|
21036
|
-
return textParts.length > 0 ? textParts.join("\n") : void 0;
|
|
21037
|
-
}
|
|
21038
|
-
function convertAgentMessage(message) {
|
|
21104
|
+
function convertAgentMessage(message, toolTrackers, completedToolResults) {
|
|
21039
21105
|
if (!message || typeof message !== "object") {
|
|
21040
21106
|
return { role: "unknown", content: String(message) };
|
|
21041
21107
|
}
|
|
21042
21108
|
const msg = message;
|
|
21043
21109
|
const role = typeof msg.role === "string" ? msg.role : "unknown";
|
|
21044
|
-
const content =
|
|
21045
|
-
const toolCalls = extractToolCalls3(msg.content);
|
|
21110
|
+
const content = extractPiTextContent(msg.content);
|
|
21111
|
+
const toolCalls = extractToolCalls3(msg.content, toolTrackers, completedToolResults);
|
|
21046
21112
|
const startTime = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
|
|
21113
|
+
let msgTokenUsage;
|
|
21114
|
+
if (msg.usage && typeof msg.usage === "object") {
|
|
21115
|
+
const u = msg.usage;
|
|
21116
|
+
const input = toFiniteNumber(u.input);
|
|
21117
|
+
const output = toFiniteNumber(u.output);
|
|
21118
|
+
if (input !== void 0 || output !== void 0) {
|
|
21119
|
+
msgTokenUsage = {
|
|
21120
|
+
input: input ?? 0,
|
|
21121
|
+
output: output ?? 0,
|
|
21122
|
+
...toFiniteNumber(u.cacheRead) !== void 0 ? { cached: toFiniteNumber(u.cacheRead) } : {}
|
|
21123
|
+
};
|
|
21124
|
+
}
|
|
21125
|
+
}
|
|
21126
|
+
const metadata = {};
|
|
21127
|
+
if (msg.api) metadata.api = msg.api;
|
|
21128
|
+
if (msg.provider) metadata.provider = msg.provider;
|
|
21129
|
+
if (msg.model) metadata.model = msg.model;
|
|
21130
|
+
if (msg.stopReason) metadata.stopReason = msg.stopReason;
|
|
21047
21131
|
return {
|
|
21048
21132
|
role,
|
|
21049
21133
|
content,
|
|
21050
21134
|
toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
|
|
21051
|
-
startTime
|
|
21135
|
+
startTime,
|
|
21136
|
+
metadata: Object.keys(metadata).length > 0 ? metadata : void 0,
|
|
21137
|
+
tokenUsage: msgTokenUsage
|
|
21052
21138
|
};
|
|
21053
21139
|
}
|
|
21054
|
-
function extractToolCalls3(content) {
|
|
21140
|
+
function extractToolCalls3(content, toolTrackers, completedToolResults) {
|
|
21055
21141
|
if (!Array.isArray(content)) {
|
|
21056
21142
|
return [];
|
|
21057
21143
|
}
|
|
@@ -21062,10 +21148,17 @@ function extractToolCalls3(content) {
|
|
|
21062
21148
|
}
|
|
21063
21149
|
const p = part;
|
|
21064
21150
|
if (p.type === "toolCall" && typeof p.name === "string") {
|
|
21151
|
+
const id = typeof p.id === "string" ? p.id : void 0;
|
|
21152
|
+
const tracker = id ? toolTrackers.get(id) : void 0;
|
|
21153
|
+
const completed = id ? completedToolResults.get(id) : void 0;
|
|
21065
21154
|
toolCalls.push({
|
|
21066
21155
|
tool: p.name,
|
|
21067
21156
|
input: p.arguments,
|
|
21068
|
-
id
|
|
21157
|
+
id,
|
|
21158
|
+
output: completed?.output,
|
|
21159
|
+
durationMs: completed?.durationMs,
|
|
21160
|
+
startTime: tracker?.startTime,
|
|
21161
|
+
endTime: tracker?.startTime && completed?.durationMs !== void 0 ? new Date(new Date(tracker.startTime).getTime() + completed.durationMs).toISOString() : void 0
|
|
21069
21162
|
});
|
|
21070
21163
|
}
|
|
21071
21164
|
}
|
|
@@ -21597,14 +21690,14 @@ function extractTokenUsage(events) {
|
|
|
21597
21690
|
const usage = record.usage;
|
|
21598
21691
|
if (usage && typeof usage === "object") {
|
|
21599
21692
|
const u = usage;
|
|
21600
|
-
const input =
|
|
21601
|
-
const output =
|
|
21693
|
+
const input = toFiniteNumber(u.input_tokens ?? u.inputTokens ?? u.input);
|
|
21694
|
+
const output = toFiniteNumber(u.output_tokens ?? u.outputTokens ?? u.output);
|
|
21602
21695
|
if (input !== void 0 || output !== void 0) {
|
|
21603
21696
|
const result = {
|
|
21604
21697
|
input: input ?? 0,
|
|
21605
21698
|
output: output ?? 0
|
|
21606
21699
|
};
|
|
21607
|
-
const cached =
|
|
21700
|
+
const cached = toFiniteNumber(u.cache_read_input_tokens ?? u.cached ?? u.cachedTokens);
|
|
21608
21701
|
if (cached !== void 0) {
|
|
21609
21702
|
return { ...result, cached };
|
|
21610
21703
|
}
|
|
@@ -21629,13 +21722,13 @@ function aggregateUsageFromMessages(messages) {
|
|
|
21629
21722
|
const usage = m.usage;
|
|
21630
21723
|
if (!usage || typeof usage !== "object") continue;
|
|
21631
21724
|
const u = usage;
|
|
21632
|
-
const input =
|
|
21633
|
-
const output =
|
|
21725
|
+
const input = toFiniteNumber(u.input_tokens ?? u.inputTokens ?? u.input);
|
|
21726
|
+
const output = toFiniteNumber(u.output_tokens ?? u.outputTokens ?? u.output);
|
|
21634
21727
|
if (input !== void 0 || output !== void 0) {
|
|
21635
21728
|
found = true;
|
|
21636
21729
|
totalInput += input ?? 0;
|
|
21637
21730
|
totalOutput += output ?? 0;
|
|
21638
|
-
const cached =
|
|
21731
|
+
const cached = toFiniteNumber(u.cache_read_input_tokens ?? u.cached ?? u.cachedTokens);
|
|
21639
21732
|
if (cached !== void 0) {
|
|
21640
21733
|
totalCached = (totalCached ?? 0) + cached;
|
|
21641
21734
|
}
|
|
@@ -21648,10 +21741,6 @@ function aggregateUsageFromMessages(messages) {
|
|
|
21648
21741
|
}
|
|
21649
21742
|
return result;
|
|
21650
21743
|
}
|
|
21651
|
-
function toNumber(value) {
|
|
21652
|
-
if (typeof value === "number" && Number.isFinite(value)) return value;
|
|
21653
|
-
return void 0;
|
|
21654
|
-
}
|
|
21655
21744
|
function convertPiMessage(message) {
|
|
21656
21745
|
if (!message || typeof message !== "object") {
|
|
21657
21746
|
return void 0;
|
|
@@ -21661,7 +21750,7 @@ function convertPiMessage(message) {
|
|
|
21661
21750
|
if (typeof role !== "string") {
|
|
21662
21751
|
return void 0;
|
|
21663
21752
|
}
|
|
21664
|
-
const content =
|
|
21753
|
+
const content = extractPiTextContent(msg.content);
|
|
21665
21754
|
const toolCalls = extractToolCalls4(msg.content);
|
|
21666
21755
|
const startTime = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
|
|
21667
21756
|
const metadata = {};
|
|
@@ -21678,25 +21767,6 @@ function convertPiMessage(message) {
|
|
|
21678
21767
|
metadata: Object.keys(metadata).length > 0 ? metadata : void 0
|
|
21679
21768
|
};
|
|
21680
21769
|
}
|
|
21681
|
-
function extractTextContent4(content) {
|
|
21682
|
-
if (typeof content === "string") {
|
|
21683
|
-
return content;
|
|
21684
|
-
}
|
|
21685
|
-
if (!Array.isArray(content)) {
|
|
21686
|
-
return void 0;
|
|
21687
|
-
}
|
|
21688
|
-
const textParts = [];
|
|
21689
|
-
for (const part of content) {
|
|
21690
|
-
if (!part || typeof part !== "object") {
|
|
21691
|
-
continue;
|
|
21692
|
-
}
|
|
21693
|
-
const p = part;
|
|
21694
|
-
if (p.type === "text" && typeof p.text === "string") {
|
|
21695
|
-
textParts.push(p.text);
|
|
21696
|
-
}
|
|
21697
|
-
}
|
|
21698
|
-
return textParts.length > 0 ? textParts.join("\n") : void 0;
|
|
21699
|
-
}
|
|
21700
21770
|
function extractToolCalls4(content) {
|
|
21701
21771
|
if (!Array.isArray(content)) {
|
|
21702
21772
|
return [];
|
|
@@ -23410,9 +23480,11 @@ function negateScore(score) {
|
|
|
23410
23480
|
...score,
|
|
23411
23481
|
score: negatedScore,
|
|
23412
23482
|
verdict: negatedVerdict,
|
|
23413
|
-
|
|
23414
|
-
|
|
23415
|
-
|
|
23483
|
+
assertions: score.assertions.map((a) => ({
|
|
23484
|
+
...a,
|
|
23485
|
+
passed: !a.passed,
|
|
23486
|
+
evidence: a.evidence ? `[Negated] ${a.evidence}` : void 0
|
|
23487
|
+
}))
|
|
23416
23488
|
};
|
|
23417
23489
|
}
|
|
23418
23490
|
function shellEscapePath(value) {
|
|
@@ -23912,9 +23984,13 @@ var CodeEvaluator = class {
|
|
|
23912
23984
|
);
|
|
23913
23985
|
const parsed = parseJsonSafe(stdout);
|
|
23914
23986
|
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
23915
|
-
const
|
|
23916
|
-
|
|
23917
|
-
|
|
23987
|
+
const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
|
|
23988
|
+
(a) => typeof a === "object" && a !== null && typeof a.text === "string"
|
|
23989
|
+
).map((a) => ({
|
|
23990
|
+
text: String(a.text),
|
|
23991
|
+
passed: Boolean(a.passed),
|
|
23992
|
+
...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
|
|
23993
|
+
})) : [];
|
|
23918
23994
|
const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
|
|
23919
23995
|
const proxyUsage = getProxyUsage?.();
|
|
23920
23996
|
const evaluatorRawRequest = {
|
|
@@ -23930,10 +24006,8 @@ var CodeEvaluator = class {
|
|
|
23930
24006
|
return {
|
|
23931
24007
|
score,
|
|
23932
24008
|
verdict: scoreToVerdict(score),
|
|
23933
|
-
|
|
23934
|
-
|
|
23935
|
-
expectedAspectCount: hits.length + misses.length || 1,
|
|
23936
|
-
reasoning,
|
|
24009
|
+
assertions,
|
|
24010
|
+
expectedAspectCount: assertions.length || 1,
|
|
23937
24011
|
evaluatorRawRequest,
|
|
23938
24012
|
...details ? { details } : {},
|
|
23939
24013
|
tokenUsage: proxyUsage?.tokenUsage
|
|
@@ -23944,10 +24018,8 @@ var CodeEvaluator = class {
|
|
|
23944
24018
|
return {
|
|
23945
24019
|
score: 0,
|
|
23946
24020
|
verdict: "fail",
|
|
23947
|
-
|
|
23948
|
-
misses: [`Code evaluator failed: ${message}`],
|
|
24021
|
+
assertions: [{ text: `Code evaluator failed: ${message}`, passed: false }],
|
|
23949
24022
|
expectedAspectCount: 1,
|
|
23950
|
-
reasoning: message,
|
|
23951
24023
|
evaluatorRawRequest: {
|
|
23952
24024
|
command: this.command,
|
|
23953
24025
|
...this.cwd ? { cwd: this.cwd } : {},
|
|
@@ -24046,9 +24118,13 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
|
|
|
24046
24118
|
{{${TEMPLATE_VARIABLES.ANSWER}}}`;
|
|
24047
24119
|
var freeformEvaluationSchema = external_exports2.object({
|
|
24048
24120
|
score: external_exports2.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
|
|
24049
|
-
|
|
24050
|
-
|
|
24051
|
-
|
|
24121
|
+
assertions: external_exports2.array(
|
|
24122
|
+
external_exports2.object({
|
|
24123
|
+
text: external_exports2.string().describe("Brief description of what was checked"),
|
|
24124
|
+
passed: external_exports2.boolean().describe("Whether this aspect was satisfied"),
|
|
24125
|
+
evidence: external_exports2.string().describe("Concise evidence (1-2 sentences)").optional()
|
|
24126
|
+
})
|
|
24127
|
+
).describe("Per-aspect evaluation results \u2014 one entry per aspect checked").optional()
|
|
24052
24128
|
});
|
|
24053
24129
|
var rubricCheckResultSchema = external_exports2.object({
|
|
24054
24130
|
id: external_exports2.string().describe("The ID of the rubric item being checked"),
|
|
@@ -24150,17 +24226,12 @@ ${context2.fileChanges}`;
|
|
|
24150
24226
|
schema: freeformEvaluationSchema
|
|
24151
24227
|
});
|
|
24152
24228
|
const score = clampScore(data.score);
|
|
24153
|
-
const
|
|
24154
|
-
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
24155
|
-
const reasoning = data.reasoning;
|
|
24156
|
-
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
24229
|
+
const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
|
|
24157
24230
|
return {
|
|
24158
24231
|
score,
|
|
24159
24232
|
verdict: scoreToVerdict(score),
|
|
24160
|
-
|
|
24161
|
-
|
|
24162
|
-
expectedAspectCount,
|
|
24163
|
-
reasoning,
|
|
24233
|
+
assertions,
|
|
24234
|
+
expectedAspectCount: Math.max(assertions.length, 1),
|
|
24164
24235
|
evaluatorRawRequest,
|
|
24165
24236
|
tokenUsage
|
|
24166
24237
|
};
|
|
@@ -24171,10 +24242,8 @@ ${context2.fileChanges}`;
|
|
|
24171
24242
|
return {
|
|
24172
24243
|
score: 0,
|
|
24173
24244
|
verdict: "skip",
|
|
24174
|
-
|
|
24175
|
-
misses: [`Grader parse failure after 3 attempts: ${message}`],
|
|
24245
|
+
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
24176
24246
|
expectedAspectCount: 1,
|
|
24177
|
-
reasoning: `Grader parse failure after 3 attempts: ${message}`,
|
|
24178
24247
|
evaluatorRawRequest
|
|
24179
24248
|
};
|
|
24180
24249
|
}
|
|
@@ -24204,14 +24273,12 @@ ${context2.fileChanges}`;
|
|
|
24204
24273
|
userPrompt: prompt,
|
|
24205
24274
|
schema: rubricEvaluationSchema
|
|
24206
24275
|
});
|
|
24207
|
-
const { score, verdict,
|
|
24276
|
+
const { score, verdict, assertions } = calculateRubricScore(data, rubrics);
|
|
24208
24277
|
return {
|
|
24209
24278
|
score,
|
|
24210
24279
|
verdict,
|
|
24211
|
-
|
|
24212
|
-
misses,
|
|
24280
|
+
assertions,
|
|
24213
24281
|
expectedAspectCount: rubrics.length,
|
|
24214
|
-
reasoning: data.overall_reasoning,
|
|
24215
24282
|
evaluatorRawRequest,
|
|
24216
24283
|
tokenUsage
|
|
24217
24284
|
};
|
|
@@ -24222,10 +24289,8 @@ ${context2.fileChanges}`;
|
|
|
24222
24289
|
return {
|
|
24223
24290
|
score: 0,
|
|
24224
24291
|
verdict: "skip",
|
|
24225
|
-
|
|
24226
|
-
misses: [`Grader parse failure after 3 attempts: ${message}`],
|
|
24292
|
+
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
24227
24293
|
expectedAspectCount: rubrics.length,
|
|
24228
|
-
reasoning: `Grader parse failure after 3 attempts: ${message}`,
|
|
24229
24294
|
evaluatorRawRequest
|
|
24230
24295
|
};
|
|
24231
24296
|
}
|
|
@@ -24250,14 +24315,12 @@ ${context2.fileChanges}`;
|
|
|
24250
24315
|
userPrompt: prompt,
|
|
24251
24316
|
schema: scoreRangeEvaluationSchema
|
|
24252
24317
|
});
|
|
24253
|
-
const { score, verdict,
|
|
24318
|
+
const { score, verdict, assertions, details } = calculateScoreRangeResult(data, rubrics);
|
|
24254
24319
|
return {
|
|
24255
24320
|
score,
|
|
24256
24321
|
verdict,
|
|
24257
|
-
|
|
24258
|
-
misses,
|
|
24322
|
+
assertions,
|
|
24259
24323
|
expectedAspectCount: rubrics.length,
|
|
24260
|
-
reasoning: data.overall_reasoning,
|
|
24261
24324
|
evaluatorRawRequest,
|
|
24262
24325
|
details,
|
|
24263
24326
|
tokenUsage
|
|
@@ -24269,10 +24332,8 @@ ${context2.fileChanges}`;
|
|
|
24269
24332
|
return {
|
|
24270
24333
|
score: 0,
|
|
24271
24334
|
verdict: "skip",
|
|
24272
|
-
|
|
24273
|
-
misses: [`Grader parse failure after 3 attempts: ${message}`],
|
|
24335
|
+
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
24274
24336
|
expectedAspectCount: rubrics.length,
|
|
24275
|
-
reasoning: `Grader parse failure after 3 attempts: ${message}`,
|
|
24276
24337
|
evaluatorRawRequest
|
|
24277
24338
|
};
|
|
24278
24339
|
}
|
|
@@ -24329,8 +24390,7 @@ ${context2.fileChanges}`;
|
|
|
24329
24390
|
return {
|
|
24330
24391
|
score: 0,
|
|
24331
24392
|
verdict: "fail",
|
|
24332
|
-
|
|
24333
|
-
misses: [`llm-grader built-in evaluation failed: ${message}`],
|
|
24393
|
+
assertions: [{ text: `llm-grader built-in evaluation failed: ${message}`, passed: false }],
|
|
24334
24394
|
expectedAspectCount: 1,
|
|
24335
24395
|
evaluatorRawRequest,
|
|
24336
24396
|
details: { mode: "built-in", error: message }
|
|
@@ -24380,8 +24440,9 @@ ${context2.fileChanges}`;
|
|
|
24380
24440
|
return {
|
|
24381
24441
|
score: 0,
|
|
24382
24442
|
verdict: "fail",
|
|
24383
|
-
|
|
24384
|
-
|
|
24443
|
+
assertions: [
|
|
24444
|
+
{ text: `llm-grader ${modeLabel} returned no assistant response`, passed: false }
|
|
24445
|
+
],
|
|
24385
24446
|
expectedAspectCount: 1,
|
|
24386
24447
|
evaluatorRawRequest,
|
|
24387
24448
|
details: { mode: modeLabel, grader_target: provider.targetName }
|
|
@@ -24399,8 +24460,9 @@ ${context2.fileChanges}`;
|
|
|
24399
24460
|
return {
|
|
24400
24461
|
score: 0,
|
|
24401
24462
|
verdict: "fail",
|
|
24402
|
-
|
|
24403
|
-
|
|
24463
|
+
assertions: [
|
|
24464
|
+
{ text: `llm-grader ${modeLabel} evaluation failed: ${message}`, passed: false }
|
|
24465
|
+
],
|
|
24404
24466
|
expectedAspectCount: 1,
|
|
24405
24467
|
evaluatorRawRequest,
|
|
24406
24468
|
details: {
|
|
@@ -24552,29 +24614,24 @@ ${outputSchema2}`;
|
|
|
24552
24614
|
const parsed = parseJsonFromText(text2);
|
|
24553
24615
|
if (rubrics && rubrics.length > 0) {
|
|
24554
24616
|
const data2 = rubricEvaluationSchema.parse(parsed);
|
|
24555
|
-
const { score: score2, verdict,
|
|
24617
|
+
const { score: score2, verdict, assertions: assertions2 } = calculateRubricScore(data2, rubrics);
|
|
24556
24618
|
return {
|
|
24557
24619
|
score: score2,
|
|
24558
24620
|
verdict,
|
|
24559
|
-
|
|
24560
|
-
misses: misses2,
|
|
24621
|
+
assertions: assertions2,
|
|
24561
24622
|
expectedAspectCount: rubrics.length,
|
|
24562
|
-
reasoning: data2.overall_reasoning,
|
|
24563
24623
|
evaluatorRawRequest,
|
|
24564
24624
|
details
|
|
24565
24625
|
};
|
|
24566
24626
|
}
|
|
24567
24627
|
const data = freeformEvaluationSchema.parse(parsed);
|
|
24568
24628
|
const score = clampScore(data.score);
|
|
24569
|
-
const
|
|
24570
|
-
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
24629
|
+
const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
|
|
24571
24630
|
return {
|
|
24572
24631
|
score,
|
|
24573
24632
|
verdict: scoreToVerdict(score),
|
|
24574
|
-
|
|
24575
|
-
|
|
24576
|
-
expectedAspectCount: Math.max(hits.length + misses.length, 1),
|
|
24577
|
-
reasoning: data.reasoning,
|
|
24633
|
+
assertions,
|
|
24634
|
+
expectedAspectCount: Math.max(assertions.length, 1),
|
|
24578
24635
|
evaluatorRawRequest,
|
|
24579
24636
|
details
|
|
24580
24637
|
};
|
|
@@ -24582,8 +24639,12 @@ ${outputSchema2}`;
|
|
|
24582
24639
|
return {
|
|
24583
24640
|
score: 0,
|
|
24584
24641
|
verdict: "fail",
|
|
24585
|
-
|
|
24586
|
-
|
|
24642
|
+
assertions: [
|
|
24643
|
+
{
|
|
24644
|
+
text: "Failed to parse llm-grader agent response as valid evaluation JSON",
|
|
24645
|
+
passed: false
|
|
24646
|
+
}
|
|
24647
|
+
],
|
|
24587
24648
|
expectedAspectCount: 1,
|
|
24588
24649
|
evaluatorRawRequest,
|
|
24589
24650
|
details
|
|
@@ -24712,9 +24773,13 @@ function buildOutputSchema() {
|
|
|
24712
24773
|
"",
|
|
24713
24774
|
"{",
|
|
24714
24775
|
' "score": <number between 0.0 and 1.0>,',
|
|
24715
|
-
' "
|
|
24716
|
-
|
|
24717
|
-
'
|
|
24776
|
+
' "assertions": [',
|
|
24777
|
+
" {",
|
|
24778
|
+
' "text": "<brief description of what was checked>",',
|
|
24779
|
+
' "passed": <boolean>,',
|
|
24780
|
+
' "evidence": "<concise evidence, 1-2 sentences, optional>"',
|
|
24781
|
+
" }",
|
|
24782
|
+
" ]",
|
|
24718
24783
|
"}"
|
|
24719
24784
|
].join("\n");
|
|
24720
24785
|
}
|
|
@@ -24739,8 +24804,7 @@ function substituteVariables(template, variables) {
|
|
|
24739
24804
|
}
|
|
24740
24805
|
function calculateRubricScore(result, rubrics) {
|
|
24741
24806
|
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
24742
|
-
const
|
|
24743
|
-
const misses = [];
|
|
24807
|
+
const assertions = [];
|
|
24744
24808
|
let totalWeight = 0;
|
|
24745
24809
|
let earnedWeight = 0;
|
|
24746
24810
|
let failedRequired = false;
|
|
@@ -24750,19 +24814,20 @@ function calculateRubricScore(result, rubrics) {
|
|
|
24750
24814
|
continue;
|
|
24751
24815
|
}
|
|
24752
24816
|
totalWeight += rubric.weight;
|
|
24817
|
+
assertions.push({
|
|
24818
|
+
text: `[${rubric.id}] ${rubric.outcome}`,
|
|
24819
|
+
passed: check.satisfied,
|
|
24820
|
+
evidence: check.reasoning
|
|
24821
|
+
});
|
|
24753
24822
|
if (check.satisfied) {
|
|
24754
24823
|
earnedWeight += rubric.weight;
|
|
24755
|
-
|
|
24756
|
-
|
|
24757
|
-
misses.push(`[${rubric.id}] ${rubric.outcome}: ${check.reasoning}`);
|
|
24758
|
-
if (rubric.required) {
|
|
24759
|
-
failedRequired = true;
|
|
24760
|
-
}
|
|
24824
|
+
} else if (rubric.required) {
|
|
24825
|
+
failedRequired = true;
|
|
24761
24826
|
}
|
|
24762
24827
|
}
|
|
24763
24828
|
const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
|
|
24764
24829
|
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
24765
|
-
return { score, verdict,
|
|
24830
|
+
return { score, verdict, assertions };
|
|
24766
24831
|
}
|
|
24767
24832
|
function buildScoreRangeOutputSchema() {
|
|
24768
24833
|
return `You are an expert evaluator. Score the candidate answer on each criterion.
|
|
@@ -24782,8 +24847,7 @@ Important: The "score" must be an integer from 0 to 10 that falls within one of
|
|
|
24782
24847
|
}
|
|
24783
24848
|
function calculateScoreRangeResult(result, rubrics) {
|
|
24784
24849
|
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
24785
|
-
const
|
|
24786
|
-
const misses = [];
|
|
24850
|
+
const assertions = [];
|
|
24787
24851
|
const rawScores = {};
|
|
24788
24852
|
let totalWeight = 0;
|
|
24789
24853
|
let weightedScoreSum = 0;
|
|
@@ -24809,24 +24873,22 @@ function calculateScoreRangeResult(result, rubrics) {
|
|
|
24809
24873
|
);
|
|
24810
24874
|
const rangeDescription = matchingRange?.outcome ?? "";
|
|
24811
24875
|
const criterionLabel = rubric.outcome ?? rubric.id;
|
|
24812
|
-
const
|
|
24813
|
-
const scoreInfo = `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})${reasoningText}`;
|
|
24876
|
+
const passed = !(requiredMinScore !== void 0 && rawScore < requiredMinScore) && rawScore >= 7;
|
|
24814
24877
|
if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
|
|
24815
24878
|
failedRequired = true;
|
|
24816
|
-
misses.push(scoreInfo);
|
|
24817
|
-
} else if (rawScore >= 7) {
|
|
24818
|
-
hits.push(scoreInfo);
|
|
24819
|
-
} else {
|
|
24820
|
-
misses.push(scoreInfo);
|
|
24821
24879
|
}
|
|
24880
|
+
assertions.push({
|
|
24881
|
+
text: `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})`,
|
|
24882
|
+
passed,
|
|
24883
|
+
evidence: check.reasoning
|
|
24884
|
+
});
|
|
24822
24885
|
}
|
|
24823
24886
|
const score = totalWeight > 0 ? Math.min(1, Math.max(0, weightedScoreSum / totalWeight)) : 0;
|
|
24824
24887
|
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
24825
24888
|
return {
|
|
24826
24889
|
score,
|
|
24827
24890
|
verdict,
|
|
24828
|
-
|
|
24829
|
-
misses,
|
|
24891
|
+
assertions,
|
|
24830
24892
|
details: {
|
|
24831
24893
|
raw_scores: rawScores,
|
|
24832
24894
|
normalization: "score / 10",
|
|
@@ -25000,9 +25062,7 @@ var CompositeEvaluator = class {
|
|
|
25000
25062
|
let totalWeight = 0;
|
|
25001
25063
|
let weightedSum = 0;
|
|
25002
25064
|
let evaluatedCount = 0;
|
|
25003
|
-
const
|
|
25004
|
-
const allMisses = [];
|
|
25005
|
-
const reasoningParts = [];
|
|
25065
|
+
const allAssertions = [];
|
|
25006
25066
|
const scores = [];
|
|
25007
25067
|
for (const member of results) {
|
|
25008
25068
|
const weight = weights?.[member.id] ?? 1;
|
|
@@ -25012,9 +25072,7 @@ var CompositeEvaluator = class {
|
|
|
25012
25072
|
score: member.result.score,
|
|
25013
25073
|
weight,
|
|
25014
25074
|
verdict: member.result.verdict,
|
|
25015
|
-
|
|
25016
|
-
misses: [...member.result.misses],
|
|
25017
|
-
reasoning: member.result.reasoning,
|
|
25075
|
+
assertions: [...member.result.assertions],
|
|
25018
25076
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
25019
25077
|
scores: member.result.scores,
|
|
25020
25078
|
details: member.result.details,
|
|
@@ -25026,20 +25084,16 @@ var CompositeEvaluator = class {
|
|
|
25026
25084
|
evaluatedCount++;
|
|
25027
25085
|
totalWeight += weight;
|
|
25028
25086
|
weightedSum += member.result.score * weight;
|
|
25029
|
-
|
|
25030
|
-
|
|
25031
|
-
|
|
25032
|
-
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
25033
|
-
}
|
|
25087
|
+
allAssertions.push(
|
|
25088
|
+
...member.result.assertions.map((a) => ({ ...a, text: `[${member.id}] ${a.text}` }))
|
|
25089
|
+
);
|
|
25034
25090
|
}
|
|
25035
25091
|
if (evaluatedCount === 0 && results.length > 0) {
|
|
25036
25092
|
return {
|
|
25037
25093
|
score: 0,
|
|
25038
25094
|
verdict: "skip",
|
|
25039
|
-
|
|
25040
|
-
misses: [],
|
|
25095
|
+
assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
|
|
25041
25096
|
expectedAspectCount: 1,
|
|
25042
|
-
reasoning: "All evaluators skipped (infrastructure failure)",
|
|
25043
25097
|
evaluatorRawRequest: {
|
|
25044
25098
|
aggregator: "weighted_average",
|
|
25045
25099
|
...weights ? { weights } : {}
|
|
@@ -25051,10 +25105,8 @@ var CompositeEvaluator = class {
|
|
|
25051
25105
|
return {
|
|
25052
25106
|
score: clampScore(finalScore),
|
|
25053
25107
|
verdict: scoreToVerdict(finalScore),
|
|
25054
|
-
|
|
25055
|
-
|
|
25056
|
-
expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
|
|
25057
|
-
reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
|
|
25108
|
+
assertions: allAssertions,
|
|
25109
|
+
expectedAspectCount: allAssertions.length || 1,
|
|
25058
25110
|
evaluatorRawRequest: {
|
|
25059
25111
|
aggregator: "weighted_average",
|
|
25060
25112
|
...weights ? { weights } : {}
|
|
@@ -25064,11 +25116,8 @@ var CompositeEvaluator = class {
|
|
|
25064
25116
|
}
|
|
25065
25117
|
runThreshold(results, threshold) {
|
|
25066
25118
|
const scores = [];
|
|
25067
|
-
const
|
|
25068
|
-
const allMisses = [];
|
|
25069
|
-
const reasoningParts = [];
|
|
25119
|
+
const allAssertions = [];
|
|
25070
25120
|
let passingCount = 0;
|
|
25071
|
-
let borderlineCount = 0;
|
|
25072
25121
|
let evaluatedCount = 0;
|
|
25073
25122
|
for (const member of results) {
|
|
25074
25123
|
scores.push({
|
|
@@ -25076,9 +25125,7 @@ var CompositeEvaluator = class {
|
|
|
25076
25125
|
type: member.type,
|
|
25077
25126
|
score: member.result.score,
|
|
25078
25127
|
verdict: member.result.verdict,
|
|
25079
|
-
|
|
25080
|
-
misses: [...member.result.misses],
|
|
25081
|
-
reasoning: member.result.reasoning,
|
|
25128
|
+
assertions: [...member.result.assertions],
|
|
25082
25129
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
25083
25130
|
scores: member.result.scores,
|
|
25084
25131
|
details: member.result.details,
|
|
@@ -25091,24 +25138,17 @@ var CompositeEvaluator = class {
|
|
|
25091
25138
|
const isPassing = member.result.verdict === "pass" || member.result.verdict === "borderline";
|
|
25092
25139
|
if (isPassing) {
|
|
25093
25140
|
passingCount++;
|
|
25094
|
-
if (member.result.verdict === "borderline") {
|
|
25095
|
-
borderlineCount++;
|
|
25096
|
-
}
|
|
25097
|
-
}
|
|
25098
|
-
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
25099
|
-
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
25100
|
-
if (member.result.reasoning) {
|
|
25101
|
-
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
25102
25141
|
}
|
|
25142
|
+
allAssertions.push(
|
|
25143
|
+
...member.result.assertions.map((a) => ({ ...a, text: `[${member.id}] ${a.text}` }))
|
|
25144
|
+
);
|
|
25103
25145
|
}
|
|
25104
25146
|
if (evaluatedCount === 0 && results.length > 0) {
|
|
25105
25147
|
return {
|
|
25106
25148
|
score: 0,
|
|
25107
25149
|
verdict: "skip",
|
|
25108
|
-
|
|
25109
|
-
misses: [],
|
|
25150
|
+
assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
|
|
25110
25151
|
expectedAspectCount: 1,
|
|
25111
|
-
reasoning: "All evaluators skipped (infrastructure failure)",
|
|
25112
25152
|
evaluatorRawRequest: {
|
|
25113
25153
|
aggregator: "threshold",
|
|
25114
25154
|
threshold
|
|
@@ -25119,19 +25159,15 @@ var CompositeEvaluator = class {
|
|
|
25119
25159
|
const totalCount = evaluatedCount;
|
|
25120
25160
|
const score = totalCount > 0 ? passingCount / totalCount : 0;
|
|
25121
25161
|
const pass = score >= threshold;
|
|
25122
|
-
|
|
25123
|
-
|
|
25124
|
-
|
|
25125
|
-
|
|
25126
|
-
`${passingCount}/${totalCount} evaluators passed (threshold: ${threshold})`
|
|
25127
|
-
);
|
|
25162
|
+
allAssertions.unshift({
|
|
25163
|
+
text: `${passingCount}/${totalCount} evaluators passed (threshold: ${threshold})`,
|
|
25164
|
+
passed: pass
|
|
25165
|
+
});
|
|
25128
25166
|
return {
|
|
25129
25167
|
score: clampScore(score),
|
|
25130
25168
|
verdict: pass ? "pass" : "fail",
|
|
25131
|
-
|
|
25132
|
-
|
|
25133
|
-
expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
|
|
25134
|
-
reasoning: reasoningParts.join("; "),
|
|
25169
|
+
assertions: allAssertions,
|
|
25170
|
+
expectedAspectCount: allAssertions.length || 1,
|
|
25135
25171
|
evaluatorRawRequest: {
|
|
25136
25172
|
aggregator: "threshold",
|
|
25137
25173
|
threshold
|
|
@@ -25148,9 +25184,7 @@ var CompositeEvaluator = class {
|
|
|
25148
25184
|
score: member.result.score,
|
|
25149
25185
|
weight: weights?.[member.id] ?? 1,
|
|
25150
25186
|
verdict: member.result.verdict,
|
|
25151
|
-
|
|
25152
|
-
misses: [...member.result.misses],
|
|
25153
|
-
reasoning: member.result.reasoning,
|
|
25187
|
+
assertions: [...member.result.assertions],
|
|
25154
25188
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
25155
25189
|
scores: member.result.scores,
|
|
25156
25190
|
details: member.result.details
|
|
@@ -25159,17 +25193,19 @@ var CompositeEvaluator = class {
|
|
|
25159
25193
|
const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
|
|
25160
25194
|
const parsed = parseJsonSafe(stdout);
|
|
25161
25195
|
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
25162
|
-
const
|
|
25163
|
-
|
|
25164
|
-
|
|
25196
|
+
const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
|
|
25197
|
+
(a) => typeof a === "object" && a !== null && typeof a.text === "string"
|
|
25198
|
+
).map((a) => ({
|
|
25199
|
+
text: String(a.text),
|
|
25200
|
+
passed: Boolean(a.passed),
|
|
25201
|
+
...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
|
|
25202
|
+
})) : [];
|
|
25165
25203
|
const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
|
|
25166
25204
|
return {
|
|
25167
25205
|
score,
|
|
25168
25206
|
verdict,
|
|
25169
|
-
|
|
25170
|
-
|
|
25171
|
-
expectedAspectCount: hits.length + misses.length || 1,
|
|
25172
|
-
reasoning,
|
|
25207
|
+
assertions,
|
|
25208
|
+
expectedAspectCount: assertions.length || 1,
|
|
25173
25209
|
evaluatorRawRequest: {
|
|
25174
25210
|
aggregator: "code-grader",
|
|
25175
25211
|
script: scriptPath
|
|
@@ -25181,10 +25217,8 @@ var CompositeEvaluator = class {
|
|
|
25181
25217
|
return {
|
|
25182
25218
|
score: 0,
|
|
25183
25219
|
verdict: "fail",
|
|
25184
|
-
|
|
25185
|
-
misses: [`Code aggregator failed: ${message}`],
|
|
25220
|
+
assertions: [{ text: `Code aggregator failed: ${message}`, passed: false }],
|
|
25186
25221
|
expectedAspectCount: 1,
|
|
25187
|
-
reasoning: message,
|
|
25188
25222
|
evaluatorRawRequest: {
|
|
25189
25223
|
aggregator: "code-grader",
|
|
25190
25224
|
script: scriptPath,
|
|
@@ -25206,9 +25240,7 @@ var CompositeEvaluator = class {
|
|
|
25206
25240
|
type: member.type,
|
|
25207
25241
|
score: member.result.score,
|
|
25208
25242
|
verdict: member.result.verdict,
|
|
25209
|
-
|
|
25210
|
-
misses: [...member.result.misses],
|
|
25211
|
-
reasoning: member.result.reasoning,
|
|
25243
|
+
assertions: [...member.result.assertions],
|
|
25212
25244
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
25213
25245
|
scores: member.result.scores,
|
|
25214
25246
|
details: member.result.details
|
|
@@ -25232,16 +25264,12 @@ var CompositeEvaluator = class {
|
|
|
25232
25264
|
});
|
|
25233
25265
|
const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text2));
|
|
25234
25266
|
const score2 = clampScore(data2.score);
|
|
25235
|
-
const
|
|
25236
|
-
const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
25237
|
-
const reasoning2 = data2.reasoning;
|
|
25267
|
+
const assertions2 = Array.isArray(data2.assertions) ? data2.assertions.slice(0, 8) : [];
|
|
25238
25268
|
return {
|
|
25239
25269
|
score: score2,
|
|
25240
25270
|
verdict: scoreToVerdict(score2),
|
|
25241
|
-
|
|
25242
|
-
|
|
25243
|
-
expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
|
|
25244
|
-
reasoning: reasoning2,
|
|
25271
|
+
assertions: assertions2,
|
|
25272
|
+
expectedAspectCount: Math.max(assertions2.length, 1),
|
|
25245
25273
|
evaluatorRawRequest,
|
|
25246
25274
|
scores
|
|
25247
25275
|
};
|
|
@@ -25256,16 +25284,12 @@ var CompositeEvaluator = class {
|
|
|
25256
25284
|
parseJsonFromText(extractLastAssistantContent(response.output))
|
|
25257
25285
|
);
|
|
25258
25286
|
const score = clampScore(data.score);
|
|
25259
|
-
const
|
|
25260
|
-
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
25261
|
-
const reasoning = data.reasoning;
|
|
25287
|
+
const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
|
|
25262
25288
|
return {
|
|
25263
25289
|
score,
|
|
25264
25290
|
verdict: scoreToVerdict(score),
|
|
25265
|
-
|
|
25266
|
-
|
|
25267
|
-
expectedAspectCount: Math.max(hits.length + misses.length, 1),
|
|
25268
|
-
reasoning,
|
|
25291
|
+
assertions,
|
|
25292
|
+
expectedAspectCount: Math.max(assertions.length, 1),
|
|
25269
25293
|
evaluatorRawRequest,
|
|
25270
25294
|
scores
|
|
25271
25295
|
};
|
|
@@ -25273,8 +25297,7 @@ var CompositeEvaluator = class {
|
|
|
25273
25297
|
return {
|
|
25274
25298
|
score: 0,
|
|
25275
25299
|
verdict: "fail",
|
|
25276
|
-
|
|
25277
|
-
misses: [],
|
|
25300
|
+
assertions: [{ text: "LLM aggregator failed", passed: false }],
|
|
25278
25301
|
expectedAspectCount: 1,
|
|
25279
25302
|
evaluatorRawRequest,
|
|
25280
25303
|
scores
|
|
@@ -25295,10 +25318,8 @@ var CostEvaluator = class {
|
|
|
25295
25318
|
return {
|
|
25296
25319
|
score: 0,
|
|
25297
25320
|
verdict: "fail",
|
|
25298
|
-
|
|
25299
|
-
misses: ["No cost data available in trace"],
|
|
25321
|
+
assertions: [{ text: "No cost data available in trace", passed: false }],
|
|
25300
25322
|
expectedAspectCount: 1,
|
|
25301
|
-
reasoning: "Execution cost not reported by provider",
|
|
25302
25323
|
evaluatorRawRequest: {
|
|
25303
25324
|
type: "cost",
|
|
25304
25325
|
budget,
|
|
@@ -25312,10 +25333,10 @@ var CostEvaluator = class {
|
|
|
25312
25333
|
return {
|
|
25313
25334
|
score,
|
|
25314
25335
|
verdict: passed ? "pass" : "fail",
|
|
25315
|
-
|
|
25316
|
-
|
|
25336
|
+
assertions: [
|
|
25337
|
+
passed ? { text: `Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`, passed: true } : { text: `Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`, passed: false }
|
|
25338
|
+
],
|
|
25317
25339
|
expectedAspectCount: 1,
|
|
25318
|
-
reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
|
|
25319
25340
|
evaluatorRawRequest: {
|
|
25320
25341
|
type: "cost",
|
|
25321
25342
|
budget,
|
|
@@ -25346,10 +25367,8 @@ var ExecutionMetricsEvaluator = class {
|
|
|
25346
25367
|
return {
|
|
25347
25368
|
score: 0,
|
|
25348
25369
|
verdict: "fail",
|
|
25349
|
-
|
|
25350
|
-
misses: ["No trace summary available"],
|
|
25370
|
+
assertions: [{ text: "No trace summary available", passed: false }],
|
|
25351
25371
|
expectedAspectCount: 1,
|
|
25352
|
-
reasoning: "Execution metrics not available - no trace summary provided",
|
|
25353
25372
|
evaluatorRawRequest: {
|
|
25354
25373
|
type: "execution-metrics",
|
|
25355
25374
|
config: this.extractConfiguredThresholds(),
|
|
@@ -25358,116 +25377,114 @@ var ExecutionMetricsEvaluator = class {
|
|
|
25358
25377
|
};
|
|
25359
25378
|
}
|
|
25360
25379
|
const narrowedTrace = trace2;
|
|
25361
|
-
const
|
|
25362
|
-
const misses = [];
|
|
25380
|
+
const assertions = [];
|
|
25363
25381
|
const actualMetrics = {};
|
|
25364
25382
|
if (max_tool_calls !== void 0 && narrowedTrace) {
|
|
25365
25383
|
const toolCalls = narrowedTrace.eventCount;
|
|
25366
25384
|
actualMetrics.tool_calls = toolCalls;
|
|
25367
25385
|
if (toolCalls <= max_tool_calls) {
|
|
25368
|
-
|
|
25386
|
+
assertions.push({ text: `Tool calls ${toolCalls} <= ${max_tool_calls} max`, passed: true });
|
|
25369
25387
|
} else {
|
|
25370
|
-
|
|
25388
|
+
assertions.push({ text: `Tool calls ${toolCalls} > ${max_tool_calls} max`, passed: false });
|
|
25371
25389
|
}
|
|
25372
25390
|
}
|
|
25373
25391
|
if (max_llm_calls !== void 0 && narrowedTrace) {
|
|
25374
25392
|
const llmCalls = narrowedTrace.llmCallCount;
|
|
25375
25393
|
if (llmCalls === void 0) {
|
|
25376
|
-
|
|
25394
|
+
assertions.push({ text: "LLM call count data not available", passed: false });
|
|
25377
25395
|
} else {
|
|
25378
25396
|
actualMetrics.llm_calls = llmCalls;
|
|
25379
25397
|
if (llmCalls <= max_llm_calls) {
|
|
25380
|
-
|
|
25398
|
+
assertions.push({ text: `LLM calls ${llmCalls} <= ${max_llm_calls} max`, passed: true });
|
|
25381
25399
|
} else {
|
|
25382
|
-
|
|
25400
|
+
assertions.push({ text: `LLM calls ${llmCalls} > ${max_llm_calls} max`, passed: false });
|
|
25383
25401
|
}
|
|
25384
25402
|
}
|
|
25385
25403
|
}
|
|
25386
25404
|
if (max_tokens !== void 0) {
|
|
25387
25405
|
if (!tokenUsage) {
|
|
25388
|
-
|
|
25406
|
+
assertions.push({ text: "Token usage data not available", passed: false });
|
|
25389
25407
|
} else {
|
|
25390
25408
|
const totalTokens = tokenUsage.input + tokenUsage.output;
|
|
25391
25409
|
actualMetrics.tokens = totalTokens;
|
|
25392
25410
|
if (totalTokens <= max_tokens) {
|
|
25393
|
-
|
|
25411
|
+
assertions.push({
|
|
25412
|
+
text: `Total tokens ${totalTokens} <= ${max_tokens} max`,
|
|
25413
|
+
passed: true
|
|
25414
|
+
});
|
|
25394
25415
|
} else {
|
|
25395
|
-
|
|
25416
|
+
assertions.push({
|
|
25417
|
+
text: `Total tokens ${totalTokens} > ${max_tokens} max`,
|
|
25418
|
+
passed: false
|
|
25419
|
+
});
|
|
25396
25420
|
}
|
|
25397
25421
|
}
|
|
25398
25422
|
}
|
|
25399
25423
|
if (max_cost_usd !== void 0) {
|
|
25400
25424
|
if (costUsd === void 0) {
|
|
25401
|
-
|
|
25425
|
+
assertions.push({ text: "Cost data not available", passed: false });
|
|
25402
25426
|
} else {
|
|
25403
25427
|
actualMetrics.cost_usd = costUsd;
|
|
25404
25428
|
const formatCost = (n) => `$${n.toFixed(4)}`;
|
|
25405
25429
|
if (costUsd <= max_cost_usd) {
|
|
25406
|
-
|
|
25430
|
+
assertions.push({
|
|
25431
|
+
text: `Cost ${formatCost(costUsd)} <= ${formatCost(max_cost_usd)} max`,
|
|
25432
|
+
passed: true
|
|
25433
|
+
});
|
|
25407
25434
|
} else {
|
|
25408
|
-
|
|
25435
|
+
assertions.push({
|
|
25436
|
+
text: `Cost ${formatCost(costUsd)} > ${formatCost(max_cost_usd)} max`,
|
|
25437
|
+
passed: false
|
|
25438
|
+
});
|
|
25409
25439
|
}
|
|
25410
25440
|
}
|
|
25411
25441
|
}
|
|
25412
25442
|
if (max_duration_ms !== void 0) {
|
|
25413
25443
|
if (durationMs === void 0) {
|
|
25414
|
-
|
|
25444
|
+
assertions.push({ text: "Duration data not available", passed: false });
|
|
25415
25445
|
} else {
|
|
25416
25446
|
actualMetrics.duration_ms = durationMs;
|
|
25417
25447
|
if (durationMs <= max_duration_ms) {
|
|
25418
|
-
|
|
25448
|
+
assertions.push({
|
|
25449
|
+
text: `Duration ${durationMs}ms <= ${max_duration_ms}ms max`,
|
|
25450
|
+
passed: true
|
|
25451
|
+
});
|
|
25419
25452
|
} else {
|
|
25420
|
-
|
|
25453
|
+
assertions.push({
|
|
25454
|
+
text: `Duration ${durationMs}ms > ${max_duration_ms}ms max`,
|
|
25455
|
+
passed: false
|
|
25456
|
+
});
|
|
25421
25457
|
}
|
|
25422
25458
|
}
|
|
25423
25459
|
}
|
|
25424
25460
|
if (target_exploration_ratio !== void 0 && narrowedTrace) {
|
|
25425
25461
|
const ratio = explorationRatio(narrowedTrace);
|
|
25426
25462
|
if (ratio === void 0) {
|
|
25427
|
-
|
|
25463
|
+
assertions.push({ text: "Exploration ratio not available (no tool calls)", passed: false });
|
|
25428
25464
|
} else {
|
|
25429
25465
|
actualMetrics.exploration_ratio = ratio;
|
|
25430
25466
|
const diff = Math.abs(ratio - target_exploration_ratio);
|
|
25431
25467
|
if (diff <= exploration_tolerance) {
|
|
25432
|
-
|
|
25433
|
-
`Exploration ratio ${ratio.toFixed(2)} within tolerance of target ${target_exploration_ratio}
|
|
25434
|
-
|
|
25468
|
+
assertions.push({
|
|
25469
|
+
text: `Exploration ratio ${ratio.toFixed(2)} within tolerance of target ${target_exploration_ratio}`,
|
|
25470
|
+
passed: true
|
|
25471
|
+
});
|
|
25435
25472
|
} else {
|
|
25436
|
-
|
|
25437
|
-
`Exploration ratio ${ratio.toFixed(2)} outside tolerance of target ${target_exploration_ratio} (diff: ${diff.toFixed(2)}, tolerance: ${exploration_tolerance})
|
|
25438
|
-
|
|
25473
|
+
assertions.push({
|
|
25474
|
+
text: `Exploration ratio ${ratio.toFixed(2)} outside tolerance of target ${target_exploration_ratio} (diff: ${diff.toFixed(2)}, tolerance: ${exploration_tolerance})`,
|
|
25475
|
+
passed: false
|
|
25476
|
+
});
|
|
25439
25477
|
}
|
|
25440
25478
|
}
|
|
25441
25479
|
}
|
|
25442
|
-
const totalChecks =
|
|
25443
|
-
const
|
|
25444
|
-
const
|
|
25445
|
-
if (actualMetrics.tool_calls !== void 0) {
|
|
25446
|
-
reasoningParts.push(`tool_calls=${actualMetrics.tool_calls}`);
|
|
25447
|
-
}
|
|
25448
|
-
if (actualMetrics.llm_calls !== void 0) {
|
|
25449
|
-
reasoningParts.push(`llm_calls=${actualMetrics.llm_calls}`);
|
|
25450
|
-
}
|
|
25451
|
-
if (actualMetrics.tokens !== void 0) {
|
|
25452
|
-
reasoningParts.push(`tokens=${actualMetrics.tokens}`);
|
|
25453
|
-
}
|
|
25454
|
-
if (actualMetrics.cost_usd !== void 0) {
|
|
25455
|
-
reasoningParts.push(`cost=$${actualMetrics.cost_usd.toFixed(4)}`);
|
|
25456
|
-
}
|
|
25457
|
-
if (actualMetrics.duration_ms !== void 0) {
|
|
25458
|
-
reasoningParts.push(`duration=${actualMetrics.duration_ms}ms`);
|
|
25459
|
-
}
|
|
25460
|
-
if (actualMetrics.exploration_ratio !== void 0) {
|
|
25461
|
-
reasoningParts.push(`exploration_ratio=${actualMetrics.exploration_ratio.toFixed(2)}`);
|
|
25462
|
-
}
|
|
25463
|
-
const reasoning = reasoningParts.length > 0 ? `execution-metrics ${reasoningParts.join(", ")}` : "No metrics evaluated";
|
|
25480
|
+
const totalChecks = assertions.length;
|
|
25481
|
+
const passedCount = assertions.filter((a) => a.passed).length;
|
|
25482
|
+
const score = totalChecks > 0 ? passedCount / totalChecks : 0;
|
|
25464
25483
|
return {
|
|
25465
25484
|
score,
|
|
25466
25485
|
verdict: scoreToVerdict(score),
|
|
25467
|
-
|
|
25468
|
-
misses,
|
|
25486
|
+
assertions,
|
|
25469
25487
|
expectedAspectCount: totalChecks || 1,
|
|
25470
|
-
reasoning,
|
|
25471
25488
|
evaluatorRawRequest: {
|
|
25472
25489
|
type: "execution-metrics",
|
|
25473
25490
|
config: this.extractConfiguredThresholds(),
|
|
@@ -25569,10 +25586,8 @@ var FieldAccuracyEvaluator = class {
|
|
|
25569
25586
|
return {
|
|
25570
25587
|
score: 0,
|
|
25571
25588
|
verdict: "fail",
|
|
25572
|
-
|
|
25573
|
-
|
|
25574
|
-
expectedAspectCount: this.config.fields.length,
|
|
25575
|
-
reasoning: "Candidate answer is not valid JSON"
|
|
25589
|
+
assertions: [{ text: "Failed to parse candidate answer as JSON", passed: false }],
|
|
25590
|
+
expectedAspectCount: this.config.fields.length
|
|
25576
25591
|
};
|
|
25577
25592
|
}
|
|
25578
25593
|
const expectedData = this.extractExpectedData(evalCase.expected_output);
|
|
@@ -25580,10 +25595,8 @@ var FieldAccuracyEvaluator = class {
|
|
|
25580
25595
|
return {
|
|
25581
25596
|
score: 0,
|
|
25582
25597
|
verdict: "fail",
|
|
25583
|
-
|
|
25584
|
-
|
|
25585
|
-
expectedAspectCount: this.config.fields.length,
|
|
25586
|
-
reasoning: "Could not extract expected data from expected_output"
|
|
25598
|
+
assertions: [{ text: "No expected data found in expected_output", passed: false }],
|
|
25599
|
+
expectedAspectCount: this.config.fields.length
|
|
25587
25600
|
};
|
|
25588
25601
|
}
|
|
25589
25602
|
const fieldResults = [];
|
|
@@ -25709,8 +25722,8 @@ var FieldAccuracyEvaluator = class {
|
|
|
25709
25722
|
*/
|
|
25710
25723
|
compareNumericTolerance(path46, candidateValue, expectedValue, fieldConfig, weight) {
|
|
25711
25724
|
const { tolerance = 0, relative = false } = fieldConfig;
|
|
25712
|
-
const candidateNum =
|
|
25713
|
-
const expectedNum =
|
|
25725
|
+
const candidateNum = toNumber(candidateValue);
|
|
25726
|
+
const expectedNum = toNumber(expectedValue);
|
|
25714
25727
|
if (candidateNum === null || expectedNum === null) {
|
|
25715
25728
|
return {
|
|
25716
25729
|
path: path46,
|
|
@@ -25801,18 +25814,14 @@ var FieldAccuracyEvaluator = class {
|
|
|
25801
25814
|
*/
|
|
25802
25815
|
aggregateResults(results) {
|
|
25803
25816
|
const aggregation = this.config.aggregation ?? "weighted_average";
|
|
25804
|
-
const
|
|
25805
|
-
const misses = [];
|
|
25817
|
+
const assertions = [];
|
|
25806
25818
|
for (const result of results) {
|
|
25807
|
-
|
|
25808
|
-
hits.push(result.message);
|
|
25809
|
-
} else {
|
|
25810
|
-
misses.push(result.message);
|
|
25811
|
-
}
|
|
25819
|
+
assertions.push({ text: result.message, passed: result.hit });
|
|
25812
25820
|
}
|
|
25813
25821
|
let score;
|
|
25814
25822
|
if (aggregation === "all_or_nothing") {
|
|
25815
|
-
|
|
25823
|
+
const hasFailed = assertions.some((a) => !a.passed);
|
|
25824
|
+
score = hasFailed ? 0 : 1;
|
|
25816
25825
|
} else {
|
|
25817
25826
|
const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
|
|
25818
25827
|
if (totalWeight === 0) {
|
|
@@ -25822,15 +25831,11 @@ var FieldAccuracyEvaluator = class {
|
|
|
25822
25831
|
score = weightedSum / totalWeight;
|
|
25823
25832
|
}
|
|
25824
25833
|
}
|
|
25825
|
-
const reasoning = `${hits.length}/${results.length} fields matched`;
|
|
25826
25834
|
return {
|
|
25827
25835
|
score: clampScore(score),
|
|
25828
25836
|
verdict: scoreToVerdict(score),
|
|
25829
|
-
|
|
25830
|
-
|
|
25831
|
-
misses: misses.slice(0, 4),
|
|
25832
|
-
expectedAspectCount: results.length,
|
|
25833
|
-
reasoning
|
|
25837
|
+
assertions,
|
|
25838
|
+
expectedAspectCount: results.length
|
|
25834
25839
|
};
|
|
25835
25840
|
}
|
|
25836
25841
|
};
|
|
@@ -25856,7 +25861,7 @@ function resolvePath(obj, path46) {
|
|
|
25856
25861
|
}
|
|
25857
25862
|
return current;
|
|
25858
25863
|
}
|
|
25859
|
-
function
|
|
25864
|
+
function toNumber(value) {
|
|
25860
25865
|
if (typeof value === "number") {
|
|
25861
25866
|
return value;
|
|
25862
25867
|
}
|
|
@@ -25937,10 +25942,8 @@ var LatencyEvaluator = class {
|
|
|
25937
25942
|
return {
|
|
25938
25943
|
score: 0,
|
|
25939
25944
|
verdict: "fail",
|
|
25940
|
-
|
|
25941
|
-
misses: ["No duration data available in trace"],
|
|
25945
|
+
assertions: [{ text: "No duration data available in trace", passed: false }],
|
|
25942
25946
|
expectedAspectCount: 1,
|
|
25943
|
-
reasoning: "Execution duration not reported by provider",
|
|
25944
25947
|
evaluatorRawRequest: {
|
|
25945
25948
|
type: "latency",
|
|
25946
25949
|
threshold,
|
|
@@ -25953,10 +25956,10 @@ var LatencyEvaluator = class {
|
|
|
25953
25956
|
return {
|
|
25954
25957
|
score,
|
|
25955
25958
|
verdict: passed ? "pass" : "fail",
|
|
25956
|
-
|
|
25957
|
-
|
|
25959
|
+
assertions: [
|
|
25960
|
+
passed ? { text: `Duration ${durationMs}ms <= ${threshold}ms threshold`, passed: true } : { text: `Duration ${durationMs}ms > ${threshold}ms threshold`, passed: false }
|
|
25961
|
+
],
|
|
25958
25962
|
expectedAspectCount: 1,
|
|
25959
|
-
reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
|
|
25960
25963
|
evaluatorRawRequest: {
|
|
25961
25964
|
type: "latency",
|
|
25962
25965
|
threshold,
|
|
@@ -26030,23 +26033,25 @@ var SkillTriggerEvaluator = class {
|
|
|
26030
26033
|
return {
|
|
26031
26034
|
score: 1,
|
|
26032
26035
|
verdict: "pass",
|
|
26033
|
-
|
|
26034
|
-
|
|
26036
|
+
assertions: [
|
|
26037
|
+
{
|
|
26038
|
+
text: shouldTrigger ? evidence || `Skill "${skillName}" triggered as expected` : `Skill "${skillName}" correctly did not trigger`,
|
|
26039
|
+
passed: true
|
|
26040
|
+
}
|
|
26035
26041
|
],
|
|
26036
|
-
|
|
26037
|
-
expectedAspectCount: 1,
|
|
26038
|
-
reasoning: shouldTrigger ? "Skill triggered correctly" : "No false trigger"
|
|
26042
|
+
expectedAspectCount: 1
|
|
26039
26043
|
};
|
|
26040
26044
|
}
|
|
26041
26045
|
return {
|
|
26042
26046
|
score: 0,
|
|
26043
26047
|
verdict: "fail",
|
|
26044
|
-
|
|
26045
|
-
|
|
26046
|
-
|
|
26048
|
+
assertions: [
|
|
26049
|
+
{
|
|
26050
|
+
text: shouldTrigger ? firstTool ? `First tool was "${firstTool.tool}" \u2014 not a skill/read tool for "${skillName}"` : "No tool calls recorded" : evidence || `Skill "${skillName}" triggered unexpectedly`,
|
|
26051
|
+
passed: false
|
|
26052
|
+
}
|
|
26047
26053
|
],
|
|
26048
|
-
expectedAspectCount: 1
|
|
26049
|
-
reasoning: shouldTrigger ? `Skill "${skillName}" was not triggered` : "False trigger: skill fired when it should not have"
|
|
26054
|
+
expectedAspectCount: 1
|
|
26050
26055
|
};
|
|
26051
26056
|
}
|
|
26052
26057
|
};
|
|
@@ -26211,10 +26216,8 @@ var TokenUsageEvaluator = class {
|
|
|
26211
26216
|
return {
|
|
26212
26217
|
score: 0,
|
|
26213
26218
|
verdict: "fail",
|
|
26214
|
-
|
|
26215
|
-
misses: ["No token usage data available in trace"],
|
|
26219
|
+
assertions: [{ text: "No token usage data available in trace", passed: false }],
|
|
26216
26220
|
expectedAspectCount,
|
|
26217
|
-
reasoning: "Token usage not reported by provider",
|
|
26218
26221
|
evaluatorRawRequest: {
|
|
26219
26222
|
type: "token-usage",
|
|
26220
26223
|
max_total: maxTotal ?? null,
|
|
@@ -26228,37 +26231,34 @@ var TokenUsageEvaluator = class {
|
|
|
26228
26231
|
const output = usage.output;
|
|
26229
26232
|
const cached = usage.cached ?? 0;
|
|
26230
26233
|
const total = input + output + cached;
|
|
26231
|
-
const
|
|
26232
|
-
const misses = [];
|
|
26234
|
+
const assertions = [];
|
|
26233
26235
|
if (typeof maxInput === "number") {
|
|
26234
26236
|
if (input <= maxInput) {
|
|
26235
|
-
|
|
26237
|
+
assertions.push({ text: `Input tokens ${input} <= ${maxInput}`, passed: true });
|
|
26236
26238
|
} else {
|
|
26237
|
-
|
|
26239
|
+
assertions.push({ text: `Input tokens ${input} > ${maxInput}`, passed: false });
|
|
26238
26240
|
}
|
|
26239
26241
|
}
|
|
26240
26242
|
if (typeof maxOutput === "number") {
|
|
26241
26243
|
if (output <= maxOutput) {
|
|
26242
|
-
|
|
26244
|
+
assertions.push({ text: `Output tokens ${output} <= ${maxOutput}`, passed: true });
|
|
26243
26245
|
} else {
|
|
26244
|
-
|
|
26246
|
+
assertions.push({ text: `Output tokens ${output} > ${maxOutput}`, passed: false });
|
|
26245
26247
|
}
|
|
26246
26248
|
}
|
|
26247
26249
|
if (typeof maxTotal === "number") {
|
|
26248
26250
|
if (total <= maxTotal) {
|
|
26249
|
-
|
|
26251
|
+
assertions.push({ text: `Total tokens ${total} <= ${maxTotal}`, passed: true });
|
|
26250
26252
|
} else {
|
|
26251
|
-
|
|
26253
|
+
assertions.push({ text: `Total tokens ${total} > ${maxTotal}`, passed: false });
|
|
26252
26254
|
}
|
|
26253
26255
|
}
|
|
26254
|
-
const passed =
|
|
26256
|
+
const passed = assertions.every((a) => a.passed);
|
|
26255
26257
|
return {
|
|
26256
26258
|
score: passed ? 1 : 0,
|
|
26257
26259
|
verdict: passed ? "pass" : "fail",
|
|
26258
|
-
|
|
26259
|
-
misses,
|
|
26260
|
+
assertions,
|
|
26260
26261
|
expectedAspectCount,
|
|
26261
|
-
reasoning: `token-usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
|
|
26262
26262
|
evaluatorRawRequest: {
|
|
26263
26263
|
type: "token-usage",
|
|
26264
26264
|
max_total: maxTotal ?? null,
|
|
@@ -26356,8 +26356,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26356
26356
|
return {
|
|
26357
26357
|
score: 0,
|
|
26358
26358
|
verdict: "fail",
|
|
26359
|
-
|
|
26360
|
-
misses: ["No trace available for evaluation"],
|
|
26359
|
+
assertions: [{ text: "No trace available for evaluation", passed: false }],
|
|
26361
26360
|
expectedAspectCount: 1
|
|
26362
26361
|
};
|
|
26363
26362
|
}
|
|
@@ -26368,8 +26367,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26368
26367
|
return {
|
|
26369
26368
|
score: 0,
|
|
26370
26369
|
verdict: "fail",
|
|
26371
|
-
|
|
26372
|
-
misses: ["No trace available for evaluation"],
|
|
26370
|
+
assertions: [{ text: "No trace available for evaluation", passed: false }],
|
|
26373
26371
|
expectedAspectCount: 1
|
|
26374
26372
|
};
|
|
26375
26373
|
}
|
|
@@ -26387,8 +26385,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26387
26385
|
return {
|
|
26388
26386
|
score: 0,
|
|
26389
26387
|
verdict: "fail",
|
|
26390
|
-
|
|
26391
|
-
misses: [`Unknown mode: ${this.config.mode}`],
|
|
26388
|
+
assertions: [{ text: `Unknown mode: ${this.config.mode}`, passed: false }],
|
|
26392
26389
|
expectedAspectCount: 1
|
|
26393
26390
|
};
|
|
26394
26391
|
}
|
|
@@ -26437,28 +26434,32 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26437
26434
|
return {
|
|
26438
26435
|
score: 1,
|
|
26439
26436
|
verdict: "pass",
|
|
26440
|
-
|
|
26441
|
-
misses: [],
|
|
26437
|
+
assertions: [{ text: "No tool requirements specified", passed: true }],
|
|
26442
26438
|
expectedAspectCount: 0
|
|
26443
26439
|
};
|
|
26444
26440
|
}
|
|
26445
|
-
const
|
|
26446
|
-
const misses = [];
|
|
26441
|
+
const assertions = [];
|
|
26447
26442
|
for (const toolName of toolNames) {
|
|
26448
26443
|
const required = minimums[toolName];
|
|
26449
26444
|
const actual = summary.toolCallsByName[toolName] ?? 0;
|
|
26450
26445
|
if (actual >= required) {
|
|
26451
|
-
|
|
26446
|
+
assertions.push({
|
|
26447
|
+
text: `${toolName}: called ${actual} times (required >=${required})`,
|
|
26448
|
+
passed: true
|
|
26449
|
+
});
|
|
26452
26450
|
} else {
|
|
26453
|
-
|
|
26451
|
+
assertions.push({
|
|
26452
|
+
text: `${toolName}: called ${actual} times (required >=${required})`,
|
|
26453
|
+
passed: false
|
|
26454
|
+
});
|
|
26454
26455
|
}
|
|
26455
26456
|
}
|
|
26456
|
-
const
|
|
26457
|
+
const passedCount = assertions.filter((a) => a.passed).length;
|
|
26458
|
+
const score = passedCount / toolNames.length;
|
|
26457
26459
|
return {
|
|
26458
26460
|
score,
|
|
26459
26461
|
verdict: scoreToVerdict(score),
|
|
26460
|
-
|
|
26461
|
-
misses,
|
|
26462
|
+
assertions,
|
|
26462
26463
|
expectedAspectCount: toolNames.length
|
|
26463
26464
|
};
|
|
26464
26465
|
}
|
|
@@ -26468,13 +26469,11 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26468
26469
|
return {
|
|
26469
26470
|
score: 1,
|
|
26470
26471
|
verdict: "pass",
|
|
26471
|
-
|
|
26472
|
-
misses: [],
|
|
26472
|
+
assertions: [{ text: "No tool sequence specified", passed: true }],
|
|
26473
26473
|
expectedAspectCount: 0
|
|
26474
26474
|
};
|
|
26475
26475
|
}
|
|
26476
|
-
const
|
|
26477
|
-
const misses = [];
|
|
26476
|
+
const assertions = [];
|
|
26478
26477
|
const warnings = [];
|
|
26479
26478
|
let actualIndex = 0;
|
|
26480
26479
|
let sequenceHits = 0;
|
|
@@ -26494,16 +26493,20 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26494
26493
|
const actualCall = toolCalls[actualIndex];
|
|
26495
26494
|
if (actualCall.name === expectedTool) {
|
|
26496
26495
|
if (argsMatch(expectedItem.args, actualCall.args, mode)) {
|
|
26497
|
-
|
|
26496
|
+
assertions.push({
|
|
26497
|
+
text: `Found ${expectedTool} at position ${actualIndex}`,
|
|
26498
|
+
passed: true
|
|
26499
|
+
});
|
|
26498
26500
|
sequenceHits++;
|
|
26499
26501
|
matchedCall = actualCall;
|
|
26500
26502
|
actualIndex++;
|
|
26501
26503
|
found = true;
|
|
26502
26504
|
break;
|
|
26503
26505
|
}
|
|
26504
|
-
|
|
26505
|
-
`Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch
|
|
26506
|
-
|
|
26506
|
+
assertions.push({
|
|
26507
|
+
text: `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`,
|
|
26508
|
+
passed: false
|
|
26509
|
+
});
|
|
26507
26510
|
actualIndex++;
|
|
26508
26511
|
argsMismatch = true;
|
|
26509
26512
|
break;
|
|
@@ -26511,7 +26514,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26511
26514
|
actualIndex++;
|
|
26512
26515
|
}
|
|
26513
26516
|
if (!found && !argsMismatch) {
|
|
26514
|
-
|
|
26517
|
+
assertions.push({
|
|
26518
|
+
text: `Expected ${expectedTool} at position ${i}, not found in remaining trace`,
|
|
26519
|
+
passed: false
|
|
26520
|
+
});
|
|
26515
26521
|
}
|
|
26516
26522
|
if (found && matchedCall) {
|
|
26517
26523
|
const latencyResult = checkLatency(
|
|
@@ -26520,10 +26526,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26520
26526
|
matchedCall.durationMs
|
|
26521
26527
|
);
|
|
26522
26528
|
if (latencyResult.status === "pass") {
|
|
26523
|
-
|
|
26529
|
+
assertions.push({ text: latencyResult.message, passed: true });
|
|
26524
26530
|
latencyHits++;
|
|
26525
26531
|
} else if (latencyResult.status === "fail") {
|
|
26526
|
-
|
|
26532
|
+
assertions.push({ text: latencyResult.message, passed: false });
|
|
26527
26533
|
} else if (latencyResult.message) {
|
|
26528
26534
|
warnings.push(latencyResult.message);
|
|
26529
26535
|
latencySkips++;
|
|
@@ -26539,8 +26545,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26539
26545
|
return {
|
|
26540
26546
|
score,
|
|
26541
26547
|
verdict: scoreToVerdict(score),
|
|
26542
|
-
|
|
26543
|
-
misses,
|
|
26548
|
+
assertions,
|
|
26544
26549
|
expectedAspectCount: totalAssertions
|
|
26545
26550
|
};
|
|
26546
26551
|
}
|
|
@@ -26550,13 +26555,11 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26550
26555
|
return {
|
|
26551
26556
|
score: 1,
|
|
26552
26557
|
verdict: "pass",
|
|
26553
|
-
|
|
26554
|
-
misses: [],
|
|
26558
|
+
assertions: [{ text: "No tool sequence specified", passed: true }],
|
|
26555
26559
|
expectedAspectCount: 0
|
|
26556
26560
|
};
|
|
26557
26561
|
}
|
|
26558
|
-
const
|
|
26559
|
-
const misses = [];
|
|
26562
|
+
const assertions = [];
|
|
26560
26563
|
const warnings = [];
|
|
26561
26564
|
let sequenceHits = 0;
|
|
26562
26565
|
let latencyHits = 0;
|
|
@@ -26565,7 +26568,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26565
26568
|
(item) => item.maxDurationMs !== void 0
|
|
26566
26569
|
).length;
|
|
26567
26570
|
if (toolCalls.length !== expected.length) {
|
|
26568
|
-
|
|
26571
|
+
assertions.push({
|
|
26572
|
+
text: `Expected ${expected.length} tool calls, got ${toolCalls.length}`,
|
|
26573
|
+
passed: false
|
|
26574
|
+
});
|
|
26569
26575
|
}
|
|
26570
26576
|
const checkLength = Math.min(expected.length, toolCalls.length);
|
|
26571
26577
|
for (let i = 0; i < checkLength; i++) {
|
|
@@ -26577,14 +26583,17 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26577
26583
|
let sequenceMatched = false;
|
|
26578
26584
|
if (actualTool === expectedTool) {
|
|
26579
26585
|
if (argsMatch(expectedItem.args, actualCall.args, mode)) {
|
|
26580
|
-
|
|
26586
|
+
assertions.push({ text: `Position ${i}: ${expectedTool}`, passed: true });
|
|
26581
26587
|
sequenceHits++;
|
|
26582
26588
|
sequenceMatched = true;
|
|
26583
26589
|
} else {
|
|
26584
|
-
|
|
26590
|
+
assertions.push({ text: `Position ${i}: ${expectedTool} args mismatch`, passed: false });
|
|
26585
26591
|
}
|
|
26586
26592
|
} else {
|
|
26587
|
-
|
|
26593
|
+
assertions.push({
|
|
26594
|
+
text: `Position ${i}: expected ${expectedTool}, got ${actualTool}`,
|
|
26595
|
+
passed: false
|
|
26596
|
+
});
|
|
26588
26597
|
}
|
|
26589
26598
|
if (sequenceMatched) {
|
|
26590
26599
|
const latencyResult = checkLatency(
|
|
@@ -26593,10 +26602,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26593
26602
|
actualCall.durationMs
|
|
26594
26603
|
);
|
|
26595
26604
|
if (latencyResult.status === "pass") {
|
|
26596
|
-
|
|
26605
|
+
assertions.push({ text: latencyResult.message, passed: true });
|
|
26597
26606
|
latencyHits++;
|
|
26598
26607
|
} else if (latencyResult.status === "fail") {
|
|
26599
|
-
|
|
26608
|
+
assertions.push({ text: latencyResult.message, passed: false });
|
|
26600
26609
|
} else if (latencyResult.message) {
|
|
26601
26610
|
warnings.push(latencyResult.message);
|
|
26602
26611
|
latencySkips++;
|
|
@@ -26604,7 +26613,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26604
26613
|
}
|
|
26605
26614
|
}
|
|
26606
26615
|
for (let i = checkLength; i < expected.length; i++) {
|
|
26607
|
-
|
|
26616
|
+
assertions.push({
|
|
26617
|
+
text: `Position ${i}: expected ${expected[i].tool}, got nothing`,
|
|
26618
|
+
passed: false
|
|
26619
|
+
});
|
|
26608
26620
|
}
|
|
26609
26621
|
for (const warning of warnings) {
|
|
26610
26622
|
console.warn(`[tool-trajectory] ${warning}`);
|
|
@@ -26615,8 +26627,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26615
26627
|
return {
|
|
26616
26628
|
score,
|
|
26617
26629
|
verdict: scoreToVerdict(score),
|
|
26618
|
-
|
|
26619
|
-
misses,
|
|
26630
|
+
assertions,
|
|
26620
26631
|
expectedAspectCount: totalAssertions
|
|
26621
26632
|
};
|
|
26622
26633
|
}
|
|
@@ -26631,13 +26642,11 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26631
26642
|
return {
|
|
26632
26643
|
score: 1,
|
|
26633
26644
|
verdict: "pass",
|
|
26634
|
-
|
|
26635
|
-
misses: [],
|
|
26645
|
+
assertions: [{ text: "No expected tools specified", passed: true }],
|
|
26636
26646
|
expectedAspectCount: 0
|
|
26637
26647
|
};
|
|
26638
26648
|
}
|
|
26639
|
-
const
|
|
26640
|
-
const misses = [];
|
|
26649
|
+
const assertions = [];
|
|
26641
26650
|
const consumed = /* @__PURE__ */ new Set();
|
|
26642
26651
|
for (let i = 0; i < expected.length; i++) {
|
|
26643
26652
|
const expectedItem = expected[i];
|
|
@@ -26648,22 +26657,25 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26648
26657
|
if (consumed.has(j)) continue;
|
|
26649
26658
|
const actualCall = toolCalls[j];
|
|
26650
26659
|
if (actualCall.name === expectedTool && argsMatch(expectedItem.args, actualCall.args, mode)) {
|
|
26651
|
-
|
|
26660
|
+
assertions.push({ text: `Found ${expectedTool} at position ${j}`, passed: true });
|
|
26652
26661
|
consumed.add(j);
|
|
26653
26662
|
found = true;
|
|
26654
26663
|
break;
|
|
26655
26664
|
}
|
|
26656
26665
|
}
|
|
26657
26666
|
if (!found) {
|
|
26658
|
-
|
|
26667
|
+
assertions.push({
|
|
26668
|
+
text: `Expected ${expectedTool} not found in actual trajectory`,
|
|
26669
|
+
passed: false
|
|
26670
|
+
});
|
|
26659
26671
|
}
|
|
26660
26672
|
}
|
|
26661
|
-
const
|
|
26673
|
+
const passedCount = assertions.filter((a) => a.passed).length;
|
|
26674
|
+
const score = expected.length > 0 ? passedCount / expected.length : 1;
|
|
26662
26675
|
return {
|
|
26663
26676
|
score,
|
|
26664
26677
|
verdict: scoreToVerdict(score),
|
|
26665
|
-
|
|
26666
|
-
misses,
|
|
26678
|
+
assertions,
|
|
26667
26679
|
expectedAspectCount: expected.length
|
|
26668
26680
|
};
|
|
26669
26681
|
}
|
|
@@ -26679,16 +26691,19 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26679
26691
|
return {
|
|
26680
26692
|
score: 1,
|
|
26681
26693
|
verdict: "pass",
|
|
26682
|
-
|
|
26683
|
-
misses: [],
|
|
26694
|
+
assertions: [{ text: "No tool calls and no expected tools", passed: true }],
|
|
26684
26695
|
expectedAspectCount: 0
|
|
26685
26696
|
};
|
|
26686
26697
|
}
|
|
26687
26698
|
return {
|
|
26688
26699
|
score: 0,
|
|
26689
26700
|
verdict: "fail",
|
|
26690
|
-
|
|
26691
|
-
|
|
26701
|
+
assertions: [
|
|
26702
|
+
{
|
|
26703
|
+
text: `${toolCalls.length} unexpected tool call(s) with empty allowed list`,
|
|
26704
|
+
passed: false
|
|
26705
|
+
}
|
|
26706
|
+
],
|
|
26692
26707
|
expectedAspectCount: toolCalls.length
|
|
26693
26708
|
};
|
|
26694
26709
|
}
|
|
@@ -26696,13 +26711,11 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26696
26711
|
return {
|
|
26697
26712
|
score: 1,
|
|
26698
26713
|
verdict: "pass",
|
|
26699
|
-
|
|
26700
|
-
misses: [],
|
|
26714
|
+
assertions: [{ text: "No actual tool calls (trivially a subset)", passed: true }],
|
|
26701
26715
|
expectedAspectCount: 0
|
|
26702
26716
|
};
|
|
26703
26717
|
}
|
|
26704
|
-
const
|
|
26705
|
-
const misses = [];
|
|
26718
|
+
const assertions = [];
|
|
26706
26719
|
for (let i = 0; i < toolCalls.length; i++) {
|
|
26707
26720
|
const actualCall = toolCalls[i];
|
|
26708
26721
|
let allowed = false;
|
|
@@ -26714,17 +26727,23 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26714
26727
|
}
|
|
26715
26728
|
}
|
|
26716
26729
|
if (allowed) {
|
|
26717
|
-
|
|
26730
|
+
assertions.push({
|
|
26731
|
+
text: `Position ${i}: ${actualCall.name} is in allowed set`,
|
|
26732
|
+
passed: true
|
|
26733
|
+
});
|
|
26718
26734
|
} else {
|
|
26719
|
-
|
|
26735
|
+
assertions.push({
|
|
26736
|
+
text: `Position ${i}: ${actualCall.name} is not in allowed set`,
|
|
26737
|
+
passed: false
|
|
26738
|
+
});
|
|
26720
26739
|
}
|
|
26721
26740
|
}
|
|
26722
|
-
const
|
|
26741
|
+
const passedCount = assertions.filter((a) => a.passed).length;
|
|
26742
|
+
const score = toolCalls.length > 0 ? passedCount / toolCalls.length : 1;
|
|
26723
26743
|
return {
|
|
26724
26744
|
score,
|
|
26725
26745
|
verdict: scoreToVerdict(score),
|
|
26726
|
-
|
|
26727
|
-
misses,
|
|
26746
|
+
assertions,
|
|
26728
26747
|
expectedAspectCount: toolCalls.length
|
|
26729
26748
|
};
|
|
26730
26749
|
}
|
|
@@ -26733,8 +26752,12 @@ function runContainsAssertion(output, value) {
|
|
|
26733
26752
|
const passed = output.includes(value);
|
|
26734
26753
|
return {
|
|
26735
26754
|
score: passed ? 1 : 0,
|
|
26736
|
-
|
|
26737
|
-
|
|
26755
|
+
assertions: [
|
|
26756
|
+
{
|
|
26757
|
+
text: passed ? `Output contains "${value}"` : `Output does not contain "${value}"`,
|
|
26758
|
+
passed
|
|
26759
|
+
}
|
|
26760
|
+
]
|
|
26738
26761
|
};
|
|
26739
26762
|
}
|
|
26740
26763
|
function runContainsAnyAssertion(output, values) {
|
|
@@ -26742,8 +26765,12 @@ function runContainsAnyAssertion(output, values) {
|
|
|
26742
26765
|
const passed = matched.length > 0;
|
|
26743
26766
|
return {
|
|
26744
26767
|
score: passed ? 1 : 0,
|
|
26745
|
-
|
|
26746
|
-
|
|
26768
|
+
assertions: [
|
|
26769
|
+
{
|
|
26770
|
+
text: passed ? `Output contains "${matched[0]}"` : `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")}`,
|
|
26771
|
+
passed
|
|
26772
|
+
}
|
|
26773
|
+
]
|
|
26747
26774
|
};
|
|
26748
26775
|
}
|
|
26749
26776
|
function runContainsAllAssertion(output, values) {
|
|
@@ -26751,16 +26778,24 @@ function runContainsAllAssertion(output, values) {
|
|
|
26751
26778
|
const passed = missing.length === 0;
|
|
26752
26779
|
return {
|
|
26753
26780
|
score: passed ? 1 : 0,
|
|
26754
|
-
|
|
26755
|
-
|
|
26781
|
+
assertions: [
|
|
26782
|
+
{
|
|
26783
|
+
text: passed ? `Output contains all ${values.length} expected strings` : `Output missing: ${missing.map((v) => `"${v}"`).join(", ")}`,
|
|
26784
|
+
passed
|
|
26785
|
+
}
|
|
26786
|
+
]
|
|
26756
26787
|
};
|
|
26757
26788
|
}
|
|
26758
26789
|
function runIcontainsAssertion(output, value) {
|
|
26759
26790
|
const passed = output.toLowerCase().includes(value.toLowerCase());
|
|
26760
26791
|
return {
|
|
26761
26792
|
score: passed ? 1 : 0,
|
|
26762
|
-
|
|
26763
|
-
|
|
26793
|
+
assertions: [
|
|
26794
|
+
{
|
|
26795
|
+
text: passed ? `Output contains "${value}" (case-insensitive)` : `Output does not contain "${value}" (case-insensitive)`,
|
|
26796
|
+
passed
|
|
26797
|
+
}
|
|
26798
|
+
]
|
|
26764
26799
|
};
|
|
26765
26800
|
}
|
|
26766
26801
|
function runIcontainsAnyAssertion(output, values) {
|
|
@@ -26769,9 +26804,11 @@ function runIcontainsAnyAssertion(output, values) {
|
|
|
26769
26804
|
const passed = matched.length > 0;
|
|
26770
26805
|
return {
|
|
26771
26806
|
score: passed ? 1 : 0,
|
|
26772
|
-
|
|
26773
|
-
|
|
26774
|
-
|
|
26807
|
+
assertions: [
|
|
26808
|
+
{
|
|
26809
|
+
text: passed ? `Output contains "${matched[0]}" (case-insensitive)` : `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")} (case-insensitive)`,
|
|
26810
|
+
passed
|
|
26811
|
+
}
|
|
26775
26812
|
]
|
|
26776
26813
|
};
|
|
26777
26814
|
}
|
|
@@ -26781,24 +26818,36 @@ function runIcontainsAllAssertion(output, values) {
|
|
|
26781
26818
|
const passed = missing.length === 0;
|
|
26782
26819
|
return {
|
|
26783
26820
|
score: passed ? 1 : 0,
|
|
26784
|
-
|
|
26785
|
-
|
|
26821
|
+
assertions: [
|
|
26822
|
+
{
|
|
26823
|
+
text: passed ? `Output contains all ${values.length} expected strings (case-insensitive)` : `Output missing (case-insensitive): ${missing.map((v) => `"${v}"`).join(", ")}`,
|
|
26824
|
+
passed
|
|
26825
|
+
}
|
|
26826
|
+
]
|
|
26786
26827
|
};
|
|
26787
26828
|
}
|
|
26788
26829
|
function runStartsWithAssertion(output, value) {
|
|
26789
26830
|
const passed = output.trim().startsWith(value.trim());
|
|
26790
26831
|
return {
|
|
26791
26832
|
score: passed ? 1 : 0,
|
|
26792
|
-
|
|
26793
|
-
|
|
26833
|
+
assertions: [
|
|
26834
|
+
{
|
|
26835
|
+
text: passed ? `Output starts with "${value}"` : `Output does not start with "${value}"`,
|
|
26836
|
+
passed
|
|
26837
|
+
}
|
|
26838
|
+
]
|
|
26794
26839
|
};
|
|
26795
26840
|
}
|
|
26796
26841
|
function runEndsWithAssertion(output, value) {
|
|
26797
26842
|
const passed = output.trim().endsWith(value.trim());
|
|
26798
26843
|
return {
|
|
26799
26844
|
score: passed ? 1 : 0,
|
|
26800
|
-
|
|
26801
|
-
|
|
26845
|
+
assertions: [
|
|
26846
|
+
{
|
|
26847
|
+
text: passed ? `Output ends with "${value}"` : `Output does not end with "${value}"`,
|
|
26848
|
+
passed
|
|
26849
|
+
}
|
|
26850
|
+
]
|
|
26802
26851
|
};
|
|
26803
26852
|
}
|
|
26804
26853
|
function runRegexAssertion(output, pattern, flags) {
|
|
@@ -26807,8 +26856,12 @@ function runRegexAssertion(output, pattern, flags) {
|
|
|
26807
26856
|
const flagsLabel = flags ? ` (flags: ${flags})` : "";
|
|
26808
26857
|
return {
|
|
26809
26858
|
score: passed ? 1 : 0,
|
|
26810
|
-
|
|
26811
|
-
|
|
26859
|
+
assertions: [
|
|
26860
|
+
{
|
|
26861
|
+
text: passed ? `Output matches pattern /${pattern}/${flags ?? ""}${flagsLabel}` : `Output does not match pattern /${pattern}/${flags ?? ""}${flagsLabel}`,
|
|
26862
|
+
passed
|
|
26863
|
+
}
|
|
26864
|
+
]
|
|
26812
26865
|
};
|
|
26813
26866
|
}
|
|
26814
26867
|
function runIsJsonAssertion(output) {
|
|
@@ -26820,16 +26873,24 @@ function runIsJsonAssertion(output) {
|
|
|
26820
26873
|
}
|
|
26821
26874
|
return {
|
|
26822
26875
|
score: passed ? 1 : 0,
|
|
26823
|
-
|
|
26824
|
-
|
|
26876
|
+
assertions: [
|
|
26877
|
+
{
|
|
26878
|
+
text: passed ? "Output is valid JSON" : "Output is not valid JSON",
|
|
26879
|
+
passed
|
|
26880
|
+
}
|
|
26881
|
+
]
|
|
26825
26882
|
};
|
|
26826
26883
|
}
|
|
26827
26884
|
function runEqualsAssertion(output, value) {
|
|
26828
26885
|
const passed = output.trim() === value.trim();
|
|
26829
26886
|
return {
|
|
26830
26887
|
score: passed ? 1 : 0,
|
|
26831
|
-
|
|
26832
|
-
|
|
26888
|
+
assertions: [
|
|
26889
|
+
{
|
|
26890
|
+
text: passed ? `Output equals "${value}"` : `Output does not equal "${value}"`,
|
|
26891
|
+
passed
|
|
26892
|
+
}
|
|
26893
|
+
]
|
|
26833
26894
|
};
|
|
26834
26895
|
}
|
|
26835
26896
|
var Node = class {
|
|
@@ -27028,10 +27089,8 @@ var InlineAssertEvaluator = class {
|
|
|
27028
27089
|
return {
|
|
27029
27090
|
score,
|
|
27030
27091
|
verdict: scoreToVerdict(score),
|
|
27031
|
-
|
|
27032
|
-
misses: score < 0.5 ? [result.name] : [],
|
|
27092
|
+
assertions: [{ text: result.name, passed: score >= 0.5 }],
|
|
27033
27093
|
expectedAspectCount: 1,
|
|
27034
|
-
reasoning: void 0,
|
|
27035
27094
|
details: result.metadata ? result.metadata : void 0
|
|
27036
27095
|
};
|
|
27037
27096
|
}
|
|
@@ -27219,9 +27278,7 @@ var containsFactory = (config) => {
|
|
|
27219
27278
|
return {
|
|
27220
27279
|
score: result.score,
|
|
27221
27280
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
27222
|
-
|
|
27223
|
-
misses: result.misses,
|
|
27224
|
-
reasoning: result.score === 1 ? `Output contains "${c.value}"` : `Output does not contain "${c.value}"`,
|
|
27281
|
+
assertions: result.assertions,
|
|
27225
27282
|
expectedAspectCount: 1
|
|
27226
27283
|
};
|
|
27227
27284
|
});
|
|
@@ -27233,9 +27290,7 @@ var regexFactory = (config) => {
|
|
|
27233
27290
|
return {
|
|
27234
27291
|
score: result.score,
|
|
27235
27292
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
27236
|
-
|
|
27237
|
-
misses: result.misses,
|
|
27238
|
-
reasoning: result.score === 1 ? `Output matches pattern /${c.value}/${c.flags ?? ""}` : `Output does not match pattern /${c.value}/${c.flags ?? ""}`,
|
|
27293
|
+
assertions: result.assertions,
|
|
27239
27294
|
expectedAspectCount: 1
|
|
27240
27295
|
};
|
|
27241
27296
|
});
|
|
@@ -27246,9 +27301,7 @@ var isJsonFactory = () => {
|
|
|
27246
27301
|
return {
|
|
27247
27302
|
score: result.score,
|
|
27248
27303
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
27249
|
-
|
|
27250
|
-
misses: result.misses,
|
|
27251
|
-
reasoning: result.score === 1 ? "Output is valid JSON" : "Output is not valid JSON",
|
|
27304
|
+
assertions: result.assertions,
|
|
27252
27305
|
expectedAspectCount: 1
|
|
27253
27306
|
};
|
|
27254
27307
|
});
|
|
@@ -27260,9 +27313,7 @@ var equalsFactory = (config) => {
|
|
|
27260
27313
|
return {
|
|
27261
27314
|
score: result.score,
|
|
27262
27315
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
27263
|
-
|
|
27264
|
-
misses: result.misses,
|
|
27265
|
-
reasoning: result.score === 1 ? `Output equals "${c.value}"` : `Output does not equal "${c.value}"`,
|
|
27316
|
+
assertions: result.assertions,
|
|
27266
27317
|
expectedAspectCount: 1
|
|
27267
27318
|
};
|
|
27268
27319
|
});
|
|
@@ -27274,9 +27325,7 @@ var containsAnyFactory = (config) => {
|
|
|
27274
27325
|
return {
|
|
27275
27326
|
score: result.score,
|
|
27276
27327
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
27277
|
-
|
|
27278
|
-
misses: result.misses,
|
|
27279
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
27328
|
+
assertions: result.assertions,
|
|
27280
27329
|
expectedAspectCount: 1
|
|
27281
27330
|
};
|
|
27282
27331
|
});
|
|
@@ -27288,9 +27337,7 @@ var containsAllFactory = (config) => {
|
|
|
27288
27337
|
return {
|
|
27289
27338
|
score: result.score,
|
|
27290
27339
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
27291
|
-
|
|
27292
|
-
misses: result.misses,
|
|
27293
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
27340
|
+
assertions: result.assertions,
|
|
27294
27341
|
expectedAspectCount: 1
|
|
27295
27342
|
};
|
|
27296
27343
|
});
|
|
@@ -27302,9 +27349,7 @@ var icontainsFactory = (config) => {
|
|
|
27302
27349
|
return {
|
|
27303
27350
|
score: result.score,
|
|
27304
27351
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
27305
|
-
|
|
27306
|
-
misses: result.misses,
|
|
27307
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
27352
|
+
assertions: result.assertions,
|
|
27308
27353
|
expectedAspectCount: 1
|
|
27309
27354
|
};
|
|
27310
27355
|
});
|
|
@@ -27316,9 +27361,7 @@ var icontainsAnyFactory = (config) => {
|
|
|
27316
27361
|
return {
|
|
27317
27362
|
score: result.score,
|
|
27318
27363
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
27319
|
-
|
|
27320
|
-
misses: result.misses,
|
|
27321
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
27364
|
+
assertions: result.assertions,
|
|
27322
27365
|
expectedAspectCount: 1
|
|
27323
27366
|
};
|
|
27324
27367
|
});
|
|
@@ -27330,9 +27373,7 @@ var icontainsAllFactory = (config) => {
|
|
|
27330
27373
|
return {
|
|
27331
27374
|
score: result.score,
|
|
27332
27375
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
27333
|
-
|
|
27334
|
-
misses: result.misses,
|
|
27335
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
27376
|
+
assertions: result.assertions,
|
|
27336
27377
|
expectedAspectCount: 1
|
|
27337
27378
|
};
|
|
27338
27379
|
});
|
|
@@ -27344,9 +27385,7 @@ var startsWithFactory = (config) => {
|
|
|
27344
27385
|
return {
|
|
27345
27386
|
score: result.score,
|
|
27346
27387
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
27347
|
-
|
|
27348
|
-
misses: result.misses,
|
|
27349
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
27388
|
+
assertions: result.assertions,
|
|
27350
27389
|
expectedAspectCount: 1
|
|
27351
27390
|
};
|
|
27352
27391
|
});
|
|
@@ -27358,9 +27397,7 @@ var endsWithFactory = (config) => {
|
|
|
27358
27397
|
return {
|
|
27359
27398
|
score: result.score,
|
|
27360
27399
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
27361
|
-
|
|
27362
|
-
misses: result.misses,
|
|
27363
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
27400
|
+
assertions: result.assertions,
|
|
27364
27401
|
expectedAspectCount: 1
|
|
27365
27402
|
};
|
|
27366
27403
|
});
|
|
@@ -28389,7 +28426,7 @@ async function runEvaluation(options) {
|
|
|
28389
28426
|
if (!cliModel) {
|
|
28390
28427
|
throw new Error('--grader-target "agentv" requires --model (e.g., "openai:gpt-5-mini")');
|
|
28391
28428
|
}
|
|
28392
|
-
const { AgentvProvider: AgentvProvider2 } = await import("./agentv-provider-
|
|
28429
|
+
const { AgentvProvider: AgentvProvider2 } = await import("./agentv-provider-NFFLXG5M-TJAWCWCX.js");
|
|
28393
28430
|
return new AgentvProvider2("agentv", { model: cliModel, temperature: 0 });
|
|
28394
28431
|
}
|
|
28395
28432
|
const overrideTarget = resolveTargetByName(cliGraderTarget);
|
|
@@ -28724,8 +28761,7 @@ async function runEvaluation(options) {
|
|
|
28724
28761
|
testId: evalCase.id,
|
|
28725
28762
|
dataset: evalCase.dataset,
|
|
28726
28763
|
score: 0,
|
|
28727
|
-
|
|
28728
|
-
misses: [],
|
|
28764
|
+
assertions: [],
|
|
28729
28765
|
answer: "",
|
|
28730
28766
|
target: target.name,
|
|
28731
28767
|
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
@@ -28744,7 +28780,9 @@ async function runEvaluation(options) {
|
|
|
28744
28780
|
testId: evalCase.id,
|
|
28745
28781
|
status: "failed",
|
|
28746
28782
|
completedAt: Date.now(),
|
|
28747
|
-
error: budgetResult.error
|
|
28783
|
+
error: budgetResult.error,
|
|
28784
|
+
score: budgetResult.score,
|
|
28785
|
+
executionStatus: budgetResult.executionStatus
|
|
28748
28786
|
});
|
|
28749
28787
|
}
|
|
28750
28788
|
if (onResult) {
|
|
@@ -28759,8 +28797,7 @@ async function runEvaluation(options) {
|
|
|
28759
28797
|
testId: evalCase.id,
|
|
28760
28798
|
dataset: evalCase.dataset,
|
|
28761
28799
|
score: 0,
|
|
28762
|
-
|
|
28763
|
-
misses: [],
|
|
28800
|
+
assertions: [],
|
|
28764
28801
|
answer: "",
|
|
28765
28802
|
target: target.name,
|
|
28766
28803
|
error: errorMsg,
|
|
@@ -28775,7 +28812,9 @@ async function runEvaluation(options) {
|
|
|
28775
28812
|
testId: evalCase.id,
|
|
28776
28813
|
status: "failed",
|
|
28777
28814
|
completedAt: Date.now(),
|
|
28778
|
-
error: haltResult.error
|
|
28815
|
+
error: haltResult.error,
|
|
28816
|
+
score: haltResult.score,
|
|
28817
|
+
executionStatus: haltResult.executionStatus
|
|
28779
28818
|
});
|
|
28780
28819
|
}
|
|
28781
28820
|
if (onResult) {
|
|
@@ -28855,7 +28894,9 @@ async function runEvaluation(options) {
|
|
|
28855
28894
|
startedAt: 0,
|
|
28856
28895
|
// Not used for completed status
|
|
28857
28896
|
completedAt: Date.now(),
|
|
28858
|
-
error: result.error
|
|
28897
|
+
error: result.error,
|
|
28898
|
+
score: result.score,
|
|
28899
|
+
executionStatus: result.executionStatus
|
|
28859
28900
|
});
|
|
28860
28901
|
}
|
|
28861
28902
|
if (onResult) {
|
|
@@ -29026,7 +29067,9 @@ async function runBatchEvaluation(options) {
|
|
|
29026
29067
|
const merged = computed ? mergeExecutionMetrics(computed, {
|
|
29027
29068
|
tokenUsage: providerResponse.tokenUsage,
|
|
29028
29069
|
costUsd: providerResponse.costUsd,
|
|
29029
|
-
durationMs: providerResponse.durationMs
|
|
29070
|
+
durationMs: providerResponse.durationMs,
|
|
29071
|
+
startTime: providerResponse.startTime,
|
|
29072
|
+
endTime: providerResponse.endTime
|
|
29030
29073
|
}) : void 0;
|
|
29031
29074
|
const trace2 = merged?.trace;
|
|
29032
29075
|
const costUsd = merged?.costUsd;
|
|
@@ -29091,7 +29134,9 @@ async function runBatchEvaluation(options) {
|
|
|
29091
29134
|
testId: evalCase.id,
|
|
29092
29135
|
status: "failed",
|
|
29093
29136
|
completedAt: Date.now(),
|
|
29094
|
-
error: error instanceof Error ? error.message : String(error)
|
|
29137
|
+
error: error instanceof Error ? error.message : String(error),
|
|
29138
|
+
score: errorResult.score,
|
|
29139
|
+
executionStatus: errorResult.executionStatus
|
|
29095
29140
|
});
|
|
29096
29141
|
}
|
|
29097
29142
|
continue;
|
|
@@ -29107,7 +29152,9 @@ async function runBatchEvaluation(options) {
|
|
|
29107
29152
|
status: result.error ? "failed" : "completed",
|
|
29108
29153
|
startedAt: 0,
|
|
29109
29154
|
completedAt: Date.now(),
|
|
29110
|
-
error: result.error
|
|
29155
|
+
error: result.error,
|
|
29156
|
+
score: result.score,
|
|
29157
|
+
executionStatus: result.executionStatus
|
|
29111
29158
|
});
|
|
29112
29159
|
}
|
|
29113
29160
|
}
|
|
@@ -29417,7 +29464,9 @@ async function runEvalCase(options) {
|
|
|
29417
29464
|
const merged = computed ? mergeExecutionMetrics(computed, {
|
|
29418
29465
|
tokenUsage: providerResponse.tokenUsage,
|
|
29419
29466
|
costUsd: providerResponse.costUsd,
|
|
29420
|
-
durationMs: providerResponse.durationMs
|
|
29467
|
+
durationMs: providerResponse.durationMs,
|
|
29468
|
+
startTime: providerResponse.startTime,
|
|
29469
|
+
endTime: providerResponse.endTime
|
|
29421
29470
|
}) : void 0;
|
|
29422
29471
|
const trace2 = merged?.trace;
|
|
29423
29472
|
const costUsd = merged?.costUsd;
|
|
@@ -29715,11 +29764,9 @@ async function evaluateCandidate(options) {
|
|
|
29715
29764
|
dataset: evalCase.dataset,
|
|
29716
29765
|
conversationId: evalCase.conversation_id,
|
|
29717
29766
|
score: score.score,
|
|
29718
|
-
|
|
29719
|
-
misses: score.misses,
|
|
29767
|
+
assertions: score.assertions,
|
|
29720
29768
|
answer: candidate,
|
|
29721
29769
|
target: target.name,
|
|
29722
|
-
reasoning: score.reasoning,
|
|
29723
29770
|
tokenUsage,
|
|
29724
29771
|
costUsd,
|
|
29725
29772
|
durationMs,
|
|
@@ -29893,9 +29940,7 @@ async function runEvaluatorList(options) {
|
|
|
29893
29940
|
score: score2.score,
|
|
29894
29941
|
weight,
|
|
29895
29942
|
verdict: score2.verdict,
|
|
29896
|
-
|
|
29897
|
-
misses: score2.misses,
|
|
29898
|
-
reasoning: score2.reasoning,
|
|
29943
|
+
assertions: score2.assertions,
|
|
29899
29944
|
evaluatorProviderRequest: score2.evaluatorRawRequest,
|
|
29900
29945
|
details: score2.details,
|
|
29901
29946
|
scores: mapChildResults(score2.scores),
|
|
@@ -29910,10 +29955,10 @@ async function runEvaluatorList(options) {
|
|
|
29910
29955
|
const fallbackScore = {
|
|
29911
29956
|
score: 0,
|
|
29912
29957
|
verdict: "fail",
|
|
29913
|
-
|
|
29914
|
-
|
|
29915
|
-
|
|
29916
|
-
|
|
29958
|
+
assertions: [
|
|
29959
|
+
{ text: `Evaluator '${evaluatorConfig.name}' failed: ${message}`, passed: false }
|
|
29960
|
+
],
|
|
29961
|
+
expectedAspectCount: 1
|
|
29917
29962
|
};
|
|
29918
29963
|
const weight = evaluatorConfig.weight ?? 1;
|
|
29919
29964
|
scored.push({
|
|
@@ -29929,9 +29974,12 @@ async function runEvaluatorList(options) {
|
|
|
29929
29974
|
score: 0,
|
|
29930
29975
|
weight,
|
|
29931
29976
|
verdict: "fail",
|
|
29932
|
-
|
|
29933
|
-
|
|
29934
|
-
|
|
29977
|
+
assertions: [
|
|
29978
|
+
{
|
|
29979
|
+
text: `Evaluator '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`,
|
|
29980
|
+
passed: false
|
|
29981
|
+
}
|
|
29982
|
+
],
|
|
29935
29983
|
durationMs: endedAt.getTime() - startedAt.getTime(),
|
|
29936
29984
|
startedAt: startedAt.toISOString(),
|
|
29937
29985
|
endedAt: endedAt.toISOString()
|
|
@@ -29947,9 +29995,7 @@ async function runEvaluatorList(options) {
|
|
|
29947
29995
|
...scores[lastScoresIdx],
|
|
29948
29996
|
score: negated.score,
|
|
29949
29997
|
verdict: negated.verdict,
|
|
29950
|
-
|
|
29951
|
-
misses: [...negated.misses],
|
|
29952
|
-
reasoning: negated.reasoning
|
|
29998
|
+
assertions: [...negated.assertions]
|
|
29953
29999
|
};
|
|
29954
30000
|
}
|
|
29955
30001
|
}
|
|
@@ -29964,21 +30010,13 @@ async function runEvaluatorList(options) {
|
|
|
29964
30010
|
const aggregateScore = hasRequiredFailure ? 0 : scorable.length > 0 ? computeWeightedMean(
|
|
29965
30011
|
scorable.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
|
|
29966
30012
|
) : 0;
|
|
29967
|
-
const
|
|
29968
|
-
const
|
|
29969
|
-
const expectedAspectCount = scored.reduce(
|
|
29970
|
-
(total, entry) => total + (entry.score.expectedAspectCount ?? 0),
|
|
29971
|
-
0
|
|
29972
|
-
);
|
|
29973
|
-
const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString);
|
|
29974
|
-
const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
|
|
30013
|
+
const assertions = scored.flatMap((entry) => entry.score.assertions);
|
|
30014
|
+
const expectedAspectCount = assertions.length || 1;
|
|
29975
30015
|
const score = {
|
|
29976
30016
|
score: aggregateScore,
|
|
29977
30017
|
verdict: scoreToVerdict(aggregateScore),
|
|
29978
|
-
|
|
29979
|
-
|
|
29980
|
-
expectedAspectCount,
|
|
29981
|
-
reasoning
|
|
30018
|
+
assertions,
|
|
30019
|
+
expectedAspectCount
|
|
29982
30020
|
};
|
|
29983
30021
|
return { score, scores };
|
|
29984
30022
|
}
|
|
@@ -30082,8 +30120,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
30082
30120
|
dataset: evalCase.dataset,
|
|
30083
30121
|
conversationId: evalCase.conversation_id,
|
|
30084
30122
|
score: 0,
|
|
30085
|
-
|
|
30086
|
-
misses: [`Error: ${message}`],
|
|
30123
|
+
assertions: [{ text: `Error: ${message}`, passed: false }],
|
|
30087
30124
|
answer: `Error occurred: ${message}`,
|
|
30088
30125
|
target: targetName,
|
|
30089
30126
|
requests,
|
|
@@ -30193,9 +30230,7 @@ function mapChildResults(children) {
|
|
|
30193
30230
|
score: child.score,
|
|
30194
30231
|
weight: child.weight,
|
|
30195
30232
|
verdict: child.verdict,
|
|
30196
|
-
|
|
30197
|
-
misses: child.misses,
|
|
30198
|
-
reasoning: child.reasoning,
|
|
30233
|
+
assertions: child.assertions,
|
|
30199
30234
|
evaluatorProviderRequest: child.evaluatorRawRequest,
|
|
30200
30235
|
scores: mapChildResults(child.scores),
|
|
30201
30236
|
details: child.details,
|
|
@@ -31063,7 +31098,6 @@ export {
|
|
|
31063
31098
|
isJsonValue,
|
|
31064
31099
|
isTestMessage,
|
|
31065
31100
|
isEvaluatorKind,
|
|
31066
|
-
getHitCount,
|
|
31067
31101
|
fileExists,
|
|
31068
31102
|
normalizeLineEndings,
|
|
31069
31103
|
readTextFile,
|
|
@@ -31203,4 +31237,4 @@ export {
|
|
|
31203
31237
|
OtelStreamingObserver,
|
|
31204
31238
|
createAgentKernel
|
|
31205
31239
|
};
|
|
31206
|
-
//# sourceMappingURL=chunk-
|
|
31240
|
+
//# sourceMappingURL=chunk-D6G4N2H2.js.map
|