@agentv/core 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -10,7 +10,7 @@ import {
10
10
  readTextFile,
11
11
  resolveFileReference,
12
12
  resolveTargetDefinition
13
- } from "./chunk-IBTKEEOT.js";
13
+ } from "./chunk-KDEP4I7G.js";
14
14
 
15
15
  // src/evaluation/types.ts
16
16
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -150,85 +150,6 @@ import { readFile as readFile5 } from "node:fs/promises";
150
150
  import path6 from "node:path";
151
151
  import { parse as parse2 } from "yaml";
152
152
 
153
- // src/evaluation/formatting/segment-formatter.ts
154
- function extractCodeBlocks(segments) {
155
- const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
156
- const codeBlocks = [];
157
- for (const segment of segments) {
158
- const typeValue = segment.type;
159
- if (typeof typeValue !== "string" || typeValue !== "text") {
160
- continue;
161
- }
162
- const textValue = segment.value;
163
- if (typeof textValue !== "string") {
164
- continue;
165
- }
166
- const matches = textValue.match(CODE_BLOCK_PATTERN);
167
- if (matches) {
168
- codeBlocks.push(...matches);
169
- }
170
- }
171
- return codeBlocks;
172
- }
173
- function formatFileContents(parts) {
174
- const fileCount = parts.filter((p) => p.isFile).length;
175
- if (fileCount > 0) {
176
- return parts.map((part) => {
177
- if (part.isFile && part.displayPath) {
178
- return `<file path="${part.displayPath}">
179
- ${part.content}
180
- </file>`;
181
- }
182
- return part.content;
183
- }).join("\n\n");
184
- }
185
- return parts.map((p) => p.content).join(" ");
186
- }
187
- function formatSegment(segment, mode = "lm") {
188
- const type = asString(segment.type);
189
- if (type === "text") {
190
- return asString(segment.value);
191
- }
192
- if (type === "guideline_ref") {
193
- const refPath = asString(segment.path);
194
- return refPath ? `<Attached: ${refPath}>` : void 0;
195
- }
196
- if (type === "file") {
197
- const filePath = asString(segment.path);
198
- if (!filePath) {
199
- return void 0;
200
- }
201
- if (mode === "agent") {
202
- return `<file: path="${filePath}">`;
203
- }
204
- const text = asString(segment.text);
205
- if (text && filePath) {
206
- return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
207
- }
208
- }
209
- return void 0;
210
- }
211
- function hasVisibleContent(segments) {
212
- return segments.some((segment) => {
213
- const type = asString(segment.type);
214
- if (type === "text") {
215
- const value = asString(segment.value);
216
- return value !== void 0 && value.trim().length > 0;
217
- }
218
- if (type === "guideline_ref") {
219
- return false;
220
- }
221
- if (type === "file") {
222
- const text = asString(segment.text);
223
- return text !== void 0 && text.trim().length > 0;
224
- }
225
- return false;
226
- });
227
- }
228
- function asString(value) {
229
- return typeof value === "string" ? value : void 0;
230
- }
231
-
232
153
  // src/evaluation/loaders/config-loader.ts
233
154
  import { readFile } from "node:fs/promises";
234
155
  import path2 from "node:path";
@@ -483,7 +404,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
483
404
  logWarning2(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
484
405
  continue;
485
406
  }
486
- const name = asString2(rawEvaluator.name);
407
+ const name = asString(rawEvaluator.name);
487
408
  const typeValue = rawEvaluator.type;
488
409
  if (!name || !isEvaluatorKind(typeValue)) {
489
410
  logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
@@ -511,7 +432,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
511
432
  continue;
512
433
  }
513
434
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
514
- const cwd = asString2(rawEvaluator.cwd);
435
+ const cwd = asString(rawEvaluator.cwd);
515
436
  let resolvedCwd;
516
437
  if (cwd) {
517
438
  const resolved = await resolveFileReference2(cwd, searchRoots);
@@ -526,7 +447,29 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
526
447
  } else {
527
448
  resolvedCwd = searchRoots[0];
528
449
  }
529
- const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight"]);
450
+ const rawTarget = rawEvaluator.target;
451
+ let targetConfig;
452
+ if (rawTarget !== void 0) {
453
+ if (isJsonObject2(rawTarget)) {
454
+ const maxCalls = rawTarget.max_calls;
455
+ if (maxCalls !== void 0 && (typeof maxCalls !== "number" || maxCalls < 0)) {
456
+ logWarning2(
457
+ `Invalid target.max_calls for evaluator '${name}' in '${evalId}': must be a non-negative number`
458
+ );
459
+ } else {
460
+ targetConfig = {
461
+ ...typeof maxCalls === "number" ? { max_calls: maxCalls } : {}
462
+ };
463
+ }
464
+ } else if (rawTarget === true) {
465
+ targetConfig = {};
466
+ } else {
467
+ logWarning2(
468
+ `Invalid target config for evaluator '${name}' in '${evalId}': expected object or true`
469
+ );
470
+ }
471
+ }
472
+ const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
530
473
  const config = {};
531
474
  for (const [key, value] of Object.entries(rawEvaluator)) {
532
475
  if (!knownProps.has(key) && value !== void 0) {
@@ -540,7 +483,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
540
483
  cwd,
541
484
  resolvedCwd,
542
485
  ...weight2 !== void 0 ? { weight: weight2 } : {},
543
- ...Object.keys(config).length > 0 ? { config } : {}
486
+ ...Object.keys(config).length > 0 ? { config } : {},
487
+ ...targetConfig !== void 0 ? { target: targetConfig } : {}
544
488
  });
545
489
  continue;
546
490
  }
@@ -557,7 +501,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
557
501
  logWarning2(`Skipping composite evaluator '${name}' in '${evalId}': missing aggregator`);
558
502
  continue;
559
503
  }
560
- const aggregatorType = asString2(rawAggregator.type);
504
+ const aggregatorType = asString(rawAggregator.type);
561
505
  if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge") {
562
506
  logWarning2(
563
507
  `Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
@@ -570,7 +514,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
570
514
  logWarning2(`Skipping invalid member evaluator in composite '${name}' (expected object)`);
571
515
  continue;
572
516
  }
573
- const memberName = asString2(rawMember.name);
517
+ const memberName = asString(rawMember.name);
574
518
  const memberType = rawMember.type;
575
519
  if (!memberName || !isEvaluatorKind(memberType)) {
576
520
  logWarning2(`Skipping member evaluator with invalid name/type in composite '${name}'`);
@@ -608,7 +552,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
608
552
  ...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
609
553
  };
610
554
  } else if (aggregatorType === "code_judge") {
611
- const aggregatorPath = asString2(rawAggregator.path);
555
+ const aggregatorPath = asString(rawAggregator.path);
612
556
  if (!aggregatorPath) {
613
557
  logWarning2(
614
558
  `Skipping composite evaluator '${name}' in '${evalId}': code_judge aggregator missing path`
@@ -621,7 +565,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
621
565
  cwd: searchRoots[0]
622
566
  };
623
567
  } else {
624
- const aggregatorPrompt = asString2(rawAggregator.prompt);
568
+ const aggregatorPrompt = asString(rawAggregator.prompt);
625
569
  let promptPath2;
626
570
  if (aggregatorPrompt) {
627
571
  const resolved = await resolveFileReference2(aggregatorPrompt, searchRoots);
@@ -646,7 +590,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
646
590
  continue;
647
591
  }
648
592
  if (typeValue === "tool_trajectory") {
649
- const mode = asString2(rawEvaluator.mode);
593
+ const mode = asString(rawEvaluator.mode);
650
594
  if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
651
595
  logWarning2(
652
596
  `Skipping tool_trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, or exact)`
@@ -737,8 +681,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
737
681
  );
738
682
  continue;
739
683
  }
740
- const fieldPath = asString2(rawField.path);
741
- const match = asString2(rawField.match);
684
+ const fieldPath = asString(rawField.path);
685
+ const match = asString(rawField.match);
742
686
  if (!fieldPath) {
743
687
  logWarning2(
744
688
  `Skipping field without path in field_accuracy evaluator '${name}' in '${evalId}'`
@@ -768,7 +712,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
768
712
  );
769
713
  continue;
770
714
  }
771
- const aggregation = asString2(rawEvaluator.aggregation);
715
+ const aggregation = asString(rawEvaluator.aggregation);
772
716
  const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
773
717
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
774
718
  evaluators.push({
@@ -849,7 +793,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
849
793
  });
850
794
  continue;
851
795
  }
852
- const prompt = asString2(rawEvaluator.prompt);
796
+ const prompt = asString(rawEvaluator.prompt);
853
797
  let promptPath;
854
798
  if (prompt) {
855
799
  const resolved = await resolveFileReference2(prompt, searchRoots);
@@ -868,11 +812,11 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
868
812
  );
869
813
  }
870
814
  }
871
- const _model = asString2(rawEvaluator.model);
815
+ const _model = asString(rawEvaluator.model);
872
816
  const rawRubrics = rawEvaluator.rubrics;
873
817
  const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
874
- id: asString2(rubric.id) ?? `rubric-${index + 1}`,
875
- description: asString2(rubric.description) ?? "",
818
+ id: asString(rubric.id) ?? `rubric-${index + 1}`,
819
+ description: asString(rubric.description) ?? "",
876
820
  weight: typeof rubric.weight === "number" ? rubric.weight : 1,
877
821
  required: typeof rubric.required === "boolean" ? rubric.required : true
878
822
  })).filter((r) => r.description.length > 0) : void 0;
@@ -916,7 +860,7 @@ function coerceEvaluator(candidate, contextId) {
916
860
  logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
917
861
  return void 0;
918
862
  }
919
- function asString2(value) {
863
+ function asString(value) {
920
864
  return typeof value === "string" ? value : void 0;
921
865
  }
922
866
  function asStringArray(value, description) {
@@ -992,6 +936,68 @@ function isValidFieldAggregationType(value) {
992
936
  // src/evaluation/loaders/message-processor.ts
993
937
  import { readFile as readFile3 } from "node:fs/promises";
994
938
  import path4 from "node:path";
939
+
940
+ // src/evaluation/formatting/segment-formatter.ts
941
+ function formatFileContents(parts) {
942
+ const fileCount = parts.filter((p) => p.isFile).length;
943
+ if (fileCount > 0) {
944
+ return parts.map((part) => {
945
+ if (part.isFile && part.displayPath) {
946
+ return `<file path="${part.displayPath}">
947
+ ${part.content}
948
+ </file>`;
949
+ }
950
+ return part.content;
951
+ }).join("\n\n");
952
+ }
953
+ return parts.map((p) => p.content).join(" ");
954
+ }
955
+ function formatSegment(segment, mode = "lm") {
956
+ const type = asString2(segment.type);
957
+ if (type === "text") {
958
+ return asString2(segment.value);
959
+ }
960
+ if (type === "guideline_ref") {
961
+ const refPath = asString2(segment.path);
962
+ return refPath ? `<Attached: ${refPath}>` : void 0;
963
+ }
964
+ if (type === "file") {
965
+ const filePath = asString2(segment.path);
966
+ if (!filePath) {
967
+ return void 0;
968
+ }
969
+ if (mode === "agent") {
970
+ return `<file: path="${filePath}">`;
971
+ }
972
+ const text = asString2(segment.text);
973
+ if (text && filePath) {
974
+ return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
975
+ }
976
+ }
977
+ return void 0;
978
+ }
979
+ function hasVisibleContent(segments) {
980
+ return segments.some((segment) => {
981
+ const type = asString2(segment.type);
982
+ if (type === "text") {
983
+ const value = asString2(segment.value);
984
+ return value !== void 0 && value.trim().length > 0;
985
+ }
986
+ if (type === "guideline_ref") {
987
+ return false;
988
+ }
989
+ if (type === "file") {
990
+ const text = asString2(segment.text);
991
+ return text !== void 0 && text.trim().length > 0;
992
+ }
993
+ return false;
994
+ });
995
+ }
996
+ function asString2(value) {
997
+ return typeof value === "string" ? value : void 0;
998
+ }
999
+
1000
+ // src/evaluation/loaders/message-processor.ts
995
1001
  var ANSI_YELLOW4 = "\x1B[33m";
996
1002
  var ANSI_RESET4 = "\x1B[0m";
997
1003
  async function processMessages(options) {
@@ -1297,9 +1303,6 @@ ${messageContent}`);
1297
1303
  questionParts.push(formattedContent);
1298
1304
  }
1299
1305
  }
1300
- if (testCase.code_snippets.length > 0) {
1301
- questionParts.push(testCase.code_snippets.join("\n"));
1302
- }
1303
1306
  question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
1304
1307
  }
1305
1308
  const chatPrompt = useRoleMarkers ? buildChatPromptFromSegments({
@@ -1498,7 +1501,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1498
1501
  repoRootPath,
1499
1502
  verbose
1500
1503
  }) : [];
1501
- const codeSnippets = extractCodeBlocks(inputSegments);
1502
1504
  let referenceAnswer = "";
1503
1505
  if (outputSegments.length > 0) {
1504
1506
  const lastMessage = outputSegments[outputSegments.length - 1];
@@ -1571,7 +1573,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1571
1573
  guideline_paths: guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
1572
1574
  guideline_patterns: guidelinePatterns,
1573
1575
  file_paths: allFilePaths,
1574
- code_snippets: codeSnippets,
1575
1576
  expected_outcome: outcome,
1576
1577
  evaluator: evalCaseEvaluatorKind,
1577
1578
  evaluators
@@ -4084,6 +4085,167 @@ var MockProvider = class {
4084
4085
  }
4085
4086
  };
4086
4087
 
4088
+ // src/evaluation/providers/pi-agent-sdk.ts
4089
+ var piAgentModule = null;
4090
+ var piAiModule = null;
4091
+ async function loadPiModules() {
4092
+ if (!piAgentModule || !piAiModule) {
4093
+ try {
4094
+ [piAgentModule, piAiModule] = await Promise.all([
4095
+ import("@mariozechner/pi-agent"),
4096
+ import("@mariozechner/pi-ai")
4097
+ ]);
4098
+ } catch (error) {
4099
+ throw new Error(
4100
+ `Failed to load pi-agent-sdk dependencies. Please install them:
4101
+ npm install @mariozechner/pi-agent @mariozechner/pi-ai
4102
+
4103
+ Original error: ${error instanceof Error ? error.message : String(error)}`
4104
+ );
4105
+ }
4106
+ }
4107
+ return {
4108
+ Agent: piAgentModule.Agent,
4109
+ ProviderTransport: piAgentModule.ProviderTransport,
4110
+ getModel: piAiModule.getModel,
4111
+ getEnvApiKey: piAiModule.getEnvApiKey
4112
+ };
4113
+ }
4114
+ var PiAgentSdkProvider = class {
4115
+ id;
4116
+ kind = "pi-agent-sdk";
4117
+ targetName;
4118
+ supportsBatch = false;
4119
+ config;
4120
+ constructor(targetName, config) {
4121
+ this.id = `pi-agent-sdk:${targetName}`;
4122
+ this.targetName = targetName;
4123
+ this.config = config;
4124
+ }
4125
+ async invoke(request) {
4126
+ if (request.signal?.aborted) {
4127
+ throw new Error("Pi agent SDK request was aborted before execution");
4128
+ }
4129
+ const { Agent, ProviderTransport, getModel, getEnvApiKey } = await loadPiModules();
4130
+ const startTime = Date.now();
4131
+ const providerName = this.config.provider ?? "anthropic";
4132
+ const modelId = this.config.model ?? "claude-sonnet-4-20250514";
4133
+ const model = getModel(providerName, modelId);
4134
+ const systemPrompt = this.config.systemPrompt ?? "Answer directly and concisely.";
4135
+ const transport = new ProviderTransport({
4136
+ getApiKey: async (provider) => {
4137
+ return this.config.apiKey ?? getEnvApiKey(provider) ?? void 0;
4138
+ }
4139
+ });
4140
+ const agent = new Agent({
4141
+ initialState: {
4142
+ systemPrompt,
4143
+ model,
4144
+ tools: [],
4145
+ // No tools for simple Q&A
4146
+ messages: []
4147
+ },
4148
+ transport
4149
+ });
4150
+ const outputMessages = [];
4151
+ let finalAssistantContent = "";
4152
+ const unsubscribe = agent.subscribe((event) => {
4153
+ if (event.type === "message_end") {
4154
+ const msg = event.message;
4155
+ if (msg.role === "assistant") {
4156
+ const content = extractTextContent2(msg.content);
4157
+ if (content) {
4158
+ finalAssistantContent = content;
4159
+ }
4160
+ }
4161
+ }
4162
+ });
4163
+ try {
4164
+ const timeoutMs = this.config.timeoutMs ?? 12e4;
4165
+ const timeoutPromise = new Promise((_, reject) => {
4166
+ setTimeout(
4167
+ () => reject(new Error(`Pi agent SDK timed out after ${timeoutMs}ms`)),
4168
+ timeoutMs
4169
+ );
4170
+ });
4171
+ await Promise.race([agent.prompt(request.question), timeoutPromise]);
4172
+ await agent.waitForIdle();
4173
+ const agentMessages = agent.state.messages;
4174
+ for (const msg of agentMessages) {
4175
+ outputMessages.push(convertAgentMessage(msg));
4176
+ }
4177
+ const durationMs = Date.now() - startTime;
4178
+ return {
4179
+ raw: {
4180
+ messages: agentMessages,
4181
+ systemPrompt,
4182
+ model: this.config.model,
4183
+ provider: this.config.provider
4184
+ },
4185
+ outputMessages,
4186
+ durationMs
4187
+ };
4188
+ } finally {
4189
+ unsubscribe();
4190
+ }
4191
+ }
4192
+ };
4193
+ function extractTextContent2(content) {
4194
+ if (typeof content === "string") {
4195
+ return content;
4196
+ }
4197
+ if (!Array.isArray(content)) {
4198
+ return void 0;
4199
+ }
4200
+ const textParts = [];
4201
+ for (const part of content) {
4202
+ if (!part || typeof part !== "object") {
4203
+ continue;
4204
+ }
4205
+ const p = part;
4206
+ if (p.type === "text" && typeof p.text === "string") {
4207
+ textParts.push(p.text);
4208
+ }
4209
+ }
4210
+ return textParts.length > 0 ? textParts.join("\n") : void 0;
4211
+ }
4212
+ function convertAgentMessage(message) {
4213
+ if (!message || typeof message !== "object") {
4214
+ return { role: "unknown", content: String(message) };
4215
+ }
4216
+ const msg = message;
4217
+ const role = typeof msg.role === "string" ? msg.role : "unknown";
4218
+ const content = extractTextContent2(msg.content);
4219
+ const toolCalls = extractToolCalls2(msg.content);
4220
+ const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
4221
+ return {
4222
+ role,
4223
+ content,
4224
+ toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
4225
+ timestamp
4226
+ };
4227
+ }
4228
+ function extractToolCalls2(content) {
4229
+ if (!Array.isArray(content)) {
4230
+ return [];
4231
+ }
4232
+ const toolCalls = [];
4233
+ for (const part of content) {
4234
+ if (!part || typeof part !== "object") {
4235
+ continue;
4236
+ }
4237
+ const p = part;
4238
+ if (p.type === "tool_use" && typeof p.name === "string") {
4239
+ toolCalls.push({
4240
+ tool: p.name,
4241
+ input: p.input,
4242
+ id: typeof p.id === "string" ? p.id : void 0
4243
+ });
4244
+ }
4245
+ }
4246
+ return toolCalls;
4247
+ }
4248
+
4087
4249
  // src/evaluation/providers/pi-coding-agent.ts
4088
4250
  import { spawn as spawn3 } from "node:child_process";
4089
4251
  import { randomUUID as randomUUID3 } from "node:crypto";
@@ -4599,8 +4761,8 @@ function convertPiMessage(message) {
4599
4761
  if (typeof role !== "string") {
4600
4762
  return void 0;
4601
4763
  }
4602
- const content = extractTextContent2(msg.content);
4603
- const toolCalls = extractToolCalls2(msg.content);
4764
+ const content = extractTextContent3(msg.content);
4765
+ const toolCalls = extractToolCalls3(msg.content);
4604
4766
  const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
4605
4767
  const metadata = {};
4606
4768
  if (msg.api) metadata.api = msg.api;
@@ -4616,7 +4778,7 @@ function convertPiMessage(message) {
4616
4778
  metadata: Object.keys(metadata).length > 0 ? metadata : void 0
4617
4779
  };
4618
4780
  }
4619
- function extractTextContent2(content) {
4781
+ function extractTextContent3(content) {
4620
4782
  if (typeof content === "string") {
4621
4783
  return content;
4622
4784
  }
@@ -4635,7 +4797,7 @@ function extractTextContent2(content) {
4635
4797
  }
4636
4798
  return textParts.length > 0 ? textParts.join("\n") : void 0;
4637
4799
  }
4638
- function extractToolCalls2(content) {
4800
+ function extractToolCalls3(content) {
4639
4801
  if (!Array.isArray(content)) {
4640
4802
  return [];
4641
4803
  }
@@ -5130,6 +5292,8 @@ function createProvider(target) {
5130
5292
  return new CodexProvider(target.name, target.config);
5131
5293
  case "pi-coding-agent":
5132
5294
  return new PiCodingAgentProvider(target.name, target.config);
5295
+ case "pi-agent-sdk":
5296
+ return new PiAgentSdkProvider(target.name, target.config);
5133
5297
  case "claude-code":
5134
5298
  return new ClaudeCodeProvider(target.name, target.config);
5135
5299
  case "mock":
@@ -5148,25 +5312,80 @@ function resolveAndCreateProvider(definition, env = process.env) {
5148
5312
  return createProvider(resolved);
5149
5313
  }
5150
5314
 
5151
- // src/evaluation/evaluators.ts
5152
- import { generateText as generateText2 } from "ai";
5153
- import { z as z2 } from "zod";
5154
-
5155
- // src/runtime/exec.ts
5156
- function shellEscapePath(value) {
5157
- if (process.platform === "win32") {
5158
- return `"${value.replaceAll('"', '""')}"`;
5315
+ // src/evaluation/evaluators/scoring.ts
5316
+ function scoreToVerdict(score) {
5317
+ if (score >= 0.8) {
5318
+ return "pass";
5159
5319
  }
5160
- return `'${value.replaceAll("'", `'"'"'`)}'`;
5320
+ if (score >= 0.6) {
5321
+ return "borderline";
5322
+ }
5323
+ return "fail";
5161
5324
  }
5162
- async function execFileWithStdin(argv, stdinPayload, options = {}) {
5163
- if (argv.length === 0) {
5164
- throw new Error("Executable argv must include at least one entry");
5325
+ function clampScore(value) {
5326
+ if (Number.isNaN(value) || !Number.isFinite(value)) {
5327
+ return 0;
5165
5328
  }
5166
- if (typeof Bun !== "undefined") {
5167
- return execFileWithStdinBun(argv, stdinPayload, options);
5329
+ if (value < 0) {
5330
+ return 0;
5168
5331
  }
5169
- return execFileWithStdinNode(argv, stdinPayload, options);
5332
+ if (value > 1) {
5333
+ return 1;
5334
+ }
5335
+ return value;
5336
+ }
5337
+ function extractJsonBlob(text) {
5338
+ const match = text.match(/\{[\s\S]*\}/);
5339
+ return match?.[0];
5340
+ }
5341
+ function parseJsonFromText(text) {
5342
+ const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
5343
+ const blob = extractJsonBlob(cleaned) ?? cleaned;
5344
+ return JSON.parse(blob);
5345
+ }
5346
+ function isNonEmptyString(value) {
5347
+ return typeof value === "string" && value.trim().length > 0;
5348
+ }
5349
+ function parseJsonSafe(payload) {
5350
+ try {
5351
+ return JSON.parse(payload);
5352
+ } catch {
5353
+ return void 0;
5354
+ }
5355
+ }
5356
+ function deepEqual(a, b) {
5357
+ if (a === b) return true;
5358
+ if (a === null || b === null) return a === b;
5359
+ if (typeof a !== typeof b) return false;
5360
+ if (typeof a !== "object") return a === b;
5361
+ if (Array.isArray(a) !== Array.isArray(b)) return false;
5362
+ if (Array.isArray(a) && Array.isArray(b)) {
5363
+ if (a.length !== b.length) return false;
5364
+ return a.every((val, i) => deepEqual(val, b[i]));
5365
+ }
5366
+ const aObj = a;
5367
+ const bObj = b;
5368
+ const aKeys = Object.keys(aObj);
5369
+ const bKeys = Object.keys(bObj);
5370
+ if (aKeys.length !== bKeys.length) return false;
5371
+ return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
5372
+ }
5373
+
5374
+ // src/runtime/exec.ts
5375
+ function shellEscapePath(value) {
5376
+ if (process.platform === "win32") {
5377
+ return `"${value.replaceAll('"', '""')}"`;
5378
+ }
5379
+ return `'${value.replaceAll("'", `'"'"'`)}'`;
5380
+ }
5381
+ async function execFileWithStdin(argv, stdinPayload, options = {}) {
5382
+ if (argv.length === 0) {
5383
+ throw new Error("Executable argv must include at least one entry");
5384
+ }
5385
+ if (typeof Bun !== "undefined") {
5386
+ return execFileWithStdinBun(argv, stdinPayload, options);
5387
+ }
5388
+ return execFileWithStdinNode(argv, stdinPayload, options);
5170
5389
  }
5171
5390
  async function execFileWithStdinBun(argv, stdinPayload, options) {
5172
5391
  const command = [...argv];
@@ -5175,7 +5394,9 @@ async function execFileWithStdinBun(argv, stdinPayload, options) {
5175
5394
  cwd: options.cwd,
5176
5395
  stdin: encoder.encode(stdinPayload),
5177
5396
  stdout: "pipe",
5178
- stderr: "pipe"
5397
+ stderr: "pipe",
5398
+ // Merge additional env vars with process.env
5399
+ env: options.env ? { ...process.env, ...options.env } : process.env
5179
5400
  });
5180
5401
  let timedOut = false;
5181
5402
  const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
@@ -5210,7 +5431,9 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
5210
5431
  const [cmd, ...args] = argv;
5211
5432
  const child = spawn4(cmd, args, {
5212
5433
  cwd: options.cwd,
5213
- stdio: ["pipe", "pipe", "pipe"]
5434
+ stdio: ["pipe", "pipe", "pipe"],
5435
+ // Merge additional env vars with process.env
5436
+ env: options.env ? { ...process.env, ...options.env } : process.env
5214
5437
  });
5215
5438
  const stdoutChunks = [];
5216
5439
  const stderrChunks = [];
@@ -5263,7 +5486,9 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
5263
5486
  const child = spawn4(wrappedCommand, {
5264
5487
  shell: true,
5265
5488
  cwd: options.cwd,
5266
- stdio: ["ignore", "ignore", "ignore"]
5489
+ stdio: ["ignore", "ignore", "ignore"],
5490
+ // Merge additional env vars with process.env
5491
+ env: options.env ? { ...process.env, ...options.env } : process.env
5267
5492
  });
5268
5493
  const timeout = options.timeoutMs ? setTimeout(() => {
5269
5494
  child.kill();
@@ -5290,6 +5515,221 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
5290
5515
  }
5291
5516
  }
5292
5517
 
5518
+ // src/runtime/target-proxy.ts
5519
+ import { randomBytes } from "node:crypto";
5520
+ import { createServer } from "node:http";
5521
+ var DEFAULT_MAX_CALLS = 50;
5522
+ async function createTargetProxy(options) {
5523
+ const { defaultProvider, targetResolver, availableTargets, maxCalls } = options;
5524
+ const token = randomBytes(32).toString("hex");
5525
+ let callCount = 0;
5526
+ let isShutdown = false;
5527
+ const targetsList = availableTargets ?? [defaultProvider.targetName];
5528
+ function resolveProvider(targetName) {
5529
+ if (targetName === void 0 || targetName === defaultProvider.targetName) {
5530
+ return defaultProvider;
5531
+ }
5532
+ if (targetResolver) {
5533
+ return targetResolver(targetName);
5534
+ }
5535
+ return void 0;
5536
+ }
5537
+ const server = createServer(async (req, res) => {
5538
+ res.setHeader("Access-Control-Allow-Origin", "*");
5539
+ res.setHeader("Access-Control-Allow-Methods", "GET, POST, OPTIONS");
5540
+ res.setHeader("Access-Control-Allow-Headers", "Content-Type, Authorization");
5541
+ if (req.method === "OPTIONS") {
5542
+ res.writeHead(204);
5543
+ res.end();
5544
+ return;
5545
+ }
5546
+ const authHeader = req.headers.authorization;
5547
+ if (!authHeader || authHeader !== `Bearer ${token}`) {
5548
+ sendJson(res, 401, { error: "Unauthorized" });
5549
+ return;
5550
+ }
5551
+ if (isShutdown) {
5552
+ sendJson(res, 503, { error: "Proxy is shutting down" });
5553
+ return;
5554
+ }
5555
+ const url2 = req.url ?? "";
5556
+ if (req.method === "GET" && url2 === "/info") {
5557
+ handleInfo(res);
5558
+ return;
5559
+ }
5560
+ if (req.method === "POST" && url2 === "/invoke") {
5561
+ await handleInvoke(req, res);
5562
+ return;
5563
+ }
5564
+ if (req.method === "POST" && url2 === "/invokeBatch") {
5565
+ await handleInvokeBatch(req, res);
5566
+ return;
5567
+ }
5568
+ sendJson(res, 404, { error: "Not found" });
5569
+ });
5570
+ function handleInfo(res) {
5571
+ const response = {
5572
+ targetName: defaultProvider.targetName,
5573
+ maxCalls,
5574
+ callCount,
5575
+ availableTargets: targetsList
5576
+ };
5577
+ sendJson(res, 200, response);
5578
+ }
5579
+ async function handleInvoke(req, res) {
5580
+ if (callCount >= maxCalls) {
5581
+ sendJson(res, 429, { error: `Max calls exceeded (limit: ${maxCalls})` });
5582
+ return;
5583
+ }
5584
+ try {
5585
+ const body = await readBody(req);
5586
+ const request = JSON.parse(body);
5587
+ if (!request.question || typeof request.question !== "string") {
5588
+ sendJson(res, 400, { error: "Missing required field: question" });
5589
+ return;
5590
+ }
5591
+ const provider = resolveProvider(request.target);
5592
+ if (!provider) {
5593
+ sendJson(res, 400, {
5594
+ error: `Unknown target '${request.target}'. Available: ${targetsList.join(", ")}`
5595
+ });
5596
+ return;
5597
+ }
5598
+ callCount++;
5599
+ const response = await provider.invoke({
5600
+ question: request.question,
5601
+ systemPrompt: request.systemPrompt,
5602
+ evalCaseId: request.evalCaseId ?? "proxy",
5603
+ attempt: request.attempt ?? 1
5604
+ });
5605
+ const outputMessages = response.outputMessages ?? [];
5606
+ const rawText = extractLastAssistantContent2(outputMessages);
5607
+ const result = {
5608
+ outputMessages,
5609
+ rawText
5610
+ };
5611
+ sendJson(res, 200, result);
5612
+ } catch (error) {
5613
+ const message = error instanceof Error ? error.message : String(error);
5614
+ sendJson(res, 500, { error: message });
5615
+ }
5616
+ }
5617
+ async function handleInvokeBatch(req, res) {
5618
+ try {
5619
+ const body = await readBody(req);
5620
+ const { requests } = JSON.parse(body);
5621
+ if (!Array.isArray(requests)) {
5622
+ sendJson(res, 400, { error: "Missing required field: requests (array)" });
5623
+ return;
5624
+ }
5625
+ if (callCount + requests.length > maxCalls) {
5626
+ sendJson(res, 429, {
5627
+ error: `Batch would exceed max calls (current: ${callCount}, batch: ${requests.length}, limit: ${maxCalls})`
5628
+ });
5629
+ return;
5630
+ }
5631
+ const responses = [];
5632
+ for (const request of requests) {
5633
+ if (!request.question || typeof request.question !== "string") {
5634
+ responses.push({
5635
+ outputMessages: [],
5636
+ rawText: "Error: Missing required field: question"
5637
+ });
5638
+ continue;
5639
+ }
5640
+ const provider = resolveProvider(request.target);
5641
+ if (!provider) {
5642
+ responses.push({
5643
+ outputMessages: [],
5644
+ rawText: `Error: Unknown target '${request.target}'. Available: ${targetsList.join(", ")}`
5645
+ });
5646
+ continue;
5647
+ }
5648
+ callCount++;
5649
+ try {
5650
+ const response = await provider.invoke({
5651
+ question: request.question,
5652
+ systemPrompt: request.systemPrompt,
5653
+ evalCaseId: request.evalCaseId ?? "proxy",
5654
+ attempt: request.attempt ?? 1
5655
+ });
5656
+ const outputMessages = response.outputMessages ?? [];
5657
+ responses.push({
5658
+ outputMessages,
5659
+ rawText: extractLastAssistantContent2(outputMessages)
5660
+ });
5661
+ } catch (error) {
5662
+ const message = error instanceof Error ? error.message : String(error);
5663
+ responses.push({
5664
+ outputMessages: [],
5665
+ rawText: `Error: ${message}`
5666
+ });
5667
+ }
5668
+ }
5669
+ sendJson(res, 200, { responses });
5670
+ } catch (error) {
5671
+ const message = error instanceof Error ? error.message : String(error);
5672
+ sendJson(res, 500, { error: message });
5673
+ }
5674
+ }
5675
+ await new Promise((resolve, reject) => {
5676
+ server.once("error", reject);
5677
+ server.listen(0, "127.0.0.1", () => {
5678
+ server.removeListener("error", reject);
5679
+ resolve();
5680
+ });
5681
+ });
5682
+ const address = server.address();
5683
+ const url = `http://127.0.0.1:${address.port}`;
5684
+ return {
5685
+ url,
5686
+ token,
5687
+ shutdown: async () => {
5688
+ isShutdown = true;
5689
+ return new Promise((resolve, reject) => {
5690
+ server.close((err) => {
5691
+ if (err) reject(err);
5692
+ else resolve();
5693
+ });
5694
+ });
5695
+ },
5696
+ getUsageMetadata: () => ({
5697
+ callCount,
5698
+ maxCalls
5699
+ })
5700
+ };
5701
+ }
5702
+ function sendJson(res, statusCode, body) {
5703
+ res.writeHead(statusCode, { "Content-Type": "application/json" });
5704
+ res.end(JSON.stringify(body));
5705
+ }
5706
+ function readBody(req) {
5707
+ return new Promise((resolve, reject) => {
5708
+ const chunks = [];
5709
+ req.on("data", (chunk) => chunks.push(chunk));
5710
+ req.on("end", () => resolve(Buffer.concat(chunks).toString("utf8")));
5711
+ req.on("error", reject);
5712
+ });
5713
+ }
5714
+ function extractLastAssistantContent2(messages) {
5715
+ for (let i = messages.length - 1; i >= 0; i--) {
5716
+ const msg = messages[i];
5717
+ if (msg.role === "assistant" && msg.content !== void 0) {
5718
+ if (typeof msg.content === "string") {
5719
+ return msg.content;
5720
+ }
5721
+ if (Array.isArray(msg.content)) {
5722
+ for (const part of msg.content) {
5723
+ if (typeof part === "object" && part !== null && "text" in part) {
5724
+ return String(part.text);
5725
+ }
5726
+ }
5727
+ }
5728
+ }
5729
+ }
5730
+ return void 0;
5731
+ }
5732
+
5293
5733
  // src/evaluation/case-conversion.ts
5294
5734
  function toSnakeCase(str) {
5295
5735
  if (/^[A-Z]/.test(str)) {
@@ -5297,12 +5737,6 @@ function toSnakeCase(str) {
5297
5737
  }
5298
5738
  return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
5299
5739
  }
5300
- function toCamelCase(str) {
5301
- if (/^[A-Z]/.test(str)) {
5302
- return str;
5303
- }
5304
- return str.replace(/_([a-z0-9])/g, (_, letter) => letter.toUpperCase());
5305
- }
5306
5740
  function toSnakeCaseDeep(obj) {
5307
5741
  if (obj === null || obj === void 0) {
5308
5742
  return obj;
@@ -5320,25 +5754,148 @@ function toSnakeCaseDeep(obj) {
5320
5754
  }
5321
5755
  return obj;
5322
5756
  }
5323
- function toCamelCaseDeep(obj) {
5324
- if (obj === null || obj === void 0) {
5325
- return obj;
5326
- }
5327
- if (Array.isArray(obj)) {
5328
- return obj.map((item) => toCamelCaseDeep(item));
5757
+
5758
+ // src/evaluation/evaluators/code-evaluator.ts
5759
+ var CodeEvaluator = class {
5760
+ kind = "code";
5761
+ script;
5762
+ cwd;
5763
+ agentTimeoutMs;
5764
+ config;
5765
+ target;
5766
+ constructor(options) {
5767
+ this.script = options.script;
5768
+ this.cwd = options.cwd;
5769
+ this.agentTimeoutMs = options.agentTimeoutMs;
5770
+ this.config = options.config;
5771
+ this.target = options.target;
5329
5772
  }
5330
- if (typeof obj === "object") {
5331
- const result = {};
5332
- for (const [key, value] of Object.entries(obj)) {
5333
- const camelKey = toCamelCase(key);
5334
- result[camelKey] = toCamelCaseDeep(value);
5773
+ async evaluate(context) {
5774
+ const payload = {
5775
+ question: context.evalCase.question,
5776
+ expectedOutcome: context.evalCase.expected_outcome,
5777
+ expectedMessages: context.evalCase.expected_messages,
5778
+ referenceAnswer: context.evalCase.reference_answer,
5779
+ candidateAnswer: context.candidate,
5780
+ outputMessages: context.outputMessages ?? null,
5781
+ guidelineFiles: context.evalCase.guideline_paths,
5782
+ inputFiles: context.evalCase.file_paths.filter(
5783
+ (path15) => !context.evalCase.guideline_paths.includes(path15)
5784
+ ),
5785
+ inputMessages: context.evalCase.input_messages,
5786
+ traceSummary: context.traceSummary ?? null,
5787
+ config: this.config ?? null
5788
+ };
5789
+ const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
5790
+ let proxyEnv;
5791
+ let proxyShutdown;
5792
+ let getProxyUsage;
5793
+ if (this.target !== void 0 && context.judgeProvider) {
5794
+ const maxCalls = this.target.max_calls ?? DEFAULT_MAX_CALLS;
5795
+ const proxy = await createTargetProxy({
5796
+ defaultProvider: context.judgeProvider,
5797
+ targetResolver: context.targetResolver,
5798
+ availableTargets: context.availableTargets,
5799
+ maxCalls
5800
+ });
5801
+ proxyEnv = {
5802
+ AGENTV_TARGET_PROXY_URL: proxy.url,
5803
+ AGENTV_TARGET_PROXY_TOKEN: proxy.token
5804
+ };
5805
+ proxyShutdown = proxy.shutdown;
5806
+ getProxyUsage = proxy.getUsageMetadata;
5807
+ }
5808
+ try {
5809
+ const stdout = await executeScript(
5810
+ this.script,
5811
+ inputPayload,
5812
+ this.agentTimeoutMs,
5813
+ this.cwd,
5814
+ proxyEnv
5815
+ );
5816
+ const parsed = parseJsonSafe(stdout);
5817
+ const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
5818
+ const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
5819
+ const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
5820
+ const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
5821
+ const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
5822
+ const proxyUsage = getProxyUsage?.();
5823
+ const evaluatorRawRequest = {
5824
+ script: this.script,
5825
+ ...this.cwd ? { cwd: this.cwd } : {},
5826
+ ...proxyUsage ? {
5827
+ target_proxy: {
5828
+ call_count: proxyUsage.callCount,
5829
+ max_calls: proxyUsage.maxCalls
5830
+ }
5831
+ } : {}
5832
+ };
5833
+ return {
5834
+ score,
5835
+ verdict: scoreToVerdict(score),
5836
+ hits,
5837
+ misses,
5838
+ expectedAspectCount: hits.length + misses.length || 1,
5839
+ reasoning,
5840
+ evaluatorRawRequest,
5841
+ ...details ? { details } : {}
5842
+ };
5843
+ } catch (error) {
5844
+ const message = error instanceof Error ? error.message : String(error);
5845
+ const proxyUsage = getProxyUsage?.();
5846
+ return {
5847
+ score: 0,
5848
+ verdict: "fail",
5849
+ hits: [],
5850
+ misses: [`Code evaluator failed: ${message}`],
5851
+ expectedAspectCount: 1,
5852
+ reasoning: message,
5853
+ evaluatorRawRequest: {
5854
+ script: this.script,
5855
+ ...this.cwd ? { cwd: this.cwd } : {},
5856
+ ...proxyUsage ? {
5857
+ target_proxy: {
5858
+ call_count: proxyUsage.callCount,
5859
+ max_calls: proxyUsage.maxCalls
5860
+ }
5861
+ } : {},
5862
+ error: message
5863
+ }
5864
+ };
5865
+ } finally {
5866
+ if (proxyShutdown) {
5867
+ await proxyShutdown();
5868
+ }
5335
5869
  }
5336
- return result;
5337
5870
  }
5338
- return obj;
5871
+ };
5872
+ async function executeScript(scriptPath, input, agentTimeoutMs, cwd, env) {
5873
+ const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env });
5874
+ if (exitCode !== 0) {
5875
+ const trimmedErr = formatStderr(stderr);
5876
+ throw new Error(
5877
+ trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
5878
+ );
5879
+ }
5880
+ return stdout.trim();
5881
+ }
5882
+ function formatStderr(stderr) {
5883
+ const trimmed = stderr.trim();
5884
+ const maxLength = 2e3;
5885
+ if (trimmed.length <= maxLength) {
5886
+ return trimmed;
5887
+ }
5888
+ const tail = trimmed.slice(-maxLength);
5889
+ return `...(truncated, last ${maxLength} chars)
5890
+ ${tail}`;
5339
5891
  }
5340
5892
 
5341
- // src/evaluation/evaluators.ts
5893
+ // src/evaluation/evaluators/composite.ts
5894
+ import { generateText as generateText3 } from "ai";
5895
+
5896
+ // src/evaluation/evaluators/llm-judge.ts
5897
+ import { generateText as generateText2 } from "ai";
5898
+ import { z as z2 } from "zod";
5342
5899
  var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
5343
5900
 
5344
5901
  Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
@@ -5418,7 +5975,7 @@ var LlmJudgeEvaluator = class {
5418
5975
  target: judgeProvider.targetName
5419
5976
  };
5420
5977
  try {
5421
- const { data, providerResponse } = await this.runWithRetry({
5978
+ const { data } = await this.runWithRetry({
5422
5979
  context,
5423
5980
  judgeProvider,
5424
5981
  systemPrompt,
@@ -5567,105 +6124,11 @@ You must return a valid JSON object matching this schema:
5567
6124
  "overall_reasoning": "string (summary)"
5568
6125
  }`;
5569
6126
  }
5570
- function scoreToVerdict(score) {
5571
- if (score >= 0.8) {
5572
- return "pass";
5573
- }
5574
- if (score >= 0.6) {
5575
- return "borderline";
5576
- }
5577
- return "fail";
6127
+ function substituteVariables(template, variables) {
6128
+ return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
6129
+ return variables[varName] ?? match;
6130
+ });
5578
6131
  }
5579
- function clampScore(value) {
5580
- if (Number.isNaN(value) || !Number.isFinite(value)) {
5581
- return 0;
5582
- }
5583
- if (value < 0) {
5584
- return 0;
5585
- }
5586
- if (value > 1) {
5587
- return 1;
5588
- }
5589
- return value;
5590
- }
5591
- function extractJsonBlob(text) {
5592
- const match = text.match(/\{[\s\S]*\}/);
5593
- return match?.[0];
5594
- }
5595
- function parseJsonFromText(text) {
5596
- const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
5597
- const blob = extractJsonBlob(cleaned) ?? cleaned;
5598
- return JSON.parse(blob);
5599
- }
5600
- function isNonEmptyString(value) {
5601
- return typeof value === "string" && value.trim().length > 0;
5602
- }
5603
- var CodeEvaluator = class {
5604
- kind = "code";
5605
- script;
5606
- cwd;
5607
- agentTimeoutMs;
5608
- config;
5609
- constructor(options) {
5610
- this.script = options.script;
5611
- this.cwd = options.cwd;
5612
- this.agentTimeoutMs = options.agentTimeoutMs;
5613
- this.config = options.config;
5614
- }
5615
- async evaluate(context) {
5616
- const payload = {
5617
- question: context.evalCase.question,
5618
- expectedOutcome: context.evalCase.expected_outcome,
5619
- expectedMessages: context.evalCase.expected_messages,
5620
- referenceAnswer: context.evalCase.reference_answer,
5621
- candidateAnswer: context.candidate,
5622
- outputMessages: context.outputMessages ?? null,
5623
- guidelineFiles: context.evalCase.guideline_paths,
5624
- inputFiles: context.evalCase.file_paths.filter(
5625
- (path15) => !context.evalCase.guideline_paths.includes(path15)
5626
- ),
5627
- inputMessages: context.evalCase.input_messages,
5628
- traceSummary: context.traceSummary ?? null,
5629
- config: this.config ?? null
5630
- };
5631
- const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
5632
- try {
5633
- const stdout = await executeScript(this.script, inputPayload, this.agentTimeoutMs, this.cwd);
5634
- const parsed = parseJsonSafe(stdout);
5635
- const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
5636
- const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
5637
- const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
5638
- const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
5639
- return {
5640
- score,
5641
- verdict: scoreToVerdict(score),
5642
- hits,
5643
- misses,
5644
- expectedAspectCount: hits.length + misses.length || 1,
5645
- reasoning,
5646
- evaluatorRawRequest: {
5647
- script: this.script,
5648
- ...this.cwd ? { cwd: this.cwd } : {}
5649
- }
5650
- };
5651
- } catch (error) {
5652
- const message = error instanceof Error ? error.message : String(error);
5653
- return {
5654
- score: 0,
5655
- verdict: "fail",
5656
- hits: [],
5657
- misses: [`Code evaluator failed: ${message}`],
5658
- expectedAspectCount: 1,
5659
- reasoning: message,
5660
- evaluatorRawRequest: {
5661
- script: this.script,
5662
- ...this.cwd ? { cwd: this.cwd } : {},
5663
- error: message
5664
- }
5665
- };
5666
- }
5667
- }
5668
- };
5669
6132
  function calculateRubricScore(result, rubrics) {
5670
6133
  const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
5671
6134
  const hits = [];
@@ -5693,273 +6156,281 @@ function calculateRubricScore(result, rubrics) {
5693
6156
  const verdict = failedRequired ? "fail" : scoreToVerdict(score);
5694
6157
  return { score, verdict, hits, misses };
5695
6158
  }
5696
- async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
5697
- const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs });
5698
- if (exitCode !== 0) {
5699
- const trimmedErr = formatStderr(stderr);
5700
- throw new Error(
5701
- trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
5702
- );
5703
- }
5704
- return stdout.trim();
5705
- }
5706
- function formatStderr(stderr) {
5707
- const trimmed = stderr.trim();
5708
- const maxLength = 2e3;
5709
- if (trimmed.length <= maxLength) {
5710
- return trimmed;
5711
- }
5712
- const tail = trimmed.slice(-maxLength);
5713
- return `...(truncated, last ${maxLength} chars)
5714
- ${tail}`;
5715
- }
5716
- function parseJsonSafe(payload) {
5717
- try {
5718
- return JSON.parse(payload);
5719
- } catch {
5720
- return void 0;
5721
- }
5722
- }
5723
- function substituteVariables(template, variables) {
5724
- return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
5725
- return variables[varName] ?? match;
5726
- });
5727
- }
5728
- function deepEqual(a, b) {
5729
- if (a === b) return true;
5730
- if (a === null || b === null) return a === b;
5731
- if (typeof a !== typeof b) return false;
5732
- if (typeof a !== "object") return a === b;
5733
- if (Array.isArray(a) !== Array.isArray(b)) return false;
5734
- if (Array.isArray(a) && Array.isArray(b)) {
5735
- if (a.length !== b.length) return false;
5736
- return a.every((val, i) => deepEqual(val, b[i]));
5737
- }
5738
- const aObj = a;
5739
- const bObj = b;
5740
- const aKeys = Object.keys(aObj);
5741
- const bKeys = Object.keys(bObj);
5742
- if (aKeys.length !== bKeys.length) return false;
5743
- return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
5744
- }
5745
- function argsMatch(expected, actual) {
5746
- if (expected === void 0) return true;
5747
- if (expected === "any") return true;
5748
- if (actual === void 0) return false;
5749
- for (const key of Object.keys(expected)) {
5750
- if (!Object.hasOwn(actual, key)) return false;
5751
- if (!deepEqual(expected[key], actual[key])) return false;
5752
- }
5753
- return true;
5754
- }
5755
- var ToolTrajectoryEvaluator = class {
5756
- kind = "tool_trajectory";
6159
+
6160
+ // src/evaluation/evaluators/composite.ts
6161
+ var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
6162
+ {{EVALUATOR_RESULTS_JSON}}
6163
+
6164
+ Decide the final score and verdict based on all evaluator results.
6165
+ Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
6166
+ var CompositeEvaluator = class {
6167
+ kind = "composite";
5757
6168
  config;
6169
+ evaluatorFactory;
6170
+ cwd;
5758
6171
  constructor(options) {
5759
6172
  this.config = options.config;
6173
+ this.evaluatorFactory = options.evaluatorFactory;
6174
+ this.cwd = options.cwd;
5760
6175
  }
5761
- evaluate(context) {
5762
- const { outputMessages, traceSummary } = context;
5763
- const toolCalls = this.extractToolCallsFromMessages(outputMessages);
5764
- if (toolCalls.length === 0 && !traceSummary) {
5765
- return {
5766
- score: 0,
5767
- verdict: "fail",
5768
- hits: [],
5769
- misses: ["No trace available for evaluation"],
5770
- expectedAspectCount: 1
5771
- };
5772
- }
5773
- const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
5774
- if (!summary) {
5775
- return {
5776
- score: 0,
5777
- verdict: "fail",
5778
- hits: [],
5779
- misses: ["No trace available for evaluation"],
5780
- expectedAspectCount: 1
5781
- };
5782
- }
5783
- switch (this.config.mode) {
5784
- case "any_order":
5785
- return this.evaluateAnyOrder(summary);
5786
- case "in_order":
5787
- return this.evaluateInOrder(toolCalls);
5788
- case "exact":
5789
- return this.evaluateExact(toolCalls);
5790
- default:
6176
+ async evaluate(context) {
6177
+ const memberResults = await Promise.all(
6178
+ this.config.evaluators.map(async (memberConfig) => {
6179
+ const evaluator = this.evaluatorFactory.create(memberConfig, context);
5791
6180
  return {
5792
- score: 0,
5793
- verdict: "fail",
5794
- hits: [],
5795
- misses: [`Unknown mode: ${this.config.mode}`],
5796
- expectedAspectCount: 1
6181
+ id: memberConfig.name,
6182
+ type: memberConfig.type,
6183
+ result: await evaluator.evaluate(context)
5797
6184
  };
5798
- }
6185
+ })
6186
+ );
6187
+ return this.aggregate(memberResults, context);
5799
6188
  }
5800
- /**
5801
- * Extract tool calls from output messages.
5802
- */
5803
- extractToolCallsFromMessages(messages) {
5804
- if (!messages) {
5805
- return [];
6189
+ async aggregate(results, context) {
6190
+ const aggregator = this.config.aggregator;
6191
+ switch (aggregator.type) {
6192
+ case "code_judge":
6193
+ return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
6194
+ case "llm_judge":
6195
+ return this.runLlmAggregator(results, context, aggregator);
6196
+ default:
6197
+ return this.runWeightedAverage(results, aggregator.weights);
5806
6198
  }
5807
- const toolCalls = [];
5808
- for (const message of messages) {
5809
- if (message.toolCalls) {
5810
- for (const call of message.toolCalls) {
5811
- toolCalls.push({
5812
- name: call.tool,
5813
- args: call.input
5814
- });
5815
- }
6199
+ }
6200
+ runWeightedAverage(results, weights) {
6201
+ let totalWeight = 0;
6202
+ let weightedSum = 0;
6203
+ const allHits = [];
6204
+ const allMisses = [];
6205
+ const reasoningParts = [];
6206
+ const evaluatorResults = [];
6207
+ for (const member of results) {
6208
+ const weight = weights?.[member.id] ?? 1;
6209
+ totalWeight += weight;
6210
+ weightedSum += member.result.score * weight;
6211
+ allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
6212
+ allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
6213
+ if (member.result.reasoning) {
6214
+ reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
5816
6215
  }
6216
+ evaluatorResults.push({
6217
+ name: member.id,
6218
+ type: member.type,
6219
+ score: member.result.score,
6220
+ weight,
6221
+ verdict: member.result.verdict,
6222
+ hits: [...member.result.hits],
6223
+ misses: [...member.result.misses],
6224
+ reasoning: member.result.reasoning,
6225
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
6226
+ evaluatorResults: member.result.evaluatorResults,
6227
+ details: member.result.details
6228
+ });
5817
6229
  }
5818
- return toolCalls;
5819
- }
5820
- /**
5821
- * Build a summary from extracted tool calls.
5822
- */
5823
- buildSummary(toolCalls) {
5824
- const toolCallsByName = {};
5825
- for (const call of toolCalls) {
5826
- toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
5827
- }
5828
- const toolNames = Object.keys(toolCallsByName).sort();
6230
+ const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
5829
6231
  return {
5830
- eventCount: toolCalls.length,
5831
- toolNames,
5832
- toolCallsByName,
5833
- errorCount: 0
6232
+ score: clampScore(finalScore),
6233
+ verdict: scoreToVerdict(finalScore),
6234
+ hits: allHits,
6235
+ misses: allMisses,
6236
+ expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
6237
+ reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
6238
+ evaluatorRawRequest: {
6239
+ aggregator: "weighted_average",
6240
+ ...weights ? { weights } : {}
6241
+ },
6242
+ evaluatorResults
5834
6243
  };
5835
6244
  }
5836
- evaluateAnyOrder(summary) {
5837
- const minimums = this.config.minimums ?? {};
5838
- const toolNames = Object.keys(minimums);
5839
- if (toolNames.length === 0) {
6245
+ async runCodeAggregator(results, scriptPath, cwd, weights) {
6246
+ const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
6247
+ const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
6248
+ const evaluatorResults = results.map((member) => ({
6249
+ name: member.id,
6250
+ type: member.type,
6251
+ score: member.result.score,
6252
+ weight: weights?.[member.id] ?? 1,
6253
+ verdict: member.result.verdict,
6254
+ hits: [...member.result.hits],
6255
+ misses: [...member.result.misses],
6256
+ reasoning: member.result.reasoning,
6257
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
6258
+ evaluatorResults: member.result.evaluatorResults,
6259
+ details: member.result.details
6260
+ }));
6261
+ try {
6262
+ const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
6263
+ const parsed = parseJsonSafe(stdout);
6264
+ const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
6265
+ const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
6266
+ const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
6267
+ const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
6268
+ const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
5840
6269
  return {
5841
- score: 1,
5842
- verdict: "pass",
5843
- hits: ["No tool requirements specified"],
5844
- misses: [],
5845
- expectedAspectCount: 0
6270
+ score,
6271
+ verdict,
6272
+ hits,
6273
+ misses,
6274
+ expectedAspectCount: hits.length + misses.length || 1,
6275
+ reasoning,
6276
+ evaluatorRawRequest: {
6277
+ aggregator: "code_judge",
6278
+ script: scriptPath
6279
+ },
6280
+ evaluatorResults
6281
+ };
6282
+ } catch (error) {
6283
+ const message = error instanceof Error ? error.message : String(error);
6284
+ return {
6285
+ score: 0,
6286
+ verdict: "fail",
6287
+ hits: [],
6288
+ misses: [`Code aggregator failed: ${message}`],
6289
+ expectedAspectCount: 1,
6290
+ reasoning: message,
6291
+ evaluatorRawRequest: {
6292
+ aggregator: "code_judge",
6293
+ script: scriptPath,
6294
+ error: message
6295
+ },
6296
+ evaluatorResults
5846
6297
  };
5847
6298
  }
5848
- const hits = [];
5849
- const misses = [];
5850
- for (const toolName of toolNames) {
5851
- const required = minimums[toolName];
5852
- const actual = summary.toolCallsByName[toolName] ?? 0;
5853
- if (actual >= required) {
5854
- hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
5855
- } else {
5856
- misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
5857
- }
6299
+ }
6300
+ async runLlmAggregator(results, context, config) {
6301
+ const judgeProvider = context.judgeProvider;
6302
+ if (!judgeProvider) {
6303
+ throw new Error("No judge provider available for LLM aggregation");
5858
6304
  }
5859
- const score = hits.length / toolNames.length;
5860
- return {
5861
- score,
5862
- verdict: scoreToVerdict(score),
5863
- hits,
5864
- misses,
5865
- expectedAspectCount: toolNames.length
6305
+ const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
6306
+ const resultsJson = JSON.stringify(resultsObject, null, 2);
6307
+ const evaluatorResults = results.map((member) => ({
6308
+ name: member.id,
6309
+ type: member.type,
6310
+ score: member.result.score,
6311
+ verdict: member.result.verdict,
6312
+ hits: [...member.result.hits],
6313
+ misses: [...member.result.misses],
6314
+ reasoning: member.result.reasoning,
6315
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
6316
+ evaluatorResults: member.result.evaluatorResults,
6317
+ details: member.result.details
6318
+ }));
6319
+ const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
6320
+ const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
6321
+ const systemPrompt = buildOutputSchema();
6322
+ const evaluatorRawRequest = {
6323
+ aggregator: "llm_judge",
6324
+ userPrompt,
6325
+ systemPrompt,
6326
+ target: judgeProvider.targetName
5866
6327
  };
5867
- }
5868
- evaluateInOrder(toolCalls) {
5869
- const expected = this.config.expected ?? [];
5870
- if (expected.length === 0) {
6328
+ try {
6329
+ const model = judgeProvider.asLanguageModel?.();
6330
+ if (model) {
6331
+ const { text } = await generateText3({
6332
+ model,
6333
+ system: systemPrompt,
6334
+ prompt: userPrompt
6335
+ });
6336
+ const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
6337
+ const score2 = clampScore(data2.score);
6338
+ const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
6339
+ const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
6340
+ const reasoning2 = data2.reasoning;
6341
+ return {
6342
+ score: score2,
6343
+ verdict: scoreToVerdict(score2),
6344
+ hits: hits2,
6345
+ misses: misses2,
6346
+ expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
6347
+ reasoning: reasoning2,
6348
+ evaluatorRawRequest,
6349
+ evaluatorResults
6350
+ };
6351
+ }
6352
+ const response = await judgeProvider.invoke({
6353
+ question: userPrompt,
6354
+ systemPrompt,
6355
+ evalCaseId: context.evalCase.id,
6356
+ attempt: context.attempt
6357
+ });
6358
+ const data = freeformEvaluationSchema.parse(
6359
+ parseJsonFromText(extractLastAssistantContent(response.outputMessages))
6360
+ );
6361
+ const score = clampScore(data.score);
6362
+ const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
6363
+ const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
6364
+ const reasoning = data.reasoning;
5871
6365
  return {
5872
- score: 1,
5873
- verdict: "pass",
5874
- hits: ["No tool sequence specified"],
6366
+ score,
6367
+ verdict: scoreToVerdict(score),
6368
+ hits,
6369
+ misses,
6370
+ expectedAspectCount: Math.max(hits.length + misses.length, 1),
6371
+ reasoning,
6372
+ evaluatorRawRequest,
6373
+ evaluatorResults
6374
+ };
6375
+ } catch {
6376
+ return {
6377
+ score: 0,
6378
+ verdict: "fail",
6379
+ hits: [],
5875
6380
  misses: [],
5876
- expectedAspectCount: 0
6381
+ expectedAspectCount: 1,
6382
+ evaluatorRawRequest,
6383
+ evaluatorResults
5877
6384
  };
5878
6385
  }
5879
- const hits = [];
5880
- const misses = [];
5881
- let actualIndex = 0;
5882
- for (let i = 0; i < expected.length; i++) {
5883
- const expectedItem = expected[i];
5884
- const expectedTool = expectedItem.tool;
5885
- let found = false;
5886
- let argsMismatch = false;
5887
- while (actualIndex < toolCalls.length) {
5888
- const actualCall = toolCalls[actualIndex];
5889
- if (actualCall.name === expectedTool) {
5890
- if (argsMatch(expectedItem.args, actualCall.args)) {
5891
- hits.push(`Found ${expectedTool} at position ${actualIndex}`);
5892
- actualIndex++;
5893
- found = true;
5894
- break;
5895
- }
5896
- misses.push(
5897
- `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
5898
- );
5899
- actualIndex++;
5900
- argsMismatch = true;
5901
- break;
6386
+ }
6387
+ };
6388
+
6389
+ // src/evaluation/evaluators/cost.ts
6390
+ var CostEvaluator = class {
6391
+ kind = "cost";
6392
+ config;
6393
+ constructor(options) {
6394
+ this.config = options.config;
6395
+ }
6396
+ evaluate(context) {
6397
+ const { budget } = this.config;
6398
+ const costUsd = context.traceSummary?.costUsd;
6399
+ if (costUsd === void 0) {
6400
+ return {
6401
+ score: 0,
6402
+ verdict: "fail",
6403
+ hits: [],
6404
+ misses: ["No cost data available in trace"],
6405
+ expectedAspectCount: 1,
6406
+ reasoning: "Execution cost not reported by provider",
6407
+ evaluatorRawRequest: {
6408
+ type: "cost",
6409
+ budget,
6410
+ costUsd: null
5902
6411
  }
5903
- actualIndex++;
5904
- }
5905
- if (!found && !argsMismatch) {
5906
- misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
5907
- }
6412
+ };
5908
6413
  }
5909
- const score = hits.length / expected.length;
6414
+ const passed = costUsd <= budget;
6415
+ const score = passed ? 1 : 0;
6416
+ const formatCost = (n) => `$${n.toFixed(4)}`;
5910
6417
  return {
5911
6418
  score,
5912
- verdict: scoreToVerdict(score),
5913
- hits,
5914
- misses,
5915
- expectedAspectCount: expected.length
5916
- };
5917
- }
5918
- evaluateExact(toolCalls) {
5919
- const expected = this.config.expected ?? [];
5920
- if (expected.length === 0) {
5921
- return {
5922
- score: 1,
5923
- verdict: "pass",
5924
- hits: ["No tool sequence specified"],
5925
- misses: [],
5926
- expectedAspectCount: 0
5927
- };
5928
- }
5929
- const hits = [];
5930
- const misses = [];
5931
- if (toolCalls.length !== expected.length) {
5932
- misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
5933
- }
5934
- const checkLength = Math.min(expected.length, toolCalls.length);
5935
- for (let i = 0; i < checkLength; i++) {
5936
- const expectedItem = expected[i];
5937
- const expectedTool = expectedItem.tool;
5938
- const actualCall = toolCalls[i];
5939
- const actualTool = actualCall.name;
5940
- if (actualTool === expectedTool) {
5941
- if (argsMatch(expectedItem.args, actualCall.args)) {
5942
- hits.push(`Position ${i}: ${expectedTool}`);
5943
- } else {
5944
- misses.push(`Position ${i}: ${expectedTool} args mismatch`);
5945
- }
5946
- } else {
5947
- misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
5948
- }
5949
- }
5950
- for (let i = checkLength; i < expected.length; i++) {
5951
- misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
5952
- }
5953
- const score = hits.length / expected.length;
5954
- return {
5955
- score,
5956
- verdict: scoreToVerdict(score),
5957
- hits,
5958
- misses,
5959
- expectedAspectCount: expected.length
6419
+ verdict: passed ? "pass" : "fail",
6420
+ hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
6421
+ misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
6422
+ expectedAspectCount: 1,
6423
+ reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
6424
+ evaluatorRawRequest: {
6425
+ type: "cost",
6426
+ budget,
6427
+ costUsd
6428
+ }
5960
6429
  };
5961
6430
  }
5962
6431
  };
6432
+
6433
+ // src/evaluation/evaluators/field-accuracy.ts
5963
6434
  var DEFAULT_DATE_FORMATS = [
5964
6435
  "YYYY-MM-DDTHH:mm:ssZ",
5965
6436
  // ISO with timezone
@@ -6168,438 +6639,213 @@ var FieldAccuracyEvaluator = class {
6168
6639
  weight,
6169
6640
  hit: false,
6170
6641
  message: `${path15} (non-numeric value)`
6171
- };
6172
- }
6173
- if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
6174
- return {
6175
- path: path15,
6176
- score: 0,
6177
- weight,
6178
- hit: false,
6179
- message: `${path15} (invalid numeric value)`
6180
- };
6181
- }
6182
- const diff = Math.abs(candidateNum - expectedNum);
6183
- let withinTolerance;
6184
- if (relative) {
6185
- const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
6186
- withinTolerance = relativeDiff <= tolerance;
6187
- } else {
6188
- withinTolerance = diff <= tolerance;
6189
- }
6190
- if (withinTolerance) {
6191
- return {
6192
- path: path15,
6193
- score: 1,
6194
- weight,
6195
- hit: true,
6196
- message: `${path15} (within tolerance: diff=${diff.toFixed(2)})`
6197
- };
6198
- }
6199
- return {
6200
- path: path15,
6201
- score: 0,
6202
- weight,
6203
- hit: false,
6204
- message: `${path15} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
6205
- };
6206
- }
6207
- /**
6208
- * Date comparison with format normalization.
6209
- */
6210
- compareDate(path15, candidateValue, expectedValue, fieldConfig, weight) {
6211
- const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
6212
- const candidateDate = parseDate(String(candidateValue), formats);
6213
- const expectedDate = parseDate(String(expectedValue), formats);
6214
- if (candidateDate === null) {
6215
- return {
6216
- path: path15,
6217
- score: 0,
6218
- weight,
6219
- hit: false,
6220
- message: `${path15} (unparseable candidate date)`
6221
- };
6222
- }
6223
- if (expectedDate === null) {
6224
- return {
6225
- path: path15,
6226
- score: 0,
6227
- weight,
6228
- hit: false,
6229
- message: `${path15} (unparseable expected date)`
6230
- };
6231
- }
6232
- if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
6233
- return {
6234
- path: path15,
6235
- score: 1,
6236
- weight,
6237
- hit: true,
6238
- message: path15
6239
- };
6240
- }
6241
- return {
6242
- path: path15,
6243
- score: 0,
6244
- weight,
6245
- hit: false,
6246
- message: `${path15} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
6247
- };
6248
- }
6249
- /**
6250
- * Aggregate field results using configured strategy.
6251
- */
6252
- aggregateResults(results) {
6253
- const aggregation = this.config.aggregation ?? "weighted_average";
6254
- const hits = [];
6255
- const misses = [];
6256
- for (const result of results) {
6257
- if (result.hit) {
6258
- hits.push(result.message);
6259
- } else {
6260
- misses.push(result.message);
6261
- }
6262
- }
6263
- let score;
6264
- if (aggregation === "all_or_nothing") {
6265
- score = misses.length === 0 ? 1 : 0;
6266
- } else {
6267
- const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
6268
- if (totalWeight === 0) {
6269
- score = results.length === 0 ? 1 : 0;
6270
- } else {
6271
- const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
6272
- score = weightedSum / totalWeight;
6273
- }
6274
- }
6275
- const reasoning = `${hits.length}/${results.length} fields matched`;
6276
- return {
6277
- score: clampScore(score),
6278
- verdict: scoreToVerdict(score),
6279
- hits: hits.slice(0, 4),
6280
- misses: misses.slice(0, 4),
6281
- expectedAspectCount: results.length,
6282
- reasoning
6283
- };
6284
- }
6285
- };
6286
- function resolvePath(obj, path15) {
6287
- if (!path15 || !obj) {
6288
- return void 0;
6289
- }
6290
- const parts = path15.split(/\.|\[|\]/).filter((p) => p.length > 0);
6291
- let current = obj;
6292
- for (const part of parts) {
6293
- if (current === null || current === void 0) {
6294
- return void 0;
6295
- }
6296
- if (typeof current !== "object") {
6297
- return void 0;
6298
- }
6299
- const isIndex = /^\d+$/.test(part);
6300
- if (isIndex && Array.isArray(current)) {
6301
- current = current[Number.parseInt(part, 10)];
6302
- } else {
6303
- current = current[part];
6304
- }
6305
- }
6306
- return current;
6307
- }
6308
- function toNumber(value) {
6309
- if (typeof value === "number") {
6310
- return value;
6311
- }
6312
- if (typeof value === "string") {
6313
- const num = Number.parseFloat(value);
6314
- return Number.isNaN(num) ? null : num;
6315
- }
6316
- return null;
6317
- }
6318
- function parseDate(dateStr, formats) {
6319
- if (!dateStr) return null;
6320
- const trimmed = dateStr.trim();
6321
- const isoDate = new Date(trimmed);
6322
- if (!Number.isNaN(isoDate.getTime())) {
6323
- return isoDate;
6324
- }
6325
- const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
6326
- if (localizedMatch) {
6327
- const day = Number.parseInt(localizedMatch[1], 10);
6328
- const monthName = localizedMatch[2].toLowerCase();
6329
- const year = Number.parseInt(localizedMatch[3], 10);
6330
- const month = MONTH_NAMES[monthName];
6331
- if (month !== void 0) {
6332
- return new Date(year, month, day);
6333
- }
6334
- }
6335
- const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
6336
- if (usMatch) {
6337
- const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
6338
- const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
6339
- if (hasUSFormat && !hasEUFormat) {
6340
- const month = Number.parseInt(usMatch[1], 10) - 1;
6341
- const day = Number.parseInt(usMatch[2], 10);
6342
- const year = Number.parseInt(usMatch[3], 10);
6343
- if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
6344
- return new Date(year, month, day);
6345
- }
6346
- } else if (hasEUFormat && !hasUSFormat) {
6347
- const day = Number.parseInt(usMatch[1], 10);
6348
- const month = Number.parseInt(usMatch[2], 10) - 1;
6349
- const year = Number.parseInt(usMatch[3], 10);
6350
- if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
6351
- return new Date(year, month, day);
6352
- }
6353
- } else {
6354
- const num1 = Number.parseInt(usMatch[1], 10);
6355
- const num2 = Number.parseInt(usMatch[2], 10);
6356
- const year = Number.parseInt(usMatch[3], 10);
6357
- if (num1 > 12 && num2 <= 12) {
6358
- return new Date(year, num2 - 1, num1);
6359
- }
6360
- if (num2 > 12 && num1 <= 12) {
6361
- return new Date(year, num1 - 1, num2);
6362
- }
6363
- if (num1 <= 12 && num2 <= 31) {
6364
- return new Date(year, num1 - 1, num2);
6365
- }
6366
- }
6367
- }
6368
- return null;
6369
- }
6370
- function formatDateISO(date) {
6371
- return date.toISOString().split("T")[0];
6372
- }
6373
- function parseJsonFromTextSafe(text) {
6374
- const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
6375
- const match = cleaned.match(/\{[\s\S]*\}/);
6376
- const blob = match?.[0] ?? cleaned;
6377
- return JSON.parse(blob);
6378
- }
6379
- var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
6380
- {{EVALUATOR_RESULTS_JSON}}
6381
-
6382
- Decide the final score and verdict based on all evaluator results.
6383
- Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
6384
- var CompositeEvaluator = class {
6385
- kind = "composite";
6386
- config;
6387
- evaluatorFactory;
6388
- cwd;
6389
- constructor(options) {
6390
- this.config = options.config;
6391
- this.evaluatorFactory = options.evaluatorFactory;
6392
- this.cwd = options.cwd;
6393
- }
6394
- async evaluate(context) {
6395
- const memberResults = await Promise.all(
6396
- this.config.evaluators.map(async (memberConfig) => {
6397
- const evaluator = this.evaluatorFactory.create(memberConfig, context);
6398
- return {
6399
- id: memberConfig.name,
6400
- type: memberConfig.type,
6401
- result: await evaluator.evaluate(context)
6402
- };
6403
- })
6404
- );
6405
- return this.aggregate(memberResults, context);
6406
- }
6407
- async aggregate(results, context) {
6408
- const aggregator = this.config.aggregator;
6409
- switch (aggregator.type) {
6410
- case "code_judge":
6411
- return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
6412
- case "llm_judge":
6413
- return this.runLlmAggregator(results, context, aggregator);
6414
- default:
6415
- return this.runWeightedAverage(results, aggregator.weights);
6416
- }
6417
- }
6418
- runWeightedAverage(results, weights) {
6419
- let totalWeight = 0;
6420
- let weightedSum = 0;
6421
- const allHits = [];
6422
- const allMisses = [];
6423
- const reasoningParts = [];
6424
- const evaluatorResults = [];
6425
- for (const member of results) {
6426
- const weight = weights?.[member.id] ?? 1;
6427
- totalWeight += weight;
6428
- weightedSum += member.result.score * weight;
6429
- allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
6430
- allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
6431
- if (member.result.reasoning) {
6432
- reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
6433
- }
6434
- evaluatorResults.push({
6435
- name: member.id,
6436
- type: member.type,
6437
- score: member.result.score,
6438
- weight,
6439
- verdict: member.result.verdict,
6440
- hits: [...member.result.hits],
6441
- misses: [...member.result.misses],
6442
- reasoning: member.result.reasoning,
6443
- evaluatorRawRequest: member.result.evaluatorRawRequest,
6444
- evaluatorResults: member.result.evaluatorResults
6445
- });
6446
- }
6447
- const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
6448
- return {
6449
- score: clampScore(finalScore),
6450
- verdict: scoreToVerdict(finalScore),
6451
- hits: allHits,
6452
- misses: allMisses,
6453
- expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
6454
- reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
6455
- evaluatorRawRequest: {
6456
- aggregator: "weighted_average",
6457
- ...weights ? { weights } : {}
6458
- },
6459
- evaluatorResults
6460
- };
6461
- }
6462
- async runCodeAggregator(results, scriptPath, cwd, weights) {
6463
- const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
6464
- const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
6465
- const evaluatorResults = results.map((member) => ({
6466
- name: member.id,
6467
- type: member.type,
6468
- score: member.result.score,
6469
- weight: weights?.[member.id] ?? 1,
6470
- verdict: member.result.verdict,
6471
- hits: [...member.result.hits],
6472
- misses: [...member.result.misses],
6473
- reasoning: member.result.reasoning,
6474
- evaluatorRawRequest: member.result.evaluatorRawRequest,
6475
- evaluatorResults: member.result.evaluatorResults
6476
- }));
6477
- try {
6478
- const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
6479
- const parsed = parseJsonSafe(stdout);
6480
- const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
6481
- const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
6482
- const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
6483
- const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
6484
- const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
6485
- return {
6486
- score,
6487
- verdict,
6488
- hits,
6489
- misses,
6490
- expectedAspectCount: hits.length + misses.length || 1,
6491
- reasoning,
6492
- evaluatorRawRequest: {
6493
- aggregator: "code_judge",
6494
- script: scriptPath
6495
- },
6496
- evaluatorResults
6497
- };
6498
- } catch (error) {
6499
- const message = error instanceof Error ? error.message : String(error);
6500
- return {
6501
- score: 0,
6502
- verdict: "fail",
6503
- hits: [],
6504
- misses: [`Code aggregator failed: ${message}`],
6505
- expectedAspectCount: 1,
6506
- reasoning: message,
6507
- evaluatorRawRequest: {
6508
- aggregator: "code_judge",
6509
- script: scriptPath,
6510
- error: message
6511
- },
6512
- evaluatorResults
6513
- };
6514
- }
6515
- }
6516
- async runLlmAggregator(results, context, config) {
6517
- const judgeProvider = context.judgeProvider;
6518
- if (!judgeProvider) {
6519
- throw new Error("No judge provider available for LLM aggregation");
6520
- }
6521
- const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
6522
- const resultsJson = JSON.stringify(resultsObject, null, 2);
6523
- const evaluatorResults = results.map((member) => ({
6524
- name: member.id,
6525
- type: member.type,
6526
- score: member.result.score,
6527
- verdict: member.result.verdict,
6528
- hits: [...member.result.hits],
6529
- misses: [...member.result.misses],
6530
- reasoning: member.result.reasoning,
6531
- evaluatorRawRequest: member.result.evaluatorRawRequest,
6532
- evaluatorResults: member.result.evaluatorResults
6533
- }));
6534
- const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
6535
- const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
6536
- const systemPrompt = buildOutputSchema();
6537
- const evaluatorRawRequest = {
6538
- aggregator: "llm_judge",
6539
- userPrompt,
6540
- systemPrompt,
6541
- target: judgeProvider.targetName
6542
- };
6543
- try {
6544
- const model = judgeProvider.asLanguageModel?.();
6545
- if (model) {
6546
- const { text } = await generateText2({
6547
- model,
6548
- system: systemPrompt,
6549
- prompt: userPrompt
6550
- });
6551
- const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
6552
- const score2 = clampScore(data2.score);
6553
- const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
6554
- const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
6555
- const reasoning2 = data2.reasoning;
6556
- return {
6557
- score: score2,
6558
- verdict: scoreToVerdict(score2),
6559
- hits: hits2,
6560
- misses: misses2,
6561
- expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
6562
- reasoning: reasoning2,
6563
- evaluatorRawRequest,
6564
- evaluatorResults
6565
- };
6566
- }
6567
- const response = await judgeProvider.invoke({
6568
- question: userPrompt,
6569
- systemPrompt,
6570
- evalCaseId: context.evalCase.id,
6571
- attempt: context.attempt
6572
- });
6573
- const data = freeformEvaluationSchema.parse(
6574
- parseJsonFromText(extractLastAssistantContent(response.outputMessages))
6575
- );
6576
- const score = clampScore(data.score);
6577
- const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
6578
- const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
6579
- const reasoning = data.reasoning;
6642
+ };
6643
+ }
6644
+ if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
6580
6645
  return {
6581
- score,
6582
- verdict: scoreToVerdict(score),
6583
- hits,
6584
- misses,
6585
- expectedAspectCount: Math.max(hits.length + misses.length, 1),
6586
- reasoning,
6587
- evaluatorRawRequest,
6588
- evaluatorResults
6646
+ path: path15,
6647
+ score: 0,
6648
+ weight,
6649
+ hit: false,
6650
+ message: `${path15} (invalid numeric value)`
6589
6651
  };
6590
- } catch {
6652
+ }
6653
+ const diff = Math.abs(candidateNum - expectedNum);
6654
+ let withinTolerance;
6655
+ if (relative) {
6656
+ const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
6657
+ withinTolerance = relativeDiff <= tolerance;
6658
+ } else {
6659
+ withinTolerance = diff <= tolerance;
6660
+ }
6661
+ if (withinTolerance) {
6662
+ return {
6663
+ path: path15,
6664
+ score: 1,
6665
+ weight,
6666
+ hit: true,
6667
+ message: `${path15} (within tolerance: diff=${diff.toFixed(2)})`
6668
+ };
6669
+ }
6670
+ return {
6671
+ path: path15,
6672
+ score: 0,
6673
+ weight,
6674
+ hit: false,
6675
+ message: `${path15} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
6676
+ };
6677
+ }
6678
+ /**
6679
+ * Date comparison with format normalization.
6680
+ */
6681
+ compareDate(path15, candidateValue, expectedValue, fieldConfig, weight) {
6682
+ const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
6683
+ const candidateDate = parseDate(String(candidateValue), formats);
6684
+ const expectedDate = parseDate(String(expectedValue), formats);
6685
+ if (candidateDate === null) {
6591
6686
  return {
6687
+ path: path15,
6592
6688
  score: 0,
6593
- verdict: "fail",
6594
- hits: [],
6595
- misses: [],
6596
- expectedAspectCount: 1,
6597
- evaluatorRawRequest,
6598
- evaluatorResults
6689
+ weight,
6690
+ hit: false,
6691
+ message: `${path15} (unparseable candidate date)`
6692
+ };
6693
+ }
6694
+ if (expectedDate === null) {
6695
+ return {
6696
+ path: path15,
6697
+ score: 0,
6698
+ weight,
6699
+ hit: false,
6700
+ message: `${path15} (unparseable expected date)`
6701
+ };
6702
+ }
6703
+ if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
6704
+ return {
6705
+ path: path15,
6706
+ score: 1,
6707
+ weight,
6708
+ hit: true,
6709
+ message: path15
6599
6710
  };
6600
6711
  }
6712
+ return {
6713
+ path: path15,
6714
+ score: 0,
6715
+ weight,
6716
+ hit: false,
6717
+ message: `${path15} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
6718
+ };
6719
+ }
6720
+ /**
6721
+ * Aggregate field results using configured strategy.
6722
+ */
6723
+ aggregateResults(results) {
6724
+ const aggregation = this.config.aggregation ?? "weighted_average";
6725
+ const hits = [];
6726
+ const misses = [];
6727
+ for (const result of results) {
6728
+ if (result.hit) {
6729
+ hits.push(result.message);
6730
+ } else {
6731
+ misses.push(result.message);
6732
+ }
6733
+ }
6734
+ let score;
6735
+ if (aggregation === "all_or_nothing") {
6736
+ score = misses.length === 0 ? 1 : 0;
6737
+ } else {
6738
+ const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
6739
+ if (totalWeight === 0) {
6740
+ score = results.length === 0 ? 1 : 0;
6741
+ } else {
6742
+ const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
6743
+ score = weightedSum / totalWeight;
6744
+ }
6745
+ }
6746
+ const reasoning = `${hits.length}/${results.length} fields matched`;
6747
+ return {
6748
+ score: clampScore(score),
6749
+ verdict: scoreToVerdict(score),
6750
+ hits: hits.slice(0, 4),
6751
+ misses: misses.slice(0, 4),
6752
+ expectedAspectCount: results.length,
6753
+ reasoning
6754
+ };
6601
6755
  }
6602
6756
  };
6757
+ function resolvePath(obj, path15) {
6758
+ if (!path15 || !obj) {
6759
+ return void 0;
6760
+ }
6761
+ const parts = path15.split(/\.|\[|\]/).filter((p) => p.length > 0);
6762
+ let current = obj;
6763
+ for (const part of parts) {
6764
+ if (current === null || current === void 0) {
6765
+ return void 0;
6766
+ }
6767
+ if (typeof current !== "object") {
6768
+ return void 0;
6769
+ }
6770
+ const isIndex = /^\d+$/.test(part);
6771
+ if (isIndex && Array.isArray(current)) {
6772
+ current = current[Number.parseInt(part, 10)];
6773
+ } else {
6774
+ current = current[part];
6775
+ }
6776
+ }
6777
+ return current;
6778
+ }
6779
+ function toNumber(value) {
6780
+ if (typeof value === "number") {
6781
+ return value;
6782
+ }
6783
+ if (typeof value === "string") {
6784
+ const num = Number.parseFloat(value);
6785
+ return Number.isNaN(num) ? null : num;
6786
+ }
6787
+ return null;
6788
+ }
6789
+ function parseDate(dateStr, formats) {
6790
+ if (!dateStr) return null;
6791
+ const trimmed = dateStr.trim();
6792
+ const isoDate = new Date(trimmed);
6793
+ if (!Number.isNaN(isoDate.getTime())) {
6794
+ return isoDate;
6795
+ }
6796
+ const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
6797
+ if (localizedMatch) {
6798
+ const day = Number.parseInt(localizedMatch[1], 10);
6799
+ const monthName = localizedMatch[2].toLowerCase();
6800
+ const year = Number.parseInt(localizedMatch[3], 10);
6801
+ const month = MONTH_NAMES[monthName];
6802
+ if (month !== void 0) {
6803
+ return new Date(year, month, day);
6804
+ }
6805
+ }
6806
+ const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
6807
+ if (usMatch) {
6808
+ const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
6809
+ const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
6810
+ if (hasUSFormat && !hasEUFormat) {
6811
+ const month = Number.parseInt(usMatch[1], 10) - 1;
6812
+ const day = Number.parseInt(usMatch[2], 10);
6813
+ const year = Number.parseInt(usMatch[3], 10);
6814
+ if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
6815
+ return new Date(year, month, day);
6816
+ }
6817
+ } else if (hasEUFormat && !hasUSFormat) {
6818
+ const day = Number.parseInt(usMatch[1], 10);
6819
+ const month = Number.parseInt(usMatch[2], 10) - 1;
6820
+ const year = Number.parseInt(usMatch[3], 10);
6821
+ if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
6822
+ return new Date(year, month, day);
6823
+ }
6824
+ } else {
6825
+ const num1 = Number.parseInt(usMatch[1], 10);
6826
+ const num2 = Number.parseInt(usMatch[2], 10);
6827
+ const year = Number.parseInt(usMatch[3], 10);
6828
+ if (num1 > 12 && num2 <= 12) {
6829
+ return new Date(year, num2 - 1, num1);
6830
+ }
6831
+ if (num2 > 12 && num1 <= 12) {
6832
+ return new Date(year, num1 - 1, num2);
6833
+ }
6834
+ if (num1 <= 12 && num2 <= 31) {
6835
+ return new Date(year, num1 - 1, num2);
6836
+ }
6837
+ }
6838
+ }
6839
+ return null;
6840
+ }
6841
+ function formatDateISO(date) {
6842
+ return date.toISOString().split("T")[0];
6843
+ }
6844
+ function parseJsonFromTextSafe(text) {
6845
+ return parseJsonFromText(text);
6846
+ }
6847
+
6848
+ // src/evaluation/evaluators/latency.ts
6603
6849
  var LatencyEvaluator = class {
6604
6850
  kind = "latency";
6605
6851
  config;
@@ -6635,54 +6881,14 @@ var LatencyEvaluator = class {
6635
6881
  reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
6636
6882
  evaluatorRawRequest: {
6637
6883
  type: "latency",
6638
- threshold,
6639
- durationMs
6640
- }
6641
- };
6642
- }
6643
- };
6644
- var CostEvaluator = class {
6645
- kind = "cost";
6646
- config;
6647
- constructor(options) {
6648
- this.config = options.config;
6649
- }
6650
- evaluate(context) {
6651
- const { budget } = this.config;
6652
- const costUsd = context.traceSummary?.costUsd;
6653
- if (costUsd === void 0) {
6654
- return {
6655
- score: 0,
6656
- verdict: "fail",
6657
- hits: [],
6658
- misses: ["No cost data available in trace"],
6659
- expectedAspectCount: 1,
6660
- reasoning: "Execution cost not reported by provider",
6661
- evaluatorRawRequest: {
6662
- type: "cost",
6663
- budget,
6664
- costUsd: null
6665
- }
6666
- };
6667
- }
6668
- const passed = costUsd <= budget;
6669
- const score = passed ? 1 : 0;
6670
- const formatCost = (n) => `$${n.toFixed(4)}`;
6671
- return {
6672
- score,
6673
- verdict: passed ? "pass" : "fail",
6674
- hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
6675
- misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
6676
- expectedAspectCount: 1,
6677
- reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
6678
- evaluatorRawRequest: {
6679
- type: "cost",
6680
- budget,
6681
- costUsd
6884
+ threshold,
6885
+ durationMs
6682
6886
  }
6683
6887
  };
6684
6888
  }
6685
6889
  };
6890
+
6891
+ // src/evaluation/evaluators/token-usage.ts
6686
6892
  var TokenUsageEvaluator = class {
6687
6893
  kind = "token_usage";
6688
6894
  config;
@@ -6766,6 +6972,226 @@ var TokenUsageEvaluator = class {
6766
6972
  }
6767
6973
  };
6768
6974
 
6975
+ // src/evaluation/evaluators/tool-trajectory.ts
6976
+ function argsMatch(expected, actual) {
6977
+ if (expected === void 0) return true;
6978
+ if (expected === "any") return true;
6979
+ if (actual === void 0) return false;
6980
+ for (const key of Object.keys(expected)) {
6981
+ if (!Object.hasOwn(actual, key)) return false;
6982
+ if (!deepEqual(expected[key], actual[key])) return false;
6983
+ }
6984
+ return true;
6985
+ }
6986
+ var ToolTrajectoryEvaluator = class {
6987
+ kind = "tool_trajectory";
6988
+ config;
6989
+ constructor(options) {
6990
+ this.config = options.config;
6991
+ }
6992
+ evaluate(context) {
6993
+ const { outputMessages, traceSummary } = context;
6994
+ const toolCalls = this.extractToolCallsFromMessages(outputMessages);
6995
+ if (toolCalls.length === 0 && !traceSummary) {
6996
+ return {
6997
+ score: 0,
6998
+ verdict: "fail",
6999
+ hits: [],
7000
+ misses: ["No trace available for evaluation"],
7001
+ expectedAspectCount: 1
7002
+ };
7003
+ }
7004
+ const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
7005
+ if (!summary) {
7006
+ return {
7007
+ score: 0,
7008
+ verdict: "fail",
7009
+ hits: [],
7010
+ misses: ["No trace available for evaluation"],
7011
+ expectedAspectCount: 1
7012
+ };
7013
+ }
7014
+ switch (this.config.mode) {
7015
+ case "any_order":
7016
+ return this.evaluateAnyOrder(summary);
7017
+ case "in_order":
7018
+ return this.evaluateInOrder(toolCalls);
7019
+ case "exact":
7020
+ return this.evaluateExact(toolCalls);
7021
+ default:
7022
+ return {
7023
+ score: 0,
7024
+ verdict: "fail",
7025
+ hits: [],
7026
+ misses: [`Unknown mode: ${this.config.mode}`],
7027
+ expectedAspectCount: 1
7028
+ };
7029
+ }
7030
+ }
7031
+ /**
7032
+ * Extract tool calls from output messages.
7033
+ */
7034
+ extractToolCallsFromMessages(messages) {
7035
+ if (!messages) {
7036
+ return [];
7037
+ }
7038
+ const toolCalls = [];
7039
+ for (const message of messages) {
7040
+ if (message.toolCalls) {
7041
+ for (const call of message.toolCalls) {
7042
+ toolCalls.push({
7043
+ name: call.tool,
7044
+ args: call.input
7045
+ });
7046
+ }
7047
+ }
7048
+ }
7049
+ return toolCalls;
7050
+ }
7051
+ /**
7052
+ * Build a summary from extracted tool calls.
7053
+ */
7054
+ buildSummary(toolCalls) {
7055
+ const toolCallsByName = {};
7056
+ for (const call of toolCalls) {
7057
+ toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
7058
+ }
7059
+ const toolNames = Object.keys(toolCallsByName).sort();
7060
+ return {
7061
+ eventCount: toolCalls.length,
7062
+ toolNames,
7063
+ toolCallsByName,
7064
+ errorCount: 0
7065
+ };
7066
+ }
7067
+ evaluateAnyOrder(summary) {
7068
+ const minimums = this.config.minimums ?? {};
7069
+ const toolNames = Object.keys(minimums);
7070
+ if (toolNames.length === 0) {
7071
+ return {
7072
+ score: 1,
7073
+ verdict: "pass",
7074
+ hits: ["No tool requirements specified"],
7075
+ misses: [],
7076
+ expectedAspectCount: 0
7077
+ };
7078
+ }
7079
+ const hits = [];
7080
+ const misses = [];
7081
+ for (const toolName of toolNames) {
7082
+ const required = minimums[toolName];
7083
+ const actual = summary.toolCallsByName[toolName] ?? 0;
7084
+ if (actual >= required) {
7085
+ hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
7086
+ } else {
7087
+ misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
7088
+ }
7089
+ }
7090
+ const score = hits.length / toolNames.length;
7091
+ return {
7092
+ score,
7093
+ verdict: scoreToVerdict(score),
7094
+ hits,
7095
+ misses,
7096
+ expectedAspectCount: toolNames.length
7097
+ };
7098
+ }
7099
+ evaluateInOrder(toolCalls) {
7100
+ const expected = this.config.expected ?? [];
7101
+ if (expected.length === 0) {
7102
+ return {
7103
+ score: 1,
7104
+ verdict: "pass",
7105
+ hits: ["No tool sequence specified"],
7106
+ misses: [],
7107
+ expectedAspectCount: 0
7108
+ };
7109
+ }
7110
+ const hits = [];
7111
+ const misses = [];
7112
+ let actualIndex = 0;
7113
+ for (let i = 0; i < expected.length; i++) {
7114
+ const expectedItem = expected[i];
7115
+ const expectedTool = expectedItem.tool;
7116
+ let found = false;
7117
+ let argsMismatch = false;
7118
+ while (actualIndex < toolCalls.length) {
7119
+ const actualCall = toolCalls[actualIndex];
7120
+ if (actualCall.name === expectedTool) {
7121
+ if (argsMatch(expectedItem.args, actualCall.args)) {
7122
+ hits.push(`Found ${expectedTool} at position ${actualIndex}`);
7123
+ actualIndex++;
7124
+ found = true;
7125
+ break;
7126
+ }
7127
+ misses.push(
7128
+ `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
7129
+ );
7130
+ actualIndex++;
7131
+ argsMismatch = true;
7132
+ break;
7133
+ }
7134
+ actualIndex++;
7135
+ }
7136
+ if (!found && !argsMismatch) {
7137
+ misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
7138
+ }
7139
+ }
7140
+ const score = hits.length / expected.length;
7141
+ return {
7142
+ score,
7143
+ verdict: scoreToVerdict(score),
7144
+ hits,
7145
+ misses,
7146
+ expectedAspectCount: expected.length
7147
+ };
7148
+ }
7149
+ evaluateExact(toolCalls) {
7150
+ const expected = this.config.expected ?? [];
7151
+ if (expected.length === 0) {
7152
+ return {
7153
+ score: 1,
7154
+ verdict: "pass",
7155
+ hits: ["No tool sequence specified"],
7156
+ misses: [],
7157
+ expectedAspectCount: 0
7158
+ };
7159
+ }
7160
+ const hits = [];
7161
+ const misses = [];
7162
+ if (toolCalls.length !== expected.length) {
7163
+ misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
7164
+ }
7165
+ const checkLength = Math.min(expected.length, toolCalls.length);
7166
+ for (let i = 0; i < checkLength; i++) {
7167
+ const expectedItem = expected[i];
7168
+ const expectedTool = expectedItem.tool;
7169
+ const actualCall = toolCalls[i];
7170
+ const actualTool = actualCall.name;
7171
+ if (actualTool === expectedTool) {
7172
+ if (argsMatch(expectedItem.args, actualCall.args)) {
7173
+ hits.push(`Position ${i}: ${expectedTool}`);
7174
+ } else {
7175
+ misses.push(`Position ${i}: ${expectedTool} args mismatch`);
7176
+ }
7177
+ } else {
7178
+ misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
7179
+ }
7180
+ }
7181
+ for (let i = checkLength; i < expected.length; i++) {
7182
+ misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
7183
+ }
7184
+ const score = hits.length / expected.length;
7185
+ return {
7186
+ score,
7187
+ verdict: scoreToVerdict(score),
7188
+ hits,
7189
+ misses,
7190
+ expectedAspectCount: expected.length
7191
+ };
7192
+ }
7193
+ };
7194
+
6769
7195
  // src/evaluation/orchestrator.ts
6770
7196
  import { createHash } from "node:crypto";
6771
7197
  import path14 from "node:path";
@@ -6979,6 +7405,17 @@ async function runEvaluation(options) {
6979
7405
  }
6980
7406
  return getOrCreateProvider(resolvedJudge);
6981
7407
  };
7408
+ const targetResolver = (name) => {
7409
+ const resolved = resolveTargetByName(name);
7410
+ if (!resolved) {
7411
+ return void 0;
7412
+ }
7413
+ return getOrCreateProvider(resolved);
7414
+ };
7415
+ const availableTargets = [
7416
+ target.name,
7417
+ ...Array.from(targetDefinitions.keys())
7418
+ ];
6982
7419
  const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
6983
7420
  const primaryProvider = getOrCreateProvider(target);
6984
7421
  const providerSupportsBatch = target.providerBatching === true && primaryProvider.supportsBatch === true && typeof primaryProvider.invokeBatch === "function";
@@ -7008,7 +7445,9 @@ async function runEvaluation(options) {
7008
7445
  onResult,
7009
7446
  verbose,
7010
7447
  resolveJudgeProvider,
7011
- agentTimeoutMs
7448
+ agentTimeoutMs,
7449
+ targetResolver,
7450
+ availableTargets
7012
7451
  });
7013
7452
  } catch (error) {
7014
7453
  if (verbose) {
@@ -7047,7 +7486,9 @@ async function runEvaluation(options) {
7047
7486
  cache,
7048
7487
  useCache,
7049
7488
  now,
7050
- judgeProvider
7489
+ judgeProvider,
7490
+ targetResolver,
7491
+ availableTargets
7051
7492
  });
7052
7493
  if (onProgress) {
7053
7494
  await onProgress({
@@ -7114,7 +7555,9 @@ async function runBatchEvaluation(options) {
7114
7555
  onProgress,
7115
7556
  onResult,
7116
7557
  resolveJudgeProvider,
7117
- agentTimeoutMs
7558
+ agentTimeoutMs,
7559
+ targetResolver,
7560
+ availableTargets
7118
7561
  } = options;
7119
7562
  const promptInputsList = [];
7120
7563
  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
@@ -7189,7 +7632,9 @@ async function runBatchEvaluation(options) {
7189
7632
  judgeProvider: await resolveJudgeProvider(target),
7190
7633
  agentTimeoutMs,
7191
7634
  outputMessages,
7192
- traceSummary
7635
+ traceSummary,
7636
+ targetResolver,
7637
+ availableTargets
7193
7638
  });
7194
7639
  if (providerError) {
7195
7640
  result = { ...result, error: providerError };
@@ -7247,7 +7692,9 @@ async function runEvalCase(options) {
7247
7692
  cache,
7248
7693
  useCache,
7249
7694
  signal,
7250
- judgeProvider
7695
+ judgeProvider,
7696
+ targetResolver,
7697
+ availableTargets
7251
7698
  } = options;
7252
7699
  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
7253
7700
  const promptInputs = await buildPromptInputs(evalCase, formattingMode);
@@ -7321,7 +7768,9 @@ async function runEvalCase(options) {
7321
7768
  judgeProvider,
7322
7769
  agentTimeoutMs,
7323
7770
  outputMessages,
7324
- traceSummary
7771
+ traceSummary,
7772
+ targetResolver,
7773
+ availableTargets
7325
7774
  });
7326
7775
  return providerError ? { ...result, error: providerError } : result;
7327
7776
  } catch (error) {
@@ -7341,7 +7790,9 @@ async function evaluateCandidate(options) {
7341
7790
  judgeProvider,
7342
7791
  agentTimeoutMs,
7343
7792
  outputMessages,
7344
- traceSummary
7793
+ traceSummary,
7794
+ targetResolver,
7795
+ availableTargets
7345
7796
  } = options;
7346
7797
  const gradeTimestamp = nowFn();
7347
7798
  const { score, evaluatorResults } = await runEvaluatorsForCase({
@@ -7356,7 +7807,9 @@ async function evaluateCandidate(options) {
7356
7807
  judgeProvider,
7357
7808
  agentTimeoutMs,
7358
7809
  outputMessages,
7359
- traceSummary
7810
+ traceSummary,
7811
+ targetResolver,
7812
+ availableTargets
7360
7813
  });
7361
7814
  const completedAt = nowFn();
7362
7815
  let agentProviderRequest;
@@ -7409,7 +7862,9 @@ async function runEvaluatorsForCase(options) {
7409
7862
  judgeProvider,
7410
7863
  agentTimeoutMs,
7411
7864
  outputMessages,
7412
- traceSummary
7865
+ traceSummary,
7866
+ targetResolver,
7867
+ availableTargets
7413
7868
  } = options;
7414
7869
  if (evalCase.evaluators && evalCase.evaluators.length > 0) {
7415
7870
  return runEvaluatorList({
@@ -7425,7 +7880,9 @@ async function runEvaluatorsForCase(options) {
7425
7880
  judgeProvider,
7426
7881
  agentTimeoutMs,
7427
7882
  outputMessages,
7428
- traceSummary
7883
+ traceSummary,
7884
+ targetResolver,
7885
+ availableTargets
7429
7886
  });
7430
7887
  }
7431
7888
  const evaluatorKind = evalCase.evaluator ?? "llm_judge";
@@ -7443,7 +7900,9 @@ async function runEvaluatorsForCase(options) {
7443
7900
  now,
7444
7901
  judgeProvider,
7445
7902
  outputMessages,
7446
- traceSummary
7903
+ traceSummary,
7904
+ targetResolver,
7905
+ availableTargets
7447
7906
  });
7448
7907
  return { score };
7449
7908
  }
@@ -7461,7 +7920,9 @@ async function runEvaluatorList(options) {
7461
7920
  judgeProvider,
7462
7921
  agentTimeoutMs,
7463
7922
  outputMessages,
7464
- traceSummary
7923
+ traceSummary,
7924
+ targetResolver,
7925
+ availableTargets
7465
7926
  } = options;
7466
7927
  const scored = [];
7467
7928
  const evaluatorResults = [];
@@ -7499,7 +7960,8 @@ async function runEvaluatorList(options) {
7499
7960
  script: evaluator.script,
7500
7961
  cwd: evaluator.resolvedCwd ?? evaluator.cwd,
7501
7962
  agentTimeoutMs,
7502
- config: evaluator.config
7963
+ config: evaluator.config,
7964
+ target: evaluator.target
7503
7965
  });
7504
7966
  const score2 = await codeEvaluator.evaluate({
7505
7967
  evalCase,
@@ -7509,8 +7971,11 @@ async function runEvaluatorList(options) {
7509
7971
  attempt,
7510
7972
  promptInputs,
7511
7973
  now,
7974
+ judgeProvider,
7512
7975
  outputMessages,
7513
- traceSummary
7976
+ traceSummary,
7977
+ targetResolver,
7978
+ availableTargets
7514
7979
  });
7515
7980
  const weight = evaluator.weight ?? 1;
7516
7981
  scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
@@ -7523,7 +7988,8 @@ async function runEvaluatorList(options) {
7523
7988
  hits: score2.hits,
7524
7989
  misses: score2.misses,
7525
7990
  reasoning: score2.reasoning,
7526
- evaluatorProviderRequest: score2.evaluatorRawRequest
7991
+ evaluatorProviderRequest: score2.evaluatorRawRequest,
7992
+ details: score2.details
7527
7993
  });
7528
7994
  }
7529
7995
  if (evaluator.type === "composite") {
@@ -7537,7 +8003,8 @@ async function runEvaluatorList(options) {
7537
8003
  script: memberConfig.script,
7538
8004
  cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
7539
8005
  agentTimeoutMs,
7540
- config: memberConfig.config
8006
+ config: memberConfig.config,
8007
+ target: memberConfig.target
7541
8008
  });
7542
8009
  case "composite":
7543
8010
  return new CompositeEvaluator({
@@ -7586,7 +8053,9 @@ async function runEvaluatorList(options) {
7586
8053
  now,
7587
8054
  judgeProvider,
7588
8055
  outputMessages,
7589
- traceSummary
8056
+ traceSummary,
8057
+ targetResolver,
8058
+ availableTargets
7590
8059
  });
7591
8060
  const weight = evaluator.weight ?? 1;
7592
8061
  scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
@@ -7782,11 +8251,11 @@ async function runEvaluatorList(options) {
7782
8251
  (total, entry) => total + (entry.score.expectedAspectCount ?? 0),
7783
8252
  0
7784
8253
  );
7785
- const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString2);
8254
+ const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString);
7786
8255
  const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
7787
8256
  const score = {
7788
8257
  score: aggregateScore,
7789
- verdict: scoreToVerdict2(aggregateScore),
8258
+ verdict: scoreToVerdict(aggregateScore),
7790
8259
  hits,
7791
8260
  misses,
7792
8261
  expectedAspectCount,
@@ -7833,18 +8302,6 @@ async function resolveCustomPrompt(config) {
7833
8302
  }
7834
8303
  return config.prompt;
7835
8304
  }
7836
- function isNonEmptyString2(value) {
7837
- return typeof value === "string" && value.trim().length > 0;
7838
- }
7839
- function scoreToVerdict2(score) {
7840
- if (score >= 0.8) {
7841
- return "pass";
7842
- }
7843
- if (score >= 0.6) {
7844
- return "borderline";
7845
- }
7846
- return "fail";
7847
- }
7848
8305
  function filterEvalCases(evalCases, evalId) {
7849
8306
  if (!evalId) {
7850
8307
  return evalCases;
@@ -7987,7 +8444,8 @@ function mapChildResults(children) {
7987
8444
  misses: child.misses,
7988
8445
  reasoning: child.reasoning,
7989
8446
  evaluatorProviderRequest: child.evaluatorRawRequest,
7990
- evaluatorResults: mapChildResults(child.evaluatorResults)
8447
+ evaluatorResults: mapChildResults(child.evaluatorResults),
8448
+ details: child.details
7991
8449
  }));
7992
8450
  }
7993
8451
  function computeWeightedMean(entries) {
@@ -8002,7 +8460,7 @@ function computeWeightedMean(entries) {
8002
8460
  }
8003
8461
 
8004
8462
  // src/evaluation/generators/rubric-generator.ts
8005
- import { generateText as generateText3 } from "ai";
8463
+ import { generateText as generateText4 } from "ai";
8006
8464
  import { z as z3 } from "zod";
8007
8465
  var rubricItemSchema = z3.object({
8008
8466
  id: z3.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
@@ -8036,7 +8494,7 @@ You must return a valid JSON object matching this schema:
8036
8494
  let lastError;
8037
8495
  for (let attempt = 1; attempt <= 3; attempt++) {
8038
8496
  try {
8039
- const { text } = await generateText3({
8497
+ const { text } = await generateText4({
8040
8498
  model,
8041
8499
  system,
8042
8500
  prompt
@@ -8081,17 +8539,6 @@ function buildPrompt(expectedOutcome, question, referenceAnswer) {
8081
8539
  return parts.join("\n");
8082
8540
  }
8083
8541
 
8084
- // src/evaluation/code-judge-sdk.ts
8085
- import { readFileSync } from "node:fs";
8086
- function parseCodeJudgePayload(payload) {
8087
- const parsed = JSON.parse(payload);
8088
- return toCamelCaseDeep(parsed);
8089
- }
8090
- function readCodeJudgePayload() {
8091
- const stdin = readFileSync(0, "utf8");
8092
- return parseCodeJudgePayload(stdin);
8093
- }
8094
-
8095
8542
  // src/index.ts
8096
8543
  function createAgentKernel() {
8097
8544
  return { status: "stub" };
@@ -8109,33 +8556,39 @@ export {
8109
8556
  ToolTrajectoryEvaluator,
8110
8557
  avgToolDurationMs,
8111
8558
  buildDirectoryChain,
8559
+ buildOutputSchema,
8112
8560
  buildPromptInputs,
8113
8561
  buildSearchRoots,
8562
+ clampScore,
8114
8563
  computeTraceSummary,
8115
8564
  consumeClaudeCodeLogEntries,
8116
8565
  consumeCodexLogEntries,
8117
8566
  consumePiLogEntries,
8118
8567
  createAgentKernel,
8119
8568
  createProvider,
8569
+ deepEqual,
8120
8570
  ensureVSCodeSubagents,
8571
+ executeScript,
8121
8572
  explorationRatio,
8122
- extractCodeBlocks,
8573
+ extractJsonBlob,
8123
8574
  fileExists,
8124
8575
  findGitRoot,
8576
+ freeformEvaluationSchema,
8125
8577
  generateRubrics,
8126
8578
  getHitCount,
8127
8579
  isEvaluatorKind,
8128
8580
  isGuidelineFile,
8129
8581
  isJsonObject,
8130
8582
  isJsonValue,
8583
+ isNonEmptyString,
8131
8584
  isTestMessage,
8132
8585
  isTestMessageRole,
8133
8586
  listTargetNames,
8134
8587
  loadEvalCases,
8135
8588
  mergeExecutionMetrics,
8136
8589
  normalizeLineEndings,
8137
- parseCodeJudgePayload,
8138
- readCodeJudgePayload,
8590
+ parseJsonFromText,
8591
+ parseJsonSafe,
8139
8592
  readJsonFile,
8140
8593
  readTargetDefinitions,
8141
8594
  readTestSuiteMetadata,
@@ -8145,6 +8598,7 @@ export {
8145
8598
  resolveTargetDefinition,
8146
8599
  runEvalCase,
8147
8600
  runEvaluation,
8601
+ scoreToVerdict,
8148
8602
  subscribeToClaudeCodeLogEntries,
8149
8603
  subscribeToCodexLogEntries,
8150
8604
  subscribeToPiLogEntries,