@agentv/core 0.5.3 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -36,6 +36,7 @@ __export(index_exports, {
36
36
  buildDirectoryChain: () => buildDirectoryChain,
37
37
  buildPromptInputs: () => buildPromptInputs,
38
38
  buildSearchRoots: () => buildSearchRoots,
39
+ consumeCodexLogEntries: () => consumeCodexLogEntries,
39
40
  createAgentKernel: () => createAgentKernel,
40
41
  createProvider: () => createProvider,
41
42
  ensureVSCodeSubagents: () => ensureVSCodeSubagents,
@@ -52,11 +53,13 @@ __export(index_exports, {
52
53
  listTargetNames: () => listTargetNames,
53
54
  loadEvalCases: () => loadEvalCases,
54
55
  readTargetDefinitions: () => readTargetDefinitions,
56
+ readTextFile: () => readTextFile,
55
57
  resolveAndCreateProvider: () => resolveAndCreateProvider,
56
58
  resolveFileReference: () => resolveFileReference,
57
59
  resolveTargetDefinition: () => resolveTargetDefinition,
58
60
  runEvalCase: () => runEvalCase,
59
- runEvaluation: () => runEvaluation
61
+ runEvaluation: () => runEvaluation,
62
+ subscribeToCodexLogEntries: () => subscribeToCodexLogEntries
60
63
  });
61
64
  module.exports = __toCommonJS(index_exports);
62
65
 
@@ -130,6 +133,10 @@ async function fileExists(filePath) {
130
133
  return false;
131
134
  }
132
135
  }
136
+ async function readTextFile(filePath) {
137
+ const content = await (0, import_promises.readFile)(filePath, "utf8");
138
+ return content.replace(/\r\n/g, "\n");
139
+ }
133
140
  async function findGitRoot(startPath) {
134
141
  let currentDir = import_node_path.default.dirname(import_node_path.default.resolve(startPath));
135
142
  const root = import_node_path.default.parse(currentDir).root;
@@ -308,6 +315,9 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
308
315
  throw new Error(`Invalid test file format: ${evalFilePath}`);
309
316
  }
310
317
  const suite = parsed;
318
+ const datasetNameFromSuite = asString(suite.dataset)?.trim();
319
+ const fallbackDataset = import_node_path2.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
320
+ const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
311
321
  const schema = suite.$schema;
312
322
  if (schema !== SCHEMA_EVAL_V2) {
313
323
  const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
@@ -455,6 +465,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
455
465
  ];
456
466
  const testCase = {
457
467
  id,
468
+ dataset: datasetName,
458
469
  conversation_id: conversationId,
459
470
  task: userTextPrompt,
460
471
  user_segments: userSegments,
@@ -835,6 +846,9 @@ var AzureProvider = class {
835
846
  );
836
847
  return mapResponse(ensureChatResponse(response));
837
848
  }
849
+ getAxAI() {
850
+ return this.ai;
851
+ }
838
852
  };
839
853
  var AnthropicProvider = class {
840
854
  constructor(targetName, config) {
@@ -869,6 +883,9 @@ var AnthropicProvider = class {
869
883
  );
870
884
  return mapResponse(ensureChatResponse(response));
871
885
  }
886
+ getAxAI() {
887
+ return this.ai;
888
+ }
872
889
  };
873
890
  var GeminiProvider = class {
874
891
  constructor(targetName, config) {
@@ -902,6 +919,9 @@ var GeminiProvider = class {
902
919
  );
903
920
  return mapResponse(ensureChatResponse(response));
904
921
  }
922
+ getAxAI() {
923
+ return this.ai;
924
+ }
905
925
  };
906
926
 
907
927
  // src/evaluation/providers/cli.ts
@@ -1222,6 +1242,59 @@ function pathToFileUri(filePath) {
1222
1242
  return `file://${normalizedPath}`;
1223
1243
  }
1224
1244
 
1245
+ // src/evaluation/providers/codex-log-tracker.ts
1246
+ var GLOBAL_LOGS_KEY = Symbol.for("agentv.codexLogs");
1247
+ var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.codexLogSubscribers");
1248
+ function getCodexLogStore() {
1249
+ const globalObject = globalThis;
1250
+ const existing = globalObject[GLOBAL_LOGS_KEY];
1251
+ if (existing) {
1252
+ return existing;
1253
+ }
1254
+ const created = [];
1255
+ globalObject[GLOBAL_LOGS_KEY] = created;
1256
+ return created;
1257
+ }
1258
+ function getSubscriberStore() {
1259
+ const globalObject = globalThis;
1260
+ const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
1261
+ if (existing) {
1262
+ return existing;
1263
+ }
1264
+ const created = /* @__PURE__ */ new Set();
1265
+ globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
1266
+ return created;
1267
+ }
1268
+ function notifySubscribers(entry) {
1269
+ const subscribers = Array.from(getSubscriberStore());
1270
+ for (const listener of subscribers) {
1271
+ try {
1272
+ listener(entry);
1273
+ } catch (error) {
1274
+ const message = error instanceof Error ? error.message : String(error);
1275
+ console.warn(`Codex log subscriber failed: ${message}`);
1276
+ }
1277
+ }
1278
+ }
1279
+ function recordCodexLogEntry(entry) {
1280
+ getCodexLogStore().push(entry);
1281
+ notifySubscribers(entry);
1282
+ }
1283
+ function consumeCodexLogEntries() {
1284
+ const store = getCodexLogStore();
1285
+ if (store.length === 0) {
1286
+ return [];
1287
+ }
1288
+ return store.splice(0, store.length);
1289
+ }
1290
+ function subscribeToCodexLogEntries(listener) {
1291
+ const store = getSubscriberStore();
1292
+ store.add(listener);
1293
+ return () => {
1294
+ store.delete(listener);
1295
+ };
1296
+ }
1297
+
1225
1298
  // src/evaluation/providers/codex.ts
1226
1299
  var execAsync2 = (0, import_node_util2.promisify)(import_node_child_process2.exec);
1227
1300
  var WORKSPACE_PREFIX = "agentv-codex-";
@@ -1418,7 +1491,12 @@ var CodexProvider = class {
1418
1491
  attempt: request.attempt,
1419
1492
  format: this.config.logFormat ?? "summary"
1420
1493
  });
1421
- console.log(`Streaming Codex CLI output to ${filePath}`);
1494
+ recordCodexLogEntry({
1495
+ filePath,
1496
+ targetName: this.targetName,
1497
+ evalCaseId: request.evalCaseId,
1498
+ attempt: request.attempt
1499
+ });
1422
1500
  return logger;
1423
1501
  } catch (error) {
1424
1502
  const message = error instanceof Error ? error.message : String(error);
@@ -2808,7 +2886,30 @@ function resolveAndCreateProvider(definition, env = process.env) {
2808
2886
  }
2809
2887
 
2810
2888
  // src/evaluation/evaluators.ts
2889
+ var import_ax3 = require("@ax-llm/ax");
2811
2890
  var import_node_crypto2 = require("crypto");
2891
+ var LLM_JUDGE_SIGNATURE = (0, import_ax3.f)().input(
2892
+ "evaluationContext",
2893
+ import_ax3.f.object(
2894
+ {
2895
+ expectedOutcome: import_ax3.f.string("The expected outcome for the original task"),
2896
+ request: import_ax3.f.string("The original task request"),
2897
+ referenceAnswer: import_ax3.f.string("The gold standard reference answer"),
2898
+ generatedAnswer: import_ax3.f.string("The answer to evaluate"),
2899
+ guidelines: import_ax3.f.string("Additional evaluation guidelines or instructions").optional()
2900
+ },
2901
+ "Complete evaluation context for the judge"
2902
+ )
2903
+ ).output(
2904
+ "evaluation",
2905
+ import_ax3.f.object({
2906
+ score: import_ax3.f.number("Score between 0.0 and 1.0").min(0).max(1),
2907
+ hits: import_ax3.f.string("Brief specific achievement").array(),
2908
+ misses: import_ax3.f.string("Brief specific failure or omission").array(),
2909
+ reasoning: import_ax3.f.string("Concise explanation for the score").max(500)
2910
+ })
2911
+ ).build();
2912
+ var LLM_JUDGE = (0, import_ax3.ax)(LLM_JUDGE_SIGNATURE);
2812
2913
  var LlmJudgeEvaluator = class {
2813
2914
  kind = "llm_judge";
2814
2915
  resolveJudgeProvider;
@@ -2826,6 +2927,44 @@ var LlmJudgeEvaluator = class {
2826
2927
  if (!judgeProvider) {
2827
2928
  throw new Error("No judge provider available for LLM grading");
2828
2929
  }
2930
+ if (providerSupportsAx(judgeProvider)) {
2931
+ return this.evaluateWithAx(context, judgeProvider);
2932
+ }
2933
+ return this.evaluateWithPrompt(context, judgeProvider);
2934
+ }
2935
+ async evaluateWithAx(context, judgeProvider) {
2936
+ const ai = judgeProvider.getAxAI();
2937
+ const guidelines = context.promptInputs.guidelines?.trim();
2938
+ const evaluationContext = {
2939
+ expectedOutcome: context.evalCase.outcome.trim(),
2940
+ request: context.evalCase.task.trim(),
2941
+ referenceAnswer: context.evalCase.expected_assistant_raw.trim(),
2942
+ generatedAnswer: context.candidate.trim(),
2943
+ ...guidelines ? { guidelines } : {}
2944
+ };
2945
+ const options = this.buildJudgeForwardOptions(context);
2946
+ const result = await LLM_JUDGE.forward(ai, { evaluationContext }, options);
2947
+ const evaluation = result.evaluation;
2948
+ const expectedAspectCount = Math.max(
2949
+ evaluation.hits.length + evaluation.misses.length,
2950
+ 1
2951
+ );
2952
+ return {
2953
+ score: evaluation.score,
2954
+ hits: evaluation.hits,
2955
+ misses: evaluation.misses,
2956
+ expectedAspectCount,
2957
+ reasoning: evaluation.reasoning,
2958
+ evaluatorRawRequest: {
2959
+ id: (0, import_node_crypto2.randomUUID)(),
2960
+ provider: judgeProvider.id,
2961
+ target: context.target.name,
2962
+ method: "ax-structured-output",
2963
+ signature: LLM_JUDGE_SIGNATURE.toString()
2964
+ }
2965
+ };
2966
+ }
2967
+ async evaluateWithPrompt(context, judgeProvider) {
2829
2968
  const prompt = buildQualityPrompt(context.evalCase, context.candidate);
2830
2969
  const systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
2831
2970
  const metadata = {
@@ -2845,6 +2984,7 @@ var LlmJudgeEvaluator = class {
2845
2984
  const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
2846
2985
  const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
2847
2986
  const reasoning = parsed.reasoning ?? response.reasoning;
2987
+ const expectedAspectCount = Math.max(hits.length + misses.length, 1);
2848
2988
  const evaluatorRawRequest = {
2849
2989
  id: (0, import_node_crypto2.randomUUID)(),
2850
2990
  provider: judgeProvider.id,
@@ -2857,12 +2997,34 @@ var LlmJudgeEvaluator = class {
2857
2997
  score,
2858
2998
  hits,
2859
2999
  misses,
2860
- expectedAspectCount: hits.length + misses.length || 1,
3000
+ expectedAspectCount,
2861
3001
  reasoning,
2862
3002
  evaluatorRawRequest
2863
3003
  };
2864
3004
  }
3005
+ buildJudgeForwardOptions(context) {
3006
+ const modelConfig = this.buildJudgeModelConfig();
3007
+ if (modelConfig === void 0 && context.judgeModel === void 0) {
3008
+ return void 0;
3009
+ }
3010
+ return {
3011
+ ...context.judgeModel ? { model: context.judgeModel } : {},
3012
+ ...modelConfig ? { modelConfig } : {}
3013
+ };
3014
+ }
3015
+ buildJudgeModelConfig() {
3016
+ if (this.maxOutputTokens === void 0 && this.temperature === void 0) {
3017
+ return void 0;
3018
+ }
3019
+ return {
3020
+ ...this.maxOutputTokens !== void 0 ? { maxTokens: this.maxOutputTokens } : {},
3021
+ ...this.temperature !== void 0 ? { temperature: this.temperature } : {}
3022
+ };
3023
+ }
2865
3024
  };
3025
+ function providerSupportsAx(provider) {
3026
+ return typeof provider.getAxAI === "function";
3027
+ }
2866
3028
  var QUALITY_SYSTEM_PROMPT = [
2867
3029
  "You are an expert evaluator. Your goal is to grade the generated_answer based on how well it achieves the expected_outcome for the original task.",
2868
3030
  "",
@@ -3633,6 +3795,7 @@ async function evaluateCandidate(options) {
3633
3795
  };
3634
3796
  return {
3635
3797
  eval_id: evalCase.id,
3798
+ dataset: evalCase.dataset,
3636
3799
  conversation_id: evalCase.conversation_id,
3637
3800
  score: score.score,
3638
3801
  hits: score.hits,
@@ -3809,7 +3972,7 @@ async function runLlmJudgeEvaluator(options) {
3809
3972
  async function resolveCustomPrompt(config) {
3810
3973
  if (config.promptPath) {
3811
3974
  try {
3812
- return await (0, import_promises6.readFile)(config.promptPath, "utf8");
3975
+ return await readTextFile(config.promptPath);
3813
3976
  } catch (error) {
3814
3977
  const message = error instanceof Error ? error.message : String(error);
3815
3978
  console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
@@ -3897,6 +4060,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs)
3897
4060
  };
3898
4061
  return {
3899
4062
  eval_id: evalCase.id,
4063
+ dataset: evalCase.dataset,
3900
4064
  conversation_id: evalCase.conversation_id,
3901
4065
  score: 0,
3902
4066
  hits: [],
@@ -3947,6 +4111,7 @@ function createAgentKernel() {
3947
4111
  buildDirectoryChain,
3948
4112
  buildPromptInputs,
3949
4113
  buildSearchRoots,
4114
+ consumeCodexLogEntries,
3950
4115
  createAgentKernel,
3951
4116
  createProvider,
3952
4117
  ensureVSCodeSubagents,
@@ -3963,10 +4128,12 @@ function createAgentKernel() {
3963
4128
  listTargetNames,
3964
4129
  loadEvalCases,
3965
4130
  readTargetDefinitions,
4131
+ readTextFile,
3966
4132
  resolveAndCreateProvider,
3967
4133
  resolveFileReference,
3968
4134
  resolveTargetDefinition,
3969
4135
  runEvalCase,
3970
- runEvaluation
4136
+ runEvaluation,
4137
+ subscribeToCodexLogEntries
3971
4138
  });
3972
4139
  //# sourceMappingURL=index.cjs.map