@agentv/core 0.5.3 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-NL7K4CAK.js → chunk-OW3SHBIJ.js} +7 -2
- package/dist/chunk-OW3SHBIJ.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +172 -5
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +28 -2
- package/dist/index.d.ts +28 -2
- package/dist/index.js +167 -6
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
- package/dist/chunk-NL7K4CAK.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -36,6 +36,7 @@ __export(index_exports, {
|
|
|
36
36
|
buildDirectoryChain: () => buildDirectoryChain,
|
|
37
37
|
buildPromptInputs: () => buildPromptInputs,
|
|
38
38
|
buildSearchRoots: () => buildSearchRoots,
|
|
39
|
+
consumeCodexLogEntries: () => consumeCodexLogEntries,
|
|
39
40
|
createAgentKernel: () => createAgentKernel,
|
|
40
41
|
createProvider: () => createProvider,
|
|
41
42
|
ensureVSCodeSubagents: () => ensureVSCodeSubagents,
|
|
@@ -52,11 +53,13 @@ __export(index_exports, {
|
|
|
52
53
|
listTargetNames: () => listTargetNames,
|
|
53
54
|
loadEvalCases: () => loadEvalCases,
|
|
54
55
|
readTargetDefinitions: () => readTargetDefinitions,
|
|
56
|
+
readTextFile: () => readTextFile,
|
|
55
57
|
resolveAndCreateProvider: () => resolveAndCreateProvider,
|
|
56
58
|
resolveFileReference: () => resolveFileReference,
|
|
57
59
|
resolveTargetDefinition: () => resolveTargetDefinition,
|
|
58
60
|
runEvalCase: () => runEvalCase,
|
|
59
|
-
runEvaluation: () => runEvaluation
|
|
61
|
+
runEvaluation: () => runEvaluation,
|
|
62
|
+
subscribeToCodexLogEntries: () => subscribeToCodexLogEntries
|
|
60
63
|
});
|
|
61
64
|
module.exports = __toCommonJS(index_exports);
|
|
62
65
|
|
|
@@ -130,6 +133,10 @@ async function fileExists(filePath) {
|
|
|
130
133
|
return false;
|
|
131
134
|
}
|
|
132
135
|
}
|
|
136
|
+
async function readTextFile(filePath) {
|
|
137
|
+
const content = await (0, import_promises.readFile)(filePath, "utf8");
|
|
138
|
+
return content.replace(/\r\n/g, "\n");
|
|
139
|
+
}
|
|
133
140
|
async function findGitRoot(startPath) {
|
|
134
141
|
let currentDir = import_node_path.default.dirname(import_node_path.default.resolve(startPath));
|
|
135
142
|
const root = import_node_path.default.parse(currentDir).root;
|
|
@@ -308,6 +315,9 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
308
315
|
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
309
316
|
}
|
|
310
317
|
const suite = parsed;
|
|
318
|
+
const datasetNameFromSuite = asString(suite.dataset)?.trim();
|
|
319
|
+
const fallbackDataset = import_node_path2.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
320
|
+
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
311
321
|
const schema = suite.$schema;
|
|
312
322
|
if (schema !== SCHEMA_EVAL_V2) {
|
|
313
323
|
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
|
|
@@ -455,6 +465,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
455
465
|
];
|
|
456
466
|
const testCase = {
|
|
457
467
|
id,
|
|
468
|
+
dataset: datasetName,
|
|
458
469
|
conversation_id: conversationId,
|
|
459
470
|
task: userTextPrompt,
|
|
460
471
|
user_segments: userSegments,
|
|
@@ -835,6 +846,9 @@ var AzureProvider = class {
|
|
|
835
846
|
);
|
|
836
847
|
return mapResponse(ensureChatResponse(response));
|
|
837
848
|
}
|
|
849
|
+
getAxAI() {
|
|
850
|
+
return this.ai;
|
|
851
|
+
}
|
|
838
852
|
};
|
|
839
853
|
var AnthropicProvider = class {
|
|
840
854
|
constructor(targetName, config) {
|
|
@@ -869,6 +883,9 @@ var AnthropicProvider = class {
|
|
|
869
883
|
);
|
|
870
884
|
return mapResponse(ensureChatResponse(response));
|
|
871
885
|
}
|
|
886
|
+
getAxAI() {
|
|
887
|
+
return this.ai;
|
|
888
|
+
}
|
|
872
889
|
};
|
|
873
890
|
var GeminiProvider = class {
|
|
874
891
|
constructor(targetName, config) {
|
|
@@ -902,6 +919,9 @@ var GeminiProvider = class {
|
|
|
902
919
|
);
|
|
903
920
|
return mapResponse(ensureChatResponse(response));
|
|
904
921
|
}
|
|
922
|
+
getAxAI() {
|
|
923
|
+
return this.ai;
|
|
924
|
+
}
|
|
905
925
|
};
|
|
906
926
|
|
|
907
927
|
// src/evaluation/providers/cli.ts
|
|
@@ -1222,6 +1242,59 @@ function pathToFileUri(filePath) {
|
|
|
1222
1242
|
return `file://${normalizedPath}`;
|
|
1223
1243
|
}
|
|
1224
1244
|
|
|
1245
|
+
// src/evaluation/providers/codex-log-tracker.ts
|
|
1246
|
+
var GLOBAL_LOGS_KEY = Symbol.for("agentv.codexLogs");
|
|
1247
|
+
var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.codexLogSubscribers");
|
|
1248
|
+
function getCodexLogStore() {
|
|
1249
|
+
const globalObject = globalThis;
|
|
1250
|
+
const existing = globalObject[GLOBAL_LOGS_KEY];
|
|
1251
|
+
if (existing) {
|
|
1252
|
+
return existing;
|
|
1253
|
+
}
|
|
1254
|
+
const created = [];
|
|
1255
|
+
globalObject[GLOBAL_LOGS_KEY] = created;
|
|
1256
|
+
return created;
|
|
1257
|
+
}
|
|
1258
|
+
function getSubscriberStore() {
|
|
1259
|
+
const globalObject = globalThis;
|
|
1260
|
+
const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
|
|
1261
|
+
if (existing) {
|
|
1262
|
+
return existing;
|
|
1263
|
+
}
|
|
1264
|
+
const created = /* @__PURE__ */ new Set();
|
|
1265
|
+
globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
|
|
1266
|
+
return created;
|
|
1267
|
+
}
|
|
1268
|
+
function notifySubscribers(entry) {
|
|
1269
|
+
const subscribers = Array.from(getSubscriberStore());
|
|
1270
|
+
for (const listener of subscribers) {
|
|
1271
|
+
try {
|
|
1272
|
+
listener(entry);
|
|
1273
|
+
} catch (error) {
|
|
1274
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1275
|
+
console.warn(`Codex log subscriber failed: ${message}`);
|
|
1276
|
+
}
|
|
1277
|
+
}
|
|
1278
|
+
}
|
|
1279
|
+
function recordCodexLogEntry(entry) {
|
|
1280
|
+
getCodexLogStore().push(entry);
|
|
1281
|
+
notifySubscribers(entry);
|
|
1282
|
+
}
|
|
1283
|
+
function consumeCodexLogEntries() {
|
|
1284
|
+
const store = getCodexLogStore();
|
|
1285
|
+
if (store.length === 0) {
|
|
1286
|
+
return [];
|
|
1287
|
+
}
|
|
1288
|
+
return store.splice(0, store.length);
|
|
1289
|
+
}
|
|
1290
|
+
function subscribeToCodexLogEntries(listener) {
|
|
1291
|
+
const store = getSubscriberStore();
|
|
1292
|
+
store.add(listener);
|
|
1293
|
+
return () => {
|
|
1294
|
+
store.delete(listener);
|
|
1295
|
+
};
|
|
1296
|
+
}
|
|
1297
|
+
|
|
1225
1298
|
// src/evaluation/providers/codex.ts
|
|
1226
1299
|
var execAsync2 = (0, import_node_util2.promisify)(import_node_child_process2.exec);
|
|
1227
1300
|
var WORKSPACE_PREFIX = "agentv-codex-";
|
|
@@ -1418,7 +1491,12 @@ var CodexProvider = class {
|
|
|
1418
1491
|
attempt: request.attempt,
|
|
1419
1492
|
format: this.config.logFormat ?? "summary"
|
|
1420
1493
|
});
|
|
1421
|
-
|
|
1494
|
+
recordCodexLogEntry({
|
|
1495
|
+
filePath,
|
|
1496
|
+
targetName: this.targetName,
|
|
1497
|
+
evalCaseId: request.evalCaseId,
|
|
1498
|
+
attempt: request.attempt
|
|
1499
|
+
});
|
|
1422
1500
|
return logger;
|
|
1423
1501
|
} catch (error) {
|
|
1424
1502
|
const message = error instanceof Error ? error.message : String(error);
|
|
@@ -2808,7 +2886,30 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
2808
2886
|
}
|
|
2809
2887
|
|
|
2810
2888
|
// src/evaluation/evaluators.ts
|
|
2889
|
+
var import_ax3 = require("@ax-llm/ax");
|
|
2811
2890
|
var import_node_crypto2 = require("crypto");
|
|
2891
|
+
var LLM_JUDGE_SIGNATURE = (0, import_ax3.f)().input(
|
|
2892
|
+
"evaluationContext",
|
|
2893
|
+
import_ax3.f.object(
|
|
2894
|
+
{
|
|
2895
|
+
expectedOutcome: import_ax3.f.string("The expected outcome for the original task"),
|
|
2896
|
+
request: import_ax3.f.string("The original task request"),
|
|
2897
|
+
referenceAnswer: import_ax3.f.string("The gold standard reference answer"),
|
|
2898
|
+
generatedAnswer: import_ax3.f.string("The answer to evaluate"),
|
|
2899
|
+
guidelines: import_ax3.f.string("Additional evaluation guidelines or instructions").optional()
|
|
2900
|
+
},
|
|
2901
|
+
"Complete evaluation context for the judge"
|
|
2902
|
+
)
|
|
2903
|
+
).output(
|
|
2904
|
+
"evaluation",
|
|
2905
|
+
import_ax3.f.object({
|
|
2906
|
+
score: import_ax3.f.number("Score between 0.0 and 1.0").min(0).max(1),
|
|
2907
|
+
hits: import_ax3.f.string("Brief specific achievement").array(),
|
|
2908
|
+
misses: import_ax3.f.string("Brief specific failure or omission").array(),
|
|
2909
|
+
reasoning: import_ax3.f.string("Concise explanation for the score").max(500)
|
|
2910
|
+
})
|
|
2911
|
+
).build();
|
|
2912
|
+
var LLM_JUDGE = (0, import_ax3.ax)(LLM_JUDGE_SIGNATURE);
|
|
2812
2913
|
var LlmJudgeEvaluator = class {
|
|
2813
2914
|
kind = "llm_judge";
|
|
2814
2915
|
resolveJudgeProvider;
|
|
@@ -2826,6 +2927,44 @@ var LlmJudgeEvaluator = class {
|
|
|
2826
2927
|
if (!judgeProvider) {
|
|
2827
2928
|
throw new Error("No judge provider available for LLM grading");
|
|
2828
2929
|
}
|
|
2930
|
+
if (providerSupportsAx(judgeProvider)) {
|
|
2931
|
+
return this.evaluateWithAx(context, judgeProvider);
|
|
2932
|
+
}
|
|
2933
|
+
return this.evaluateWithPrompt(context, judgeProvider);
|
|
2934
|
+
}
|
|
2935
|
+
async evaluateWithAx(context, judgeProvider) {
|
|
2936
|
+
const ai = judgeProvider.getAxAI();
|
|
2937
|
+
const guidelines = context.promptInputs.guidelines?.trim();
|
|
2938
|
+
const evaluationContext = {
|
|
2939
|
+
expectedOutcome: context.evalCase.outcome.trim(),
|
|
2940
|
+
request: context.evalCase.task.trim(),
|
|
2941
|
+
referenceAnswer: context.evalCase.expected_assistant_raw.trim(),
|
|
2942
|
+
generatedAnswer: context.candidate.trim(),
|
|
2943
|
+
...guidelines ? { guidelines } : {}
|
|
2944
|
+
};
|
|
2945
|
+
const options = this.buildJudgeForwardOptions(context);
|
|
2946
|
+
const result = await LLM_JUDGE.forward(ai, { evaluationContext }, options);
|
|
2947
|
+
const evaluation = result.evaluation;
|
|
2948
|
+
const expectedAspectCount = Math.max(
|
|
2949
|
+
evaluation.hits.length + evaluation.misses.length,
|
|
2950
|
+
1
|
|
2951
|
+
);
|
|
2952
|
+
return {
|
|
2953
|
+
score: evaluation.score,
|
|
2954
|
+
hits: evaluation.hits,
|
|
2955
|
+
misses: evaluation.misses,
|
|
2956
|
+
expectedAspectCount,
|
|
2957
|
+
reasoning: evaluation.reasoning,
|
|
2958
|
+
evaluatorRawRequest: {
|
|
2959
|
+
id: (0, import_node_crypto2.randomUUID)(),
|
|
2960
|
+
provider: judgeProvider.id,
|
|
2961
|
+
target: context.target.name,
|
|
2962
|
+
method: "ax-structured-output",
|
|
2963
|
+
signature: LLM_JUDGE_SIGNATURE.toString()
|
|
2964
|
+
}
|
|
2965
|
+
};
|
|
2966
|
+
}
|
|
2967
|
+
async evaluateWithPrompt(context, judgeProvider) {
|
|
2829
2968
|
const prompt = buildQualityPrompt(context.evalCase, context.candidate);
|
|
2830
2969
|
const systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
|
|
2831
2970
|
const metadata = {
|
|
@@ -2845,6 +2984,7 @@ var LlmJudgeEvaluator = class {
|
|
|
2845
2984
|
const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
2846
2985
|
const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
2847
2986
|
const reasoning = parsed.reasoning ?? response.reasoning;
|
|
2987
|
+
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
2848
2988
|
const evaluatorRawRequest = {
|
|
2849
2989
|
id: (0, import_node_crypto2.randomUUID)(),
|
|
2850
2990
|
provider: judgeProvider.id,
|
|
@@ -2857,12 +2997,34 @@ var LlmJudgeEvaluator = class {
|
|
|
2857
2997
|
score,
|
|
2858
2998
|
hits,
|
|
2859
2999
|
misses,
|
|
2860
|
-
expectedAspectCount
|
|
3000
|
+
expectedAspectCount,
|
|
2861
3001
|
reasoning,
|
|
2862
3002
|
evaluatorRawRequest
|
|
2863
3003
|
};
|
|
2864
3004
|
}
|
|
3005
|
+
buildJudgeForwardOptions(context) {
|
|
3006
|
+
const modelConfig = this.buildJudgeModelConfig();
|
|
3007
|
+
if (modelConfig === void 0 && context.judgeModel === void 0) {
|
|
3008
|
+
return void 0;
|
|
3009
|
+
}
|
|
3010
|
+
return {
|
|
3011
|
+
...context.judgeModel ? { model: context.judgeModel } : {},
|
|
3012
|
+
...modelConfig ? { modelConfig } : {}
|
|
3013
|
+
};
|
|
3014
|
+
}
|
|
3015
|
+
buildJudgeModelConfig() {
|
|
3016
|
+
if (this.maxOutputTokens === void 0 && this.temperature === void 0) {
|
|
3017
|
+
return void 0;
|
|
3018
|
+
}
|
|
3019
|
+
return {
|
|
3020
|
+
...this.maxOutputTokens !== void 0 ? { maxTokens: this.maxOutputTokens } : {},
|
|
3021
|
+
...this.temperature !== void 0 ? { temperature: this.temperature } : {}
|
|
3022
|
+
};
|
|
3023
|
+
}
|
|
2865
3024
|
};
|
|
3025
|
+
function providerSupportsAx(provider) {
|
|
3026
|
+
return typeof provider.getAxAI === "function";
|
|
3027
|
+
}
|
|
2866
3028
|
var QUALITY_SYSTEM_PROMPT = [
|
|
2867
3029
|
"You are an expert evaluator. Your goal is to grade the generated_answer based on how well it achieves the expected_outcome for the original task.",
|
|
2868
3030
|
"",
|
|
@@ -3633,6 +3795,7 @@ async function evaluateCandidate(options) {
|
|
|
3633
3795
|
};
|
|
3634
3796
|
return {
|
|
3635
3797
|
eval_id: evalCase.id,
|
|
3798
|
+
dataset: evalCase.dataset,
|
|
3636
3799
|
conversation_id: evalCase.conversation_id,
|
|
3637
3800
|
score: score.score,
|
|
3638
3801
|
hits: score.hits,
|
|
@@ -3809,7 +3972,7 @@ async function runLlmJudgeEvaluator(options) {
|
|
|
3809
3972
|
async function resolveCustomPrompt(config) {
|
|
3810
3973
|
if (config.promptPath) {
|
|
3811
3974
|
try {
|
|
3812
|
-
return await (
|
|
3975
|
+
return await readTextFile(config.promptPath);
|
|
3813
3976
|
} catch (error) {
|
|
3814
3977
|
const message = error instanceof Error ? error.message : String(error);
|
|
3815
3978
|
console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
|
|
@@ -3897,6 +4060,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs)
|
|
|
3897
4060
|
};
|
|
3898
4061
|
return {
|
|
3899
4062
|
eval_id: evalCase.id,
|
|
4063
|
+
dataset: evalCase.dataset,
|
|
3900
4064
|
conversation_id: evalCase.conversation_id,
|
|
3901
4065
|
score: 0,
|
|
3902
4066
|
hits: [],
|
|
@@ -3947,6 +4111,7 @@ function createAgentKernel() {
|
|
|
3947
4111
|
buildDirectoryChain,
|
|
3948
4112
|
buildPromptInputs,
|
|
3949
4113
|
buildSearchRoots,
|
|
4114
|
+
consumeCodexLogEntries,
|
|
3950
4115
|
createAgentKernel,
|
|
3951
4116
|
createProvider,
|
|
3952
4117
|
ensureVSCodeSubagents,
|
|
@@ -3963,10 +4128,12 @@ function createAgentKernel() {
|
|
|
3963
4128
|
listTargetNames,
|
|
3964
4129
|
loadEvalCases,
|
|
3965
4130
|
readTargetDefinitions,
|
|
4131
|
+
readTextFile,
|
|
3966
4132
|
resolveAndCreateProvider,
|
|
3967
4133
|
resolveFileReference,
|
|
3968
4134
|
resolveTargetDefinition,
|
|
3969
4135
|
runEvalCase,
|
|
3970
|
-
runEvaluation
|
|
4136
|
+
runEvaluation,
|
|
4137
|
+
subscribeToCodexLogEntries
|
|
3971
4138
|
});
|
|
3972
4139
|
//# sourceMappingURL=index.cjs.map
|