@agentv/core 0.10.0 → 0.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +40 -13
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +8 -3
- package/dist/index.d.ts +8 -3
- package/dist/index.js +39 -13
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.cts
CHANGED
|
@@ -91,7 +91,6 @@ type LlmJudgeEvaluatorConfig = {
|
|
|
91
91
|
readonly type: "llm_judge";
|
|
92
92
|
readonly prompt?: string;
|
|
93
93
|
readonly promptPath?: string;
|
|
94
|
-
readonly model?: string;
|
|
95
94
|
};
|
|
96
95
|
type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig;
|
|
97
96
|
/**
|
|
@@ -264,6 +263,13 @@ interface TargetDefinition {
|
|
|
264
263
|
readonly retryStatusCodes?: unknown | undefined;
|
|
265
264
|
}
|
|
266
265
|
|
|
266
|
+
/**
|
|
267
|
+
* Read metadata from a test suite file (like target name).
|
|
268
|
+
* This is a convenience function for CLI tools that need metadata without loading all eval cases.
|
|
269
|
+
*/
|
|
270
|
+
declare function readTestSuiteMetadata(testFilePath: string): Promise<{
|
|
271
|
+
target?: string;
|
|
272
|
+
}>;
|
|
267
273
|
/**
|
|
268
274
|
* Determine whether a path references guideline content (instructions or prompts).
|
|
269
275
|
*/
|
|
@@ -496,7 +502,6 @@ interface EvaluationContext {
|
|
|
496
502
|
readonly judgeProvider?: Provider;
|
|
497
503
|
readonly systemPrompt?: string;
|
|
498
504
|
readonly evaluator?: EvaluatorConfig;
|
|
499
|
-
readonly judgeModel?: string;
|
|
500
505
|
}
|
|
501
506
|
interface EvaluationScore {
|
|
502
507
|
readonly score: number;
|
|
@@ -599,4 +604,4 @@ type AgentKernel = {
|
|
|
599
604
|
};
|
|
600
605
|
declare function createAgentKernel(): AgentKernel;
|
|
601
606
|
|
|
602
|
-
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type Evaluator, type EvaluatorConfig, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, normalizeLineEndings, readTargetDefinitions, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
|
|
607
|
+
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type Evaluator, type EvaluatorConfig, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, normalizeLineEndings, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
|
package/dist/index.d.ts
CHANGED
|
@@ -91,7 +91,6 @@ type LlmJudgeEvaluatorConfig = {
|
|
|
91
91
|
readonly type: "llm_judge";
|
|
92
92
|
readonly prompt?: string;
|
|
93
93
|
readonly promptPath?: string;
|
|
94
|
-
readonly model?: string;
|
|
95
94
|
};
|
|
96
95
|
type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig;
|
|
97
96
|
/**
|
|
@@ -264,6 +263,13 @@ interface TargetDefinition {
|
|
|
264
263
|
readonly retryStatusCodes?: unknown | undefined;
|
|
265
264
|
}
|
|
266
265
|
|
|
266
|
+
/**
|
|
267
|
+
* Read metadata from a test suite file (like target name).
|
|
268
|
+
* This is a convenience function for CLI tools that need metadata without loading all eval cases.
|
|
269
|
+
*/
|
|
270
|
+
declare function readTestSuiteMetadata(testFilePath: string): Promise<{
|
|
271
|
+
target?: string;
|
|
272
|
+
}>;
|
|
267
273
|
/**
|
|
268
274
|
* Determine whether a path references guideline content (instructions or prompts).
|
|
269
275
|
*/
|
|
@@ -496,7 +502,6 @@ interface EvaluationContext {
|
|
|
496
502
|
readonly judgeProvider?: Provider;
|
|
497
503
|
readonly systemPrompt?: string;
|
|
498
504
|
readonly evaluator?: EvaluatorConfig;
|
|
499
|
-
readonly judgeModel?: string;
|
|
500
505
|
}
|
|
501
506
|
interface EvaluationScore {
|
|
502
507
|
readonly score: number;
|
|
@@ -599,4 +604,4 @@ type AgentKernel = {
|
|
|
599
604
|
};
|
|
600
605
|
declare function createAgentKernel(): AgentKernel;
|
|
601
606
|
|
|
602
|
-
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type Evaluator, type EvaluatorConfig, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, normalizeLineEndings, readTargetDefinitions, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
|
|
607
|
+
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type Evaluator, type EvaluatorConfig, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, normalizeLineEndings, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
|
package/dist/index.js
CHANGED
|
@@ -73,6 +73,33 @@ var ANSI_YELLOW = "\x1B[33m";
|
|
|
73
73
|
var ANSI_RESET = "\x1B[0m";
|
|
74
74
|
var SCHEMA_EVAL_V2 = "agentv-eval-v2";
|
|
75
75
|
var SCHEMA_CONFIG_V2 = "agentv-config-v2";
|
|
76
|
+
async function readTestSuiteMetadata(testFilePath) {
|
|
77
|
+
try {
|
|
78
|
+
const absolutePath = path.resolve(testFilePath);
|
|
79
|
+
const content = await readFile(absolutePath, "utf8");
|
|
80
|
+
const parsed = parse(content);
|
|
81
|
+
if (!isJsonObject(parsed)) {
|
|
82
|
+
return {};
|
|
83
|
+
}
|
|
84
|
+
return { target: extractTargetFromSuite(parsed) };
|
|
85
|
+
} catch {
|
|
86
|
+
return {};
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
function extractTargetFromSuite(suite) {
|
|
90
|
+
const execution = suite.execution;
|
|
91
|
+
if (execution && typeof execution === "object" && !Array.isArray(execution)) {
|
|
92
|
+
const executionTarget = execution.target;
|
|
93
|
+
if (typeof executionTarget === "string" && executionTarget.trim().length > 0) {
|
|
94
|
+
return executionTarget.trim();
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
const targetValue = suite.target;
|
|
98
|
+
if (typeof targetValue === "string" && targetValue.trim().length > 0) {
|
|
99
|
+
return targetValue.trim();
|
|
100
|
+
}
|
|
101
|
+
return void 0;
|
|
102
|
+
}
|
|
76
103
|
async function loadConfig(evalFilePath, repoRoot) {
|
|
77
104
|
const directories = buildDirectoryChain(evalFilePath, repoRoot);
|
|
78
105
|
for (const directory of directories) {
|
|
@@ -249,6 +276,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
249
276
|
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
|
|
250
277
|
}
|
|
251
278
|
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
|
|
279
|
+
const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
|
|
280
|
+
const globalTarget = asString(globalExecution?.target) ?? asString(suite.target);
|
|
252
281
|
const results = [];
|
|
253
282
|
for (const rawEvalcase of rawTestcases) {
|
|
254
283
|
if (!isJsonObject(rawEvalcase)) {
|
|
@@ -303,7 +332,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
303
332
|
const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
|
|
304
333
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
305
334
|
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
306
|
-
const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
|
|
335
|
+
const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
|
|
307
336
|
const userFilePaths = [];
|
|
308
337
|
for (const segment of inputSegments) {
|
|
309
338
|
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
@@ -670,9 +699,9 @@ async function resolveAssistantContent(content, searchRoots, verbose) {
|
|
|
670
699
|
}
|
|
671
700
|
return parts.join(" ");
|
|
672
701
|
}
|
|
673
|
-
async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
|
|
702
|
+
async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
|
|
674
703
|
const execution = rawEvalCase.execution;
|
|
675
|
-
const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators;
|
|
704
|
+
const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
|
|
676
705
|
if (candidateEvaluators === void 0) {
|
|
677
706
|
return void 0;
|
|
678
707
|
}
|
|
@@ -710,6 +739,8 @@ async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
|
|
|
710
739
|
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
711
740
|
);
|
|
712
741
|
}
|
|
742
|
+
} else {
|
|
743
|
+
resolvedCwd = searchRoots[0];
|
|
713
744
|
}
|
|
714
745
|
evaluators.push({
|
|
715
746
|
name,
|
|
@@ -738,8 +769,7 @@ async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
|
|
|
738
769
|
name,
|
|
739
770
|
type: "llm_judge",
|
|
740
771
|
prompt,
|
|
741
|
-
promptPath
|
|
742
|
-
model
|
|
772
|
+
promptPath
|
|
743
773
|
});
|
|
744
774
|
}
|
|
745
775
|
return evaluators.length > 0 ? evaluators : void 0;
|
|
@@ -2532,10 +2562,7 @@ var LlmJudgeEvaluator = class {
|
|
|
2532
2562
|
prompt = substituteVariables(systemPrompt, variables);
|
|
2533
2563
|
systemPrompt = buildSystemPrompt(hasReferenceAnswer);
|
|
2534
2564
|
}
|
|
2535
|
-
const metadata = {
|
|
2536
|
-
...systemPrompt !== void 0 ? { systemPrompt } : {},
|
|
2537
|
-
...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
|
|
2538
|
-
};
|
|
2565
|
+
const metadata = systemPrompt !== void 0 ? { systemPrompt } : {};
|
|
2539
2566
|
const response = await judgeProvider.invoke({
|
|
2540
2567
|
question: prompt,
|
|
2541
2568
|
metadata,
|
|
@@ -2555,8 +2582,7 @@ var LlmJudgeEvaluator = class {
|
|
|
2555
2582
|
provider: judgeProvider.id,
|
|
2556
2583
|
prompt,
|
|
2557
2584
|
target: context.target.name,
|
|
2558
|
-
...systemPrompt !== void 0
|
|
2559
|
-
...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
|
|
2585
|
+
...systemPrompt !== void 0 && { systemPrompt }
|
|
2560
2586
|
};
|
|
2561
2587
|
return {
|
|
2562
2588
|
score,
|
|
@@ -3550,8 +3576,7 @@ async function runLlmJudgeEvaluator(options) {
|
|
|
3550
3576
|
now,
|
|
3551
3577
|
judgeProvider,
|
|
3552
3578
|
systemPrompt: customPrompt,
|
|
3553
|
-
evaluator: config
|
|
3554
|
-
judgeModel: config.model
|
|
3579
|
+
evaluator: config
|
|
3555
3580
|
});
|
|
3556
3581
|
}
|
|
3557
3582
|
async function resolveCustomPrompt(config) {
|
|
@@ -3736,6 +3761,7 @@ export {
|
|
|
3736
3761
|
loadEvalCases,
|
|
3737
3762
|
normalizeLineEndings,
|
|
3738
3763
|
readTargetDefinitions,
|
|
3764
|
+
readTestSuiteMetadata,
|
|
3739
3765
|
readTextFile,
|
|
3740
3766
|
resolveAndCreateProvider,
|
|
3741
3767
|
resolveFileReference,
|