@agentv/core 0.2.3 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-5REK5RSI.js → chunk-QVS4OL44.js} +30 -2
- package/dist/chunk-QVS4OL44.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +30 -4
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +7 -5
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +73 -32
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +9 -9
- package/dist/index.d.ts +9 -9
- package/dist/index.js +70 -33
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-5REK5RSI.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { AxChatRequest } from '@ax-llm/ax';
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
|
-
* JSON primitive values appearing in
|
|
4
|
+
* JSON primitive values appearing in AgentV payloads.
|
|
5
5
|
*/
|
|
6
6
|
type JsonPrimitive = string | number | boolean | null;
|
|
7
7
|
/**
|
|
@@ -64,11 +64,11 @@ type TestMessage = SystemTestMessage | UserTestMessage | AssistantTestMessage |
|
|
|
64
64
|
*/
|
|
65
65
|
declare function isTestMessageRole(value: unknown): value is TestMessageRole;
|
|
66
66
|
/**
|
|
67
|
-
* Guard matching
|
|
67
|
+
* Guard matching AgentV JSON objects.
|
|
68
68
|
*/
|
|
69
69
|
declare function isJsonObject(value: unknown): value is JsonObject;
|
|
70
70
|
/**
|
|
71
|
-
* Guard matching
|
|
71
|
+
* Guard matching AgentV JSON values.
|
|
72
72
|
*/
|
|
73
73
|
declare function isJsonValue(value: unknown): value is JsonValue;
|
|
74
74
|
/**
|
|
@@ -89,7 +89,7 @@ type GraderKind = (typeof GRADER_KIND_VALUES)[number];
|
|
|
89
89
|
*/
|
|
90
90
|
declare function isGraderKind(value: unknown): value is GraderKind;
|
|
91
91
|
/**
|
|
92
|
-
* Test case definition sourced from
|
|
92
|
+
* Test case definition sourced from AgentV specs.
|
|
93
93
|
*/
|
|
94
94
|
interface TestCase {
|
|
95
95
|
readonly id: string;
|
|
@@ -106,7 +106,7 @@ interface TestCase {
|
|
|
106
106
|
* Evaluator scorecard for a single test case run.
|
|
107
107
|
*/
|
|
108
108
|
interface EvaluationResult {
|
|
109
|
-
readonly
|
|
109
|
+
readonly eval_id: string;
|
|
110
110
|
readonly conversation_id?: string;
|
|
111
111
|
readonly score: number;
|
|
112
112
|
readonly hits: readonly string[];
|
|
@@ -130,14 +130,14 @@ declare function getHitCount(result: Pick<EvaluationResult, "hits">): number;
|
|
|
130
130
|
*/
|
|
131
131
|
declare function isGuidelineFile(filePath: string): boolean;
|
|
132
132
|
/**
|
|
133
|
-
* Extract fenced code blocks from
|
|
133
|
+
* Extract fenced code blocks from AgentV user segments.
|
|
134
134
|
*/
|
|
135
135
|
declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
|
|
136
136
|
type LoadOptions = {
|
|
137
137
|
readonly verbose?: boolean;
|
|
138
138
|
};
|
|
139
139
|
/**
|
|
140
|
-
* Load
|
|
140
|
+
* Load eval cases from a AgentV YAML specification file.
|
|
141
141
|
*/
|
|
142
142
|
declare function loadTestCases(testFilePath: string, repoRoot: URL | string, options?: LoadOptions): Promise<readonly TestCase[]>;
|
|
143
143
|
/**
|
|
@@ -369,7 +369,7 @@ interface RunTestCaseOptions {
|
|
|
369
369
|
}
|
|
370
370
|
interface ProgressEvent {
|
|
371
371
|
readonly workerId: number;
|
|
372
|
-
readonly
|
|
372
|
+
readonly evalId: string;
|
|
373
373
|
readonly status: "pending" | "running" | "completed" | "failed";
|
|
374
374
|
readonly startedAt?: number;
|
|
375
375
|
readonly completedAt?: number;
|
|
@@ -389,7 +389,7 @@ interface RunEvaluationOptions {
|
|
|
389
389
|
readonly cache?: EvaluationCache;
|
|
390
390
|
readonly useCache?: boolean;
|
|
391
391
|
readonly now?: () => Date;
|
|
392
|
-
readonly
|
|
392
|
+
readonly evalId?: string;
|
|
393
393
|
readonly verbose?: boolean;
|
|
394
394
|
readonly maxConcurrency?: number;
|
|
395
395
|
readonly onResult?: (result: EvaluationResult) => MaybePromise<void>;
|
package/dist/index.d.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { AxChatRequest } from '@ax-llm/ax';
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
|
-
* JSON primitive values appearing in
|
|
4
|
+
* JSON primitive values appearing in AgentV payloads.
|
|
5
5
|
*/
|
|
6
6
|
type JsonPrimitive = string | number | boolean | null;
|
|
7
7
|
/**
|
|
@@ -64,11 +64,11 @@ type TestMessage = SystemTestMessage | UserTestMessage | AssistantTestMessage |
|
|
|
64
64
|
*/
|
|
65
65
|
declare function isTestMessageRole(value: unknown): value is TestMessageRole;
|
|
66
66
|
/**
|
|
67
|
-
* Guard matching
|
|
67
|
+
* Guard matching AgentV JSON objects.
|
|
68
68
|
*/
|
|
69
69
|
declare function isJsonObject(value: unknown): value is JsonObject;
|
|
70
70
|
/**
|
|
71
|
-
* Guard matching
|
|
71
|
+
* Guard matching AgentV JSON values.
|
|
72
72
|
*/
|
|
73
73
|
declare function isJsonValue(value: unknown): value is JsonValue;
|
|
74
74
|
/**
|
|
@@ -89,7 +89,7 @@ type GraderKind = (typeof GRADER_KIND_VALUES)[number];
|
|
|
89
89
|
*/
|
|
90
90
|
declare function isGraderKind(value: unknown): value is GraderKind;
|
|
91
91
|
/**
|
|
92
|
-
* Test case definition sourced from
|
|
92
|
+
* Test case definition sourced from AgentV specs.
|
|
93
93
|
*/
|
|
94
94
|
interface TestCase {
|
|
95
95
|
readonly id: string;
|
|
@@ -106,7 +106,7 @@ interface TestCase {
|
|
|
106
106
|
* Evaluator scorecard for a single test case run.
|
|
107
107
|
*/
|
|
108
108
|
interface EvaluationResult {
|
|
109
|
-
readonly
|
|
109
|
+
readonly eval_id: string;
|
|
110
110
|
readonly conversation_id?: string;
|
|
111
111
|
readonly score: number;
|
|
112
112
|
readonly hits: readonly string[];
|
|
@@ -130,14 +130,14 @@ declare function getHitCount(result: Pick<EvaluationResult, "hits">): number;
|
|
|
130
130
|
*/
|
|
131
131
|
declare function isGuidelineFile(filePath: string): boolean;
|
|
132
132
|
/**
|
|
133
|
-
* Extract fenced code blocks from
|
|
133
|
+
* Extract fenced code blocks from AgentV user segments.
|
|
134
134
|
*/
|
|
135
135
|
declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
|
|
136
136
|
type LoadOptions = {
|
|
137
137
|
readonly verbose?: boolean;
|
|
138
138
|
};
|
|
139
139
|
/**
|
|
140
|
-
* Load
|
|
140
|
+
* Load eval cases from a AgentV YAML specification file.
|
|
141
141
|
*/
|
|
142
142
|
declare function loadTestCases(testFilePath: string, repoRoot: URL | string, options?: LoadOptions): Promise<readonly TestCase[]>;
|
|
143
143
|
/**
|
|
@@ -369,7 +369,7 @@ interface RunTestCaseOptions {
|
|
|
369
369
|
}
|
|
370
370
|
interface ProgressEvent {
|
|
371
371
|
readonly workerId: number;
|
|
372
|
-
readonly
|
|
372
|
+
readonly evalId: string;
|
|
373
373
|
readonly status: "pending" | "running" | "completed" | "failed";
|
|
374
374
|
readonly startedAt?: number;
|
|
375
375
|
readonly completedAt?: number;
|
|
@@ -389,7 +389,7 @@ interface RunEvaluationOptions {
|
|
|
389
389
|
readonly cache?: EvaluationCache;
|
|
390
390
|
readonly useCache?: boolean;
|
|
391
391
|
readonly now?: () => Date;
|
|
392
|
-
readonly
|
|
392
|
+
readonly evalId?: string;
|
|
393
393
|
readonly verbose?: boolean;
|
|
394
394
|
readonly maxConcurrency?: number;
|
|
395
395
|
readonly onResult?: (result: EvaluationResult) => MaybePromise<void>;
|
package/dist/index.js
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import {
|
|
2
|
+
TARGETS_SCHEMA_V2,
|
|
2
3
|
buildSearchRoots,
|
|
3
4
|
resolveFileReference
|
|
4
|
-
} from "./chunk-
|
|
5
|
+
} from "./chunk-QVS4OL44.js";
|
|
5
6
|
|
|
6
7
|
// src/evaluation/types.ts
|
|
7
8
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -205,7 +206,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
205
206
|
}
|
|
206
207
|
const codeSnippets = extractCodeBlocks(userSegments);
|
|
207
208
|
const assistantContent = assistantMessages[0]?.content;
|
|
208
|
-
const expectedAssistantRaw =
|
|
209
|
+
const expectedAssistantRaw = await resolveAssistantContent(assistantContent, searchRoots, verbose);
|
|
209
210
|
const userTextPrompt = userTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
210
211
|
const testCaseGrader = coerceGrader(testcase.grader) ?? globalGrader;
|
|
211
212
|
const testCase = {
|
|
@@ -321,7 +322,7 @@ function cloneJsonValue(value) {
|
|
|
321
322
|
}
|
|
322
323
|
return cloneJsonObject(value);
|
|
323
324
|
}
|
|
324
|
-
function
|
|
325
|
+
async function resolveAssistantContent(content, searchRoots, verbose) {
|
|
325
326
|
if (typeof content === "string") {
|
|
326
327
|
return content;
|
|
327
328
|
}
|
|
@@ -334,12 +335,42 @@ function normalizeAssistantContent(content) {
|
|
|
334
335
|
parts.push(entry);
|
|
335
336
|
continue;
|
|
336
337
|
}
|
|
337
|
-
|
|
338
|
+
if (!isJsonObject(entry)) {
|
|
339
|
+
continue;
|
|
340
|
+
}
|
|
341
|
+
const segmentType = asString(entry.type);
|
|
342
|
+
if (segmentType === "file") {
|
|
343
|
+
const rawValue = asString(entry.value);
|
|
344
|
+
if (!rawValue) {
|
|
345
|
+
continue;
|
|
346
|
+
}
|
|
347
|
+
const { displayPath, resolvedPath, attempted } = await resolveFileReference(
|
|
348
|
+
rawValue,
|
|
349
|
+
searchRoots
|
|
350
|
+
);
|
|
351
|
+
if (!resolvedPath) {
|
|
352
|
+
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
353
|
+
logWarning(`File not found in expected_messages: ${displayPath}`, attempts);
|
|
354
|
+
continue;
|
|
355
|
+
}
|
|
356
|
+
try {
|
|
357
|
+
const fileContent = (await readFile(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
358
|
+
parts.push(fileContent);
|
|
359
|
+
if (verbose) {
|
|
360
|
+
console.log(` [Expected Assistant File] Found: ${displayPath}`);
|
|
361
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
362
|
+
}
|
|
363
|
+
} catch (error) {
|
|
364
|
+
logWarning(`Could not read file ${resolvedPath}: ${error.message}`);
|
|
365
|
+
}
|
|
366
|
+
continue;
|
|
367
|
+
}
|
|
368
|
+
const textValue = asString(entry.text);
|
|
338
369
|
if (typeof textValue === "string") {
|
|
339
370
|
parts.push(textValue);
|
|
340
371
|
continue;
|
|
341
372
|
}
|
|
342
|
-
const valueValue = asString(entry
|
|
373
|
+
const valueValue = asString(entry.value);
|
|
343
374
|
if (typeof valueValue === "string") {
|
|
344
375
|
parts.push(valueValue);
|
|
345
376
|
continue;
|
|
@@ -844,7 +875,7 @@ import { mkdtemp, readFile as readFile2, rm, writeFile } from "node:fs/promises"
|
|
|
844
875
|
import { tmpdir } from "node:os";
|
|
845
876
|
import path2 from "node:path";
|
|
846
877
|
import { dispatchAgentSession, getSubagentRoot, provisionSubagents } from "subagent";
|
|
847
|
-
var PROMPT_FILE_PREFIX = "
|
|
878
|
+
var PROMPT_FILE_PREFIX = "agentv-vscode-";
|
|
848
879
|
var VSCodeProvider = class {
|
|
849
880
|
id;
|
|
850
881
|
kind;
|
|
@@ -911,7 +942,7 @@ function buildPromptDocument(request, attachments) {
|
|
|
911
942
|
if (instructionFiles.length > 0) {
|
|
912
943
|
parts.push(buildMandatoryPrereadBlock(instructionFiles));
|
|
913
944
|
}
|
|
914
|
-
parts.push(`#
|
|
945
|
+
parts.push(`# AgentV Request`);
|
|
915
946
|
if (request.testCaseId) {
|
|
916
947
|
parts.push(`- Test Case: ${request.testCaseId}`);
|
|
917
948
|
}
|
|
@@ -1056,18 +1087,24 @@ import { parse as parse2 } from "yaml";
|
|
|
1056
1087
|
function isRecord(value) {
|
|
1057
1088
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
1058
1089
|
}
|
|
1059
|
-
function
|
|
1060
|
-
const
|
|
1061
|
-
if (
|
|
1090
|
+
function checkSchema(parsed, absolutePath) {
|
|
1091
|
+
const schema = parsed.$schema;
|
|
1092
|
+
if (schema === void 0) {
|
|
1093
|
+
throw new Error(
|
|
1094
|
+
`Missing $schema field in targets.yaml at ${absolutePath}.
|
|
1095
|
+
Please add '$schema: ${TARGETS_SCHEMA_V2}' at the top of the file.`
|
|
1096
|
+
);
|
|
1097
|
+
}
|
|
1098
|
+
if (typeof schema !== "string") {
|
|
1062
1099
|
throw new Error(
|
|
1063
|
-
`
|
|
1064
|
-
|
|
1100
|
+
`Invalid $schema field in targets.yaml at ${absolutePath}.
|
|
1101
|
+
Expected a string value '${TARGETS_SCHEMA_V2}'.`
|
|
1065
1102
|
);
|
|
1066
1103
|
}
|
|
1067
|
-
if (
|
|
1104
|
+
if (schema !== TARGETS_SCHEMA_V2) {
|
|
1068
1105
|
throw new Error(
|
|
1069
|
-
`
|
|
1070
|
-
|
|
1106
|
+
`Invalid $schema '${schema}' in targets.yaml at ${absolutePath}.
|
|
1107
|
+
Expected '${TARGETS_SCHEMA_V2}'.`
|
|
1071
1108
|
);
|
|
1072
1109
|
}
|
|
1073
1110
|
}
|
|
@@ -1115,9 +1152,9 @@ async function readTargetDefinitions(filePath) {
|
|
|
1115
1152
|
const raw = await readFile3(absolutePath, "utf8");
|
|
1116
1153
|
const parsed = parse2(raw);
|
|
1117
1154
|
if (!isRecord(parsed)) {
|
|
1118
|
-
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '
|
|
1155
|
+
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
|
|
1119
1156
|
}
|
|
1120
|
-
|
|
1157
|
+
checkSchema(parsed, absolutePath);
|
|
1121
1158
|
const targets = extractTargetsArray(parsed, absolutePath);
|
|
1122
1159
|
const definitions = targets.map((entry, index) => assertTargetDefinition(entry, index, absolutePath));
|
|
1123
1160
|
return definitions;
|
|
@@ -1678,17 +1715,17 @@ async function runEvaluation(options) {
|
|
|
1678
1715
|
cache,
|
|
1679
1716
|
useCache,
|
|
1680
1717
|
now,
|
|
1681
|
-
|
|
1718
|
+
evalId,
|
|
1682
1719
|
verbose,
|
|
1683
1720
|
onResult,
|
|
1684
1721
|
onProgress
|
|
1685
1722
|
} = options;
|
|
1686
1723
|
const load = loadTestCases;
|
|
1687
1724
|
const testCases = await load(testFilePath, repoRoot, { verbose });
|
|
1688
|
-
const filteredTestCases = filterTestCases(testCases,
|
|
1725
|
+
const filteredTestCases = filterTestCases(testCases, evalId);
|
|
1689
1726
|
if (filteredTestCases.length === 0) {
|
|
1690
|
-
if (
|
|
1691
|
-
throw new Error(`Test case with id '${
|
|
1727
|
+
if (evalId) {
|
|
1728
|
+
throw new Error(`Test case with id '${evalId}' not found in ${testFilePath}`);
|
|
1692
1729
|
}
|
|
1693
1730
|
return [];
|
|
1694
1731
|
}
|
|
@@ -1736,7 +1773,7 @@ async function runEvaluation(options) {
|
|
|
1736
1773
|
for (let i = 0; i < filteredTestCases.length; i++) {
|
|
1737
1774
|
await onProgress({
|
|
1738
1775
|
workerId: i + 1,
|
|
1739
|
-
|
|
1776
|
+
evalId: filteredTestCases[i].id,
|
|
1740
1777
|
status: "pending"
|
|
1741
1778
|
});
|
|
1742
1779
|
}
|
|
@@ -1744,15 +1781,15 @@ async function runEvaluation(options) {
|
|
|
1744
1781
|
const workers = options.maxConcurrency ?? target.workers ?? 1;
|
|
1745
1782
|
const limit = pLimit(workers);
|
|
1746
1783
|
let nextWorkerId = 1;
|
|
1747
|
-
const
|
|
1784
|
+
const workerIdByEvalId = /* @__PURE__ */ new Map();
|
|
1748
1785
|
const promises = filteredTestCases.map(
|
|
1749
1786
|
(testCase) => limit(async () => {
|
|
1750
1787
|
const workerId = nextWorkerId++;
|
|
1751
|
-
|
|
1788
|
+
workerIdByEvalId.set(testCase.id, workerId);
|
|
1752
1789
|
if (onProgress) {
|
|
1753
1790
|
await onProgress({
|
|
1754
1791
|
workerId,
|
|
1755
|
-
|
|
1792
|
+
evalId: testCase.id,
|
|
1756
1793
|
status: "running",
|
|
1757
1794
|
startedAt: Date.now()
|
|
1758
1795
|
});
|
|
@@ -1775,7 +1812,7 @@ async function runEvaluation(options) {
|
|
|
1775
1812
|
if (onProgress) {
|
|
1776
1813
|
await onProgress({
|
|
1777
1814
|
workerId,
|
|
1778
|
-
|
|
1815
|
+
evalId: testCase.id,
|
|
1779
1816
|
status: "completed",
|
|
1780
1817
|
startedAt: 0,
|
|
1781
1818
|
// Not used for completed status
|
|
@@ -1790,7 +1827,7 @@ async function runEvaluation(options) {
|
|
|
1790
1827
|
if (onProgress) {
|
|
1791
1828
|
await onProgress({
|
|
1792
1829
|
workerId,
|
|
1793
|
-
|
|
1830
|
+
evalId: testCase.id,
|
|
1794
1831
|
status: "failed",
|
|
1795
1832
|
completedAt: Date.now(),
|
|
1796
1833
|
error: error instanceof Error ? error.message : String(error)
|
|
@@ -1912,7 +1949,7 @@ async function runTestCase(options) {
|
|
|
1912
1949
|
guideline_paths: testCase.guideline_paths
|
|
1913
1950
|
};
|
|
1914
1951
|
return {
|
|
1915
|
-
|
|
1952
|
+
eval_id: testCase.id,
|
|
1916
1953
|
conversation_id: testCase.conversation_id,
|
|
1917
1954
|
score: grade.score,
|
|
1918
1955
|
hits: grade.hits,
|
|
@@ -1927,11 +1964,11 @@ async function runTestCase(options) {
|
|
|
1927
1964
|
grader_raw_request: grade.graderRawRequest
|
|
1928
1965
|
};
|
|
1929
1966
|
}
|
|
1930
|
-
function filterTestCases(testCases,
|
|
1931
|
-
if (!
|
|
1967
|
+
function filterTestCases(testCases, evalId) {
|
|
1968
|
+
if (!evalId) {
|
|
1932
1969
|
return testCases;
|
|
1933
1970
|
}
|
|
1934
|
-
return testCases.filter((testCase) => testCase.id ===
|
|
1971
|
+
return testCases.filter((testCase) => testCase.id === evalId);
|
|
1935
1972
|
}
|
|
1936
1973
|
function buildGraderRegistry(overrides, resolveJudgeProvider) {
|
|
1937
1974
|
const heuristic = overrides?.heuristic ?? new HeuristicGrader();
|
|
@@ -1955,7 +1992,7 @@ async function dumpPrompt(directory, testCase, promptInputs) {
|
|
|
1955
1992
|
const filePath = path4.resolve(directory, filename);
|
|
1956
1993
|
await mkdir(path4.dirname(filePath), { recursive: true });
|
|
1957
1994
|
const payload = {
|
|
1958
|
-
|
|
1995
|
+
eval_id: testCase.id,
|
|
1959
1996
|
request: promptInputs.request,
|
|
1960
1997
|
guidelines: promptInputs.guidelines,
|
|
1961
1998
|
guideline_paths: testCase.guideline_paths
|
|
@@ -2004,7 +2041,7 @@ function buildErrorResult(testCase, targetName, timestamp, error, promptInputs)
|
|
|
2004
2041
|
error: message
|
|
2005
2042
|
};
|
|
2006
2043
|
return {
|
|
2007
|
-
|
|
2044
|
+
eval_id: testCase.id,
|
|
2008
2045
|
conversation_id: testCase.conversation_id,
|
|
2009
2046
|
score: 0,
|
|
2010
2047
|
hits: [],
|