@agentv/core 0.2.3 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -1,7 +1,7 @@
1
1
  import { AxChatRequest } from '@ax-llm/ax';
2
2
 
3
3
  /**
4
- * JSON primitive values appearing in BbEval payloads.
4
+ * JSON primitive values appearing in AgentV payloads.
5
5
  */
6
6
  type JsonPrimitive = string | number | boolean | null;
7
7
  /**
@@ -64,11 +64,11 @@ type TestMessage = SystemTestMessage | UserTestMessage | AssistantTestMessage |
64
64
  */
65
65
  declare function isTestMessageRole(value: unknown): value is TestMessageRole;
66
66
  /**
67
- * Guard matching BbEval JSON objects.
67
+ * Guard matching AgentV JSON objects.
68
68
  */
69
69
  declare function isJsonObject(value: unknown): value is JsonObject;
70
70
  /**
71
- * Guard matching BbEval JSON values.
71
+ * Guard matching AgentV JSON values.
72
72
  */
73
73
  declare function isJsonValue(value: unknown): value is JsonValue;
74
74
  /**
@@ -89,7 +89,7 @@ type GraderKind = (typeof GRADER_KIND_VALUES)[number];
89
89
  */
90
90
  declare function isGraderKind(value: unknown): value is GraderKind;
91
91
  /**
92
- * Test case definition sourced from BbEval specs.
92
+ * Test case definition sourced from AgentV specs.
93
93
  */
94
94
  interface TestCase {
95
95
  readonly id: string;
@@ -106,7 +106,7 @@ interface TestCase {
106
106
  * Evaluator scorecard for a single test case run.
107
107
  */
108
108
  interface EvaluationResult {
109
- readonly test_id: string;
109
+ readonly eval_id: string;
110
110
  readonly conversation_id?: string;
111
111
  readonly score: number;
112
112
  readonly hits: readonly string[];
@@ -130,14 +130,14 @@ declare function getHitCount(result: Pick<EvaluationResult, "hits">): number;
130
130
  */
131
131
  declare function isGuidelineFile(filePath: string): boolean;
132
132
  /**
133
- * Extract fenced code blocks from BbEval user segments.
133
+ * Extract fenced code blocks from AgentV user segments.
134
134
  */
135
135
  declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
136
136
  type LoadOptions = {
137
137
  readonly verbose?: boolean;
138
138
  };
139
139
  /**
140
- * Load test cases from a BbEval YAML specification file.
140
+ * Load eval cases from a AgentV YAML specification file.
141
141
  */
142
142
  declare function loadTestCases(testFilePath: string, repoRoot: URL | string, options?: LoadOptions): Promise<readonly TestCase[]>;
143
143
  /**
@@ -369,7 +369,7 @@ interface RunTestCaseOptions {
369
369
  }
370
370
  interface ProgressEvent {
371
371
  readonly workerId: number;
372
- readonly testId: string;
372
+ readonly evalId: string;
373
373
  readonly status: "pending" | "running" | "completed" | "failed";
374
374
  readonly startedAt?: number;
375
375
  readonly completedAt?: number;
@@ -389,7 +389,7 @@ interface RunEvaluationOptions {
389
389
  readonly cache?: EvaluationCache;
390
390
  readonly useCache?: boolean;
391
391
  readonly now?: () => Date;
392
- readonly testId?: string;
392
+ readonly evalId?: string;
393
393
  readonly verbose?: boolean;
394
394
  readonly maxConcurrency?: number;
395
395
  readonly onResult?: (result: EvaluationResult) => MaybePromise<void>;
package/dist/index.d.ts CHANGED
@@ -1,7 +1,7 @@
1
1
  import { AxChatRequest } from '@ax-llm/ax';
2
2
 
3
3
  /**
4
- * JSON primitive values appearing in BbEval payloads.
4
+ * JSON primitive values appearing in AgentV payloads.
5
5
  */
6
6
  type JsonPrimitive = string | number | boolean | null;
7
7
  /**
@@ -64,11 +64,11 @@ type TestMessage = SystemTestMessage | UserTestMessage | AssistantTestMessage |
64
64
  */
65
65
  declare function isTestMessageRole(value: unknown): value is TestMessageRole;
66
66
  /**
67
- * Guard matching BbEval JSON objects.
67
+ * Guard matching AgentV JSON objects.
68
68
  */
69
69
  declare function isJsonObject(value: unknown): value is JsonObject;
70
70
  /**
71
- * Guard matching BbEval JSON values.
71
+ * Guard matching AgentV JSON values.
72
72
  */
73
73
  declare function isJsonValue(value: unknown): value is JsonValue;
74
74
  /**
@@ -89,7 +89,7 @@ type GraderKind = (typeof GRADER_KIND_VALUES)[number];
89
89
  */
90
90
  declare function isGraderKind(value: unknown): value is GraderKind;
91
91
  /**
92
- * Test case definition sourced from BbEval specs.
92
+ * Test case definition sourced from AgentV specs.
93
93
  */
94
94
  interface TestCase {
95
95
  readonly id: string;
@@ -106,7 +106,7 @@ interface TestCase {
106
106
  * Evaluator scorecard for a single test case run.
107
107
  */
108
108
  interface EvaluationResult {
109
- readonly test_id: string;
109
+ readonly eval_id: string;
110
110
  readonly conversation_id?: string;
111
111
  readonly score: number;
112
112
  readonly hits: readonly string[];
@@ -130,14 +130,14 @@ declare function getHitCount(result: Pick<EvaluationResult, "hits">): number;
130
130
  */
131
131
  declare function isGuidelineFile(filePath: string): boolean;
132
132
  /**
133
- * Extract fenced code blocks from BbEval user segments.
133
+ * Extract fenced code blocks from AgentV user segments.
134
134
  */
135
135
  declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
136
136
  type LoadOptions = {
137
137
  readonly verbose?: boolean;
138
138
  };
139
139
  /**
140
- * Load test cases from a BbEval YAML specification file.
140
+ * Load eval cases from a AgentV YAML specification file.
141
141
  */
142
142
  declare function loadTestCases(testFilePath: string, repoRoot: URL | string, options?: LoadOptions): Promise<readonly TestCase[]>;
143
143
  /**
@@ -369,7 +369,7 @@ interface RunTestCaseOptions {
369
369
  }
370
370
  interface ProgressEvent {
371
371
  readonly workerId: number;
372
- readonly testId: string;
372
+ readonly evalId: string;
373
373
  readonly status: "pending" | "running" | "completed" | "failed";
374
374
  readonly startedAt?: number;
375
375
  readonly completedAt?: number;
@@ -389,7 +389,7 @@ interface RunEvaluationOptions {
389
389
  readonly cache?: EvaluationCache;
390
390
  readonly useCache?: boolean;
391
391
  readonly now?: () => Date;
392
- readonly testId?: string;
392
+ readonly evalId?: string;
393
393
  readonly verbose?: boolean;
394
394
  readonly maxConcurrency?: number;
395
395
  readonly onResult?: (result: EvaluationResult) => MaybePromise<void>;
package/dist/index.js CHANGED
@@ -1,7 +1,8 @@
1
1
  import {
2
+ TARGETS_SCHEMA_V2,
2
3
  buildSearchRoots,
3
4
  resolveFileReference
4
- } from "./chunk-5REK5RSI.js";
5
+ } from "./chunk-QVS4OL44.js";
5
6
 
6
7
  // src/evaluation/types.ts
7
8
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -205,7 +206,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
205
206
  }
206
207
  const codeSnippets = extractCodeBlocks(userSegments);
207
208
  const assistantContent = assistantMessages[0]?.content;
208
- const expectedAssistantRaw = normalizeAssistantContent(assistantContent);
209
+ const expectedAssistantRaw = await resolveAssistantContent(assistantContent, searchRoots, verbose);
209
210
  const userTextPrompt = userTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
210
211
  const testCaseGrader = coerceGrader(testcase.grader) ?? globalGrader;
211
212
  const testCase = {
@@ -321,7 +322,7 @@ function cloneJsonValue(value) {
321
322
  }
322
323
  return cloneJsonObject(value);
323
324
  }
324
- function normalizeAssistantContent(content) {
325
+ async function resolveAssistantContent(content, searchRoots, verbose) {
325
326
  if (typeof content === "string") {
326
327
  return content;
327
328
  }
@@ -334,12 +335,42 @@ function normalizeAssistantContent(content) {
334
335
  parts.push(entry);
335
336
  continue;
336
337
  }
337
- const textValue = asString(entry["text"]);
338
+ if (!isJsonObject(entry)) {
339
+ continue;
340
+ }
341
+ const segmentType = asString(entry.type);
342
+ if (segmentType === "file") {
343
+ const rawValue = asString(entry.value);
344
+ if (!rawValue) {
345
+ continue;
346
+ }
347
+ const { displayPath, resolvedPath, attempted } = await resolveFileReference(
348
+ rawValue,
349
+ searchRoots
350
+ );
351
+ if (!resolvedPath) {
352
+ const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
353
+ logWarning(`File not found in expected_messages: ${displayPath}`, attempts);
354
+ continue;
355
+ }
356
+ try {
357
+ const fileContent = (await readFile(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
358
+ parts.push(fileContent);
359
+ if (verbose) {
360
+ console.log(` [Expected Assistant File] Found: ${displayPath}`);
361
+ console.log(` Resolved to: ${resolvedPath}`);
362
+ }
363
+ } catch (error) {
364
+ logWarning(`Could not read file ${resolvedPath}: ${error.message}`);
365
+ }
366
+ continue;
367
+ }
368
+ const textValue = asString(entry.text);
338
369
  if (typeof textValue === "string") {
339
370
  parts.push(textValue);
340
371
  continue;
341
372
  }
342
- const valueValue = asString(entry["value"]);
373
+ const valueValue = asString(entry.value);
343
374
  if (typeof valueValue === "string") {
344
375
  parts.push(valueValue);
345
376
  continue;
@@ -844,7 +875,7 @@ import { mkdtemp, readFile as readFile2, rm, writeFile } from "node:fs/promises"
844
875
  import { tmpdir } from "node:os";
845
876
  import path2 from "node:path";
846
877
  import { dispatchAgentSession, getSubagentRoot, provisionSubagents } from "subagent";
847
- var PROMPT_FILE_PREFIX = "bbeval-vscode-";
878
+ var PROMPT_FILE_PREFIX = "agentv-vscode-";
848
879
  var VSCodeProvider = class {
849
880
  id;
850
881
  kind;
@@ -911,7 +942,7 @@ function buildPromptDocument(request, attachments) {
911
942
  if (instructionFiles.length > 0) {
912
943
  parts.push(buildMandatoryPrereadBlock(instructionFiles));
913
944
  }
914
- parts.push(`# BbEval Request`);
945
+ parts.push(`# AgentV Request`);
915
946
  if (request.testCaseId) {
916
947
  parts.push(`- Test Case: ${request.testCaseId}`);
917
948
  }
@@ -1056,18 +1087,24 @@ import { parse as parse2 } from "yaml";
1056
1087
  function isRecord(value) {
1057
1088
  return typeof value === "object" && value !== null && !Array.isArray(value);
1058
1089
  }
1059
- function checkVersion(parsed, absolutePath) {
1060
- const version = typeof parsed.version === "number" ? parsed.version : typeof parsed.version === "string" ? parseFloat(parsed.version) : void 0;
1061
- if (version === void 0) {
1090
+ function checkSchema(parsed, absolutePath) {
1091
+ const schema = parsed.$schema;
1092
+ if (schema === void 0) {
1093
+ throw new Error(
1094
+ `Missing $schema field in targets.yaml at ${absolutePath}.
1095
+ Please add '$schema: ${TARGETS_SCHEMA_V2}' at the top of the file.`
1096
+ );
1097
+ }
1098
+ if (typeof schema !== "string") {
1062
1099
  throw new Error(
1063
- `Missing version field in targets.yaml at ${absolutePath}.
1064
- Please add 'version: 2.0' at the top of the file.`
1100
+ `Invalid $schema field in targets.yaml at ${absolutePath}.
1101
+ Expected a string value '${TARGETS_SCHEMA_V2}'.`
1065
1102
  );
1066
1103
  }
1067
- if (version < 2) {
1104
+ if (schema !== TARGETS_SCHEMA_V2) {
1068
1105
  throw new Error(
1069
- `Outdated targets.yaml format (version ${version}) at ${absolutePath}.
1070
- Please update to version 2.0 format with 'targets' array.`
1106
+ `Invalid $schema '${schema}' in targets.yaml at ${absolutePath}.
1107
+ Expected '${TARGETS_SCHEMA_V2}'.`
1071
1108
  );
1072
1109
  }
1073
1110
  }
@@ -1115,9 +1152,9 @@ async function readTargetDefinitions(filePath) {
1115
1152
  const raw = await readFile3(absolutePath, "utf8");
1116
1153
  const parsed = parse2(raw);
1117
1154
  if (!isRecord(parsed)) {
1118
- throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with 'version' and 'targets' fields`);
1155
+ throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
1119
1156
  }
1120
- checkVersion(parsed, absolutePath);
1157
+ checkSchema(parsed, absolutePath);
1121
1158
  const targets = extractTargetsArray(parsed, absolutePath);
1122
1159
  const definitions = targets.map((entry, index) => assertTargetDefinition(entry, index, absolutePath));
1123
1160
  return definitions;
@@ -1678,17 +1715,17 @@ async function runEvaluation(options) {
1678
1715
  cache,
1679
1716
  useCache,
1680
1717
  now,
1681
- testId,
1718
+ evalId,
1682
1719
  verbose,
1683
1720
  onResult,
1684
1721
  onProgress
1685
1722
  } = options;
1686
1723
  const load = loadTestCases;
1687
1724
  const testCases = await load(testFilePath, repoRoot, { verbose });
1688
- const filteredTestCases = filterTestCases(testCases, testId);
1725
+ const filteredTestCases = filterTestCases(testCases, evalId);
1689
1726
  if (filteredTestCases.length === 0) {
1690
- if (testId) {
1691
- throw new Error(`Test case with id '${testId}' not found in ${testFilePath}`);
1727
+ if (evalId) {
1728
+ throw new Error(`Test case with id '${evalId}' not found in ${testFilePath}`);
1692
1729
  }
1693
1730
  return [];
1694
1731
  }
@@ -1736,7 +1773,7 @@ async function runEvaluation(options) {
1736
1773
  for (let i = 0; i < filteredTestCases.length; i++) {
1737
1774
  await onProgress({
1738
1775
  workerId: i + 1,
1739
- testId: filteredTestCases[i].id,
1776
+ evalId: filteredTestCases[i].id,
1740
1777
  status: "pending"
1741
1778
  });
1742
1779
  }
@@ -1744,15 +1781,15 @@ async function runEvaluation(options) {
1744
1781
  const workers = options.maxConcurrency ?? target.workers ?? 1;
1745
1782
  const limit = pLimit(workers);
1746
1783
  let nextWorkerId = 1;
1747
- const workerIdByTestId = /* @__PURE__ */ new Map();
1784
+ const workerIdByEvalId = /* @__PURE__ */ new Map();
1748
1785
  const promises = filteredTestCases.map(
1749
1786
  (testCase) => limit(async () => {
1750
1787
  const workerId = nextWorkerId++;
1751
- workerIdByTestId.set(testCase.id, workerId);
1788
+ workerIdByEvalId.set(testCase.id, workerId);
1752
1789
  if (onProgress) {
1753
1790
  await onProgress({
1754
1791
  workerId,
1755
- testId: testCase.id,
1792
+ evalId: testCase.id,
1756
1793
  status: "running",
1757
1794
  startedAt: Date.now()
1758
1795
  });
@@ -1775,7 +1812,7 @@ async function runEvaluation(options) {
1775
1812
  if (onProgress) {
1776
1813
  await onProgress({
1777
1814
  workerId,
1778
- testId: testCase.id,
1815
+ evalId: testCase.id,
1779
1816
  status: "completed",
1780
1817
  startedAt: 0,
1781
1818
  // Not used for completed status
@@ -1790,7 +1827,7 @@ async function runEvaluation(options) {
1790
1827
  if (onProgress) {
1791
1828
  await onProgress({
1792
1829
  workerId,
1793
- testId: testCase.id,
1830
+ evalId: testCase.id,
1794
1831
  status: "failed",
1795
1832
  completedAt: Date.now(),
1796
1833
  error: error instanceof Error ? error.message : String(error)
@@ -1912,7 +1949,7 @@ async function runTestCase(options) {
1912
1949
  guideline_paths: testCase.guideline_paths
1913
1950
  };
1914
1951
  return {
1915
- test_id: testCase.id,
1952
+ eval_id: testCase.id,
1916
1953
  conversation_id: testCase.conversation_id,
1917
1954
  score: grade.score,
1918
1955
  hits: grade.hits,
@@ -1927,11 +1964,11 @@ async function runTestCase(options) {
1927
1964
  grader_raw_request: grade.graderRawRequest
1928
1965
  };
1929
1966
  }
1930
- function filterTestCases(testCases, testId) {
1931
- if (!testId) {
1967
+ function filterTestCases(testCases, evalId) {
1968
+ if (!evalId) {
1932
1969
  return testCases;
1933
1970
  }
1934
- return testCases.filter((testCase) => testCase.id === testId);
1971
+ return testCases.filter((testCase) => testCase.id === evalId);
1935
1972
  }
1936
1973
  function buildGraderRegistry(overrides, resolveJudgeProvider) {
1937
1974
  const heuristic = overrides?.heuristic ?? new HeuristicGrader();
@@ -1955,7 +1992,7 @@ async function dumpPrompt(directory, testCase, promptInputs) {
1955
1992
  const filePath = path4.resolve(directory, filename);
1956
1993
  await mkdir(path4.dirname(filePath), { recursive: true });
1957
1994
  const payload = {
1958
- test_id: testCase.id,
1995
+ eval_id: testCase.id,
1959
1996
  request: promptInputs.request,
1960
1997
  guidelines: promptInputs.guidelines,
1961
1998
  guideline_paths: testCase.guideline_paths
@@ -2004,7 +2041,7 @@ function buildErrorResult(testCase, targetName, timestamp, error, promptInputs)
2004
2041
  error: message
2005
2042
  };
2006
2043
  return {
2007
- test_id: testCase.id,
2044
+ eval_id: testCase.id,
2008
2045
  conversation_id: testCase.conversation_id,
2009
2046
  score: 0,
2010
2047
  hits: [],