agentv 3.3.0 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -2,9 +2,9 @@
2
2
  import { createRequire } from 'node:module'; const require = createRequire(import.meta.url);
3
3
  import {
4
4
  runCli
5
- } from "./chunk-6LP5Z5Y4.js";
6
- import "./chunk-4ZMSAQWS.js";
7
- import "./chunk-5M3K2DMV.js";
5
+ } from "./chunk-A7ZDUB46.js";
6
+ import "./chunk-RE5I3U2S.js";
7
+ import "./chunk-GOZV2HN2.js";
8
8
  import "./chunk-C5GOHBQM.js";
9
9
  import "./chunk-JK6V4KVD.js";
10
10
  import "./chunk-HQDCIXVH.js";
@@ -142,7 +142,7 @@ import {
142
142
  transpileEvalYaml,
143
143
  transpileEvalYamlFile,
144
144
  trimBaselineResult
145
- } from "./chunk-5M3K2DMV.js";
145
+ } from "./chunk-GOZV2HN2.js";
146
146
  import {
147
147
  OtlpJsonFileExporter
148
148
  } from "./chunk-C5GOHBQM.js";
@@ -302,4 +302,4 @@ export {
302
302
  transpileEvalYamlFile,
303
303
  trimBaselineResult
304
304
  };
305
- //# sourceMappingURL=dist-OC53WD3P.js.map
305
+ //# sourceMappingURL=dist-AFDYFH6Y.js.map
package/dist/index.js CHANGED
@@ -3,9 +3,9 @@ import {
3
3
  app,
4
4
  preprocessArgv,
5
5
  runCli
6
- } from "./chunk-6LP5Z5Y4.js";
7
- import "./chunk-4ZMSAQWS.js";
8
- import "./chunk-5M3K2DMV.js";
6
+ } from "./chunk-A7ZDUB46.js";
7
+ import "./chunk-RE5I3U2S.js";
8
+ import "./chunk-GOZV2HN2.js";
9
9
  import "./chunk-C5GOHBQM.js";
10
10
  import "./chunk-JK6V4KVD.js";
11
11
  import "./chunk-HQDCIXVH.js";
@@ -4,14 +4,14 @@ import {
4
4
  fileExists,
5
5
  findRepoRoot,
6
6
  runEvalCommand
7
- } from "./chunk-4ZMSAQWS.js";
7
+ } from "./chunk-RE5I3U2S.js";
8
8
  import {
9
9
  DEFAULT_EVAL_PATTERNS,
10
10
  getAgentvHome,
11
11
  listTargetNames,
12
12
  loadConfig,
13
13
  readTargetDefinitions
14
- } from "./chunk-5M3K2DMV.js";
14
+ } from "./chunk-GOZV2HN2.js";
15
15
  import "./chunk-C5GOHBQM.js";
16
16
  import "./chunk-JK6V4KVD.js";
17
17
  import "./chunk-HQDCIXVH.js";
@@ -323,12 +323,52 @@ async function executeConfig(config) {
323
323
  cleanupWorkspaces: false,
324
324
  trace: false
325
325
  };
326
- await runEvalCommand({
326
+ const result = await runEvalCommand({
327
327
  testFiles: [...config.evalPaths],
328
328
  rawOptions
329
329
  });
330
+ if (result && result.executionErrorCount > 0 && process.stdin.isTTY) {
331
+ await promptRetryErrors(config, result.outputPath);
332
+ }
333
+ }
334
+ async function promptRetryErrors(config, outputPath) {
335
+ const shouldRetry = await confirm({
336
+ message: `Retry ${ANSI_BOLD}execution errors${ANSI_RESET} from this run?`,
337
+ default: true
338
+ });
339
+ if (!shouldRetry) {
340
+ return;
341
+ }
342
+ console.log(`
343
+ ${ANSI_DIM}Retrying execution errors...${ANSI_RESET}
344
+ `);
345
+ const rawOptions = {
346
+ target: config.target,
347
+ workers: config.workers,
348
+ dryRun: config.dryRun,
349
+ cache: config.cache,
350
+ outputFormat: "jsonl",
351
+ retryErrors: outputPath,
352
+ out: outputPath,
353
+ dryRunDelay: 0,
354
+ dryRunDelayMin: 0,
355
+ dryRunDelayMax: 0,
356
+ agentTimeout: 120,
357
+ maxRetries: 2,
358
+ verbose: false,
359
+ keepWorkspaces: false,
360
+ cleanupWorkspaces: false,
361
+ trace: false
362
+ };
363
+ const retryResult = await runEvalCommand({
364
+ testFiles: [...config.evalPaths],
365
+ rawOptions
366
+ });
367
+ if (retryResult && retryResult.executionErrorCount > 0 && process.stdin.isTTY) {
368
+ await promptRetryErrors(config, retryResult.outputPath);
369
+ }
330
370
  }
331
371
  export {
332
372
  launchInteractiveWizard
333
373
  };
334
- //# sourceMappingURL=interactive-NA6SAIAG.js.map
374
+ //# sourceMappingURL=interactive-WXXTZ7PD.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/commands/eval/interactive.ts","../src/commands/eval/discover.ts","../src/commands/eval/last-config.ts"],"sourcesContent":["import path from 'node:path';\nimport { listTargetNames, readTargetDefinitions } from '@agentv/core';\nimport { checkbox, confirm, number, search, select } from '@inquirer/prompts';\n\nimport { TARGET_FILE_CANDIDATES, fileExists } from '../../utils/targets.js';\nimport {\n type DiscoveredEvalFile,\n discoverEvalFiles,\n filterByCategory,\n getCategories,\n} from './discover.js';\nimport { type LastConfig, loadLastConfig, saveLastConfig } from './last-config.js';\nimport { runEvalCommand } from './run-eval.js';\nimport { findRepoRoot } from './shared.js';\n\nconst ANSI_BOLD = '\\x1b[1m';\nconst ANSI_DIM = '\\x1b[2m';\nconst ANSI_CYAN = '\\x1b[36m';\nconst ANSI_GREEN = '\\x1b[32m';\nconst ANSI_RESET = '\\x1b[0m';\n\nexport interface InteractiveConfig {\n readonly evalPaths: readonly string[];\n readonly target: string;\n readonly workers: number;\n readonly dryRun: boolean;\n readonly cache: boolean;\n}\n\n/**\n * Launch the interactive wizard when `agentv eval` is called with no arguments.\n */\nexport async function launchInteractiveWizard(): Promise<void> {\n const cwd = process.cwd();\n\n console.log(`\\n${ANSI_BOLD}${ANSI_CYAN}AgentV Interactive Mode${ANSI_RESET}\\n`);\n\n const lastConfig = await loadLastConfig();\n const action = await promptMainMenu(lastConfig);\n\n if (action === 'exit') {\n return;\n }\n\n if (action === 'rerun' && lastConfig) {\n console.log(`\\n${ANSI_DIM}Rerunning last configuration...${ANSI_RESET}\\n`);\n await executeConfig({\n evalPaths: lastConfig.evalPaths,\n target: lastConfig.target,\n workers: lastConfig.workers,\n dryRun: lastConfig.dryRun,\n cache: lastConfig.cache,\n });\n return;\n }\n\n // Run new evaluation flow\n const config = await promptNewEvaluation(cwd);\n if (!config) {\n return;\n }\n\n // Review & confirm\n const confirmed = await promptReviewAndConfirm(config, cwd);\n if (!confirmed) {\n return;\n }\n\n // Save last config\n await saveLastConfig({\n timestamp: new Date().toISOString(),\n cwd,\n evalPaths: config.evalPaths,\n target: config.target,\n workers: config.workers,\n dryRun: config.dryRun,\n cache: config.cache,\n });\n\n await executeConfig(config);\n}\n\nasync function promptMainMenu(\n lastConfig: LastConfig | undefined,\n): Promise<'new' | 'rerun' | 'exit'> {\n type MenuChoice = 'new' | 'rerun' | 'exit';\n const choices: Array<{ name: string; value: MenuChoice; description?: string }> = [];\n\n if (lastConfig) {\n const evalCount = lastConfig.evalPaths.length;\n choices.push({\n name: '🔄 Rerun last config',\n value: 'rerun',\n description: `${evalCount} eval file(s), target: ${lastConfig.target}`,\n });\n }\n\n choices.push({ name: '🚀 Run new evaluation', value: 'new' }, { name: '✕ Exit', value: 'exit' });\n\n return select<MenuChoice>({\n message: 'What would you like to do?',\n choices,\n });\n}\n\nasync function promptNewEvaluation(cwd: string): Promise<InteractiveConfig | undefined> {\n // Step 1: Discover eval files\n console.log(`\\n${ANSI_DIM}Scanning for eval files...${ANSI_RESET}`);\n const allFiles = await discoverEvalFiles(cwd);\n\n if (allFiles.length === 0) {\n console.log(\n '\\n⚠ No eval files found in the current directory.\\n' +\n ' Place .yaml or .jsonl eval files in your project, or use:\\n' +\n ' agentv eval <path-to-eval.yaml>\\n',\n );\n return undefined;\n }\n\n console.log(`${ANSI_DIM}Found ${allFiles.length} eval file(s)${ANSI_RESET}\\n`);\n\n // Step 2: Select eval files (optionally filter by category first)\n const selectedFiles = await promptEvalSelection(allFiles);\n if (selectedFiles.length === 0) {\n console.log('\\nNo eval files selected.');\n return undefined;\n }\n\n // Step 3: Select target\n const target = await promptTargetSelection(cwd, selectedFiles[0].path);\n\n // Step 4: Advanced options\n const advanced = await promptAdvancedOptions();\n\n return {\n evalPaths: selectedFiles.map((f) => f.path),\n target,\n ...advanced,\n };\n}\n\nasync function promptEvalSelection(\n allFiles: readonly DiscoveredEvalFile[],\n): Promise<DiscoveredEvalFile[]> {\n const categories = getCategories(allFiles);\n\n // If only one category or few files, skip category selection\n let filesToSelect: readonly DiscoveredEvalFile[];\n\n if (categories.length > 1) {\n const selectedCategory = await search<string>({\n message: 'Select a category (type to search)',\n source: async (term) => {\n const filtered = term\n ? categories.filter((c) => c.toLowerCase().includes(term.toLowerCase()))\n : categories;\n return [\n { name: '(all categories)', value: '__all__' },\n ...filtered.map((c) => {\n const count = filterByCategory(allFiles, c).length;\n return { name: `${c} (${count} file${count > 1 ? 's' : ''})`, value: c };\n }),\n ];\n },\n });\n\n filesToSelect =\n selectedCategory === '__all__' ? allFiles : filterByCategory(allFiles, selectedCategory);\n } else {\n filesToSelect = allFiles;\n }\n\n return checkbox<DiscoveredEvalFile>({\n message: 'Select eval files to run (space to toggle, enter to confirm)',\n choices: filesToSelect.map((f) => ({\n name: f.relativePath,\n value: f,\n checked: filesToSelect.length <= 5, // auto-select if few files\n })),\n required: true,\n });\n}\n\nasync function promptTargetSelection(cwd: string, firstEvalPath: string): Promise<string> {\n const repoRoot = await findRepoRoot(cwd);\n\n // Try to find targets.yaml — search near the eval file first, then cwd/repoRoot\n const targetsPath = await findTargetsFile(cwd, repoRoot, firstEvalPath);\n\n if (!targetsPath) {\n console.log(`${ANSI_DIM}No targets.yaml found. Using default target.${ANSI_RESET}`);\n return 'default';\n }\n\n const definitions = await readTargetDefinitions(targetsPath);\n const targetNames = listTargetNames(definitions);\n\n if (targetNames.length === 0) {\n return 'default';\n }\n\n if (targetNames.length === 1) {\n console.log(`${ANSI_DIM}Using target: ${targetNames[0]}${ANSI_RESET}`);\n return targetNames[0];\n }\n\n return search<string>({\n message: 'Select a target (type to search)',\n source: async (term) => {\n const filtered = term\n ? targetNames.filter((t) => t.toLowerCase().includes(term.toLowerCase()))\n : targetNames;\n return filtered.map((t) => {\n const def = definitions.find((d) => d.name === t);\n return {\n name: t,\n value: t,\n description: def ? `provider: ${def.provider}` : undefined,\n };\n });\n },\n });\n}\n\nasync function findTargetsFile(\n cwd: string,\n repoRoot: string,\n evalFilePath?: string,\n): Promise<string | undefined> {\n // Build directory chain: eval file dir → cwd → repoRoot (mirrors discoverTargetsFile)\n const dirsToSearch: string[] = [];\n\n if (evalFilePath) {\n const evalDir = path.dirname(evalFilePath);\n if (!dirsToSearch.includes(evalDir)) {\n dirsToSearch.push(evalDir);\n }\n }\n\n if (!dirsToSearch.includes(cwd)) {\n dirsToSearch.push(cwd);\n }\n\n if (repoRoot !== cwd && !dirsToSearch.includes(repoRoot)) {\n dirsToSearch.push(repoRoot);\n }\n\n for (const dir of dirsToSearch) {\n for (const candidate of TARGET_FILE_CANDIDATES) {\n const fullPath = `${dir}/${candidate}`;\n if (await fileExists(fullPath)) {\n return fullPath;\n }\n }\n }\n\n return undefined;\n}\n\nasync function promptAdvancedOptions(): Promise<{\n workers: number;\n dryRun: boolean;\n cache: boolean;\n}> {\n const customize = await confirm({\n message: 'Configure advanced options?',\n default: false,\n });\n\n if (!customize) {\n return { workers: 3, dryRun: false, cache: false };\n }\n\n const workers =\n (await number({\n message: 'Number of parallel workers (1-50)',\n default: 3,\n min: 1,\n max: 50,\n })) ?? 3;\n\n const dryRun = await confirm({\n message: 'Enable dry-run mode (mock responses)?',\n default: false,\n });\n\n const cache = await confirm({\n message: 'Enable response cache?',\n default: false,\n });\n\n return { workers, dryRun, cache };\n}\n\nasync function promptReviewAndConfirm(config: InteractiveConfig, cwd: string): Promise<boolean> {\n const evalDisplay = config.evalPaths\n .map((p) => {\n const rel = p.startsWith(cwd) ? p.slice(cwd.length + 1) : p;\n return ` ${rel}`;\n })\n .join('\\n');\n\n console.log(`\\n${ANSI_BOLD}Review Configuration${ANSI_RESET}`);\n console.log(`${ANSI_DIM}${'─'.repeat(40)}${ANSI_RESET}`);\n console.log(`${ANSI_GREEN}Eval files:${ANSI_RESET}\\n${evalDisplay}`);\n console.log(`${ANSI_GREEN}Target:${ANSI_RESET} ${config.target}`);\n console.log(`${ANSI_GREEN}Workers:${ANSI_RESET} ${config.workers}`);\n console.log(`${ANSI_GREEN}Dry run:${ANSI_RESET} ${config.dryRun ? 'yes' : 'no'}`);\n console.log(`${ANSI_GREEN}Cache:${ANSI_RESET} ${config.cache ? 'yes' : 'no'}`);\n console.log(`${ANSI_DIM}${'─'.repeat(40)}${ANSI_RESET}`);\n\n return confirm({\n message: 'Run evaluation with this configuration?',\n default: true,\n });\n}\n\nasync function executeConfig(config: InteractiveConfig): Promise<void> {\n const rawOptions: Record<string, unknown> = {\n target: config.target,\n workers: config.workers,\n dryRun: config.dryRun,\n cache: config.cache,\n outputFormat: 'jsonl',\n dryRunDelay: 0,\n dryRunDelayMin: 0,\n dryRunDelayMax: 0,\n agentTimeout: 120,\n maxRetries: 2,\n verbose: false,\n keepWorkspaces: false,\n cleanupWorkspaces: false,\n trace: false,\n };\n\n const result = await runEvalCommand({\n testFiles: [...config.evalPaths],\n rawOptions,\n });\n\n // Prompt to retry errors when execution errors were detected in a TTY\n if (result && result.executionErrorCount > 0 && process.stdin.isTTY) {\n await promptRetryErrors(config, result.outputPath);\n }\n}\n\nasync function promptRetryErrors(config: InteractiveConfig, outputPath: string): Promise<void> {\n const shouldRetry = await confirm({\n message: `Retry ${ANSI_BOLD}execution errors${ANSI_RESET} from this run?`,\n default: true,\n });\n\n if (!shouldRetry) {\n return;\n }\n\n console.log(`\\n${ANSI_DIM}Retrying execution errors...${ANSI_RESET}\\n`);\n\n const rawOptions: Record<string, unknown> = {\n target: config.target,\n workers: config.workers,\n dryRun: config.dryRun,\n cache: config.cache,\n outputFormat: 'jsonl',\n retryErrors: outputPath,\n out: outputPath,\n dryRunDelay: 0,\n dryRunDelayMin: 0,\n dryRunDelayMax: 0,\n agentTimeout: 120,\n maxRetries: 2,\n verbose: false,\n keepWorkspaces: false,\n cleanupWorkspaces: false,\n trace: false,\n };\n\n const retryResult = await runEvalCommand({\n testFiles: [...config.evalPaths],\n rawOptions,\n });\n\n // Allow chained retries if there are still errors\n if (retryResult && retryResult.executionErrorCount > 0 && process.stdin.isTTY) {\n await promptRetryErrors(config, retryResult.outputPath);\n }\n}\n","import path from 'node:path';\nimport { DEFAULT_EVAL_PATTERNS, loadConfig } from '@agentv/core';\nimport fg from 'fast-glob';\n\nimport { findRepoRoot } from './shared.js';\n\nexport interface DiscoveredEvalFile {\n /** Absolute path to the eval file */\n readonly path: string;\n /** Relative path from cwd for display */\n readonly relativePath: string;\n /** Category derived from parent folder structure */\n readonly category: string;\n}\n\n/**\n * Discover eval files by glob pattern matching.\n *\n * Uses `eval_patterns` from `.agentv/config.yaml` if configured,\n * otherwise falls back to default patterns that match `dataset*.yaml`\n * and `eval.yaml` files under `evals/` directories.\n */\nexport async function discoverEvalFiles(cwd: string): Promise<readonly DiscoveredEvalFile[]> {\n const repoRoot = await findRepoRoot(cwd);\n\n // Load config to check for custom eval_patterns\n // Pass a dummy file path in cwd so buildDirectoryChain starts from cwd\n const config = await loadConfig(path.join(cwd, '_'), repoRoot);\n const patterns =\n config?.eval_patterns && config.eval_patterns.length > 0\n ? config.eval_patterns\n : DEFAULT_EVAL_PATTERNS;\n\n const ignore = ['**/node_modules/**', '**/dist/**'];\n\n const matches = await fg(patterns as string[], {\n cwd,\n absolute: true,\n onlyFiles: true,\n ignore,\n followSymbolicLinks: true,\n caseSensitiveMatch: false,\n });\n\n const evalFiles: DiscoveredEvalFile[] = matches.map((absPath) => {\n const relativePath = path.relative(cwd, absPath);\n const category = deriveCategory(relativePath);\n return { path: absPath, relativePath, category };\n });\n\n evalFiles.sort((a, b) => a.relativePath.localeCompare(b.relativePath));\n return evalFiles;\n}\n\n/** Derive a human-readable category from the relative path. */\nfunction deriveCategory(relativePath: string): string {\n const parts = relativePath.split(path.sep);\n // Use the first meaningful directory as category\n // e.g., \"examples/showcase/export-screening/evals/dataset.eval.yaml\" → \"showcase/export-screening\"\n // e.g., \"evals/dataset.eval.yaml\" → \"evals\"\n if (parts.length <= 1) {\n return 'root';\n }\n\n // Remove the filename and \"evals\" folder if present\n const dirs = parts.slice(0, -1).filter((d) => d !== 'evals');\n return dirs.length > 0 ? dirs.join('/') : 'root';\n}\n\n/** Get unique categories from discovered eval files. */\nexport function getCategories(files: readonly DiscoveredEvalFile[]): readonly string[] {\n const categories = new Set<string>();\n for (const file of files) {\n categories.add(file.category);\n }\n const sorted = Array.from(categories);\n sorted.sort();\n return sorted;\n}\n\n/** Filter eval files by category. */\nexport function filterByCategory(\n files: readonly DiscoveredEvalFile[],\n category: string,\n): readonly DiscoveredEvalFile[] {\n return files.filter((f) => f.category === category);\n}\n","import { mkdir, readFile, writeFile } from 'node:fs/promises';\nimport path from 'node:path';\nimport { getAgentvHome } from '@agentv/core';\n\nconst AGENTV_DIR = getAgentvHome();\nconst LAST_CONFIG_PATH = path.join(AGENTV_DIR, 'last-config.json');\n\nexport interface LastConfig {\n readonly timestamp: string;\n readonly cwd: string;\n readonly evalPaths: readonly string[];\n readonly target: string;\n readonly workers: number;\n readonly dryRun: boolean;\n readonly cache: boolean;\n}\n\nexport async function loadLastConfig(): Promise<LastConfig | undefined> {\n try {\n const content = await readFile(LAST_CONFIG_PATH, 'utf-8');\n return JSON.parse(content) as LastConfig;\n } catch {\n return undefined;\n }\n}\n\nexport async function saveLastConfig(config: LastConfig): Promise<void> {\n await mkdir(AGENTV_DIR, { recursive: true });\n await writeFile(LAST_CONFIG_PATH, JSON.stringify(config, null, 2), 'utf-8');\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;AAAA,OAAOA,WAAU;AAEjB,SAAS,UAAU,SAAS,QAAQ,QAAQ,cAAc;;;ACF1D,OAAO,UAAU;AAEjB,OAAO,QAAQ;AAoBf,eAAsB,kBAAkB,KAAqD;AAC3F,QAAM,WAAW,MAAM,aAAa,GAAG;AAIvC,QAAM,SAAS,MAAM,WAAW,KAAK,KAAK,KAAK,GAAG,GAAG,QAAQ;AAC7D,QAAM,WACJ,QAAQ,iBAAiB,OAAO,cAAc,SAAS,IACnD,OAAO,gBACP;AAEN,QAAM,SAAS,CAAC,sBAAsB,YAAY;AAElD,QAAM,UAAU,MAAM,GAAG,UAAsB;AAAA,IAC7C;AAAA,IACA,UAAU;AAAA,IACV,WAAW;AAAA,IACX;AAAA,IACA,qBAAqB;AAAA,IACrB,oBAAoB;AAAA,EACtB,CAAC;AAED,QAAM,YAAkC,QAAQ,IAAI,CAAC,YAAY;AAC/D,UAAM,eAAe,KAAK,SAAS,KAAK,OAAO;AAC/C,UAAM,WAAW,eAAe,YAAY;AAC5C,WAAO,EAAE,MAAM,SAAS,cAAc,SAAS;AAAA,EACjD,CAAC;AAED,YAAU,KAAK,CAAC,GAAG,MAAM,EAAE,aAAa,cAAc,EAAE,YAAY,CAAC;AACrE,SAAO;AACT;AAGA,SAAS,eAAe,cAA8B;AACpD,QAAM,QAAQ,aAAa,MAAM,KAAK,GAAG;AAIzC,MAAI,MAAM,UAAU,GAAG;AACrB,WAAO;AAAA,EACT;AAGA,QAAM,OAAO,MAAM,MAAM,GAAG,EAAE,EAAE,OAAO,CAAC,MAAM,MAAM,OAAO;AAC3D,SAAO,KAAK,SAAS,IAAI,KAAK,KAAK,GAAG,IAAI;AAC5C;AAGO,SAAS,cAAc,OAAyD;AACrF,QAAM,aAAa,oBAAI,IAAY;AACnC,aAAW,QAAQ,OAAO;AACxB,eAAW,IAAI,KAAK,QAAQ;AAAA,EAC9B;AACA,QAAM,SAAS,MAAM,KAAK,UAAU;AACpC,SAAO,KAAK;AACZ,SAAO;AACT;AAGO,SAAS,iBACd,OACA,UAC+B;AAC/B,SAAO,MAAM,OAAO,CAAC,MAAM,EAAE,aAAa,QAAQ;AACpD;;;ACtFA,SAAS,OAAO,UAAU,iBAAiB;AAC3C,OAAOC,WAAU;AAGjB,IAAM,aAAa,cAAc;AACjC,IAAM,mBAAmBC,MAAK,KAAK,YAAY,kBAAkB;AAYjE,eAAsB,iBAAkD;AACtE,MAAI;AACF,UAAM,UAAU,MAAM,SAAS,kBAAkB,OAAO;AACxD,WAAO,KAAK,MAAM,OAAO;AAAA,EAC3B,QAAQ;AACN,WAAO;AAAA,EACT;AACF;AAEA,eAAsB,eAAe,QAAmC;AACtE,QAAM,MAAM,YAAY,EAAE,WAAW,KAAK,CAAC;AAC3C,QAAM,UAAU,kBAAkB,KAAK,UAAU,QAAQ,MAAM,CAAC,GAAG,OAAO;AAC5E;;;AFdA,IAAM,YAAY;AAClB,IAAM,WAAW;AACjB,IAAM,YAAY;AAClB,IAAM,aAAa;AACnB,IAAM,aAAa;AAanB,eAAsB,0BAAyC;AAC7D,QAAM,MAAM,QAAQ,IAAI;AAExB,UAAQ,IAAI;AAAA,EAAK,SAAS,GAAG,SAAS,0BAA0B,UAAU;AAAA,CAAI;AAE9E,QAAM,aAAa,MAAM,eAAe;AACxC,QAAM,SAAS,MAAM,eAAe,UAAU;AAE9C,MAAI,WAAW,QAAQ;AACrB;AAAA,EACF;AAEA,MAAI,WAAW,WAAW,YAAY;AACpC,YAAQ,IAAI;AAAA,EAAK,QAAQ,kCAAkC,UAAU;AAAA,CAAI;AACzE,UAAM,cAAc;AAAA,MAClB,WAAW,WAAW;AAAA,MACtB,QAAQ,WAAW;AAAA,MACnB,SAAS,WAAW;AAAA,MACpB,QAAQ,WAAW;AAAA,MACnB,OAAO,WAAW;AAAA,IACpB,CAAC;AACD;AAAA,EACF;AAGA,QAAM,SAAS,MAAM,oBAAoB,GAAG;AAC5C,MAAI,CAAC,QAAQ;AACX;AAAA,EACF;AAGA,QAAM,YAAY,MAAM,uBAAuB,QAAQ,GAAG;AAC1D,MAAI,CAAC,WAAW;AACd;AAAA,EACF;AAGA,QAAM,eAAe;AAAA,IACnB,YAAW,oBAAI,KAAK,GAAE,YAAY;AAAA,IAClC;AAAA,IACA,WAAW,OAAO;AAAA,IAClB,QAAQ,OAAO;AAAA,IACf,SAAS,OAAO;AAAA,IAChB,QAAQ,OAAO;AAAA,IACf,OAAO,OAAO;AAAA,EAChB,CAAC;AAED,QAAM,cAAc,MAAM;AAC5B;AAEA,eAAe,eACb,YACmC;AAEnC,QAAM,UAA4E,CAAC;AAEnF,MAAI,YAAY;AACd,UAAM,YAAY,WAAW,UAAU;AACvC,YAAQ,KAAK;AAAA,MACX,MAAM;AAAA,MACN,OAAO;AAAA,MACP,aAAa,GAAG,SAAS,0BAA0B,WAAW,MAAM;AAAA,IACtE,CAAC;AAAA,EACH;AAEA,UAAQ,KAAK,EAAE,MAAM,gCAAyB,OAAO,MAAM,GAAG,EAAE,MAAM,eAAU,OAAO,OAAO,CAAC;AAE/F,SAAO,OAAmB;AAAA,IACxB,SAAS;AAAA,IACT;AAAA,EACF,CAAC;AACH;AAEA,eAAe,oBAAoB,KAAqD;AAEtF,UAAQ,IAAI;AAAA,EAAK,QAAQ,6BAA6B,UAAU,EAAE;AAClE,QAAM,WAAW,MAAM,kBAAkB,GAAG;AAE5C,MAAI,SAAS,WAAW,GAAG;AACzB,YAAQ;AAAA,MACN;AAAA,IAGF;AACA,WAAO;AAAA,EACT;AAEA,UAAQ,IAAI,GAAG,QAAQ,SAAS,SAAS,MAAM,gBAAgB,UAAU;AAAA,CAAI;AAG7E,QAAM,gBAAgB,MAAM,oBAAoB,QAAQ;AACxD,MAAI,cAAc,WAAW,GAAG;AAC9B,YAAQ,IAAI,2BAA2B;AACvC,WAAO;AAAA,EACT;AAGA,QAAM,SAAS,MAAM,sBAAsB,KAAK,cAAc,CAAC,EAAE,IAAI;AAGrE,QAAM,WAAW,MAAM,sBAAsB;AAE7C,SAAO;AAAA,IACL,WAAW,cAAc,IAAI,CAAC,MAAM,EAAE,IAAI;AAAA,IAC1C;AAAA,IACA,GAAG;AAAA,EACL;AACF;AAEA,eAAe,oBACb,UAC+B;AAC/B,QAAM,aAAa,cAAc,QAAQ;AAGzC,MAAI;AAEJ,MAAI,WAAW,SAAS,GAAG;AACzB,UAAM,mBAAmB,MAAM,OAAe;AAAA,MAC5C,SAAS;AAAA,MACT,QAAQ,OAAO,SAAS;AACtB,cAAM,WAAW,OACb,WAAW,OAAO,CAAC,MAAM,EAAE,YAAY,EAAE,SAAS,KAAK,YAAY,CAAC,CAAC,IACrE;AACJ,eAAO;AAAA,UACL,EAAE,MAAM,oBAAoB,OAAO,UAAU;AAAA,UAC7C,GAAG,SAAS,IAAI,CAAC,MAAM;AACrB,kBAAM,QAAQ,iBAAiB,UAAU,CAAC,EAAE;AAC5C,mBAAO,EAAE,MAAM,GAAG,CAAC,KAAK,KAAK,QAAQ,QAAQ,IAAI,MAAM,EAAE,KAAK,OAAO,EAAE;AAAA,UACzE,CAAC;AAAA,QACH;AAAA,MACF;AAAA,IACF,CAAC;AAED,oBACE,qBAAqB,YAAY,WAAW,iBAAiB,UAAU,gBAAgB;AAAA,EAC3F,OAAO;AACL,oBAAgB;AAAA,EAClB;AAEA,SAAO,SAA6B;AAAA,IAClC,SAAS;AAAA,IACT,SAAS,cAAc,IAAI,CAAC,OAAO;AAAA,MACjC,MAAM,EAAE;AAAA,MACR,OAAO;AAAA,MACP,SAAS,cAAc,UAAU;AAAA;AAAA,IACnC,EAAE;AAAA,IACF,UAAU;AAAA,EACZ,CAAC;AACH;AAEA,eAAe,sBAAsB,KAAa,eAAwC;AACxF,QAAM,WAAW,MAAM,aAAa,GAAG;AAGvC,QAAM,cAAc,MAAM,gBAAgB,KAAK,UAAU,aAAa;AAEtE,MAAI,CAAC,aAAa;AAChB,YAAQ,IAAI,GAAG,QAAQ,+CAA+C,UAAU,EAAE;AAClF,WAAO;AAAA,EACT;AAEA,QAAM,cAAc,MAAM,sBAAsB,WAAW;AAC3D,QAAM,cAAc,gBAAgB,WAAW;AAE/C,MAAI,YAAY,WAAW,GAAG;AAC5B,WAAO;AAAA,EACT;AAEA,MAAI,YAAY,WAAW,GAAG;AAC5B,YAAQ,IAAI,GAAG,QAAQ,iBAAiB,YAAY,CAAC,CAAC,GAAG,UAAU,EAAE;AACrE,WAAO,YAAY,CAAC;AAAA,EACtB;AAEA,SAAO,OAAe;AAAA,IACpB,SAAS;AAAA,IACT,QAAQ,OAAO,SAAS;AACtB,YAAM,WAAW,OACb,YAAY,OAAO,CAAC,MAAM,EAAE,YAAY,EAAE,SAAS,KAAK,YAAY,CAAC,CAAC,IACtE;AACJ,aAAO,SAAS,IAAI,CAAC,MAAM;AACzB,cAAM,MAAM,YAAY,KAAK,CAAC,MAAM,EAAE,SAAS,CAAC;AAChD,eAAO;AAAA,UACL,MAAM;AAAA,UACN,OAAO;AAAA,UACP,aAAa,MAAM,aAAa,IAAI,QAAQ,KAAK;AAAA,QACnD;AAAA,MACF,CAAC;AAAA,IACH;AAAA,EACF,CAAC;AACH;AAEA,eAAe,gBACb,KACA,UACA,cAC6B;AAE7B,QAAM,eAAyB,CAAC;AAEhC,MAAI,cAAc;AAChB,UAAM,UAAUC,MAAK,QAAQ,YAAY;AACzC,QAAI,CAAC,aAAa,SAAS,OAAO,GAAG;AACnC,mBAAa,KAAK,OAAO;AAAA,IAC3B;AAAA,EACF;AAEA,MAAI,CAAC,aAAa,SAAS,GAAG,GAAG;AAC/B,iBAAa,KAAK,GAAG;AAAA,EACvB;AAEA,MAAI,aAAa,OAAO,CAAC,aAAa,SAAS,QAAQ,GAAG;AACxD,iBAAa,KAAK,QAAQ;AAAA,EAC5B;AAEA,aAAW,OAAO,cAAc;AAC9B,eAAW,aAAa,wBAAwB;AAC9C,YAAM,WAAW,GAAG,GAAG,IAAI,SAAS;AACpC,UAAI,MAAM,WAAW,QAAQ,GAAG;AAC9B,eAAO;AAAA,MACT;AAAA,IACF;AAAA,EACF;AAEA,SAAO;AACT;AAEA,eAAe,wBAIZ;AACD,QAAM,YAAY,MAAM,QAAQ;AAAA,IAC9B,SAAS;AAAA,IACT,SAAS;AAAA,EACX,CAAC;AAED,MAAI,CAAC,WAAW;AACd,WAAO,EAAE,SAAS,GAAG,QAAQ,OAAO,OAAO,MAAM;AAAA,EACnD;AAEA,QAAM,UACH,MAAM,OAAO;AAAA,IACZ,SAAS;AAAA,IACT,SAAS;AAAA,IACT,KAAK;AAAA,IACL,KAAK;AAAA,EACP,CAAC,KAAM;AAET,QAAM,SAAS,MAAM,QAAQ;AAAA,IAC3B,SAAS;AAAA,IACT,SAAS;AAAA,EACX,CAAC;AAED,QAAM,QAAQ,MAAM,QAAQ;AAAA,IAC1B,SAAS;AAAA,IACT,SAAS;AAAA,EACX,CAAC;AAED,SAAO,EAAE,SAAS,QAAQ,MAAM;AAClC;AAEA,eAAe,uBAAuB,QAA2B,KAA+B;AAC9F,QAAM,cAAc,OAAO,UACxB,IAAI,CAAC,MAAM;AACV,UAAM,MAAM,EAAE,WAAW,GAAG,IAAI,EAAE,MAAM,IAAI,SAAS,CAAC,IAAI;AAC1D,WAAO,KAAK,GAAG;AAAA,EACjB,CAAC,EACA,KAAK,IAAI;AAEZ,UAAQ,IAAI;AAAA,EAAK,SAAS,uBAAuB,UAAU,EAAE;AAC7D,UAAQ,IAAI,GAAG,QAAQ,GAAG,SAAI,OAAO,EAAE,CAAC,GAAG,UAAU,EAAE;AACvD,UAAQ,IAAI,GAAG,UAAU,cAAc,UAAU;AAAA,EAAK,WAAW,EAAE;AACnE,UAAQ,IAAI,GAAG,UAAU,UAAU,UAAU,OAAO,OAAO,MAAM,EAAE;AACnE,UAAQ,IAAI,GAAG,UAAU,WAAW,UAAU,MAAM,OAAO,OAAO,EAAE;AACpE,UAAQ,IAAI,GAAG,UAAU,WAAW,UAAU,MAAM,OAAO,SAAS,QAAQ,IAAI,EAAE;AAClF,UAAQ,IAAI,GAAG,UAAU,SAAS,UAAU,QAAQ,OAAO,QAAQ,QAAQ,IAAI,EAAE;AACjF,UAAQ,IAAI,GAAG,QAAQ,GAAG,SAAI,OAAO,EAAE,CAAC,GAAG,UAAU,EAAE;AAEvD,SAAO,QAAQ;AAAA,IACb,SAAS;AAAA,IACT,SAAS;AAAA,EACX,CAAC;AACH;AAEA,eAAe,cAAc,QAA0C;AACrE,QAAM,aAAsC;AAAA,IAC1C,QAAQ,OAAO;AAAA,IACf,SAAS,OAAO;AAAA,IAChB,QAAQ,OAAO;AAAA,IACf,OAAO,OAAO;AAAA,IACd,cAAc;AAAA,IACd,aAAa;AAAA,IACb,gBAAgB;AAAA,IAChB,gBAAgB;AAAA,IAChB,cAAc;AAAA,IACd,YAAY;AAAA,IACZ,SAAS;AAAA,IACT,gBAAgB;AAAA,IAChB,mBAAmB;AAAA,IACnB,OAAO;AAAA,EACT;AAEA,QAAM,SAAS,MAAM,eAAe;AAAA,IAClC,WAAW,CAAC,GAAG,OAAO,SAAS;AAAA,IAC/B;AAAA,EACF,CAAC;AAGD,MAAI,UAAU,OAAO,sBAAsB,KAAK,QAAQ,MAAM,OAAO;AACnE,UAAM,kBAAkB,QAAQ,OAAO,UAAU;AAAA,EACnD;AACF;AAEA,eAAe,kBAAkB,QAA2B,YAAmC;AAC7F,QAAM,cAAc,MAAM,QAAQ;AAAA,IAChC,SAAS,SAAS,SAAS,mBAAmB,UAAU;AAAA,IACxD,SAAS;AAAA,EACX,CAAC;AAED,MAAI,CAAC,aAAa;AAChB;AAAA,EACF;AAEA,UAAQ,IAAI;AAAA,EAAK,QAAQ,+BAA+B,UAAU;AAAA,CAAI;AAEtE,QAAM,aAAsC;AAAA,IAC1C,QAAQ,OAAO;AAAA,IACf,SAAS,OAAO;AAAA,IAChB,QAAQ,OAAO;AAAA,IACf,OAAO,OAAO;AAAA,IACd,cAAc;AAAA,IACd,aAAa;AAAA,IACb,KAAK;AAAA,IACL,aAAa;AAAA,IACb,gBAAgB;AAAA,IAChB,gBAAgB;AAAA,IAChB,cAAc;AAAA,IACd,YAAY;AAAA,IACZ,SAAS;AAAA,IACT,gBAAgB;AAAA,IAChB,mBAAmB;AAAA,IACnB,OAAO;AAAA,EACT;AAEA,QAAM,cAAc,MAAM,eAAe;AAAA,IACvC,WAAW,CAAC,GAAG,OAAO,SAAS;AAAA,IAC/B;AAAA,EACF,CAAC;AAGD,MAAI,eAAe,YAAY,sBAAsB,KAAK,QAAQ,MAAM,OAAO;AAC7E,UAAM,kBAAkB,QAAQ,YAAY,UAAU;AAAA,EACxD;AACF;","names":["path","path","path","path"]}
@@ -1,23 +1,25 @@
1
1
  # Copy this file to .env and fill in your credentials
2
2
 
3
+ # Eval run mode (used by agentv-bench skill)
4
+ AGENT_EVAL_MODE=agent # agent | cli
5
+
3
6
  # Azure OpenAI Configuration
4
7
  AZURE_OPENAI_ENDPOINT=https://your-endpoint.openai.azure.com/
5
8
  AZURE_OPENAI_API_KEY=your-openai-api-key-here
6
- AZURE_DEPLOYMENT_NAME=gpt-5-chat
9
+ AZURE_DEPLOYMENT_NAME=gpt-5-mini
7
10
  AZURE_OPENAI_API_VERSION=2024-12-01-preview
8
11
 
12
+ # OpenAI
13
+ OPENAI_ENDPOINT=https://your-endpoint.openai.azure.com/
14
+ OPENAI_API_KEY=your-openai-api-key-here
15
+ OPENAI_MODEL=gpt-5-mini
16
+
9
17
  # Google Gemini
10
18
  GOOGLE_GENERATIVE_AI_API_KEY=your-gemini-api-key-here
11
- GEMINI_MODEL_NAME=gemini-2.5-flash
19
+ GEMINI_MODEL_NAME=gemini-3-flash-preview
12
20
 
13
21
  # Anthropic
14
22
  ANTHROPIC_API_KEY=your-anthropic-api-key-here
15
23
 
16
- # VS Code Workspace Paths for Execution Targets
17
- # Note: Using forward slashes is recommended for paths in .env files
18
- # to avoid issues with escape characters.
19
- PROJECTX_WORKSPACE_PATH=C:/Users/your-username/OneDrive - Company Pty Ltd/sample.code-workspace
20
-
21
24
  # CLI provider sample (used by the local_cli target)
22
- CLI_EVALS_DIR=./docs/examples/simple/evals/local-cli
23
- LOCAL_AGENT_TOKEN=dummytoken
25
+ CLI_EVALS_DIR=./docs/examples/simple/evals/local-cli
@@ -8,6 +8,11 @@ guideline_patterns:
8
8
  - "**/*.prompt.md"
9
9
  - "**/SKILL.md"
10
10
 
11
+ # Execution defaults (overridden by CLI flags)
12
+ # execution:
13
+ # pool_workspaces: true # Reuse materialized workspaces across eval runs
14
+ # pool_slots: 10 # Max pool slots on disk (1-50, default: 10)
15
+
11
16
  # Notes:
12
17
  # - Patterns use standard glob syntax (via micromatch library)
13
18
  # - Paths are normalized to forward slashes for cross-platform compatibility
@@ -10,10 +10,6 @@ targets:
10
10
  model: ${{ AZURE_DEPLOYMENT_NAME }}
11
11
  # version: ${{ AZURE_OPENAI_API_VERSION }} # Optional: uncomment to override default (2024-12-01-preview)
12
12
 
13
- - name: vscode
14
- provider: vscode
15
- judge_target: azure-llm
16
-
17
13
  - name: codex
18
14
  provider: codex
19
15
  judge_target: azure-llm
@@ -43,18 +39,6 @@ targets:
43
39
  log_format: json # Optional: 'summary' (default) or 'json' for raw event logs
44
40
  # system_prompt: optional override (default instructs agent to include code in response)
45
41
 
46
- - name: vscode_projectx
47
- provider: vscode
48
- workspace_template: ${{ PROJECTX_WORKSPACE_PATH }}
49
- provider_batching: false
50
- judge_target: azure-llm
51
-
52
- - name: vscode_insiders_projectx
53
- provider: vscode-insiders
54
- workspace_template: ${{ PROJECTX_WORKSPACE_PATH }}
55
- provider_batching: false
56
- judge_target: azure-llm
57
-
58
42
  - name: azure-llm
59
43
  provider: azure
60
44
  endpoint: ${{ AZURE_OPENAI_ENDPOINT }}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agentv",
3
- "version": "3.3.0",
3
+ "version": "3.4.0",
4
4
  "description": "CLI entry point for AgentV",
5
5
  "type": "module",
6
6
  "repository": {
@@ -44,7 +44,7 @@
44
44
  "yaml": "^2.6.1"
45
45
  },
46
46
  "devDependencies": {
47
- "@agentv/core": "3.2.3",
47
+ "@agentv/core": "3.3.0",
48
48
  "@types/semver": "^7.7.1",
49
49
  "execa": "^9.3.0"
50
50
  }