@agentv/core 4.4.1 → 4.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -6803,6 +6803,7 @@ var CopilotLogProvider = class {
6803
6803
 
6804
6804
  // src/evaluation/providers/copilot-sdk.ts
6805
6805
  import { randomUUID as randomUUID6 } from "node:crypto";
6806
+ import { existsSync as existsSync2 } from "node:fs";
6806
6807
  import { mkdir as mkdir5 } from "node:fs/promises";
6807
6808
  import path18 from "node:path";
6808
6809
 
@@ -6918,6 +6919,7 @@ var CopilotSdkProvider = class {
6918
6919
  const cwd = this.resolveCwd(request.cwd);
6919
6920
  if (cwd) {
6920
6921
  sessionOptions.workingDirectory = cwd;
6922
+ sessionOptions.skillDirectories = resolveSkillDirectories(cwd);
6921
6923
  }
6922
6924
  const systemPrompt = this.config.systemPrompt;
6923
6925
  if (systemPrompt) {
@@ -7149,6 +7151,14 @@ var CopilotSdkProvider = class {
7149
7151
  }
7150
7152
  }
7151
7153
  };
7154
+ function resolveSkillDirectories(cwd) {
7155
+ const candidates = [
7156
+ path18.join(cwd, ".claude", "skills"),
7157
+ path18.join(cwd, ".agents", "skills"),
7158
+ path18.join(cwd, ".codex", "skills")
7159
+ ];
7160
+ return candidates.filter((dir) => existsSync2(dir));
7161
+ }
7152
7162
  function summarizeSdkEvent(eventType, data) {
7153
7163
  if (!data || typeof data !== "object") {
7154
7164
  return eventType;
@@ -8177,6 +8187,11 @@ var PiCodingAgentProvider = class {
8177
8187
  const modelId = this.config.model ?? "gemini-2.5-flash";
8178
8188
  this.setApiKeyEnv(providerName);
8179
8189
  const model = sdk.getModel(providerName, modelId);
8190
+ if (!model) {
8191
+ throw new Error(
8192
+ `pi-coding-agent: getModel('${providerName}', '${modelId}') returned undefined. The model '${modelId}' is not registered for provider '${providerName}' in pi-ai. Check that subprovider and model are correct in your target config.`
8193
+ );
8194
+ }
8180
8195
  const tools = this.resolveTools(sdk);
8181
8196
  const { session } = await sdk.createAgentSession({
8182
8197
  cwd,
@@ -8291,6 +8306,15 @@ ${fileList}`;
8291
8306
  await session.prompt(prompt);
8292
8307
  }
8293
8308
  const agentMessages = session.agent.state.messages;
8309
+ const lastAssistant = [...agentMessages].reverse().find(
8310
+ (m) => !!m && typeof m === "object" && m.role === "assistant"
8311
+ );
8312
+ if (lastAssistant?.stopReason === "error") {
8313
+ const errorMsg = typeof lastAssistant.errorMessage === "string" ? lastAssistant.errorMessage : "unknown SDK error";
8314
+ throw new Error(
8315
+ `pi-coding-agent SDK error (provider: ${lastAssistant.provider ?? providerName}, model: ${lastAssistant.model ?? modelId}): ${errorMsg}`
8316
+ );
8317
+ }
8294
8318
  const output = [];
8295
8319
  for (const msg of agentMessages) {
8296
8320
  output.push(convertAgentMessage(msg, toolTrackers, completedToolResults));
@@ -10340,15 +10364,15 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
10340
10364
  });
10341
10365
  }
10342
10366
  async function execShellWithStdin(command, stdinPayload, options = {}) {
10343
- const { mkdir: mkdir16, readFile: readFile14, rm: rm6, writeFile: writeFile9 } = await import("node:fs/promises");
10367
+ const { mkdir: mkdir16, readFile: readFile15, rm: rm6, writeFile: writeFile9 } = await import("node:fs/promises");
10344
10368
  const { tmpdir: tmpdir3 } = await import("node:os");
10345
- const path48 = await import("node:path");
10369
+ const path49 = await import("node:path");
10346
10370
  const { randomUUID: randomUUID10 } = await import("node:crypto");
10347
- const dir = path48.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
10371
+ const dir = path49.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
10348
10372
  await mkdir16(dir, { recursive: true });
10349
- const stdinPath = path48.join(dir, "stdin.txt");
10350
- const stdoutPath = path48.join(dir, "stdout.txt");
10351
- const stderrPath = path48.join(dir, "stderr.txt");
10373
+ const stdinPath = path49.join(dir, "stdin.txt");
10374
+ const stdoutPath = path49.join(dir, "stdout.txt");
10375
+ const stderrPath = path49.join(dir, "stderr.txt");
10352
10376
  await writeFile9(stdinPath, stdinPayload, "utf8");
10353
10377
  const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
10354
10378
  const { spawn: spawn5 } = await import("node:child_process");
@@ -10378,8 +10402,8 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
10378
10402
  resolve(code ?? 0);
10379
10403
  });
10380
10404
  });
10381
- const stdout = (await readFile14(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
10382
- const stderr = (await readFile14(stderrPath, "utf8")).replace(/\r\n/g, "\n");
10405
+ const stdout = (await readFile15(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
10406
+ const stderr = (await readFile15(stderrPath, "utf8")).replace(/\r\n/g, "\n");
10383
10407
  return { stdout, stderr, exitCode };
10384
10408
  } finally {
10385
10409
  await rm6(dir, { recursive: true, force: true });
@@ -11855,11 +11879,11 @@ function createFilesystemTools(workspacePath) {
11855
11879
  execute: async (input) => {
11856
11880
  try {
11857
11881
  const resolved = resolveSandboxed(workspacePath, input.path);
11858
- const stat9 = await fs2.stat(resolved);
11859
- if (stat9.isDirectory()) {
11882
+ const stat10 = await fs2.stat(resolved);
11883
+ if (stat10.isDirectory()) {
11860
11884
  return { error: `'${input.path}' is a directory, not a file` };
11861
11885
  }
11862
- const buffer = Buffer.alloc(Math.min(stat9.size, MAX_FILE_SIZE));
11886
+ const buffer = Buffer.alloc(Math.min(stat10.size, MAX_FILE_SIZE));
11863
11887
  const fd = await fs2.open(resolved, "r");
11864
11888
  try {
11865
11889
  await fd.read(buffer, 0, buffer.length, 0);
@@ -11867,8 +11891,8 @@ function createFilesystemTools(workspacePath) {
11867
11891
  await fd.close();
11868
11892
  }
11869
11893
  const content = buffer.toString("utf-8");
11870
- const truncated = stat9.size > MAX_FILE_SIZE;
11871
- return { content, truncated, size: stat9.size };
11894
+ const truncated = stat10.size > MAX_FILE_SIZE;
11895
+ return { content, truncated, size: stat10.size };
11872
11896
  } catch (error) {
11873
11897
  return { error: error instanceof Error ? error.message : String(error) };
11874
11898
  }
@@ -11919,8 +11943,8 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
11919
11943
  const ext = path35.extname(entry.name).toLowerCase();
11920
11944
  if (BINARY_EXTENSIONS.has(ext)) continue;
11921
11945
  try {
11922
- const stat9 = await fs2.stat(fullPath);
11923
- if (stat9.size > MAX_FILE_SIZE) continue;
11946
+ const stat10 = await fs2.stat(fullPath);
11947
+ if (stat10.size > MAX_FILE_SIZE) continue;
11924
11948
  const content = await fs2.readFile(fullPath, "utf-8");
11925
11949
  const lines = content.split("\n");
11926
11950
  for (let i = 0; i < lines.length; i++) {
@@ -12561,115 +12585,115 @@ var FieldAccuracyEvaluator = class {
12561
12585
  * Evaluate a single field against the expected value.
12562
12586
  */
12563
12587
  evaluateField(fieldConfig, candidateData, expectedData) {
12564
- const { path: path48, match, required = true, weight = 1 } = fieldConfig;
12565
- const candidateValue = resolvePath(candidateData, path48);
12566
- const expectedValue = resolvePath(expectedData, path48);
12588
+ const { path: path49, match, required = true, weight = 1 } = fieldConfig;
12589
+ const candidateValue = resolvePath(candidateData, path49);
12590
+ const expectedValue = resolvePath(expectedData, path49);
12567
12591
  if (expectedValue === void 0) {
12568
12592
  return {
12569
- path: path48,
12593
+ path: path49,
12570
12594
  score: 1,
12571
12595
  // No expected value means no comparison needed
12572
12596
  weight,
12573
12597
  hit: true,
12574
- message: `${path48}: no expected value`
12598
+ message: `${path49}: no expected value`
12575
12599
  };
12576
12600
  }
12577
12601
  if (candidateValue === void 0) {
12578
12602
  if (required) {
12579
12603
  return {
12580
- path: path48,
12604
+ path: path49,
12581
12605
  score: 0,
12582
12606
  weight,
12583
12607
  hit: false,
12584
- message: `${path48} (required, missing)`
12608
+ message: `${path49} (required, missing)`
12585
12609
  };
12586
12610
  }
12587
12611
  return {
12588
- path: path48,
12612
+ path: path49,
12589
12613
  score: 1,
12590
12614
  // Don't penalize missing optional fields
12591
12615
  weight: 0,
12592
12616
  // Zero weight means it won't affect the score
12593
12617
  hit: true,
12594
- message: `${path48}: optional field missing`
12618
+ message: `${path49}: optional field missing`
12595
12619
  };
12596
12620
  }
12597
12621
  switch (match) {
12598
12622
  case "exact":
12599
- return this.compareExact(path48, candidateValue, expectedValue, weight);
12623
+ return this.compareExact(path49, candidateValue, expectedValue, weight);
12600
12624
  case "numeric_tolerance":
12601
12625
  return this.compareNumericTolerance(
12602
- path48,
12626
+ path49,
12603
12627
  candidateValue,
12604
12628
  expectedValue,
12605
12629
  fieldConfig,
12606
12630
  weight
12607
12631
  );
12608
12632
  case "date":
12609
- return this.compareDate(path48, candidateValue, expectedValue, fieldConfig, weight);
12633
+ return this.compareDate(path49, candidateValue, expectedValue, fieldConfig, weight);
12610
12634
  default:
12611
12635
  return {
12612
- path: path48,
12636
+ path: path49,
12613
12637
  score: 0,
12614
12638
  weight,
12615
12639
  hit: false,
12616
- message: `${path48}: unknown match type "${match}"`
12640
+ message: `${path49}: unknown match type "${match}"`
12617
12641
  };
12618
12642
  }
12619
12643
  }
12620
12644
  /**
12621
12645
  * Exact equality comparison.
12622
12646
  */
12623
- compareExact(path48, candidateValue, expectedValue, weight) {
12647
+ compareExact(path49, candidateValue, expectedValue, weight) {
12624
12648
  if (deepEqual(candidateValue, expectedValue)) {
12625
12649
  return {
12626
- path: path48,
12650
+ path: path49,
12627
12651
  score: 1,
12628
12652
  weight,
12629
12653
  hit: true,
12630
- message: path48
12654
+ message: path49
12631
12655
  };
12632
12656
  }
12633
12657
  if (typeof candidateValue !== typeof expectedValue) {
12634
12658
  return {
12635
- path: path48,
12659
+ path: path49,
12636
12660
  score: 0,
12637
12661
  weight,
12638
12662
  hit: false,
12639
- message: `${path48} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
12663
+ message: `${path49} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
12640
12664
  };
12641
12665
  }
12642
12666
  return {
12643
- path: path48,
12667
+ path: path49,
12644
12668
  score: 0,
12645
12669
  weight,
12646
12670
  hit: false,
12647
- message: `${path48} (value mismatch)`
12671
+ message: `${path49} (value mismatch)`
12648
12672
  };
12649
12673
  }
12650
12674
  /**
12651
12675
  * Numeric comparison with absolute or relative tolerance.
12652
12676
  */
12653
- compareNumericTolerance(path48, candidateValue, expectedValue, fieldConfig, weight) {
12677
+ compareNumericTolerance(path49, candidateValue, expectedValue, fieldConfig, weight) {
12654
12678
  const { tolerance = 0, relative = false } = fieldConfig;
12655
12679
  const candidateNum = toNumber(candidateValue);
12656
12680
  const expectedNum = toNumber(expectedValue);
12657
12681
  if (candidateNum === null || expectedNum === null) {
12658
12682
  return {
12659
- path: path48,
12683
+ path: path49,
12660
12684
  score: 0,
12661
12685
  weight,
12662
12686
  hit: false,
12663
- message: `${path48} (non-numeric value)`
12687
+ message: `${path49} (non-numeric value)`
12664
12688
  };
12665
12689
  }
12666
12690
  if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
12667
12691
  return {
12668
- path: path48,
12692
+ path: path49,
12669
12693
  score: 0,
12670
12694
  weight,
12671
12695
  hit: false,
12672
- message: `${path48} (invalid numeric value)`
12696
+ message: `${path49} (invalid numeric value)`
12673
12697
  };
12674
12698
  }
12675
12699
  const diff = Math.abs(candidateNum - expectedNum);
@@ -12682,61 +12706,61 @@ var FieldAccuracyEvaluator = class {
12682
12706
  }
12683
12707
  if (withinTolerance) {
12684
12708
  return {
12685
- path: path48,
12709
+ path: path49,
12686
12710
  score: 1,
12687
12711
  weight,
12688
12712
  hit: true,
12689
- message: `${path48} (within tolerance: diff=${diff.toFixed(2)})`
12713
+ message: `${path49} (within tolerance: diff=${diff.toFixed(2)})`
12690
12714
  };
12691
12715
  }
12692
12716
  return {
12693
- path: path48,
12717
+ path: path49,
12694
12718
  score: 0,
12695
12719
  weight,
12696
12720
  hit: false,
12697
- message: `${path48} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
12721
+ message: `${path49} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
12698
12722
  };
12699
12723
  }
12700
12724
  /**
12701
12725
  * Date comparison with format normalization.
12702
12726
  */
12703
- compareDate(path48, candidateValue, expectedValue, fieldConfig, weight) {
12727
+ compareDate(path49, candidateValue, expectedValue, fieldConfig, weight) {
12704
12728
  const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
12705
12729
  const candidateDate = parseDate(String(candidateValue), formats);
12706
12730
  const expectedDate = parseDate(String(expectedValue), formats);
12707
12731
  if (candidateDate === null) {
12708
12732
  return {
12709
- path: path48,
12733
+ path: path49,
12710
12734
  score: 0,
12711
12735
  weight,
12712
12736
  hit: false,
12713
- message: `${path48} (unparseable candidate date)`
12737
+ message: `${path49} (unparseable candidate date)`
12714
12738
  };
12715
12739
  }
12716
12740
  if (expectedDate === null) {
12717
12741
  return {
12718
- path: path48,
12742
+ path: path49,
12719
12743
  score: 0,
12720
12744
  weight,
12721
12745
  hit: false,
12722
- message: `${path48} (unparseable expected date)`
12746
+ message: `${path49} (unparseable expected date)`
12723
12747
  };
12724
12748
  }
12725
12749
  if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
12726
12750
  return {
12727
- path: path48,
12751
+ path: path49,
12728
12752
  score: 1,
12729
12753
  weight,
12730
12754
  hit: true,
12731
- message: path48
12755
+ message: path49
12732
12756
  };
12733
12757
  }
12734
12758
  return {
12735
- path: path48,
12759
+ path: path49,
12736
12760
  score: 0,
12737
12761
  weight,
12738
12762
  hit: false,
12739
- message: `${path48} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
12763
+ message: `${path49} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
12740
12764
  };
12741
12765
  }
12742
12766
  /**
@@ -12769,11 +12793,11 @@ var FieldAccuracyEvaluator = class {
12769
12793
  };
12770
12794
  }
12771
12795
  };
12772
- function resolvePath(obj, path48) {
12773
- if (!path48 || !obj) {
12796
+ function resolvePath(obj, path49) {
12797
+ if (!path49 || !obj) {
12774
12798
  return void 0;
12775
12799
  }
12776
- const parts = path48.split(/\.|\[|\]/).filter((p) => p.length > 0);
12800
+ const parts = path49.split(/\.|\[|\]/).filter((p) => p.length > 0);
12777
12801
  let current = obj;
12778
12802
  for (const part of parts) {
12779
12803
  if (current === null || current === void 0) {
@@ -12997,6 +13021,14 @@ var SkillTriggerEvaluator = class {
12997
13021
  evidence = `Read tool loaded skill file via tool name "${toolName}"`;
12998
13022
  break;
12999
13023
  }
13024
+ if (!triggered && toolCall.output != null) {
13025
+ const outputStr = typeof toolCall.output === "string" ? toolCall.output : JSON.stringify(toolCall.output);
13026
+ if (outputStr.includes(`skills/${skillName}/`)) {
13027
+ triggered = true;
13028
+ evidence = `Tool "${toolName}" output referenced skill file for "${skillName}"`;
13029
+ break;
13030
+ }
13031
+ }
13000
13032
  }
13001
13033
  const pass = triggered === shouldTrigger;
13002
13034
  if (pass) {
@@ -13257,8 +13289,8 @@ var TokenUsageEvaluator = class {
13257
13289
  };
13258
13290
 
13259
13291
  // src/evaluation/evaluators/tool-trajectory.ts
13260
- function getNestedValue(obj, path48) {
13261
- const parts = path48.split(".");
13292
+ function getNestedValue(obj, path49) {
13293
+ const parts = path49.split(".");
13262
13294
  let current = obj;
13263
13295
  for (const part of parts) {
13264
13296
  if (current === null || current === void 0 || typeof current !== "object") {
@@ -14793,7 +14825,7 @@ async function cleanupEvalWorkspaces(evalRunId, workspaceRoot) {
14793
14825
  // src/evaluation/workspace/pool-manager.ts
14794
14826
  import { execFile } from "node:child_process";
14795
14827
  import { createHash } from "node:crypto";
14796
- import { existsSync as existsSync2 } from "node:fs";
14828
+ import { existsSync as existsSync3 } from "node:fs";
14797
14829
  import { cp as cp2, mkdir as mkdir13, readFile as readFile12, readdir as readdir5, rm as rm5, unlink, writeFile as writeFile7 } from "node:fs/promises";
14798
14830
  import path41 from "node:path";
14799
14831
  import { promisify as promisify5 } from "node:util";
@@ -14899,7 +14931,7 @@ var WorkspacePoolManager = class {
14899
14931
  if (!locked) {
14900
14932
  continue;
14901
14933
  }
14902
- const slotExists = existsSync2(slotPath);
14934
+ const slotExists = existsSync3(slotPath);
14903
14935
  if (slotExists) {
14904
14936
  await this.resetSlot(slotPath, templatePath, repos, poolReset);
14905
14937
  return {
@@ -15005,7 +15037,7 @@ var WorkspacePoolManager = class {
15005
15037
  for (const entry of entries) {
15006
15038
  if (entry.startsWith("slot-") && !entry.endsWith(".lock")) {
15007
15039
  const lockPath = path41.join(poolDir, `${entry}.lock`);
15008
- if (existsSync2(lockPath)) {
15040
+ if (existsSync3(lockPath)) {
15009
15041
  try {
15010
15042
  const pidStr = await readFile12(lockPath, "utf-8");
15011
15043
  const pid = Number.parseInt(pidStr.trim(), 10);
@@ -15036,7 +15068,7 @@ var WorkspacePoolManager = class {
15036
15068
  async resetSlot(slotPath, templatePath, repos, poolReset = "fast") {
15037
15069
  for (const repo of repos) {
15038
15070
  const repoDir = path41.join(slotPath, repo.path);
15039
- if (!existsSync2(repoDir)) {
15071
+ if (!existsSync3(repoDir)) {
15040
15072
  continue;
15041
15073
  }
15042
15074
  if (poolReset === "none") {
@@ -15061,7 +15093,7 @@ var WorkspacePoolManager = class {
15061
15093
 
15062
15094
  // src/evaluation/workspace/repo-manager.ts
15063
15095
  import { execFile as execFile2 } from "node:child_process";
15064
- import { existsSync as existsSync3 } from "node:fs";
15096
+ import { existsSync as existsSync4 } from "node:fs";
15065
15097
  import path42 from "node:path";
15066
15098
  import { promisify as promisify6 } from "node:util";
15067
15099
  var execFileAsync2 = promisify6(execFile2);
@@ -15113,7 +15145,7 @@ var RepoManager = class {
15113
15145
  resolvedSourcePath: sourcePath ?? "",
15114
15146
  reason: "empty_path"
15115
15147
  });
15116
- } else if (!existsSync3(sourcePath)) {
15148
+ } else if (!existsSync4(sourcePath)) {
15117
15149
  errors.push({
15118
15150
  repoPath: repo.path,
15119
15151
  resolvedSourcePath: sourcePath,
@@ -15340,8 +15372,8 @@ async function executeWorkspaceScript(config, context, failureMode = "fatal") {
15340
15372
  }
15341
15373
 
15342
15374
  // src/evaluation/orchestrator.ts
15343
- function classifyQualityStatus(score) {
15344
- return score >= PASS_THRESHOLD ? "ok" : "quality_failure";
15375
+ function classifyQualityStatus(score, threshold = PASS_THRESHOLD) {
15376
+ return score >= threshold ? "ok" : "quality_failure";
15345
15377
  }
15346
15378
  function buildSkippedEvaluatorError(scores) {
15347
15379
  const skippedScores = scores?.filter((score) => score.verdict === "skip") ?? [];
@@ -15417,7 +15449,8 @@ async function runEvaluation(options) {
15417
15449
  retainOnSuccess,
15418
15450
  retainOnFailure,
15419
15451
  graderTarget: cliGraderTarget,
15420
- model: cliModel
15452
+ model: cliModel,
15453
+ threshold: scoreThreshold
15421
15454
  } = options;
15422
15455
  let useCache = options.useCache;
15423
15456
  if (trials && trials.count > 1 && useCache) {
@@ -15546,7 +15579,8 @@ async function runEvaluation(options) {
15546
15579
  resolveGraderProvider,
15547
15580
  agentTimeoutMs,
15548
15581
  targetResolver,
15549
- availableTargets
15582
+ availableTargets,
15583
+ threshold: scoreThreshold
15550
15584
  });
15551
15585
  } catch (error) {
15552
15586
  if (verbose) {
@@ -15915,7 +15949,8 @@ async function runEvaluation(options) {
15915
15949
  typeRegistry,
15916
15950
  repoManager,
15917
15951
  evalDir,
15918
- verbose
15952
+ verbose,
15953
+ threshold: scoreThreshold
15919
15954
  };
15920
15955
  let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
15921
15956
  if (totalBudgetUsd !== void 0) {
@@ -16073,7 +16108,8 @@ async function runBatchEvaluation(options) {
16073
16108
  resolveGraderProvider,
16074
16109
  agentTimeoutMs,
16075
16110
  targetResolver,
16076
- availableTargets
16111
+ availableTargets,
16112
+ threshold: batchThreshold
16077
16113
  } = options;
16078
16114
  const promptInputsList = [];
16079
16115
  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
@@ -16158,7 +16194,8 @@ async function runBatchEvaluation(options) {
16158
16194
  endTime,
16159
16195
  targetResolver,
16160
16196
  availableTargets,
16161
- verbose
16197
+ verbose,
16198
+ threshold: batchThreshold
16162
16199
  });
16163
16200
  if (providerError) {
16164
16201
  result = {
@@ -16244,7 +16281,8 @@ async function runEvalCase(options) {
16244
16281
  typeRegistry: providedTypeRegistry,
16245
16282
  repoManager,
16246
16283
  evalDir,
16247
- verbose
16284
+ verbose,
16285
+ threshold: caseThreshold
16248
16286
  } = options;
16249
16287
  const setupDebug = process.env.AGENTV_SETUP_DEBUG === "1";
16250
16288
  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
@@ -16476,7 +16514,9 @@ async function runEvalCase(options) {
16476
16514
  });
16477
16515
  } catch (error) {
16478
16516
  lastError = error;
16479
- if (isTimeoutLike(error) && attempt + 1 < attemptBudget) {
16517
+ if (attempt + 1 < attemptBudget) {
16518
+ const delayMs = retryBackoffMs(attempt);
16519
+ await sleep3(delayMs, signal);
16480
16520
  attempt += 1;
16481
16521
  continue;
16482
16522
  }
@@ -16607,7 +16647,8 @@ async function runEvalCase(options) {
16607
16647
  availableTargets,
16608
16648
  fileChanges,
16609
16649
  workspacePath,
16610
- verbose
16650
+ verbose,
16651
+ threshold: caseThreshold
16611
16652
  });
16612
16653
  const totalDurationMs = Date.now() - caseStartMs;
16613
16654
  const graderTokens = aggregateEvaluatorTokenUsage(result.scores);
@@ -16622,7 +16663,7 @@ async function runEvalCase(options) {
16622
16663
  ...evalRunTokenUsage ? { tokenUsage: evalRunTokenUsage } : {}
16623
16664
  };
16624
16665
  const skippedEvaluatorError = buildSkippedEvaluatorError(result.scores);
16625
- const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score);
16666
+ const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score, caseThreshold);
16626
16667
  const finalResult = providerError ? {
16627
16668
  ...result,
16628
16669
  evalRun,
@@ -16793,7 +16834,8 @@ async function evaluateCandidate(options) {
16793
16834
  targetResolver,
16794
16835
  availableTargets,
16795
16836
  fileChanges,
16796
- workspacePath
16837
+ workspacePath,
16838
+ threshold: evalThreshold
16797
16839
  } = options;
16798
16840
  const gradeTimestamp = nowFn();
16799
16841
  const { score, scores } = await runEvaluatorsForCase({
@@ -16866,7 +16908,7 @@ async function evaluateCandidate(options) {
16866
16908
  scores,
16867
16909
  trace,
16868
16910
  fileChanges,
16869
- executionStatus: classifyQualityStatus(score.score)
16911
+ executionStatus: classifyQualityStatus(score.score, evalThreshold)
16870
16912
  };
16871
16913
  }
16872
16914
  async function runEvaluatorsForCase(options) {
@@ -17170,7 +17212,7 @@ async function invokeProvider(provider, options) {
17170
17212
  }
17171
17213
  }
17172
17214
  function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode, verbose) {
17173
- const message = error instanceof Error ? error.message : String(error);
17215
+ const message = extractErrorMessage(error);
17174
17216
  let agentRequest;
17175
17217
  let lmRequest;
17176
17218
  if (isAgentProvider(provider)) {
@@ -17287,20 +17329,45 @@ function aggregateEvaluatorTokenUsage(scores) {
17287
17329
  ...hasCached ? { cached } : {}
17288
17330
  };
17289
17331
  }
17290
- function isTimeoutLike(error) {
17291
- if (!error) {
17292
- return false;
17293
- }
17294
- if (typeof DOMException !== "undefined" && error instanceof DOMException && error.name === "AbortError") {
17295
- return true;
17296
- }
17332
+ function extractErrorMessage(error) {
17297
17333
  if (error instanceof Error) {
17298
- const name = error.name?.toLowerCase();
17299
- const message = error.message?.toLowerCase();
17300
- return name.includes("timeout") || message.includes("timeout");
17334
+ return error.message;
17301
17335
  }
17302
- const value = String(error).toLowerCase();
17303
- return value.includes("timeout");
17336
+ if (error !== null && typeof error === "object") {
17337
+ const obj = error;
17338
+ const parts = [];
17339
+ if (typeof obj.message === "string" && obj.message) {
17340
+ parts.push(obj.message);
17341
+ }
17342
+ if (typeof obj.code === "number") {
17343
+ parts.push(`(code ${obj.code})`);
17344
+ }
17345
+ if (parts.length > 0) {
17346
+ return parts.join(" ");
17347
+ }
17348
+ try {
17349
+ return JSON.stringify(error);
17350
+ } catch {
17351
+ }
17352
+ }
17353
+ return String(error);
17354
+ }
17355
+ function retryBackoffMs(attempt) {
17356
+ return Math.min(2 ** attempt * 1e3, 3e4);
17357
+ }
17358
+ function sleep3(ms, signal) {
17359
+ if (signal?.aborted) return Promise.resolve();
17360
+ return new Promise((resolve) => {
17361
+ const timer = setTimeout(resolve, ms);
17362
+ signal?.addEventListener(
17363
+ "abort",
17364
+ () => {
17365
+ clearTimeout(timer);
17366
+ resolve();
17367
+ },
17368
+ { once: true }
17369
+ );
17370
+ });
17304
17371
  }
17305
17372
  function mapChildResults(children) {
17306
17373
  if (!children || children.length === 0) {
@@ -17331,7 +17398,7 @@ function computeWeightedMean(entries) {
17331
17398
  }
17332
17399
 
17333
17400
  // src/evaluation/evaluate.ts
17334
- import { existsSync as existsSync4 } from "node:fs";
17401
+ import { existsSync as existsSync5 } from "node:fs";
17335
17402
  import path45 from "node:path";
17336
17403
 
17337
17404
  // src/evaluation/providers/function-provider.ts
@@ -17487,7 +17554,7 @@ async function discoverDefaultTarget(repoRoot) {
17487
17554
  for (const dir of chain) {
17488
17555
  for (const candidate of TARGET_FILE_CANDIDATES) {
17489
17556
  const targetsPath = path45.join(dir, candidate);
17490
- if (!existsSync4(targetsPath)) continue;
17557
+ if (!existsSync5(targetsPath)) continue;
17491
17558
  try {
17492
17559
  const definitions = await readTargetDefinitions(targetsPath);
17493
17560
  const defaultTarget = definitions.find((d) => d.name === "default");
@@ -17504,7 +17571,7 @@ async function loadEnvHierarchy(repoRoot, startPath) {
17504
17571
  const envFiles = [];
17505
17572
  for (const dir of chain) {
17506
17573
  const envPath = path45.join(dir, ".env");
17507
- if (existsSync4(envPath)) envFiles.push(envPath);
17574
+ if (existsSync5(envPath)) envFiles.push(envPath);
17508
17575
  }
17509
17576
  for (let i = 0; i < envFiles.length; i++) {
17510
17577
  try {
@@ -17580,12 +17647,12 @@ var CONFIG_FILE_NAMES = [
17580
17647
  ".agentv/config.js"
17581
17648
  ];
17582
17649
  async function loadTsConfig(projectRoot) {
17583
- const { existsSync: existsSync6 } = await import("node:fs");
17650
+ const { existsSync: existsSync7 } = await import("node:fs");
17584
17651
  const { pathToFileURL } = await import("node:url");
17585
17652
  const { join: join2 } = await import("node:path");
17586
17653
  for (const fileName of CONFIG_FILE_NAMES) {
17587
17654
  const filePath = join2(projectRoot, fileName);
17588
- if (!existsSync6(filePath)) {
17655
+ if (!existsSync7(filePath)) {
17589
17656
  continue;
17590
17657
  }
17591
17658
  try {
@@ -17723,7 +17790,7 @@ function shouldSkipCacheForTemperature(targetConfig) {
17723
17790
  }
17724
17791
 
17725
17792
  // src/projects.ts
17726
- import { existsSync as existsSync5, mkdirSync, readFileSync as readFileSync2, readdirSync as readdirSync3, statSync as statSync2, writeFileSync } from "node:fs";
17793
+ import { existsSync as existsSync6, mkdirSync, readFileSync as readFileSync2, readdirSync as readdirSync3, statSync as statSync2, writeFileSync } from "node:fs";
17727
17794
  import path47 from "node:path";
17728
17795
  import { parse as parseYaml3, stringify as stringifyYaml } from "yaml";
17729
17796
  function getProjectsRegistryPath() {
@@ -17731,7 +17798,7 @@ function getProjectsRegistryPath() {
17731
17798
  }
17732
17799
  function loadProjectRegistry() {
17733
17800
  const registryPath = getProjectsRegistryPath();
17734
- if (!existsSync5(registryPath)) {
17801
+ if (!existsSync6(registryPath)) {
17735
17802
  return { projects: [] };
17736
17803
  }
17737
17804
  try {
@@ -17748,7 +17815,7 @@ function loadProjectRegistry() {
17748
17815
  function saveProjectRegistry(registry) {
17749
17816
  const registryPath = getProjectsRegistryPath();
17750
17817
  const dir = path47.dirname(registryPath);
17751
- if (!existsSync5(dir)) {
17818
+ if (!existsSync6(dir)) {
17752
17819
  mkdirSync(dir, { recursive: true });
17753
17820
  }
17754
17821
  writeFileSync(registryPath, stringifyYaml(registry), "utf-8");
@@ -17765,10 +17832,10 @@ function deriveProjectId(dirPath, existingIds) {
17765
17832
  }
17766
17833
  function addProject(projectPath) {
17767
17834
  const absPath = path47.resolve(projectPath);
17768
- if (!existsSync5(absPath)) {
17835
+ if (!existsSync6(absPath)) {
17769
17836
  throw new Error(`Directory not found: ${absPath}`);
17770
17837
  }
17771
- if (!existsSync5(path47.join(absPath, ".agentv"))) {
17838
+ if (!existsSync6(path47.join(absPath, ".agentv"))) {
17772
17839
  throw new Error(`No .agentv/ directory found in ${absPath}. Run an evaluation first.`);
17773
17840
  }
17774
17841
  const registry = loadProjectRegistry();
@@ -17812,13 +17879,13 @@ function touchProject(projectId) {
17812
17879
  }
17813
17880
  function discoverProjects(rootDir, maxDepth = 2) {
17814
17881
  const absRoot = path47.resolve(rootDir);
17815
- if (!existsSync5(absRoot) || !statSync2(absRoot).isDirectory()) {
17882
+ if (!existsSync6(absRoot) || !statSync2(absRoot).isDirectory()) {
17816
17883
  return [];
17817
17884
  }
17818
17885
  const results = [];
17819
17886
  function scan(dir, depth) {
17820
17887
  if (depth > maxDepth) return;
17821
- if (existsSync5(path47.join(dir, ".agentv"))) {
17888
+ if (existsSync6(path47.join(dir, ".agentv"))) {
17822
17889
  results.push(dir);
17823
17890
  return;
17824
17891
  }
@@ -18391,6 +18458,248 @@ function toHrTime(iso) {
18391
18458
  return new Date(iso).getTime();
18392
18459
  }
18393
18460
 
18461
+ // src/import/claude-parser.ts
18462
+ var SKIPPED_TYPES = /* @__PURE__ */ new Set(["progress", "system", "file-history-snapshot"]);
18463
+ function parseClaudeSession(jsonl) {
18464
+ const messages = [];
18465
+ let sessionId = "";
18466
+ let projectPath;
18467
+ let model;
18468
+ let startTimestamp;
18469
+ let endTimestamp;
18470
+ const usageByRequestId = /* @__PURE__ */ new Map();
18471
+ let lastAssistantRequestId;
18472
+ let lastAssistantIdx = -1;
18473
+ const pendingToolCalls = /* @__PURE__ */ new Map();
18474
+ const lines = jsonl.split("\n").filter((l) => l.trim().length > 0);
18475
+ for (const line of lines) {
18476
+ let event;
18477
+ try {
18478
+ event = JSON.parse(line);
18479
+ } catch {
18480
+ continue;
18481
+ }
18482
+ if (!event.type) continue;
18483
+ if (event.timestamp) {
18484
+ if (!startTimestamp) startTimestamp = event.timestamp;
18485
+ endTimestamp = event.timestamp;
18486
+ }
18487
+ if (SKIPPED_TYPES.has(event.type)) continue;
18488
+ if (event.isSidechain) continue;
18489
+ if (!sessionId && event.sessionId) {
18490
+ sessionId = event.sessionId;
18491
+ }
18492
+ if (!projectPath && event.cwd) {
18493
+ projectPath = event.cwd;
18494
+ }
18495
+ switch (event.type) {
18496
+ case "user": {
18497
+ const msg = event.message;
18498
+ if (!msg) break;
18499
+ const contentArr = msg.content;
18500
+ if (Array.isArray(contentArr)) {
18501
+ for (const block of contentArr) {
18502
+ if (block.type === "tool_result" && block.tool_use_id) {
18503
+ const pending = pendingToolCalls.get(block.tool_use_id);
18504
+ if (pending) {
18505
+ const existingMsg = messages[pending.msgIdx];
18506
+ const existingCalls = [...existingMsg.toolCalls ?? []];
18507
+ existingCalls[pending.toolIdx] = {
18508
+ ...existingCalls[pending.toolIdx],
18509
+ output: extractToolResultContent(block.content)
18510
+ };
18511
+ messages[pending.msgIdx] = { ...existingMsg, toolCalls: existingCalls };
18512
+ pendingToolCalls.delete(block.tool_use_id);
18513
+ }
18514
+ }
18515
+ }
18516
+ }
18517
+ const text = extractTextContent2(contentArr);
18518
+ if (text !== void 0) {
18519
+ messages.push({ role: "user", content: text });
18520
+ }
18521
+ break;
18522
+ }
18523
+ case "assistant": {
18524
+ const msg = event.message;
18525
+ if (!msg) break;
18526
+ if (!model && msg.model) {
18527
+ model = msg.model;
18528
+ }
18529
+ if (msg.usage && event.requestId) {
18530
+ usageByRequestId.set(event.requestId, msg.usage);
18531
+ }
18532
+ const { text, toolCalls } = extractAssistantContent(msg.content);
18533
+ if (event.requestId && event.requestId === lastAssistantRequestId && lastAssistantIdx >= 0) {
18534
+ messages[lastAssistantIdx] = {
18535
+ role: "assistant",
18536
+ content: text || void 0,
18537
+ toolCalls: toolCalls.length > 0 ? toolCalls : void 0
18538
+ };
18539
+ registerPendingToolCalls(toolCalls, lastAssistantIdx, pendingToolCalls);
18540
+ } else {
18541
+ if (text || toolCalls.length > 0) {
18542
+ lastAssistantIdx = messages.length;
18543
+ messages.push({
18544
+ role: "assistant",
18545
+ content: text || void 0,
18546
+ toolCalls: toolCalls.length > 0 ? toolCalls : void 0
18547
+ });
18548
+ registerPendingToolCalls(toolCalls, lastAssistantIdx, pendingToolCalls);
18549
+ }
18550
+ }
18551
+ lastAssistantRequestId = event.requestId;
18552
+ break;
18553
+ }
18554
+ }
18555
+ }
18556
+ let totalInputTokens = 0;
18557
+ let totalOutputTokens = 0;
18558
+ for (const usage of usageByRequestId.values()) {
18559
+ totalInputTokens += Number(usage.input_tokens ?? 0);
18560
+ totalOutputTokens += Number(usage.output_tokens ?? 0);
18561
+ }
18562
+ const hasUsage = usageByRequestId.size > 0;
18563
+ let durationMs;
18564
+ if (startTimestamp && endTimestamp) {
18565
+ durationMs = new Date(endTimestamp).getTime() - new Date(startTimestamp).getTime();
18566
+ }
18567
+ const source = {
18568
+ provider: "claude",
18569
+ sessionId,
18570
+ projectPath,
18571
+ startedAt: startTimestamp,
18572
+ model
18573
+ };
18574
+ return {
18575
+ messages,
18576
+ source,
18577
+ tokenUsage: hasUsage ? { input: totalInputTokens, output: totalOutputTokens } : void 0,
18578
+ durationMs,
18579
+ costUsd: null
18580
+ };
18581
+ }
18582
+ function registerPendingToolCalls(toolCalls, msgIdx, pending) {
18583
+ for (let i = 0; i < toolCalls.length; i++) {
18584
+ const id = toolCalls[i].id;
18585
+ if (id) {
18586
+ pending.set(id, { msgIdx, toolIdx: i });
18587
+ }
18588
+ }
18589
+ }
18590
+ function extractTextContent2(content) {
18591
+ if (content === void 0 || content === null) return void 0;
18592
+ if (typeof content === "string") return content;
18593
+ const textParts = [];
18594
+ for (const block of content) {
18595
+ if (block.type === "text" && block.text) {
18596
+ textParts.push(block.text);
18597
+ }
18598
+ }
18599
+ return textParts.length > 0 ? textParts.join("") : void 0;
18600
+ }
18601
+ function extractAssistantContent(content) {
18602
+ if (content === void 0 || content === null) {
18603
+ return { text: void 0, toolCalls: [] };
18604
+ }
18605
+ if (typeof content === "string") {
18606
+ return { text: content, toolCalls: [] };
18607
+ }
18608
+ const textParts = [];
18609
+ const toolCalls = [];
18610
+ for (const block of content) {
18611
+ switch (block.type) {
18612
+ case "text":
18613
+ if (block.text) textParts.push(block.text);
18614
+ break;
18615
+ case "tool_use":
18616
+ if (block.name) {
18617
+ toolCalls.push({
18618
+ tool: block.name,
18619
+ input: block.input,
18620
+ id: block.id
18621
+ });
18622
+ }
18623
+ break;
18624
+ }
18625
+ }
18626
+ return {
18627
+ text: textParts.length > 0 ? textParts.join("") : void 0,
18628
+ toolCalls
18629
+ };
18630
+ }
18631
+ function extractToolResultContent(content) {
18632
+ if (content === void 0 || content === null) return void 0;
18633
+ if (typeof content === "string") return content;
18634
+ const parts = [];
18635
+ for (const block of content) {
18636
+ if (block.type === "text" && block.text) {
18637
+ parts.push(block.text);
18638
+ }
18639
+ }
18640
+ return parts.length > 0 ? parts.join("") : void 0;
18641
+ }
18642
+
18643
+ // src/import/session-discovery.ts
18644
+ import { readdir as readdir8, stat as stat9 } from "node:fs/promises";
18645
+ import { homedir as homedir3 } from "node:os";
18646
+ import path48 from "node:path";
18647
+ var DEFAULT_PROJECTS_DIR = () => path48.join(homedir3(), ".claude", "projects");
18648
+ function encodeProjectPath(projectPath) {
18649
+ return projectPath.replace(/\//g, "-");
18650
+ }
18651
+ async function discoverClaudeSessions(opts) {
18652
+ const projectsDir = opts?.projectsDir ?? DEFAULT_PROJECTS_DIR();
18653
+ const limit = opts?.latest ? 1 : opts?.limit ?? 10;
18654
+ let projectDirs;
18655
+ try {
18656
+ projectDirs = await readdir8(projectsDir);
18657
+ } catch {
18658
+ return [];
18659
+ }
18660
+ if (opts?.projectPath) {
18661
+ const encoded = encodeProjectPath(opts.projectPath);
18662
+ projectDirs = projectDirs.filter((dir) => dir === encoded || dir.includes(encoded));
18663
+ }
18664
+ const sessions = [];
18665
+ for (const projectDir of projectDirs) {
18666
+ const dirPath = path48.join(projectsDir, projectDir);
18667
+ let entries;
18668
+ try {
18669
+ entries = await readdir8(dirPath);
18670
+ } catch {
18671
+ continue;
18672
+ }
18673
+ for (const entry of entries) {
18674
+ if (!entry.endsWith(".jsonl")) continue;
18675
+ const sessionId = entry.replace(/\.jsonl$/, "");
18676
+ if (opts?.sessionId && sessionId !== opts.sessionId) continue;
18677
+ const filePath = path48.join(dirPath, entry);
18678
+ let updatedAt;
18679
+ try {
18680
+ const fileStat = await stat9(filePath);
18681
+ updatedAt = fileStat.mtime;
18682
+ } catch {
18683
+ updatedAt = /* @__PURE__ */ new Date(0);
18684
+ }
18685
+ sessions.push({
18686
+ sessionId,
18687
+ filePath,
18688
+ projectDir,
18689
+ updatedAt
18690
+ });
18691
+ }
18692
+ }
18693
+ sessions.sort((a, b) => b.updatedAt.getTime() - a.updatedAt.getTime());
18694
+ return sessions.slice(0, limit);
18695
+ }
18696
+
18697
+ // src/import/types.ts
18698
+ import { readFile as readFile14 } from "node:fs/promises";
18699
+ async function readTranscriptFile(filePath) {
18700
+ return readFile14(filePath, "utf8");
18701
+ }
18702
+
18394
18703
  // src/index.ts
18395
18704
  function createAgentKernel() {
18396
18705
  return { status: "stub" };
@@ -18460,6 +18769,7 @@ export {
18460
18769
  deriveProjectId,
18461
18770
  detectFormat,
18462
18771
  discoverAssertions,
18772
+ discoverClaudeSessions,
18463
18773
  discoverCopilotSessions,
18464
18774
  discoverGraders,
18465
18775
  discoverGraders as discoverJudges,
@@ -18519,12 +18829,14 @@ export {
18519
18829
  negateScore,
18520
18830
  normalizeLineEndings,
18521
18831
  parseAgentSkillsEvals,
18832
+ parseClaudeSession,
18522
18833
  parseJsonFromText,
18523
18834
  parseJsonSafe,
18524
18835
  readJsonFile,
18525
18836
  readTargetDefinitions,
18526
18837
  readTestSuiteMetadata,
18527
18838
  readTextFile,
18839
+ readTranscriptFile,
18528
18840
  removeProject,
18529
18841
  resolveAndCreateProvider,
18530
18842
  resolveFileReference,