agent-gauntlet 0.15.3 → 0.15.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -299,7 +299,7 @@ import { Command } from "commander";
299
299
  // package.json
300
300
  var package_default = {
301
301
  name: "agent-gauntlet",
302
- version: "0.15.3",
302
+ version: "0.15.4",
303
303
  description: "A CLI tool for testing AI coding agents",
304
304
  license: "MIT",
305
305
  author: "Paul Caplan",
@@ -882,15 +882,15 @@ class ChangeDetector {
882
882
  }
883
883
  return this.getCommitChangedFiles(this.options.commit);
884
884
  }
885
- if (this.options.uncommitted) {
886
- return this.getUncommittedChangedFiles();
887
- }
888
885
  if (this.options.fixBase) {
889
886
  if (!isValidGitRef(this.options.fixBase)) {
890
887
  throw new Error(`Invalid fixBase ref: ${this.options.fixBase}`);
891
888
  }
892
889
  return this.getFixBaseChangedFiles(this.options.fixBase);
893
890
  }
891
+ if (this.options.uncommitted) {
892
+ return this.getUncommittedChangedFiles();
893
+ }
894
894
  const isCI = process.env.CI === "true" || process.env.GITHUB_ACTIONS === "true";
895
895
  if (isCI) {
896
896
  return this.getCIChangedFiles();
@@ -2855,7 +2855,7 @@ function classifyBlock(block) {
2855
2855
  if (block.includes("descriptor:") && block.includes("dataPointType:") && block.includes("dataPoints:")) {
2856
2856
  return "metric";
2857
2857
  }
2858
- if (block.includes("resource:") && /body:\s*'claude_code\.\w+'/.test(block)) {
2858
+ if (block.includes("resource:") && /body:\s*['"]claude_code\.\w+['"]/.test(block)) {
2859
2859
  return "log";
2860
2860
  }
2861
2861
  return "other";
@@ -2941,13 +2941,13 @@ function parseOtelMetrics(blocks) {
2941
2941
  return usage;
2942
2942
  }
2943
2943
  var OTEL_ATTR_RE = {
2944
- body: /body:\s*'([^']*)'/,
2945
- tool_result_size_bytes: /tool_result_size_bytes:\s*'([^']*)'/,
2946
- input_tokens: /input_tokens:\s*'([^']*)'/,
2947
- output_tokens: /output_tokens:\s*'([^']*)'/,
2948
- cache_read_tokens: /cache_read_tokens:\s*'([^']*)'/,
2949
- cache_creation_tokens: /cache_creation_tokens:\s*'([^']*)'/,
2950
- cost_usd: /cost_usd:\s*'([^']*)'/
2944
+ body: /body:\s*['"]([^'"]*)['"]/,
2945
+ tool_result_size_bytes: /tool_result_size_bytes:\s*['"]([^'"]*)['"]/,
2946
+ input_tokens: /input_tokens:\s*['"]([^'"]*)['"]/,
2947
+ output_tokens: /output_tokens:\s*['"]([^'"]*)['"]/,
2948
+ cache_read_tokens: /cache_read_tokens:\s*['"]([^'"]*)['"]/,
2949
+ cache_creation_tokens: /cache_creation_tokens:\s*['"]([^'"]*)['"]/,
2950
+ cost_usd: /cost_usd:\s*['"]([^'"]*)['"]/
2951
2951
  };
2952
2952
  var API_REQUEST_FIELDS = [
2953
2953
  [OTEL_ATTR_RE.input_tokens, "input"],
@@ -4491,31 +4491,7 @@ If NO violations are found:
4491
4491
  }
4492
4492
  `;
4493
4493
 
4494
- // src/gates/review-eval.ts
4495
- var log5 = getCategoryLogger("gate", "review");
4496
- function logDiffStats(diff, mainLogger) {
4497
- const diffLines = diff.split(`
4498
- `).length;
4499
- const diffChars = diff.length;
4500
- const diffEstTokens = Math.ceil(diffChars / CHARS_PER_TOKEN);
4501
- const diffFileRanges = parseDiff(diff);
4502
- const diffFiles = diffFileRanges.size;
4503
- const msg = `[diff-stats] files=${diffFiles} lines=${diffLines} chars=${diffChars} est_tokens=${diffEstTokens}`;
4504
- log5.debug(msg);
4505
- mainLogger(`${msg}
4506
- `);
4507
- }
4508
- function logInputStats(prompt, diff, adapterLogger) {
4509
- const promptChars = prompt.length;
4510
- const diffChars = diff.length;
4511
- const totalInputChars = promptChars + diffChars;
4512
- const promptEstTokens = Math.ceil(promptChars / CHARS_PER_TOKEN);
4513
- const diffEstTokens = Math.ceil(diffChars / CHARS_PER_TOKEN);
4514
- const totalEstTokens = promptEstTokens + diffEstTokens;
4515
- const msg = `[input-stats] prompt_chars=${promptChars} diff_chars=${diffChars} total_chars=${totalInputChars} prompt_est_tokens=${promptEstTokens} diff_est_tokens=${diffEstTokens} total_est_tokens=${totalEstTokens}`;
4516
- adapterLogger(`${msg}
4517
- `);
4518
- }
4494
+ // src/gates/review-prompt.ts
4519
4495
  function buildReviewPrompt(config, previousViolations = []) {
4520
4496
  const baseContent = config.promptContent || "";
4521
4497
  if (previousViolations.length > 0) {
@@ -4533,34 +4509,43 @@ function buildPreviousFailuresSection(violations) {
4533
4509
  const affectedFiles = [...new Set(violations.map((v) => v.file))];
4534
4510
  const lines = [];
4535
4511
  lines.push(buildRerunHeader());
4536
- if (toVerify.length === 0) {
4537
- lines.push(`(No violations were marked as FIXED for verification)
4512
+ lines.push(...formatVerifySection(toVerify));
4513
+ lines.push(...formatUnaddressedSection(unaddressed));
4514
+ lines.push(buildRerunInstructions(affectedFiles));
4515
+ return lines.join(`
4538
4516
  `);
4539
- } else {
4540
- for (const [i, v] of toVerify.entries()) {
4541
- lines.push(`${i + 1}. ${v.file}:${v.line} - ${v.issue}`);
4542
- if (v.fix)
4543
- lines.push(` Suggested fix: ${v.fix}`);
4544
- if (v.result)
4545
- lines.push(` Agent result: ${v.result}`);
4546
- lines.push("");
4547
- }
4517
+ }
4518
+ function formatVerifySection(toVerify) {
4519
+ if (toVerify.length === 0) {
4520
+ return [`(No violations were marked as FIXED for verification)
4521
+ `];
4548
4522
  }
4549
- if (unaddressed.length > 0) {
4550
- lines.push(buildUnaddressedHeader());
4551
- for (const [i, v] of unaddressed.entries()) {
4552
- lines.push(`${i + 1}. ${v.file}:${v.line} - ${v.issue}`);
4553
- }
4523
+ const lines = [];
4524
+ for (const [i, v] of toVerify.entries()) {
4525
+ lines.push(`${i + 1}. ${v.file}:${v.line} - ${v.issue}`);
4526
+ if (v.fix)
4527
+ lines.push(` Suggested fix: ${v.fix}`);
4528
+ if (v.result)
4529
+ lines.push(` Agent result: ${v.result}`);
4554
4530
  lines.push("");
4555
4531
  }
4556
- lines.push(buildRerunInstructions(affectedFiles));
4557
- return lines.join(`
4558
- `);
4532
+ return lines;
4559
4533
  }
4534
+ function formatUnaddressedSection(unaddressed) {
4535
+ if (unaddressed.length === 0)
4536
+ return [];
4537
+ const lines = [buildUnaddressedHeader()];
4538
+ for (const [i, v] of unaddressed.entries()) {
4539
+ lines.push(`${i + 1}. ${v.file}:${v.line} - ${v.issue}`);
4540
+ }
4541
+ lines.push("");
4542
+ return lines;
4543
+ }
4544
+ var RERUN_SEPARATOR = "━".repeat(46);
4560
4545
  function buildRerunHeader() {
4561
- return `━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
4546
+ return `${RERUN_SEPARATOR}
4562
4547
  RERUN MODE: VERIFY PREVIOUS FIXES ONLY
4563
- ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
4548
+ ${RERUN_SEPARATOR}
4564
4549
 
4565
4550
  This is a RERUN review. The agent attempted to fix some of the violations listed below.
4566
4551
  Your task is STRICTLY LIMITED to verifying the fixes for violations marked as FIXED.
@@ -4574,39 +4559,74 @@ The following violations were NOT marked as fixed or skipped and are still activ
4574
4559
  `;
4575
4560
  }
4576
4561
  function buildRerunInstructions(affectedFiles) {
4577
- return `STRICT INSTRUCTIONS FOR RERUN MODE:
4578
-
4579
- 1. VERIFY FIXES: Check if each violation marked as FIXED above has been addressed
4580
- - For violations that are fixed, confirm they no longer appear
4581
- - For violations that remain unfixed, include them in your violations array (status: "new")
4582
-
4583
- 2. UNADDRESSED VIOLATIONS: You MUST include all UNADDRESSED violations listed above in your output array if they still exist.
4584
-
4585
- 3. CHECK FOR REGRESSIONS ONLY: You may ONLY report NEW violations if they:
4586
- - Are in FILES that were modified to fix the above violations: ${affectedFiles.join(", ")}
4587
- - Are DIRECTLY caused by the fix changes (e.g., a fix introduced a new bug)
4588
- - Are in the same function/region that was modified to address a previous violation
4589
-
4590
- 4. Return status "pass" ONLY if ALL previous violations (including unaddressed ones) are now fixed AND no regressions were introduced.
4591
- Otherwise, return status "fail" and list all remaining violations.
4592
-
4593
- ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`;
4562
+ const files = affectedFiles.join(", ");
4563
+ return [
4564
+ "STRICT INSTRUCTIONS FOR RERUN MODE:",
4565
+ "",
4566
+ "1. VERIFY FIXES: Check if each violation marked as FIXED above has been addressed",
4567
+ " - For violations that are fixed, confirm they no longer appear",
4568
+ ' - For violations that remain unfixed, include them in your violations array (status: "new")',
4569
+ "",
4570
+ "2. UNADDRESSED VIOLATIONS: You MUST include all UNADDRESSED violations listed above in your output array if they still exist.",
4571
+ "",
4572
+ "3. CHECK FOR REGRESSIONS ONLY: You may ONLY report NEW violations if they:",
4573
+ ` - Are in FILES that were modified to fix the above violations: ${files}`,
4574
+ " - Are DIRECTLY caused by the fix changes (e.g., a fix introduced a new bug)",
4575
+ " - Are in the same function/region that was modified to address a previous violation",
4576
+ "",
4577
+ '4. Return status "pass" ONLY if ALL previous violations (including unaddressed ones) are now fixed AND no regressions were introduced.',
4578
+ ' Otherwise, return status "fail" and list all remaining violations.',
4579
+ "",
4580
+ RERUN_SEPARATOR
4581
+ ].join(`
4582
+ `);
4594
4583
  }
4584
+ // src/gates/review-eval.ts
4585
+ var log5 = getCategoryLogger("gate", "review");
4586
+ function logDiffStats(diff, mainLogger) {
4587
+ const diffLines = diff.split(`
4588
+ `).length;
4589
+ const diffChars = diff.length;
4590
+ const diffEstTokens = Math.ceil(diffChars / CHARS_PER_TOKEN);
4591
+ const diffFileRanges = parseDiff(diff);
4592
+ const diffFiles = diffFileRanges.size;
4593
+ const msg = `[diff-stats] files=${diffFiles} lines=${diffLines} chars=${diffChars} est_tokens=${diffEstTokens}`;
4594
+ log5.debug(msg);
4595
+ mainLogger(`${msg}
4596
+ `);
4597
+ }
4598
+ function logInputStats(prompt, diff, adapterLogger) {
4599
+ const promptChars = prompt.length;
4600
+ const diffChars = diff.length;
4601
+ const totalInputChars = promptChars + diffChars;
4602
+ const promptEstTokens = Math.ceil(promptChars / CHARS_PER_TOKEN);
4603
+ const diffEstTokens = Math.ceil(diffChars / CHARS_PER_TOKEN);
4604
+ const totalEstTokens = promptEstTokens + diffEstTokens;
4605
+ const msg = `[input-stats] prompt_chars=${promptChars} diff_chars=${diffChars} total_chars=${totalInputChars} prompt_est_tokens=${promptEstTokens} diff_est_tokens=${diffEstTokens} total_est_tokens=${totalEstTokens}`;
4606
+ adapterLogger(`${msg}
4607
+ `);
4608
+ }
4609
+ var MAX_OUTPUT_SIZE_FOR_JSON_PROBE = 1e5;
4595
4610
  function evaluateOutput(output, diff) {
4596
4611
  const diffRanges = diff ? parseDiff(diff) : undefined;
4597
4612
  try {
4598
4613
  const fromBlock = tryParseJsonBlock(output);
4599
4614
  if (fromBlock)
4600
4615
  return validateAndReturn(fromBlock, diffRanges);
4601
- const fromLast = tryParseLastJson(output);
4602
- if (fromLast)
4603
- return validateAndReturn(fromLast, diffRanges);
4604
- const fromFirst = tryParseFirstJson(output);
4605
- if (fromFirst)
4606
- return validateAndReturn(fromFirst, diffRanges);
4616
+ const fromDirect = tryParseWholeOutput(output);
4617
+ if (fromDirect)
4618
+ return validateAndReturn(fromDirect, diffRanges);
4619
+ if (output.length <= MAX_OUTPUT_SIZE_FOR_JSON_PROBE) {
4620
+ const fromLast = tryParseLastJson(output);
4621
+ if (fromLast)
4622
+ return validateAndReturn(fromLast, diffRanges);
4623
+ const fromFirst = tryParseFirstJson(output);
4624
+ if (fromFirst)
4625
+ return validateAndReturn(fromFirst, diffRanges);
4626
+ }
4607
4627
  return {
4608
4628
  status: "error",
4609
- message: "No valid JSON object found in output"
4629
+ message: output.length > MAX_OUTPUT_SIZE_FOR_JSON_PROBE ? `Output too large (${output.length} bytes) and no JSON found` : "No valid JSON object found in output"
4610
4630
  };
4611
4631
  } catch (error) {
4612
4632
  const err = error;
@@ -4626,12 +4646,26 @@ function tryParseJsonBlock(output) {
4626
4646
  return null;
4627
4647
  }
4628
4648
  }
4649
+ function tryParseWholeOutput(output) {
4650
+ const trimmed = output.trim();
4651
+ if (!(trimmed.startsWith("{") && trimmed.endsWith("}")))
4652
+ return null;
4653
+ try {
4654
+ const json = JSON.parse(trimmed);
4655
+ if (json.status)
4656
+ return json;
4657
+ } catch {}
4658
+ return null;
4659
+ }
4660
+ var MAX_JSON_PROBE_ITERATIONS = 50;
4629
4661
  function tryParseLastJson(output) {
4630
4662
  const end = output.lastIndexOf("}");
4631
4663
  if (end === -1)
4632
4664
  return null;
4633
4665
  let start = output.lastIndexOf("{", end);
4634
- while (start !== -1) {
4666
+ let iterations = 0;
4667
+ while (start !== -1 && iterations < MAX_JSON_PROBE_ITERATIONS) {
4668
+ iterations++;
4635
4669
  try {
4636
4670
  const json = JSON.parse(output.substring(start, end + 1));
4637
4671
  if (json.status)
@@ -6122,6 +6156,7 @@ ${chalk2.bold(SEPARATOR)}`);
6122
6156
  }
6123
6157
  const { overallStatus, statusColor } = computeOverallStatus(results, statusOverride);
6124
6158
  console.error(statusColor(`Status: ${overallStatus}`));
6159
+ console.log(`Status: ${overallStatus}`);
6125
6160
  console.error(chalk2.bold(`${SEPARATOR}
6126
6161
  `));
6127
6162
  }
@@ -8993,6 +9028,17 @@ async function handleNoChanges(ctx, failuresMap) {
8993
9028
  log10.info(getStatusMessage2(status));
8994
9029
  return { status, message: getStatusMessage2(status), gatesRun: 0 };
8995
9030
  }
9031
+ if (failuresMap && failuresMap.size > 0) {
9032
+ let totalViolations = 0;
9033
+ for (const adapterMap of failuresMap.values()) {
9034
+ for (const violations of adapterMap.values()) {
9035
+ totalViolations += violations.length;
9036
+ }
9037
+ }
9038
+ const message = `No changes detected — ${totalViolations} violation(s) still outstanding.`;
9039
+ log10.warn(message);
9040
+ return { status: "failed", message, gatesRun: 0 };
9041
+ }
8996
9042
  log10.info("No changes detected.");
8997
9043
  return {
8998
9044
  status: "no_changes",
@@ -9577,4 +9623,4 @@ if (process.argv.length < 3) {
9577
9623
  }
9578
9624
  program.parse(process.argv);
9579
9625
 
9580
- //# debugId=CD4FBA492EA1ADA764756E2164756E21
9626
+ //# debugId=5FCEF2943643F3B864756E2164756E21