agent-gauntlet 0.15.3 → 0.15.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +130 -84
- package/dist/index.js.map +9 -8
- package/package.json +1 -1
- package/skills/gauntlet-run/SKILL.md +2 -2
package/dist/index.js
CHANGED
|
@@ -299,7 +299,7 @@ import { Command } from "commander";
|
|
|
299
299
|
// package.json
|
|
300
300
|
var package_default = {
|
|
301
301
|
name: "agent-gauntlet",
|
|
302
|
-
version: "0.15.
|
|
302
|
+
version: "0.15.4",
|
|
303
303
|
description: "A CLI tool for testing AI coding agents",
|
|
304
304
|
license: "MIT",
|
|
305
305
|
author: "Paul Caplan",
|
|
@@ -882,15 +882,15 @@ class ChangeDetector {
|
|
|
882
882
|
}
|
|
883
883
|
return this.getCommitChangedFiles(this.options.commit);
|
|
884
884
|
}
|
|
885
|
-
if (this.options.uncommitted) {
|
|
886
|
-
return this.getUncommittedChangedFiles();
|
|
887
|
-
}
|
|
888
885
|
if (this.options.fixBase) {
|
|
889
886
|
if (!isValidGitRef(this.options.fixBase)) {
|
|
890
887
|
throw new Error(`Invalid fixBase ref: ${this.options.fixBase}`);
|
|
891
888
|
}
|
|
892
889
|
return this.getFixBaseChangedFiles(this.options.fixBase);
|
|
893
890
|
}
|
|
891
|
+
if (this.options.uncommitted) {
|
|
892
|
+
return this.getUncommittedChangedFiles();
|
|
893
|
+
}
|
|
894
894
|
const isCI = process.env.CI === "true" || process.env.GITHUB_ACTIONS === "true";
|
|
895
895
|
if (isCI) {
|
|
896
896
|
return this.getCIChangedFiles();
|
|
@@ -2855,7 +2855,7 @@ function classifyBlock(block) {
|
|
|
2855
2855
|
if (block.includes("descriptor:") && block.includes("dataPointType:") && block.includes("dataPoints:")) {
|
|
2856
2856
|
return "metric";
|
|
2857
2857
|
}
|
|
2858
|
-
if (block.includes("resource:") && /body:\s*'claude_code\.\w+'/.test(block)) {
|
|
2858
|
+
if (block.includes("resource:") && /body:\s*['"]claude_code\.\w+['"]/.test(block)) {
|
|
2859
2859
|
return "log";
|
|
2860
2860
|
}
|
|
2861
2861
|
return "other";
|
|
@@ -2941,13 +2941,13 @@ function parseOtelMetrics(blocks) {
|
|
|
2941
2941
|
return usage;
|
|
2942
2942
|
}
|
|
2943
2943
|
var OTEL_ATTR_RE = {
|
|
2944
|
-
body: /body:\s*'([^']*)'/,
|
|
2945
|
-
tool_result_size_bytes: /tool_result_size_bytes:\s*'([^']*)'/,
|
|
2946
|
-
input_tokens: /input_tokens:\s*'([^']*)'/,
|
|
2947
|
-
output_tokens: /output_tokens:\s*'([^']*)'/,
|
|
2948
|
-
cache_read_tokens: /cache_read_tokens:\s*'([^']*)'/,
|
|
2949
|
-
cache_creation_tokens: /cache_creation_tokens:\s*'([^']*)'/,
|
|
2950
|
-
cost_usd: /cost_usd:\s*'([^']*)'/
|
|
2944
|
+
body: /body:\s*['"]([^'"]*)['"]/,
|
|
2945
|
+
tool_result_size_bytes: /tool_result_size_bytes:\s*['"]([^'"]*)['"]/,
|
|
2946
|
+
input_tokens: /input_tokens:\s*['"]([^'"]*)['"]/,
|
|
2947
|
+
output_tokens: /output_tokens:\s*['"]([^'"]*)['"]/,
|
|
2948
|
+
cache_read_tokens: /cache_read_tokens:\s*['"]([^'"]*)['"]/,
|
|
2949
|
+
cache_creation_tokens: /cache_creation_tokens:\s*['"]([^'"]*)['"]/,
|
|
2950
|
+
cost_usd: /cost_usd:\s*['"]([^'"]*)['"]/
|
|
2951
2951
|
};
|
|
2952
2952
|
var API_REQUEST_FIELDS = [
|
|
2953
2953
|
[OTEL_ATTR_RE.input_tokens, "input"],
|
|
@@ -4491,31 +4491,7 @@ If NO violations are found:
|
|
|
4491
4491
|
}
|
|
4492
4492
|
`;
|
|
4493
4493
|
|
|
4494
|
-
// src/gates/review-
|
|
4495
|
-
var log5 = getCategoryLogger("gate", "review");
|
|
4496
|
-
function logDiffStats(diff, mainLogger) {
|
|
4497
|
-
const diffLines = diff.split(`
|
|
4498
|
-
`).length;
|
|
4499
|
-
const diffChars = diff.length;
|
|
4500
|
-
const diffEstTokens = Math.ceil(diffChars / CHARS_PER_TOKEN);
|
|
4501
|
-
const diffFileRanges = parseDiff(diff);
|
|
4502
|
-
const diffFiles = diffFileRanges.size;
|
|
4503
|
-
const msg = `[diff-stats] files=${diffFiles} lines=${diffLines} chars=${diffChars} est_tokens=${diffEstTokens}`;
|
|
4504
|
-
log5.debug(msg);
|
|
4505
|
-
mainLogger(`${msg}
|
|
4506
|
-
`);
|
|
4507
|
-
}
|
|
4508
|
-
function logInputStats(prompt, diff, adapterLogger) {
|
|
4509
|
-
const promptChars = prompt.length;
|
|
4510
|
-
const diffChars = diff.length;
|
|
4511
|
-
const totalInputChars = promptChars + diffChars;
|
|
4512
|
-
const promptEstTokens = Math.ceil(promptChars / CHARS_PER_TOKEN);
|
|
4513
|
-
const diffEstTokens = Math.ceil(diffChars / CHARS_PER_TOKEN);
|
|
4514
|
-
const totalEstTokens = promptEstTokens + diffEstTokens;
|
|
4515
|
-
const msg = `[input-stats] prompt_chars=${promptChars} diff_chars=${diffChars} total_chars=${totalInputChars} prompt_est_tokens=${promptEstTokens} diff_est_tokens=${diffEstTokens} total_est_tokens=${totalEstTokens}`;
|
|
4516
|
-
adapterLogger(`${msg}
|
|
4517
|
-
`);
|
|
4518
|
-
}
|
|
4494
|
+
// src/gates/review-prompt.ts
|
|
4519
4495
|
function buildReviewPrompt(config, previousViolations = []) {
|
|
4520
4496
|
const baseContent = config.promptContent || "";
|
|
4521
4497
|
if (previousViolations.length > 0) {
|
|
@@ -4533,34 +4509,43 @@ function buildPreviousFailuresSection(violations) {
|
|
|
4533
4509
|
const affectedFiles = [...new Set(violations.map((v) => v.file))];
|
|
4534
4510
|
const lines = [];
|
|
4535
4511
|
lines.push(buildRerunHeader());
|
|
4536
|
-
|
|
4537
|
-
|
|
4512
|
+
lines.push(...formatVerifySection(toVerify));
|
|
4513
|
+
lines.push(...formatUnaddressedSection(unaddressed));
|
|
4514
|
+
lines.push(buildRerunInstructions(affectedFiles));
|
|
4515
|
+
return lines.join(`
|
|
4538
4516
|
`);
|
|
4539
|
-
|
|
4540
|
-
|
|
4541
|
-
|
|
4542
|
-
|
|
4543
|
-
|
|
4544
|
-
if (v.result)
|
|
4545
|
-
lines.push(` Agent result: ${v.result}`);
|
|
4546
|
-
lines.push("");
|
|
4547
|
-
}
|
|
4517
|
+
}
|
|
4518
|
+
function formatVerifySection(toVerify) {
|
|
4519
|
+
if (toVerify.length === 0) {
|
|
4520
|
+
return [`(No violations were marked as FIXED for verification)
|
|
4521
|
+
`];
|
|
4548
4522
|
}
|
|
4549
|
-
|
|
4550
|
-
|
|
4551
|
-
|
|
4552
|
-
|
|
4553
|
-
|
|
4523
|
+
const lines = [];
|
|
4524
|
+
for (const [i, v] of toVerify.entries()) {
|
|
4525
|
+
lines.push(`${i + 1}. ${v.file}:${v.line} - ${v.issue}`);
|
|
4526
|
+
if (v.fix)
|
|
4527
|
+
lines.push(` Suggested fix: ${v.fix}`);
|
|
4528
|
+
if (v.result)
|
|
4529
|
+
lines.push(` Agent result: ${v.result}`);
|
|
4554
4530
|
lines.push("");
|
|
4555
4531
|
}
|
|
4556
|
-
lines
|
|
4557
|
-
return lines.join(`
|
|
4558
|
-
`);
|
|
4532
|
+
return lines;
|
|
4559
4533
|
}
|
|
4534
|
+
function formatUnaddressedSection(unaddressed) {
|
|
4535
|
+
if (unaddressed.length === 0)
|
|
4536
|
+
return [];
|
|
4537
|
+
const lines = [buildUnaddressedHeader()];
|
|
4538
|
+
for (const [i, v] of unaddressed.entries()) {
|
|
4539
|
+
lines.push(`${i + 1}. ${v.file}:${v.line} - ${v.issue}`);
|
|
4540
|
+
}
|
|
4541
|
+
lines.push("");
|
|
4542
|
+
return lines;
|
|
4543
|
+
}
|
|
4544
|
+
var RERUN_SEPARATOR = "━".repeat(46);
|
|
4560
4545
|
function buildRerunHeader() {
|
|
4561
|
-
return
|
|
4546
|
+
return `${RERUN_SEPARATOR}
|
|
4562
4547
|
RERUN MODE: VERIFY PREVIOUS FIXES ONLY
|
|
4563
|
-
|
|
4548
|
+
${RERUN_SEPARATOR}
|
|
4564
4549
|
|
|
4565
4550
|
This is a RERUN review. The agent attempted to fix some of the violations listed below.
|
|
4566
4551
|
Your task is STRICTLY LIMITED to verifying the fixes for violations marked as FIXED.
|
|
@@ -4574,39 +4559,74 @@ The following violations were NOT marked as fixed or skipped and are still activ
|
|
|
4574
4559
|
`;
|
|
4575
4560
|
}
|
|
4576
4561
|
function buildRerunInstructions(affectedFiles) {
|
|
4577
|
-
|
|
4578
|
-
|
|
4579
|
-
|
|
4580
|
-
|
|
4581
|
-
|
|
4582
|
-
|
|
4583
|
-
|
|
4584
|
-
|
|
4585
|
-
|
|
4586
|
-
|
|
4587
|
-
|
|
4588
|
-
- Are in
|
|
4589
|
-
|
|
4590
|
-
|
|
4591
|
-
|
|
4592
|
-
|
|
4593
|
-
|
|
4562
|
+
const files = affectedFiles.join(", ");
|
|
4563
|
+
return [
|
|
4564
|
+
"STRICT INSTRUCTIONS FOR RERUN MODE:",
|
|
4565
|
+
"",
|
|
4566
|
+
"1. VERIFY FIXES: Check if each violation marked as FIXED above has been addressed",
|
|
4567
|
+
" - For violations that are fixed, confirm they no longer appear",
|
|
4568
|
+
' - For violations that remain unfixed, include them in your violations array (status: "new")',
|
|
4569
|
+
"",
|
|
4570
|
+
"2. UNADDRESSED VIOLATIONS: You MUST include all UNADDRESSED violations listed above in your output array if they still exist.",
|
|
4571
|
+
"",
|
|
4572
|
+
"3. CHECK FOR REGRESSIONS ONLY: You may ONLY report NEW violations if they:",
|
|
4573
|
+
` - Are in FILES that were modified to fix the above violations: ${files}`,
|
|
4574
|
+
" - Are DIRECTLY caused by the fix changes (e.g., a fix introduced a new bug)",
|
|
4575
|
+
" - Are in the same function/region that was modified to address a previous violation",
|
|
4576
|
+
"",
|
|
4577
|
+
'4. Return status "pass" ONLY if ALL previous violations (including unaddressed ones) are now fixed AND no regressions were introduced.',
|
|
4578
|
+
' Otherwise, return status "fail" and list all remaining violations.',
|
|
4579
|
+
"",
|
|
4580
|
+
RERUN_SEPARATOR
|
|
4581
|
+
].join(`
|
|
4582
|
+
`);
|
|
4594
4583
|
}
|
|
4584
|
+
// src/gates/review-eval.ts
|
|
4585
|
+
var log5 = getCategoryLogger("gate", "review");
|
|
4586
|
+
function logDiffStats(diff, mainLogger) {
|
|
4587
|
+
const diffLines = diff.split(`
|
|
4588
|
+
`).length;
|
|
4589
|
+
const diffChars = diff.length;
|
|
4590
|
+
const diffEstTokens = Math.ceil(diffChars / CHARS_PER_TOKEN);
|
|
4591
|
+
const diffFileRanges = parseDiff(diff);
|
|
4592
|
+
const diffFiles = diffFileRanges.size;
|
|
4593
|
+
const msg = `[diff-stats] files=${diffFiles} lines=${diffLines} chars=${diffChars} est_tokens=${diffEstTokens}`;
|
|
4594
|
+
log5.debug(msg);
|
|
4595
|
+
mainLogger(`${msg}
|
|
4596
|
+
`);
|
|
4597
|
+
}
|
|
4598
|
+
function logInputStats(prompt, diff, adapterLogger) {
|
|
4599
|
+
const promptChars = prompt.length;
|
|
4600
|
+
const diffChars = diff.length;
|
|
4601
|
+
const totalInputChars = promptChars + diffChars;
|
|
4602
|
+
const promptEstTokens = Math.ceil(promptChars / CHARS_PER_TOKEN);
|
|
4603
|
+
const diffEstTokens = Math.ceil(diffChars / CHARS_PER_TOKEN);
|
|
4604
|
+
const totalEstTokens = promptEstTokens + diffEstTokens;
|
|
4605
|
+
const msg = `[input-stats] prompt_chars=${promptChars} diff_chars=${diffChars} total_chars=${totalInputChars} prompt_est_tokens=${promptEstTokens} diff_est_tokens=${diffEstTokens} total_est_tokens=${totalEstTokens}`;
|
|
4606
|
+
adapterLogger(`${msg}
|
|
4607
|
+
`);
|
|
4608
|
+
}
|
|
4609
|
+
var MAX_OUTPUT_SIZE_FOR_JSON_PROBE = 1e5;
|
|
4595
4610
|
function evaluateOutput(output, diff) {
|
|
4596
4611
|
const diffRanges = diff ? parseDiff(diff) : undefined;
|
|
4597
4612
|
try {
|
|
4598
4613
|
const fromBlock = tryParseJsonBlock(output);
|
|
4599
4614
|
if (fromBlock)
|
|
4600
4615
|
return validateAndReturn(fromBlock, diffRanges);
|
|
4601
|
-
const
|
|
4602
|
-
if (
|
|
4603
|
-
return validateAndReturn(
|
|
4604
|
-
|
|
4605
|
-
|
|
4606
|
-
|
|
4616
|
+
const fromDirect = tryParseWholeOutput(output);
|
|
4617
|
+
if (fromDirect)
|
|
4618
|
+
return validateAndReturn(fromDirect, diffRanges);
|
|
4619
|
+
if (output.length <= MAX_OUTPUT_SIZE_FOR_JSON_PROBE) {
|
|
4620
|
+
const fromLast = tryParseLastJson(output);
|
|
4621
|
+
if (fromLast)
|
|
4622
|
+
return validateAndReturn(fromLast, diffRanges);
|
|
4623
|
+
const fromFirst = tryParseFirstJson(output);
|
|
4624
|
+
if (fromFirst)
|
|
4625
|
+
return validateAndReturn(fromFirst, diffRanges);
|
|
4626
|
+
}
|
|
4607
4627
|
return {
|
|
4608
4628
|
status: "error",
|
|
4609
|
-
message: "No valid JSON object found in output"
|
|
4629
|
+
message: output.length > MAX_OUTPUT_SIZE_FOR_JSON_PROBE ? `Output too large (${output.length} bytes) and no JSON found` : "No valid JSON object found in output"
|
|
4610
4630
|
};
|
|
4611
4631
|
} catch (error) {
|
|
4612
4632
|
const err = error;
|
|
@@ -4626,12 +4646,26 @@ function tryParseJsonBlock(output) {
|
|
|
4626
4646
|
return null;
|
|
4627
4647
|
}
|
|
4628
4648
|
}
|
|
4649
|
+
function tryParseWholeOutput(output) {
|
|
4650
|
+
const trimmed = output.trim();
|
|
4651
|
+
if (!(trimmed.startsWith("{") && trimmed.endsWith("}")))
|
|
4652
|
+
return null;
|
|
4653
|
+
try {
|
|
4654
|
+
const json = JSON.parse(trimmed);
|
|
4655
|
+
if (json.status)
|
|
4656
|
+
return json;
|
|
4657
|
+
} catch {}
|
|
4658
|
+
return null;
|
|
4659
|
+
}
|
|
4660
|
+
var MAX_JSON_PROBE_ITERATIONS = 50;
|
|
4629
4661
|
function tryParseLastJson(output) {
|
|
4630
4662
|
const end = output.lastIndexOf("}");
|
|
4631
4663
|
if (end === -1)
|
|
4632
4664
|
return null;
|
|
4633
4665
|
let start = output.lastIndexOf("{", end);
|
|
4634
|
-
|
|
4666
|
+
let iterations = 0;
|
|
4667
|
+
while (start !== -1 && iterations < MAX_JSON_PROBE_ITERATIONS) {
|
|
4668
|
+
iterations++;
|
|
4635
4669
|
try {
|
|
4636
4670
|
const json = JSON.parse(output.substring(start, end + 1));
|
|
4637
4671
|
if (json.status)
|
|
@@ -6122,6 +6156,7 @@ ${chalk2.bold(SEPARATOR)}`);
|
|
|
6122
6156
|
}
|
|
6123
6157
|
const { overallStatus, statusColor } = computeOverallStatus(results, statusOverride);
|
|
6124
6158
|
console.error(statusColor(`Status: ${overallStatus}`));
|
|
6159
|
+
console.log(`Status: ${overallStatus}`);
|
|
6125
6160
|
console.error(chalk2.bold(`${SEPARATOR}
|
|
6126
6161
|
`));
|
|
6127
6162
|
}
|
|
@@ -8993,6 +9028,17 @@ async function handleNoChanges(ctx, failuresMap) {
|
|
|
8993
9028
|
log10.info(getStatusMessage2(status));
|
|
8994
9029
|
return { status, message: getStatusMessage2(status), gatesRun: 0 };
|
|
8995
9030
|
}
|
|
9031
|
+
if (failuresMap && failuresMap.size > 0) {
|
|
9032
|
+
let totalViolations = 0;
|
|
9033
|
+
for (const adapterMap of failuresMap.values()) {
|
|
9034
|
+
for (const violations of adapterMap.values()) {
|
|
9035
|
+
totalViolations += violations.length;
|
|
9036
|
+
}
|
|
9037
|
+
}
|
|
9038
|
+
const message = `No changes detected — ${totalViolations} violation(s) still outstanding.`;
|
|
9039
|
+
log10.warn(message);
|
|
9040
|
+
return { status: "failed", message, gatesRun: 0 };
|
|
9041
|
+
}
|
|
8996
9042
|
log10.info("No changes detected.");
|
|
8997
9043
|
return {
|
|
8998
9044
|
status: "no_changes",
|
|
@@ -9577,4 +9623,4 @@ if (process.argv.length < 3) {
|
|
|
9577
9623
|
}
|
|
9578
9624
|
program.parse(process.argv);
|
|
9579
9625
|
|
|
9580
|
-
//# debugId=
|
|
9626
|
+
//# debugId=5FCEF2943643F3B864756E2164756E21
|