@pushpalsdev/cli 1.1.5 → 1.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/pushpals-cli.js +222 -48
- package/package.json +1 -1
- package/runtime/configs/default.toml +3 -1
- package/runtime/configs/local.example.toml +3 -1
- package/runtime/sandbox/.pushpals-remotebuddy-fallback.js +13 -1
- package/runtime/sandbox/apps/workerpals/src/backends/openai_codex_backend.ts +1 -0
- package/runtime/sandbox/apps/workerpals/src/common/generic_python_executor.ts +22 -4
- package/runtime/sandbox/apps/workerpals/src/docker_executor.ts +28 -2
- package/runtime/sandbox/apps/workerpals/src/execute_job.ts +457 -172
- package/runtime/sandbox/apps/workerpals/src/merge_conflict_job.ts +21 -1
- package/runtime/sandbox/apps/workerpals/src/workerpals_main.ts +5 -1
- package/runtime/sandbox/configs/default.toml +3 -1
- package/runtime/sandbox/configs/local.example.toml +3 -1
- package/runtime/sandbox/packages/shared/src/config.ts +26 -1
|
@@ -635,6 +635,7 @@ export function tokenizeValidationCommandArgv(command: string): string[] | null
|
|
|
635
635
|
const out: string[] = [];
|
|
636
636
|
let current = "";
|
|
637
637
|
let quote: "'" | '"' | null = null;
|
|
638
|
+
let escaped = false;
|
|
638
639
|
|
|
639
640
|
const pushCurrent = () => {
|
|
640
641
|
if (!current) return;
|
|
@@ -643,7 +644,16 @@ export function tokenizeValidationCommandArgv(command: string): string[] | null
|
|
|
643
644
|
};
|
|
644
645
|
|
|
645
646
|
for (const ch of trimmed) {
|
|
647
|
+
if (escaped) {
|
|
648
|
+
current += ch;
|
|
649
|
+
escaped = false;
|
|
650
|
+
continue;
|
|
651
|
+
}
|
|
646
652
|
if (quote) {
|
|
653
|
+
if (quote === '"' && ch === "\\") {
|
|
654
|
+
escaped = true;
|
|
655
|
+
continue;
|
|
656
|
+
}
|
|
647
657
|
if (ch === quote) {
|
|
648
658
|
quote = null;
|
|
649
659
|
} else {
|
|
@@ -662,6 +672,7 @@ export function tokenizeValidationCommandArgv(command: string): string[] | null
|
|
|
662
672
|
}
|
|
663
673
|
current += ch;
|
|
664
674
|
}
|
|
675
|
+
if (escaped) current += "\\";
|
|
665
676
|
if (quote) return null;
|
|
666
677
|
pushCurrent();
|
|
667
678
|
if (out.length === 0) return null;
|
|
@@ -826,6 +837,11 @@ export async function runValidationArgv(
|
|
|
826
837
|
outputPolicy: Partial<OutputCompactionPolicy>,
|
|
827
838
|
timeoutMessage: string,
|
|
828
839
|
): Promise<ValidationExecutionResult> {
|
|
840
|
+
type ValidationWaitResult =
|
|
841
|
+
| { type: "exit"; code: number }
|
|
842
|
+
| { type: "timeout" }
|
|
843
|
+
| { type: "failure-signal" }
|
|
844
|
+
| { type: "success-signal" };
|
|
829
845
|
const startedAt = Date.now();
|
|
830
846
|
const proc = Bun.spawn(argv, {
|
|
831
847
|
cwd: repo,
|
|
@@ -846,7 +862,7 @@ export async function runValidationArgv(
|
|
|
846
862
|
let stoppedAfterSuccessSignal = false;
|
|
847
863
|
const timeout = Math.max(1_000, timeoutMs);
|
|
848
864
|
let timeoutTimer: ReturnType<typeof setTimeout> | null = null;
|
|
849
|
-
const timeoutPromise = new Promise<
|
|
865
|
+
const timeoutPromise = new Promise<ValidationWaitResult>((resolveTimeout) => {
|
|
850
866
|
timeoutTimer = setTimeout(() => {
|
|
851
867
|
timedOut = true;
|
|
852
868
|
resolveTimeout({ type: "timeout" });
|
|
@@ -855,7 +871,7 @@ export async function runValidationArgv(
|
|
|
855
871
|
|
|
856
872
|
let browserSignalTimer: ReturnType<typeof setInterval> | null = null;
|
|
857
873
|
const browserSignalPromise = isLongRunningBrowserValidationCommand(command)
|
|
858
|
-
? new Promise<
|
|
874
|
+
? new Promise<ValidationWaitResult>((resolveBrowserSignal) => {
|
|
859
875
|
const idleMs = browserValidationFailureIdleMs(env);
|
|
860
876
|
const successIdleMs = browserValidationSuccessIdleMs(env);
|
|
861
877
|
browserSignalTimer = setInterval(() => {
|
|
@@ -877,11 +893,11 @@ export async function runValidationArgv(
|
|
|
877
893
|
}
|
|
878
894
|
}, 250);
|
|
879
895
|
})
|
|
880
|
-
: new Promise<
|
|
896
|
+
: new Promise<ValidationWaitResult>(() => {
|
|
881
897
|
// Non-browser validations should only end on process exit or timeout.
|
|
882
898
|
});
|
|
883
899
|
|
|
884
|
-
const exitOrTimeout = await Promise.race([
|
|
900
|
+
const exitOrTimeout = await Promise.race<ValidationWaitResult>([
|
|
885
901
|
proc.exited.then((code) => ({ type: "exit" as const, code })),
|
|
886
902
|
timeoutPromise,
|
|
887
903
|
browserSignalPromise,
|
|
@@ -1740,9 +1756,9 @@ function classifyBrowserValidationFailureKindFromText(text: string): BrowserVali
|
|
|
1740
1756
|
|
|
1741
1757
|
function extractBrowserValidationStage(text: string): string | null {
|
|
1742
1758
|
const patterns = [
|
|
1743
|
-
/\bBrowser validation failed during\s+([^:.\r\n]+?)\s+stage\b/i,
|
|
1744
|
-
/\bfailed during\s+([^:.\r\n]+?)\s+stage\b/i,
|
|
1745
|
-
/\b(?:stage|phase)\s*[:=]\s*["'`]?([^"'`.\r\n]+)["'`]?/i,
|
|
1759
|
+
/\bBrowser validation failed during\s+([^:.\r\n|]+?)\s+stage\b/i,
|
|
1760
|
+
/\bfailed during\s+([^:.\r\n|]+?)\s+stage\b/i,
|
|
1761
|
+
/\b(?:stage|phase)\s*[:=]\s*["'`]?([^"'`.\r\n|]+)["'`]?/i,
|
|
1746
1762
|
];
|
|
1747
1763
|
for (const pattern of patterns) {
|
|
1748
1764
|
const match = text.match(pattern);
|
|
@@ -1757,6 +1773,27 @@ function extractBrowserValidationStage(text: string): string | null {
|
|
|
1757
1773
|
return null;
|
|
1758
1774
|
}
|
|
1759
1775
|
|
|
1776
|
+
function refineBrowserValidationStage(
|
|
1777
|
+
stage: string | null,
|
|
1778
|
+
selector: string | null,
|
|
1779
|
+
expected: string | null,
|
|
1780
|
+
text: string,
|
|
1781
|
+
): string | null {
|
|
1782
|
+
const combined = stripAnsiControlSequences(
|
|
1783
|
+
[stage, selector, expected, text].filter(Boolean).join(" "),
|
|
1784
|
+
).toLowerCase();
|
|
1785
|
+
if (/\b(game-control-panel|planet control panel|selected planet panel)\b/i.test(combined)) {
|
|
1786
|
+
return "planet control panel";
|
|
1787
|
+
}
|
|
1788
|
+
if (/\bsettings-home-button\b|\breturn to home from settings\b/i.test(combined)) {
|
|
1789
|
+
return "settings return";
|
|
1790
|
+
}
|
|
1791
|
+
if (/\bshop-home-button\b|\breturn to home from shop\b/i.test(combined)) {
|
|
1792
|
+
return "shop return";
|
|
1793
|
+
}
|
|
1794
|
+
return stage;
|
|
1795
|
+
}
|
|
1796
|
+
|
|
1760
1797
|
function inferBrowserValidationFailureFocus(params: {
|
|
1761
1798
|
stage?: string | null;
|
|
1762
1799
|
selector?: string | null;
|
|
@@ -1980,13 +2017,60 @@ function summarizeBrowserValidationOutput(text: string): string {
|
|
|
1980
2017
|
.map((line) => line.trim())
|
|
1981
2018
|
.filter(Boolean)
|
|
1982
2019
|
.filter((line) =>
|
|
1983
|
-
/\b(Web end-to-end smoke test failed|Browser validation failed|Expected |locator\.|page\.|waiting for getBy|Call log:|ERR_SOCKET_BAD_PORT|EADDRINUSE|EPERM|EACCES|browserType\.launch|Executable doesn't exist|Expo exited early|freeport|net::ERR_|Validation command timed out|terminated by signal|SIGTERM|timed out after \d+ms)/i.test(
|
|
2020
|
+
/\b(Web end-to-end smoke test failed|Browser validation failed|Expected |locator\.|page\.|waiting for getBy|Call log:|Verified:|Saved screenshot|Saved trace|ERR_SOCKET_BAD_PORT|EADDRINUSE|EPERM|EACCES|browserType\.launch|Executable doesn't exist|Expo exited early|freeport|net::ERR_|Validation command timed out|terminated by signal|SIGTERM|timed out after \d+ms)/i.test(
|
|
1984
2021
|
line,
|
|
1985
2022
|
),
|
|
1986
2023
|
);
|
|
1987
2024
|
return toSingleLine(lines.slice(0, 8).join(" | "), 900);
|
|
1988
2025
|
}
|
|
1989
2026
|
|
|
2027
|
+
function lastBrowserVerifiedStage(text: string): string | null {
|
|
2028
|
+
const verifiedStages = [...stripAnsiControlSequences(text).matchAll(/\bVerified:\s+([^|\r\n]+)/gi)]
|
|
2029
|
+
.map((match) => match[1]?.trim())
|
|
2030
|
+
.filter((entry): entry is string => Boolean(entry));
|
|
2031
|
+
const lastVerified = verifiedStages.at(-1);
|
|
2032
|
+
return lastVerified ? toSingleLine(lastVerified, 80) : null;
|
|
2033
|
+
}
|
|
2034
|
+
|
|
2035
|
+
export function extractValidationFailureRetryDigest(
|
|
2036
|
+
run: {
|
|
2037
|
+
command: string;
|
|
2038
|
+
stdout?: string;
|
|
2039
|
+
stderr?: string;
|
|
2040
|
+
exitCode?: number;
|
|
2041
|
+
elapsedMs?: number;
|
|
2042
|
+
},
|
|
2043
|
+
repo?: string,
|
|
2044
|
+
): string {
|
|
2045
|
+
const baseDigest = extractValidationFailureDigest(run);
|
|
2046
|
+
if (!isLongRunningBrowserValidationCommand(run.command)) return baseDigest;
|
|
2047
|
+
const combined = stripAnsiControlSequences([run.stderr, run.stdout].filter(Boolean).join("\n"));
|
|
2048
|
+
const failureKind = classifyBrowserValidationFailureKindFromText(`${baseDigest}\n${combined}`);
|
|
2049
|
+
if (failureKind !== "assertion") return baseDigest;
|
|
2050
|
+
|
|
2051
|
+
const recentLogSummary = summarizeRecentBrowserValidationLogs(repo);
|
|
2052
|
+
const enrichedBrowserContext = [combined, recentLogSummary].filter(Boolean).join("\n");
|
|
2053
|
+
const selector = extractBrowserValidationSelector(enrichedBrowserContext);
|
|
2054
|
+
const expected = extractBrowserValidationExpectedUi(enrichedBrowserContext);
|
|
2055
|
+
const stage = refineBrowserValidationStage(
|
|
2056
|
+
extractBrowserValidationStage(enrichedBrowserContext),
|
|
2057
|
+
selector,
|
|
2058
|
+
expected,
|
|
2059
|
+
enrichedBrowserContext,
|
|
2060
|
+
);
|
|
2061
|
+
const lastVerified = lastBrowserVerifiedStage(enrichedBrowserContext);
|
|
2062
|
+
const output = summarizeBrowserValidationOutput(enrichedBrowserContext);
|
|
2063
|
+
const parts = [
|
|
2064
|
+
baseDigest,
|
|
2065
|
+
stage ? `stage=${stage}` : "",
|
|
2066
|
+
selector ? `selector=${selector}` : "",
|
|
2067
|
+
expected ? `expected=${expected}` : "",
|
|
2068
|
+
lastVerified ? `last verified=${lastVerified}` : "",
|
|
2069
|
+
output && output !== baseDigest ? output : "",
|
|
2070
|
+
].filter(Boolean);
|
|
2071
|
+
return toSingleLine(parts.join(" | "), 900) || baseDigest;
|
|
2072
|
+
}
|
|
2073
|
+
|
|
1990
2074
|
export function buildBrowserValidationRepairPacket(
|
|
1991
2075
|
validationRuns: ValidationExecutionResult[],
|
|
1992
2076
|
previousFailureDigests: Map<string, string> = new Map(),
|
|
@@ -1995,15 +2079,24 @@ export function buildBrowserValidationRepairPacket(
|
|
|
1995
2079
|
for (const run of validationRuns) {
|
|
1996
2080
|
if (run.ok || !isLongRunningBrowserValidationCommand(run.command)) continue;
|
|
1997
2081
|
const combined = stripAnsiControlSequences([run.stderr, run.stdout].filter(Boolean).join("\n"));
|
|
1998
|
-
const
|
|
1999
|
-
const failureKind = classifyBrowserValidationFailureKindFromText(`${
|
|
2082
|
+
const baseDigest = extractValidationFailureDigest(run);
|
|
2083
|
+
const failureKind = classifyBrowserValidationFailureKindFromText(`${baseDigest}\n${combined}`);
|
|
2000
2084
|
if (failureKind === "unknown") continue;
|
|
2085
|
+
const digest =
|
|
2086
|
+
failureKind === "assertion"
|
|
2087
|
+
? extractValidationFailureRetryDigest(run, repo) || baseDigest
|
|
2088
|
+
: baseDigest;
|
|
2001
2089
|
const previousDigest = previousFailureDigests.get(validationCommandKey(run.command)) ?? null;
|
|
2002
2090
|
const recentLogSummary = summarizeRecentBrowserValidationLogs(repo);
|
|
2003
2091
|
const enrichedBrowserContext = [combined, recentLogSummary].filter(Boolean).join("\n");
|
|
2004
|
-
const stage = extractBrowserValidationStage(enrichedBrowserContext);
|
|
2005
2092
|
const selector = extractBrowserValidationSelector(enrichedBrowserContext);
|
|
2006
2093
|
const expected = extractBrowserValidationExpectedUi(enrichedBrowserContext);
|
|
2094
|
+
const stage = refineBrowserValidationStage(
|
|
2095
|
+
extractBrowserValidationStage(enrichedBrowserContext),
|
|
2096
|
+
selector,
|
|
2097
|
+
expected,
|
|
2098
|
+
enrichedBrowserContext,
|
|
2099
|
+
);
|
|
2007
2100
|
const previousStage = previousDigest ? extractBrowserValidationStage(previousDigest) : null;
|
|
2008
2101
|
const previousSelector = previousDigest ? extractBrowserValidationSelector(previousDigest) : null;
|
|
2009
2102
|
const previousExpected = previousDigest ? extractBrowserValidationExpectedUi(previousDigest) : null;
|
|
@@ -2021,17 +2114,21 @@ export function buildBrowserValidationRepairPacket(
|
|
|
2021
2114
|
text: previousDigest,
|
|
2022
2115
|
})
|
|
2023
2116
|
: null;
|
|
2117
|
+
const sameFailureSignal =
|
|
2118
|
+
Boolean(previousDigest) &&
|
|
2119
|
+
(previousDigest === digest ||
|
|
2120
|
+
(Boolean(failureFocus) &&
|
|
2121
|
+
failureFocus === previousFailureFocus &&
|
|
2122
|
+
(!selector || !previousSelector || selector === previousSelector)));
|
|
2024
2123
|
const progress =
|
|
2025
2124
|
previousDigest == null
|
|
2026
2125
|
? "first_failure"
|
|
2027
|
-
:
|
|
2126
|
+
: sameFailureSignal
|
|
2028
2127
|
? "same_failure"
|
|
2029
2128
|
: "new_failure";
|
|
2030
2129
|
const needsDiagnosticProbe =
|
|
2031
2130
|
failureKind === "assertion" &&
|
|
2032
|
-
|
|
2033
|
-
Boolean(failureFocus) &&
|
|
2034
|
-
failureFocus === previousFailureFocus;
|
|
2131
|
+
sameFailureSignal;
|
|
2035
2132
|
return {
|
|
2036
2133
|
command: run.command,
|
|
2037
2134
|
failureKind,
|
|
@@ -2203,14 +2300,19 @@ export function inferFallbackValidationCommandsForTestTask(
|
|
|
2203
2300
|
/\b(pytest|python)\b/.test(lowerInstruction) ||
|
|
2204
2301
|
changedTestPaths.some((entry) => entry.toLowerCase().endsWith(".py"));
|
|
2205
2302
|
|
|
2303
|
+
const bunTestPath = (path: string) => formatBunTestPathArg(path);
|
|
2206
2304
|
const normalizedTarget = (targetPath ?? "").replace(/\\/g, "/").trim();
|
|
2207
2305
|
if (normalizedTarget && isLikelyTestPath(normalizedTarget)) {
|
|
2208
|
-
add(pythonSignal ? `pytest ${normalizedTarget}` : `bun test ${normalizedTarget}`);
|
|
2306
|
+
add(pythonSignal ? `pytest ${normalizedTarget}` : `bun test ${bunTestPath(normalizedTarget)}`);
|
|
2209
2307
|
}
|
|
2210
2308
|
|
|
2211
2309
|
if (changedTestPaths.length > 0) {
|
|
2212
|
-
const focused = changedTestPaths.slice(0, 4)
|
|
2213
|
-
add(
|
|
2310
|
+
const focused = changedTestPaths.slice(0, 4);
|
|
2311
|
+
add(
|
|
2312
|
+
pythonSignal
|
|
2313
|
+
? `pytest ${focused.join(" ")}`
|
|
2314
|
+
: `bun test ${focused.map((entry) => bunTestPath(entry)).join(" ")}`,
|
|
2315
|
+
);
|
|
2214
2316
|
}
|
|
2215
2317
|
|
|
2216
2318
|
const scopeHints = [
|
|
@@ -2238,6 +2340,24 @@ export function inferFallbackValidationCommandsForTestTask(
|
|
|
2238
2340
|
return candidates.slice(0, 4);
|
|
2239
2341
|
}
|
|
2240
2342
|
|
|
2343
|
+
export function formatBunTestPathArg(path: string): string {
|
|
2344
|
+
const normalized = String(path ?? "").replace(/\\/g, "/").trim();
|
|
2345
|
+
if (!normalized) return normalized;
|
|
2346
|
+
const pathArg =
|
|
2347
|
+
normalized.startsWith("./") ||
|
|
2348
|
+
normalized.startsWith("../") ||
|
|
2349
|
+
normalized.startsWith("/") ||
|
|
2350
|
+
/^[A-Za-z]:\//.test(normalized)
|
|
2351
|
+
? normalized
|
|
2352
|
+
: `./${normalized}`;
|
|
2353
|
+
return quoteValidationCommandArg(pathArg);
|
|
2354
|
+
}
|
|
2355
|
+
|
|
2356
|
+
function quoteValidationCommandArg(arg: string): string {
|
|
2357
|
+
if (!/[\s"\\]/.test(arg)) return arg;
|
|
2358
|
+
return `"${arg.replace(/\\/g, "\\\\").replace(/"/g, '\\"')}"`;
|
|
2359
|
+
}
|
|
2360
|
+
|
|
2241
2361
|
export function isTestFocusedTask(
|
|
2242
2362
|
instruction: string,
|
|
2243
2363
|
planning: TaskExecutePlanning,
|
|
@@ -2642,49 +2762,67 @@ async function runDeterministicQualityGate(
|
|
|
2642
2762
|
};
|
|
2643
2763
|
}
|
|
2644
2764
|
|
|
2645
|
-
|
|
2646
|
-
|
|
2647
|
-
|
|
2648
|
-
|
|
2765
|
+
type QualityCriticTimeoutBehavior = "skip" | "retry_once" | "block";
|
|
2766
|
+
|
|
2767
|
+
function resolveQualityCriticTimeoutMs(runtimeConfig: WorkerpalsRuntimeConfig): number {
|
|
2768
|
+
const value = Number(runtimeConfig.workerpals.qualityCriticTimeoutMs);
|
|
2769
|
+
if (!Number.isFinite(value)) return 90_000;
|
|
2770
|
+
return Math.max(1_000, Math.min(7_200_000, Math.floor(value)));
|
|
2771
|
+
}
|
|
2772
|
+
|
|
2773
|
+
function resolveQualityCriticTimeoutBehavior(
|
|
2649
2774
|
runtimeConfig: WorkerpalsRuntimeConfig,
|
|
2650
|
-
|
|
2651
|
-
|
|
2652
|
-
|
|
2653
|
-
|
|
2654
|
-
|
|
2775
|
+
): QualityCriticTimeoutBehavior {
|
|
2776
|
+
const value = String(runtimeConfig.workerpals.qualityCriticTimeoutBehavior ?? "")
|
|
2777
|
+
.trim()
|
|
2778
|
+
.toLowerCase()
|
|
2779
|
+
.replace(/-/g, "_");
|
|
2780
|
+
if (value === "skip" || value === "retry_once" || value === "block") return value;
|
|
2781
|
+
return "retry_once";
|
|
2782
|
+
}
|
|
2655
2783
|
|
|
2656
|
-
|
|
2657
|
-
|
|
2658
|
-
|
|
2659
|
-
|
|
2660
|
-
|
|
2661
|
-
|
|
2662
|
-
|
|
2663
|
-
|
|
2664
|
-
|
|
2665
|
-
|
|
2666
|
-
|
|
2667
|
-
const
|
|
2668
|
-
|
|
2669
|
-
|
|
2670
|
-
|
|
2671
|
-
|
|
2672
|
-
const qualityCriticTimeoutMs = (() => {
|
|
2673
|
-
const value = Number(runtimeConfig.workerpals.qualityCriticTimeoutMs);
|
|
2674
|
-
if (!Number.isFinite(value)) return 45_000;
|
|
2675
|
-
return Math.max(1_000, Math.min(7_200_000, Math.floor(value)));
|
|
2676
|
-
})();
|
|
2677
|
-
diffText = compactJobOutput(diffText, outputPolicyForRuntime(runtimeConfig)).slice(
|
|
2678
|
-
0,
|
|
2679
|
-
qualityCriticMaxDiffChars,
|
|
2680
|
-
);
|
|
2784
|
+
function resolveQualityCriticModel(
|
|
2785
|
+
runtimeConfig: WorkerpalsRuntimeConfig,
|
|
2786
|
+
fallback = "",
|
|
2787
|
+
): string {
|
|
2788
|
+
return String(runtimeConfig.workerpals.qualityCriticModel ?? "").trim() || fallback.trim();
|
|
2789
|
+
}
|
|
2790
|
+
|
|
2791
|
+
function resolveQualityCriticMaxDiffChars(
|
|
2792
|
+
runtimeConfig: WorkerpalsRuntimeConfig,
|
|
2793
|
+
compact = false,
|
|
2794
|
+
): number {
|
|
2795
|
+
const value = Number(runtimeConfig.workerpals.qualityCriticMaxDiffChars);
|
|
2796
|
+
const max = Number.isFinite(value) ? value : 16_000;
|
|
2797
|
+
const bounded = Math.max(256, Math.min(524_288, Math.floor(max)));
|
|
2798
|
+
return compact ? Math.min(bounded, 6_000) : bounded;
|
|
2799
|
+
}
|
|
2681
2800
|
|
|
2682
|
-
|
|
2801
|
+
function resolveQualityCriticMaxValidationOutputChars(
|
|
2802
|
+
runtimeConfig: WorkerpalsRuntimeConfig,
|
|
2803
|
+
compact = false,
|
|
2804
|
+
): number {
|
|
2805
|
+
const value = Number(runtimeConfig.workerpals.qualityCriticMaxValidationOutputChars);
|
|
2806
|
+
const max = Number.isFinite(value) ? value : 8_000;
|
|
2807
|
+
const bounded = Math.max(256, Math.min(524_288, Math.floor(max)));
|
|
2808
|
+
return compact ? Math.min(bounded, 2_000) : bounded;
|
|
2809
|
+
}
|
|
2810
|
+
|
|
2811
|
+
function buildCriticValidationSummary(
|
|
2812
|
+
quality: DeterministicQualityResult,
|
|
2813
|
+
maxValidationOutputChars: number,
|
|
2814
|
+
): string {
|
|
2815
|
+
const allPassed =
|
|
2816
|
+
quality.validationRuns.length > 0 && quality.validationRuns.every((run) => run.ok);
|
|
2817
|
+
return quality.validationRuns
|
|
2683
2818
|
.map((run) => {
|
|
2684
|
-
const output =
|
|
2685
|
-
|
|
2686
|
-
|
|
2687
|
-
|
|
2819
|
+
const output =
|
|
2820
|
+
allPassed
|
|
2821
|
+
? ""
|
|
2822
|
+
: [run.stdout, run.stderr]
|
|
2823
|
+
.filter(Boolean)
|
|
2824
|
+
.join("\n")
|
|
2825
|
+
.slice(0, maxValidationOutputChars);
|
|
2688
2826
|
return [
|
|
2689
2827
|
`Command: ${run.command}`,
|
|
2690
2828
|
`Result: ${run.ok ? "pass" : "fail"} (exit ${run.exitCode}, ${run.elapsedMs}ms)`,
|
|
@@ -2694,6 +2832,38 @@ async function runTaskCriticReview(
|
|
|
2694
2832
|
.join("\n");
|
|
2695
2833
|
})
|
|
2696
2834
|
.join("\n\n---\n\n");
|
|
2835
|
+
}
|
|
2836
|
+
|
|
2837
|
+
function criticTimeoutReview(
|
|
2838
|
+
source: "Codex" | "LLM",
|
|
2839
|
+
timeoutMs: number,
|
|
2840
|
+
elapsedMs: number,
|
|
2841
|
+
): CriticReview {
|
|
2842
|
+
const summary = `${source} critic timed out after ${elapsedMs}ms (timeout=${timeoutMs}ms).`;
|
|
2843
|
+
return {
|
|
2844
|
+
score: 0,
|
|
2845
|
+
findings: [summary],
|
|
2846
|
+
mustFix: [
|
|
2847
|
+
"CriticGate timeout behavior is set to block; complete the critic review by reducing critic input, choosing a faster critic model, or increasing workerpals.quality_critic_timeout_ms.",
|
|
2848
|
+
],
|
|
2849
|
+
revisionGuidance:
|
|
2850
|
+
"Do not change product code for this finding unless product code caused the critic prompt explosion. Adjust CriticGate configuration or reduce validation/diff evidence volume.",
|
|
2851
|
+
raw: JSON.stringify({ score: 0, findings: [summary], must_fix: ["CriticGate timed out"] }),
|
|
2852
|
+
};
|
|
2853
|
+
}
|
|
2854
|
+
|
|
2855
|
+
async function runTaskCriticReview(
|
|
2856
|
+
repo: string,
|
|
2857
|
+
params: Record<string, unknown>,
|
|
2858
|
+
quality: DeterministicQualityResult,
|
|
2859
|
+
runtimeConfig: WorkerpalsRuntimeConfig,
|
|
2860
|
+
onLog?: (stream: "stdout" | "stderr", line: string) => void,
|
|
2861
|
+
): Promise<CriticReview | null> {
|
|
2862
|
+
const endpoint = normalizeChatCompletionsEndpoint(runtimeConfig.workerpals.llm.endpoint);
|
|
2863
|
+
const model = resolveQualityCriticModel(runtimeConfig, runtimeConfig.workerpals.llm.model.trim());
|
|
2864
|
+
if (!endpoint || !model) return null;
|
|
2865
|
+
const qualityCriticTimeoutMs = resolveQualityCriticTimeoutMs(runtimeConfig);
|
|
2866
|
+
const timeoutBehavior = resolveQualityCriticTimeoutBehavior(runtimeConfig);
|
|
2697
2867
|
|
|
2698
2868
|
const planning = params.planning as TaskExecutePlanning;
|
|
2699
2869
|
const instruction = String(params.instruction ?? "").trim();
|
|
@@ -2711,33 +2881,65 @@ async function runTaskCriticReview(
|
|
|
2711
2881
|
const changedPathsText =
|
|
2712
2882
|
quality.changedPaths.map((entry) => `- ${entry}`).join("\n") || "- (none)";
|
|
2713
2883
|
const criticSystem = loadPromptTemplate("workerpals/task_quality_critic_system_prompt.md").trim();
|
|
2714
|
-
const criticUser = loadPromptTemplate("workerpals/task_quality_critic_user_prompt.md", {
|
|
2715
|
-
instruction,
|
|
2716
|
-
acceptance_criteria: acceptanceCriteriaText,
|
|
2717
|
-
validation_steps: validationStepsText,
|
|
2718
|
-
changed_paths: changedPathsText,
|
|
2719
|
-
diff_excerpt: diffText || "(empty diff excerpt)",
|
|
2720
|
-
validation_evidence: validationSummary || "(no validation output)",
|
|
2721
|
-
});
|
|
2722
2884
|
|
|
2723
2885
|
const apiKey = runtimeConfig.workerpals.llm.apiKey.trim() || "local";
|
|
2724
2886
|
const headers: Record<string, string> = {
|
|
2725
2887
|
"Content-Type": "application/json",
|
|
2726
2888
|
};
|
|
2727
2889
|
if (apiKey) headers.Authorization = `Bearer ${apiKey}`;
|
|
2728
|
-
|
|
2729
|
-
|
|
2730
|
-
|
|
2731
|
-
|
|
2732
|
-
|
|
2733
|
-
|
|
2734
|
-
|
|
2735
|
-
|
|
2890
|
+
|
|
2891
|
+
const buildAttemptPayload = async (compact: boolean) => {
|
|
2892
|
+
const changedForDiff = quality.changedPaths.slice(0, compact ? 4 : 8);
|
|
2893
|
+
let diffText = "";
|
|
2894
|
+
if (changedForDiff.length > 0) {
|
|
2895
|
+
const diffResult = await git(repo, ["diff", "--", ...changedForDiff]);
|
|
2896
|
+
diffText = diffResult.ok ? diffResult.stdout : diffResult.stderr;
|
|
2897
|
+
}
|
|
2898
|
+
diffText = compactJobOutput(diffText, outputPolicyForRuntime(runtimeConfig)).slice(
|
|
2899
|
+
0,
|
|
2900
|
+
resolveQualityCriticMaxDiffChars(runtimeConfig, compact),
|
|
2901
|
+
);
|
|
2902
|
+
const validationSummary = buildCriticValidationSummary(
|
|
2903
|
+
quality,
|
|
2904
|
+
resolveQualityCriticMaxValidationOutputChars(runtimeConfig, compact),
|
|
2905
|
+
);
|
|
2906
|
+
const criticUser = loadPromptTemplate("workerpals/task_quality_critic_user_prompt.md", {
|
|
2907
|
+
instruction,
|
|
2908
|
+
acceptance_criteria: acceptanceCriteriaText,
|
|
2909
|
+
validation_steps: validationStepsText,
|
|
2910
|
+
changed_paths: changedPathsText,
|
|
2911
|
+
diff_excerpt: diffText || "(empty diff excerpt)",
|
|
2912
|
+
validation_evidence: validationSummary || "(no validation output)",
|
|
2913
|
+
});
|
|
2914
|
+
const promptChars = criticSystem.length + criticUser.length;
|
|
2915
|
+
const promptBytes = new TextEncoder().encode(`${criticSystem}\n${criticUser}`).length;
|
|
2916
|
+
return {
|
|
2917
|
+
bodyBase: {
|
|
2918
|
+
model,
|
|
2919
|
+
messages: [
|
|
2920
|
+
{ role: "system", content: criticSystem },
|
|
2921
|
+
{ role: "user", content: criticUser },
|
|
2922
|
+
],
|
|
2923
|
+
temperature: 0,
|
|
2924
|
+
max_tokens: compact ? 500 : 700,
|
|
2925
|
+
},
|
|
2926
|
+
promptChars,
|
|
2927
|
+
promptBytes,
|
|
2928
|
+
diffChars: diffText.length,
|
|
2929
|
+
validationChars: validationSummary.length,
|
|
2930
|
+
};
|
|
2736
2931
|
};
|
|
2737
2932
|
|
|
2738
|
-
const runCriticRequest = async (
|
|
2933
|
+
const runCriticRequest = async (
|
|
2934
|
+
bodyBase: Record<string, unknown>,
|
|
2935
|
+
responseFormat: Record<string, unknown> | null,
|
|
2936
|
+
) => {
|
|
2739
2937
|
const controller = new AbortController();
|
|
2740
|
-
|
|
2938
|
+
let timedOut = false;
|
|
2939
|
+
const timer = setTimeout(() => {
|
|
2940
|
+
timedOut = true;
|
|
2941
|
+
controller.abort();
|
|
2942
|
+
}, qualityCriticTimeoutMs);
|
|
2741
2943
|
try {
|
|
2742
2944
|
const response = await fetch(endpoint, {
|
|
2743
2945
|
method: "POST",
|
|
@@ -2748,14 +2950,29 @@ async function runTaskCriticReview(
|
|
|
2748
2950
|
signal: controller.signal,
|
|
2749
2951
|
});
|
|
2750
2952
|
const text = await response.text();
|
|
2751
|
-
return { response, text };
|
|
2953
|
+
return { timedOut: false as const, response, text };
|
|
2954
|
+
} catch (err) {
|
|
2955
|
+
if (!timedOut && String((err as { name?: unknown })?.name ?? "") !== "AbortError") {
|
|
2956
|
+
throw err;
|
|
2957
|
+
}
|
|
2958
|
+
return { timedOut: true as const, err };
|
|
2752
2959
|
} finally {
|
|
2753
2960
|
clearTimeout(timer);
|
|
2754
2961
|
}
|
|
2755
2962
|
};
|
|
2756
2963
|
|
|
2757
|
-
|
|
2758
|
-
|
|
2964
|
+
const runAttempt = async (
|
|
2965
|
+
attempt: number,
|
|
2966
|
+
compact: boolean,
|
|
2967
|
+
): Promise<{ status: "timeout" } | { status: "done"; review: CriticReview | null }> => {
|
|
2968
|
+
const payload = await buildAttemptPayload(compact);
|
|
2969
|
+
const startedAt = Date.now();
|
|
2970
|
+
onLog?.(
|
|
2971
|
+
"stdout",
|
|
2972
|
+
`[CriticGate] LLM review attempt ${attempt}${compact ? " (compact)" : ""}: model=${model} timeout_ms=${qualityCriticTimeoutMs} behavior=${timeoutBehavior} prompt_chars=${payload.promptChars} prompt_bytes=${payload.promptBytes} diff_chars=${payload.diffChars} validation_chars=${payload.validationChars}`,
|
|
2973
|
+
);
|
|
2974
|
+
let request = await runCriticRequest(payload.bodyBase, { type: "json_object" });
|
|
2975
|
+
if (request.timedOut) return { status: "timeout" };
|
|
2759
2976
|
if (!request.response.ok && request.response.status === 400) {
|
|
2760
2977
|
const lowered = request.text.toLowerCase();
|
|
2761
2978
|
if (lowered.includes("response_format")) {
|
|
@@ -2763,7 +2980,8 @@ async function runTaskCriticReview(
|
|
|
2763
2980
|
"stdout",
|
|
2764
2981
|
"[CriticGate] fallback: response_format json_object unsupported; retrying without strict response_format.",
|
|
2765
2982
|
);
|
|
2766
|
-
request = await runCriticRequest(null);
|
|
2983
|
+
request = await runCriticRequest(payload.bodyBase, null);
|
|
2984
|
+
if (request.timedOut) return { status: "timeout" };
|
|
2767
2985
|
}
|
|
2768
2986
|
}
|
|
2769
2987
|
if (!request.response.ok) {
|
|
@@ -2771,12 +2989,12 @@ async function runTaskCriticReview(
|
|
|
2771
2989
|
"stderr",
|
|
2772
2990
|
`[CriticGate] review request failed (${request.response.status}): ${toSingleLine(request.text, 240)}`,
|
|
2773
2991
|
);
|
|
2774
|
-
return null;
|
|
2992
|
+
return { status: "done", review: null };
|
|
2775
2993
|
}
|
|
2776
2994
|
|
|
2777
|
-
const
|
|
2778
|
-
const choices = Array.isArray((
|
|
2779
|
-
? ((
|
|
2995
|
+
const responsePayload = parseJsonObjectLoose(request.text) ?? JSON.parse(request.text);
|
|
2996
|
+
const choices = Array.isArray((responsePayload as Record<string, unknown>).choices)
|
|
2997
|
+
? ((responsePayload as Record<string, unknown>).choices as Array<Record<string, unknown>>)
|
|
2780
2998
|
: [];
|
|
2781
2999
|
const content = String(
|
|
2782
3000
|
(choices[0]?.message as Record<string, unknown> | undefined)?.content ?? "",
|
|
@@ -2790,7 +3008,7 @@ async function runTaskCriticReview(
|
|
|
2790
3008
|
220,
|
|
2791
3009
|
)}`,
|
|
2792
3010
|
);
|
|
2793
|
-
return null;
|
|
3011
|
+
return { status: "done", review: null };
|
|
2794
3012
|
}
|
|
2795
3013
|
|
|
2796
3014
|
const scoreRaw = Number(reviewObj.score);
|
|
@@ -2804,13 +3022,43 @@ async function runTaskCriticReview(
|
|
|
2804
3022
|
.trim()
|
|
2805
3023
|
.slice(0, 2000);
|
|
2806
3024
|
const score = Number.isFinite(scoreRaw) ? Math.max(0, Math.min(10, scoreRaw)) : 0;
|
|
3025
|
+
onLog?.(
|
|
3026
|
+
"stdout",
|
|
3027
|
+
`[CriticGate] LLM review completed in ${Date.now() - startedAt}ms (attempt ${attempt}).`,
|
|
3028
|
+
);
|
|
2807
3029
|
return {
|
|
2808
|
-
|
|
2809
|
-
|
|
2810
|
-
|
|
2811
|
-
|
|
2812
|
-
|
|
3030
|
+
status: "done",
|
|
3031
|
+
review: {
|
|
3032
|
+
score,
|
|
3033
|
+
findings,
|
|
3034
|
+
mustFix,
|
|
3035
|
+
revisionGuidance,
|
|
3036
|
+
raw: compactJobOutput(content, outputPolicyForRuntime(runtimeConfig)),
|
|
3037
|
+
},
|
|
2813
3038
|
};
|
|
3039
|
+
};
|
|
3040
|
+
|
|
3041
|
+
try {
|
|
3042
|
+
let attempt = await runAttempt(1, false);
|
|
3043
|
+
if (attempt.status === "timeout" && timeoutBehavior === "retry_once") {
|
|
3044
|
+
onLog?.(
|
|
3045
|
+
"stderr",
|
|
3046
|
+
`[CriticGate] LLM review timed out after ${qualityCriticTimeoutMs}ms; retrying once with compact critic input.`,
|
|
3047
|
+
);
|
|
3048
|
+
attempt = await runAttempt(2, true);
|
|
3049
|
+
}
|
|
3050
|
+
if (attempt.status === "timeout") {
|
|
3051
|
+
if (timeoutBehavior === "block") {
|
|
3052
|
+
onLog?.(
|
|
3053
|
+
"stderr",
|
|
3054
|
+
`[CriticGate] LLM review timed out after ${qualityCriticTimeoutMs}ms; blocking because quality_critic_timeout_behavior=block.`,
|
|
3055
|
+
);
|
|
3056
|
+
return criticTimeoutReview("LLM", qualityCriticTimeoutMs, qualityCriticTimeoutMs);
|
|
3057
|
+
}
|
|
3058
|
+
onLog?.("stderr", `[CriticGate] LLM timed out after ${qualityCriticTimeoutMs}ms; skipping.`);
|
|
3059
|
+
return null;
|
|
3060
|
+
}
|
|
3061
|
+
return attempt.review;
|
|
2814
3062
|
} catch (err) {
|
|
2815
3063
|
onLog?.(
|
|
2816
3064
|
"stderr",
|
|
@@ -2905,7 +3153,10 @@ export function buildQualityRevisionHint(
|
|
|
2905
3153
|
"- Convergence mode: diagnostic-first repair. This same browser focus failed in the previous revision, so do not guess another selector or rewrite a different stage.",
|
|
2906
3154
|
);
|
|
2907
3155
|
lines.push(
|
|
2908
|
-
"- Diagnostic requirement: before editing again, inspect or add a tiny temporary diagnostic around the failing stage that records locator counts, visible textContent, role/ARIA attributes, data-testid values, and a nearby DOM snippet for the candidate nodes.",
|
|
3156
|
+
"- Diagnostic requirement: before editing again, inspect or add a tiny temporary diagnostic around the failing stage that records locator counts, visible textContent, role/ARIA attributes, data-testid values, bounding boxes, and a nearby DOM snippet for the candidate nodes.",
|
|
3157
|
+
);
|
|
3158
|
+
lines.push(
|
|
3159
|
+
"- Artifact freshness rule: only trust screenshots/logs captured after the failing action in the current revision. If the screenshot is stale or stops before the failing locator, capture or print the DOM state instead of reasoning from that image.",
|
|
2909
3160
|
);
|
|
2910
3161
|
lines.push(
|
|
2911
3162
|
"- React Native Web note: screenshots can show the intended state while Playwright reads a duplicate or stale rendered node. Prefer one unique selected-state test id or a semantic checked attribute on the stable pressable, then assert locator count and visibility.",
|
|
@@ -2947,7 +3198,7 @@ export function buildQualityRevisionHint(
|
|
|
2947
3198
|
);
|
|
2948
3199
|
if (browserRepairPacket.needsDiagnosticProbe) {
|
|
2949
3200
|
lines.push(
|
|
2950
|
-
`Validation rerun rule: PushPals ValidationGate will rerun "${browserRepairPacket.command}" after the patch, but this is now a repeated browser assertion. If a quick local startup probe shows the browser server can run in this executor, run one targeted "${browserRepairPacket.command}" confirmation after the DOM-backed fix. Do not hand off another unverified selector guess.`,
|
|
3201
|
+
`Validation rerun rule: PushPals ValidationGate will rerun "${browserRepairPacket.command}" after the patch, but this is now a repeated browser assertion. If a quick local startup probe shows the browser server can run in this executor, run exactly one targeted "${browserRepairPacket.command}" confirmation after the DOM-backed fix. Do not stop after fast checks only. Do not hand off another unverified selector guess.`,
|
|
2951
3202
|
);
|
|
2952
3203
|
} else {
|
|
2953
3204
|
lines.push(
|
|
@@ -5462,86 +5713,92 @@ async function runCodexCriticReview(
|
|
|
5462
5713
|
|
|
5463
5714
|
const instruction = String(params.instruction ?? "").trim();
|
|
5464
5715
|
const planning = params.planning as TaskExecutePlanning;
|
|
5465
|
-
|
|
5466
|
-
const
|
|
5467
|
-
|
|
5468
|
-
|
|
5469
|
-
|
|
5470
|
-
|
|
5471
|
-
|
|
5472
|
-
|
|
5473
|
-
|
|
5474
|
-
|
|
5475
|
-
|
|
5476
|
-
|
|
5477
|
-
})();
|
|
5478
|
-
const qualityCriticTimeoutMs = (() => {
|
|
5479
|
-
const value = Number(runtimeConfig.workerpals.qualityCriticTimeoutMs);
|
|
5480
|
-
if (!Number.isFinite(value)) return 45_000;
|
|
5481
|
-
return Math.max(1_000, Math.min(7_200_000, Math.floor(value)));
|
|
5482
|
-
})();
|
|
5483
|
-
if (changedForDiff.length > 0) {
|
|
5484
|
-
const diffResult = await git(repo, ["diff", "--", ...changedForDiff]);
|
|
5485
|
-
diffText = (diffResult.ok ? diffResult.stdout : diffResult.stderr).slice(
|
|
5716
|
+
const qualityCriticTimeoutMs = resolveQualityCriticTimeoutMs(runtimeConfig);
|
|
5717
|
+
const timeoutBehavior = resolveQualityCriticTimeoutBehavior(runtimeConfig);
|
|
5718
|
+
const criticModel = resolveQualityCriticModel(runtimeConfig);
|
|
5719
|
+
|
|
5720
|
+
const buildCriticInstruction = async (compact: boolean) => {
|
|
5721
|
+
const changedForDiff = quality.changedPaths.slice(0, compact ? 4 : 8);
|
|
5722
|
+
let diffText = "";
|
|
5723
|
+
if (changedForDiff.length > 0) {
|
|
5724
|
+
const diffResult = await git(repo, ["diff", "--", ...changedForDiff]);
|
|
5725
|
+
diffText = diffResult.ok ? diffResult.stdout : diffResult.stderr;
|
|
5726
|
+
}
|
|
5727
|
+
diffText = compactJobOutput(diffText, outputPolicyForRuntime(runtimeConfig)).slice(
|
|
5486
5728
|
0,
|
|
5487
|
-
|
|
5729
|
+
resolveQualityCriticMaxDiffChars(runtimeConfig, compact),
|
|
5488
5730
|
);
|
|
5489
|
-
|
|
5490
|
-
|
|
5491
|
-
|
|
5492
|
-
|
|
5493
|
-
|
|
5494
|
-
|
|
5495
|
-
|
|
5496
|
-
|
|
5497
|
-
|
|
5498
|
-
|
|
5499
|
-
|
|
5500
|
-
|
|
5501
|
-
|
|
5502
|
-
|
|
5503
|
-
|
|
5504
|
-
|
|
5505
|
-
|
|
5506
|
-
|
|
5507
|
-
|
|
5508
|
-
|
|
5509
|
-
|
|
5510
|
-
|
|
5511
|
-
|
|
5512
|
-
|
|
5513
|
-
|
|
5514
|
-
diff_section: diffText ? `Diff:\n${diffText}` : "Diff: (empty - no changes detected)",
|
|
5515
|
-
validation_section: validationSummary
|
|
5516
|
-
? `Validation:\n${validationSummary}`
|
|
5517
|
-
: "Validation: (none)",
|
|
5518
|
-
},
|
|
5519
|
-
);
|
|
5731
|
+
const validationSummary = buildCriticValidationSummary(
|
|
5732
|
+
quality,
|
|
5733
|
+
resolveQualityCriticMaxValidationOutputChars(runtimeConfig, compact),
|
|
5734
|
+
);
|
|
5735
|
+
const criticInstruction = loadPromptTemplate(
|
|
5736
|
+
"workerpals/codex_quality_critic_instruction_prompt.md",
|
|
5737
|
+
{
|
|
5738
|
+
instruction,
|
|
5739
|
+
acceptance_criteria:
|
|
5740
|
+
planning.acceptanceCriteria.map((c) => `- ${c}`).join("\n") || "- (none)",
|
|
5741
|
+
changed_paths: quality.changedPaths.join(", ") || "(none)",
|
|
5742
|
+
diff_section: diffText ? `Diff:\n${diffText}` : "Diff: (empty - no changes detected)",
|
|
5743
|
+
validation_section: validationSummary
|
|
5744
|
+
? `Validation:\n${validationSummary}`
|
|
5745
|
+
: "Validation: (none)",
|
|
5746
|
+
},
|
|
5747
|
+
);
|
|
5748
|
+
return {
|
|
5749
|
+
criticInstruction,
|
|
5750
|
+
promptChars: criticInstruction.length,
|
|
5751
|
+
promptBytes: new TextEncoder().encode(criticInstruction).length,
|
|
5752
|
+
diffChars: diffText.length,
|
|
5753
|
+
validationChars: validationSummary.length,
|
|
5754
|
+
};
|
|
5755
|
+
};
|
|
5520
5756
|
|
|
5521
5757
|
const tmpOutputPath = `/tmp/pushpals-critic-${Date.now()}-${Math.random().toString(36).slice(2, 8)}.txt`;
|
|
5522
|
-
const
|
|
5523
|
-
|
|
5524
|
-
|
|
5525
|
-
|
|
5526
|
-
|
|
5527
|
-
|
|
5528
|
-
|
|
5529
|
-
|
|
5530
|
-
|
|
5531
|
-
|
|
5532
|
-
|
|
5533
|
-
|
|
5534
|
-
|
|
5758
|
+
const buildCmd = () => {
|
|
5759
|
+
const cmd = [
|
|
5760
|
+
...codexPrefix,
|
|
5761
|
+
"-c",
|
|
5762
|
+
'model_reasoning_effort="low"',
|
|
5763
|
+
"-a",
|
|
5764
|
+
"never",
|
|
5765
|
+
"exec",
|
|
5766
|
+
"-s",
|
|
5767
|
+
"read-only",
|
|
5768
|
+
"--color",
|
|
5769
|
+
"never",
|
|
5770
|
+
"--output-last-message",
|
|
5771
|
+
tmpOutputPath,
|
|
5772
|
+
];
|
|
5773
|
+
if (criticModel) cmd.push("-m", criticModel);
|
|
5774
|
+
cmd.push("-");
|
|
5775
|
+
return cmd;
|
|
5776
|
+
};
|
|
5535
5777
|
|
|
5536
5778
|
const env = buildWorkerSandboxWritableEnv(repo);
|
|
5537
5779
|
const codexMask = maskRepoLocalCodexFilesForCodexCli(repo, env);
|
|
5538
|
-
|
|
5539
|
-
|
|
5780
|
+
|
|
5781
|
+
const runAttempt = async (
|
|
5782
|
+
attempt: number,
|
|
5783
|
+
compact: boolean,
|
|
5784
|
+
): Promise<{ status: "timeout" } | { status: "done"; review: CriticReview | null }> => {
|
|
5785
|
+
try {
|
|
5786
|
+
unlinkSync(tmpOutputPath);
|
|
5787
|
+
} catch {
|
|
5788
|
+
/* ignore stale/missing critic output */
|
|
5789
|
+
}
|
|
5790
|
+
const payload = await buildCriticInstruction(compact);
|
|
5791
|
+
const startedAt = Date.now();
|
|
5792
|
+
onLog?.(
|
|
5793
|
+
"stdout",
|
|
5794
|
+
`[CriticGate] Codex review attempt ${attempt}${compact ? " (compact)" : ""}: model=${criticModel || "(codex default)"} timeout_ms=${qualityCriticTimeoutMs} behavior=${timeoutBehavior} prompt_chars=${payload.promptChars} prompt_bytes=${payload.promptBytes} diff_chars=${payload.diffChars} validation_chars=${payload.validationChars}`,
|
|
5795
|
+
);
|
|
5796
|
+
const proc = Bun.spawn(buildCmd(), {
|
|
5540
5797
|
cwd: repo,
|
|
5541
5798
|
env,
|
|
5542
5799
|
stdout: "pipe",
|
|
5543
5800
|
stderr: "pipe",
|
|
5544
|
-
stdin: new Blob([criticInstruction]),
|
|
5801
|
+
stdin: new Blob([payload.criticInstruction]),
|
|
5545
5802
|
});
|
|
5546
5803
|
|
|
5547
5804
|
let timedOut = false;
|
|
@@ -5558,8 +5815,7 @@ async function runCodexCriticReview(
|
|
|
5558
5815
|
clearTimeout(timer);
|
|
5559
5816
|
|
|
5560
5817
|
if (timedOut) {
|
|
5561
|
-
|
|
5562
|
-
return null;
|
|
5818
|
+
return { status: "timeout" };
|
|
5563
5819
|
}
|
|
5564
5820
|
if (exitCode !== 0) {
|
|
5565
5821
|
const stderrText = await new Response(proc.stderr).text();
|
|
@@ -5567,7 +5823,7 @@ async function runCodexCriticReview(
|
|
|
5567
5823
|
"stderr",
|
|
5568
5824
|
`[CriticGate] Codex exited ${exitCode}: ${toSingleLine(stderrText, 220)}`,
|
|
5569
5825
|
);
|
|
5570
|
-
return null;
|
|
5826
|
+
return { status: "done", review: null };
|
|
5571
5827
|
}
|
|
5572
5828
|
|
|
5573
5829
|
let lastMessage = "";
|
|
@@ -5584,7 +5840,7 @@ async function runCodexCriticReview(
|
|
|
5584
5840
|
|
|
5585
5841
|
if (!lastMessage) {
|
|
5586
5842
|
onLog?.("stderr", "[CriticGate] Codex: no output message captured; skipping.");
|
|
5587
|
-
return null;
|
|
5843
|
+
return { status: "done", review: null };
|
|
5588
5844
|
}
|
|
5589
5845
|
|
|
5590
5846
|
const reviewObj = parseJsonObjectLoose(lastMessage);
|
|
@@ -5593,7 +5849,7 @@ async function runCodexCriticReview(
|
|
|
5593
5849
|
"stderr",
|
|
5594
5850
|
`[CriticGate] Codex returned non-JSON: ${toSingleLine(lastMessage, 220)}`,
|
|
5595
5851
|
);
|
|
5596
|
-
return null;
|
|
5852
|
+
return { status: "done", review: null };
|
|
5597
5853
|
}
|
|
5598
5854
|
|
|
5599
5855
|
const scoreRaw = Number(reviewObj.score);
|
|
@@ -5607,14 +5863,43 @@ async function runCodexCriticReview(
|
|
|
5607
5863
|
const revisionGuidance = String(reviewObj.revision_guidance ?? "")
|
|
5608
5864
|
.trim()
|
|
5609
5865
|
.slice(0, 2000);
|
|
5610
|
-
onLog?.(
|
|
5866
|
+
onLog?.(
|
|
5867
|
+
"stdout",
|
|
5868
|
+
`[CriticGate] Codex score: ${score}/10 (${Date.now() - startedAt}ms, attempt ${attempt})`,
|
|
5869
|
+
);
|
|
5611
5870
|
return {
|
|
5612
|
-
|
|
5613
|
-
|
|
5614
|
-
|
|
5615
|
-
|
|
5616
|
-
|
|
5871
|
+
status: "done",
|
|
5872
|
+
review: {
|
|
5873
|
+
score,
|
|
5874
|
+
findings,
|
|
5875
|
+
mustFix,
|
|
5876
|
+
revisionGuidance,
|
|
5877
|
+
raw: compactJobOutput(lastMessage, outputPolicyForRuntime(runtimeConfig)),
|
|
5878
|
+
},
|
|
5617
5879
|
};
|
|
5880
|
+
};
|
|
5881
|
+
|
|
5882
|
+
try {
|
|
5883
|
+
let attempt = await runAttempt(1, false);
|
|
5884
|
+
if (attempt.status === "timeout" && timeoutBehavior === "retry_once") {
|
|
5885
|
+
onLog?.(
|
|
5886
|
+
"stderr",
|
|
5887
|
+
`[CriticGate] Codex timed out after ${qualityCriticTimeoutMs}ms; retrying once with compact critic input.`,
|
|
5888
|
+
);
|
|
5889
|
+
attempt = await runAttempt(2, true);
|
|
5890
|
+
}
|
|
5891
|
+
if (attempt.status === "timeout") {
|
|
5892
|
+
if (timeoutBehavior === "block") {
|
|
5893
|
+
onLog?.(
|
|
5894
|
+
"stderr",
|
|
5895
|
+
`[CriticGate] Codex timed out after ${qualityCriticTimeoutMs}ms; blocking because quality_critic_timeout_behavior=block.`,
|
|
5896
|
+
);
|
|
5897
|
+
return criticTimeoutReview("Codex", qualityCriticTimeoutMs, qualityCriticTimeoutMs);
|
|
5898
|
+
}
|
|
5899
|
+
onLog?.("stderr", `[CriticGate] Codex timed out after ${qualityCriticTimeoutMs}ms; skipping.`);
|
|
5900
|
+
return null;
|
|
5901
|
+
}
|
|
5902
|
+
return attempt.review;
|
|
5618
5903
|
} catch (err) {
|
|
5619
5904
|
onLog?.("stderr", `[CriticGate] Codex error: ${toSingleLine(err, 220)} (skipping).`);
|
|
5620
5905
|
return null;
|
|
@@ -5857,7 +6142,7 @@ export async function executeJob(
|
|
|
5857
6142
|
);
|
|
5858
6143
|
for (const run of quality.validationRuns) {
|
|
5859
6144
|
if (run.ok) continue;
|
|
5860
|
-
const digest =
|
|
6145
|
+
const digest = extractValidationFailureRetryDigest(run, repo);
|
|
5861
6146
|
if (digest) previousValidationFailureDigests.set(validationCommandKey(run.command), digest);
|
|
5862
6147
|
}
|
|
5863
6148
|
const validationOutsideTaskScope =
|