@pushpalsdev/cli 1.1.5 → 1.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -635,6 +635,7 @@ export function tokenizeValidationCommandArgv(command: string): string[] | null
635
635
  const out: string[] = [];
636
636
  let current = "";
637
637
  let quote: "'" | '"' | null = null;
638
+ let escaped = false;
638
639
 
639
640
  const pushCurrent = () => {
640
641
  if (!current) return;
@@ -643,7 +644,16 @@ export function tokenizeValidationCommandArgv(command: string): string[] | null
643
644
  };
644
645
 
645
646
  for (const ch of trimmed) {
647
+ if (escaped) {
648
+ current += ch;
649
+ escaped = false;
650
+ continue;
651
+ }
646
652
  if (quote) {
653
+ if (quote === '"' && ch === "\\") {
654
+ escaped = true;
655
+ continue;
656
+ }
647
657
  if (ch === quote) {
648
658
  quote = null;
649
659
  } else {
@@ -662,6 +672,7 @@ export function tokenizeValidationCommandArgv(command: string): string[] | null
662
672
  }
663
673
  current += ch;
664
674
  }
675
+ if (escaped) current += "\\";
665
676
  if (quote) return null;
666
677
  pushCurrent();
667
678
  if (out.length === 0) return null;
@@ -826,6 +837,11 @@ export async function runValidationArgv(
826
837
  outputPolicy: Partial<OutputCompactionPolicy>,
827
838
  timeoutMessage: string,
828
839
  ): Promise<ValidationExecutionResult> {
840
+ type ValidationWaitResult =
841
+ | { type: "exit"; code: number }
842
+ | { type: "timeout" }
843
+ | { type: "failure-signal" }
844
+ | { type: "success-signal" };
829
845
  const startedAt = Date.now();
830
846
  const proc = Bun.spawn(argv, {
831
847
  cwd: repo,
@@ -846,7 +862,7 @@ export async function runValidationArgv(
846
862
  let stoppedAfterSuccessSignal = false;
847
863
  const timeout = Math.max(1_000, timeoutMs);
848
864
  let timeoutTimer: ReturnType<typeof setTimeout> | null = null;
849
- const timeoutPromise = new Promise<{ type: "timeout" }>((resolveTimeout) => {
865
+ const timeoutPromise = new Promise<ValidationWaitResult>((resolveTimeout) => {
850
866
  timeoutTimer = setTimeout(() => {
851
867
  timedOut = true;
852
868
  resolveTimeout({ type: "timeout" });
@@ -855,7 +871,7 @@ export async function runValidationArgv(
855
871
 
856
872
  let browserSignalTimer: ReturnType<typeof setInterval> | null = null;
857
873
  const browserSignalPromise = isLongRunningBrowserValidationCommand(command)
858
- ? new Promise<{ type: "failure-signal" | "success-signal" }>((resolveBrowserSignal) => {
874
+ ? new Promise<ValidationWaitResult>((resolveBrowserSignal) => {
859
875
  const idleMs = browserValidationFailureIdleMs(env);
860
876
  const successIdleMs = browserValidationSuccessIdleMs(env);
861
877
  browserSignalTimer = setInterval(() => {
@@ -877,11 +893,11 @@ export async function runValidationArgv(
877
893
  }
878
894
  }, 250);
879
895
  })
880
- : new Promise<never>(() => {
896
+ : new Promise<ValidationWaitResult>(() => {
881
897
  // Non-browser validations should only end on process exit or timeout.
882
898
  });
883
899
 
884
- const exitOrTimeout = await Promise.race([
900
+ const exitOrTimeout = await Promise.race<ValidationWaitResult>([
885
901
  proc.exited.then((code) => ({ type: "exit" as const, code })),
886
902
  timeoutPromise,
887
903
  browserSignalPromise,
@@ -1740,9 +1756,9 @@ function classifyBrowserValidationFailureKindFromText(text: string): BrowserVali
1740
1756
 
1741
1757
  function extractBrowserValidationStage(text: string): string | null {
1742
1758
  const patterns = [
1743
- /\bBrowser validation failed during\s+([^:.\r\n]+?)\s+stage\b/i,
1744
- /\bfailed during\s+([^:.\r\n]+?)\s+stage\b/i,
1745
- /\b(?:stage|phase)\s*[:=]\s*["'`]?([^"'`.\r\n]+)["'`]?/i,
1759
+ /\bBrowser validation failed during\s+([^:.\r\n|]+?)\s+stage\b/i,
1760
+ /\bfailed during\s+([^:.\r\n|]+?)\s+stage\b/i,
1761
+ /\b(?:stage|phase)\s*[:=]\s*["'`]?([^"'`.\r\n|]+)["'`]?/i,
1746
1762
  ];
1747
1763
  for (const pattern of patterns) {
1748
1764
  const match = text.match(pattern);
@@ -1757,6 +1773,27 @@ function extractBrowserValidationStage(text: string): string | null {
1757
1773
  return null;
1758
1774
  }
1759
1775
 
1776
+ function refineBrowserValidationStage(
1777
+ stage: string | null,
1778
+ selector: string | null,
1779
+ expected: string | null,
1780
+ text: string,
1781
+ ): string | null {
1782
+ const combined = stripAnsiControlSequences(
1783
+ [stage, selector, expected, text].filter(Boolean).join(" "),
1784
+ ).toLowerCase();
1785
+ if (/\b(game-control-panel|planet control panel|selected planet panel)\b/i.test(combined)) {
1786
+ return "planet control panel";
1787
+ }
1788
+ if (/\bsettings-home-button\b|\breturn to home from settings\b/i.test(combined)) {
1789
+ return "settings return";
1790
+ }
1791
+ if (/\bshop-home-button\b|\breturn to home from shop\b/i.test(combined)) {
1792
+ return "shop return";
1793
+ }
1794
+ return stage;
1795
+ }
1796
+
1760
1797
  function inferBrowserValidationFailureFocus(params: {
1761
1798
  stage?: string | null;
1762
1799
  selector?: string | null;
@@ -1980,13 +2017,60 @@ function summarizeBrowserValidationOutput(text: string): string {
1980
2017
  .map((line) => line.trim())
1981
2018
  .filter(Boolean)
1982
2019
  .filter((line) =>
1983
- /\b(Web end-to-end smoke test failed|Browser validation failed|Expected |locator\.|page\.|waiting for getBy|Call log:|ERR_SOCKET_BAD_PORT|EADDRINUSE|EPERM|EACCES|browserType\.launch|Executable doesn't exist|Expo exited early|freeport|net::ERR_|Validation command timed out|terminated by signal|SIGTERM|timed out after \d+ms)/i.test(
2020
+ /\b(Web end-to-end smoke test failed|Browser validation failed|Expected |locator\.|page\.|waiting for getBy|Call log:|Verified:|Saved screenshot|Saved trace|ERR_SOCKET_BAD_PORT|EADDRINUSE|EPERM|EACCES|browserType\.launch|Executable doesn't exist|Expo exited early|freeport|net::ERR_|Validation command timed out|terminated by signal|SIGTERM|timed out after \d+ms)/i.test(
1984
2021
  line,
1985
2022
  ),
1986
2023
  );
1987
2024
  return toSingleLine(lines.slice(0, 8).join(" | "), 900);
1988
2025
  }
1989
2026
 
2027
+ function lastBrowserVerifiedStage(text: string): string | null {
2028
+ const verifiedStages = [...stripAnsiControlSequences(text).matchAll(/\bVerified:\s+([^|\r\n]+)/gi)]
2029
+ .map((match) => match[1]?.trim())
2030
+ .filter((entry): entry is string => Boolean(entry));
2031
+ const lastVerified = verifiedStages.at(-1);
2032
+ return lastVerified ? toSingleLine(lastVerified, 80) : null;
2033
+ }
2034
+
2035
+ export function extractValidationFailureRetryDigest(
2036
+ run: {
2037
+ command: string;
2038
+ stdout?: string;
2039
+ stderr?: string;
2040
+ exitCode?: number;
2041
+ elapsedMs?: number;
2042
+ },
2043
+ repo?: string,
2044
+ ): string {
2045
+ const baseDigest = extractValidationFailureDigest(run);
2046
+ if (!isLongRunningBrowserValidationCommand(run.command)) return baseDigest;
2047
+ const combined = stripAnsiControlSequences([run.stderr, run.stdout].filter(Boolean).join("\n"));
2048
+ const failureKind = classifyBrowserValidationFailureKindFromText(`${baseDigest}\n${combined}`);
2049
+ if (failureKind !== "assertion") return baseDigest;
2050
+
2051
+ const recentLogSummary = summarizeRecentBrowserValidationLogs(repo);
2052
+ const enrichedBrowserContext = [combined, recentLogSummary].filter(Boolean).join("\n");
2053
+ const selector = extractBrowserValidationSelector(enrichedBrowserContext);
2054
+ const expected = extractBrowserValidationExpectedUi(enrichedBrowserContext);
2055
+ const stage = refineBrowserValidationStage(
2056
+ extractBrowserValidationStage(enrichedBrowserContext),
2057
+ selector,
2058
+ expected,
2059
+ enrichedBrowserContext,
2060
+ );
2061
+ const lastVerified = lastBrowserVerifiedStage(enrichedBrowserContext);
2062
+ const output = summarizeBrowserValidationOutput(enrichedBrowserContext);
2063
+ const parts = [
2064
+ baseDigest,
2065
+ stage ? `stage=${stage}` : "",
2066
+ selector ? `selector=${selector}` : "",
2067
+ expected ? `expected=${expected}` : "",
2068
+ lastVerified ? `last verified=${lastVerified}` : "",
2069
+ output && output !== baseDigest ? output : "",
2070
+ ].filter(Boolean);
2071
+ return toSingleLine(parts.join(" | "), 900) || baseDigest;
2072
+ }
2073
+
1990
2074
  export function buildBrowserValidationRepairPacket(
1991
2075
  validationRuns: ValidationExecutionResult[],
1992
2076
  previousFailureDigests: Map<string, string> = new Map(),
@@ -1995,15 +2079,24 @@ export function buildBrowserValidationRepairPacket(
1995
2079
  for (const run of validationRuns) {
1996
2080
  if (run.ok || !isLongRunningBrowserValidationCommand(run.command)) continue;
1997
2081
  const combined = stripAnsiControlSequences([run.stderr, run.stdout].filter(Boolean).join("\n"));
1998
- const digest = extractValidationFailureDigest(run);
1999
- const failureKind = classifyBrowserValidationFailureKindFromText(`${digest}\n${combined}`);
2082
+ const baseDigest = extractValidationFailureDigest(run);
2083
+ const failureKind = classifyBrowserValidationFailureKindFromText(`${baseDigest}\n${combined}`);
2000
2084
  if (failureKind === "unknown") continue;
2085
+ const digest =
2086
+ failureKind === "assertion"
2087
+ ? extractValidationFailureRetryDigest(run, repo) || baseDigest
2088
+ : baseDigest;
2001
2089
  const previousDigest = previousFailureDigests.get(validationCommandKey(run.command)) ?? null;
2002
2090
  const recentLogSummary = summarizeRecentBrowserValidationLogs(repo);
2003
2091
  const enrichedBrowserContext = [combined, recentLogSummary].filter(Boolean).join("\n");
2004
- const stage = extractBrowserValidationStage(enrichedBrowserContext);
2005
2092
  const selector = extractBrowserValidationSelector(enrichedBrowserContext);
2006
2093
  const expected = extractBrowserValidationExpectedUi(enrichedBrowserContext);
2094
+ const stage = refineBrowserValidationStage(
2095
+ extractBrowserValidationStage(enrichedBrowserContext),
2096
+ selector,
2097
+ expected,
2098
+ enrichedBrowserContext,
2099
+ );
2007
2100
  const previousStage = previousDigest ? extractBrowserValidationStage(previousDigest) : null;
2008
2101
  const previousSelector = previousDigest ? extractBrowserValidationSelector(previousDigest) : null;
2009
2102
  const previousExpected = previousDigest ? extractBrowserValidationExpectedUi(previousDigest) : null;
@@ -2021,17 +2114,21 @@ export function buildBrowserValidationRepairPacket(
2021
2114
  text: previousDigest,
2022
2115
  })
2023
2116
  : null;
2117
+ const sameFailureSignal =
2118
+ Boolean(previousDigest) &&
2119
+ (previousDigest === digest ||
2120
+ (Boolean(failureFocus) &&
2121
+ failureFocus === previousFailureFocus &&
2122
+ (!selector || !previousSelector || selector === previousSelector)));
2024
2123
  const progress =
2025
2124
  previousDigest == null
2026
2125
  ? "first_failure"
2027
- : previousDigest === digest
2126
+ : sameFailureSignal
2028
2127
  ? "same_failure"
2029
2128
  : "new_failure";
2030
2129
  const needsDiagnosticProbe =
2031
2130
  failureKind === "assertion" &&
2032
- Boolean(previousDigest) &&
2033
- Boolean(failureFocus) &&
2034
- failureFocus === previousFailureFocus;
2131
+ sameFailureSignal;
2035
2132
  return {
2036
2133
  command: run.command,
2037
2134
  failureKind,
@@ -2203,14 +2300,19 @@ export function inferFallbackValidationCommandsForTestTask(
2203
2300
  /\b(pytest|python)\b/.test(lowerInstruction) ||
2204
2301
  changedTestPaths.some((entry) => entry.toLowerCase().endsWith(".py"));
2205
2302
 
2303
+ const bunTestPath = (path: string) => formatBunTestPathArg(path);
2206
2304
  const normalizedTarget = (targetPath ?? "").replace(/\\/g, "/").trim();
2207
2305
  if (normalizedTarget && isLikelyTestPath(normalizedTarget)) {
2208
- add(pythonSignal ? `pytest ${normalizedTarget}` : `bun test ${normalizedTarget}`);
2306
+ add(pythonSignal ? `pytest ${normalizedTarget}` : `bun test ${bunTestPath(normalizedTarget)}`);
2209
2307
  }
2210
2308
 
2211
2309
  if (changedTestPaths.length > 0) {
2212
- const focused = changedTestPaths.slice(0, 4).join(" ");
2213
- add(pythonSignal ? `pytest ${focused}` : `bun test ${focused}`);
2310
+ const focused = changedTestPaths.slice(0, 4);
2311
+ add(
2312
+ pythonSignal
2313
+ ? `pytest ${focused.join(" ")}`
2314
+ : `bun test ${focused.map((entry) => bunTestPath(entry)).join(" ")}`,
2315
+ );
2214
2316
  }
2215
2317
 
2216
2318
  const scopeHints = [
@@ -2238,6 +2340,24 @@ export function inferFallbackValidationCommandsForTestTask(
2238
2340
  return candidates.slice(0, 4);
2239
2341
  }
2240
2342
 
2343
+ export function formatBunTestPathArg(path: string): string {
2344
+ const normalized = String(path ?? "").replace(/\\/g, "/").trim();
2345
+ if (!normalized) return normalized;
2346
+ const pathArg =
2347
+ normalized.startsWith("./") ||
2348
+ normalized.startsWith("../") ||
2349
+ normalized.startsWith("/") ||
2350
+ /^[A-Za-z]:\//.test(normalized)
2351
+ ? normalized
2352
+ : `./${normalized}`;
2353
+ return quoteValidationCommandArg(pathArg);
2354
+ }
2355
+
2356
+ function quoteValidationCommandArg(arg: string): string {
2357
+ if (!/[\s"\\]/.test(arg)) return arg;
2358
+ return `"${arg.replace(/\\/g, "\\\\").replace(/"/g, '\\"')}"`;
2359
+ }
2360
+
2241
2361
  export function isTestFocusedTask(
2242
2362
  instruction: string,
2243
2363
  planning: TaskExecutePlanning,
@@ -2642,49 +2762,67 @@ async function runDeterministicQualityGate(
2642
2762
  };
2643
2763
  }
2644
2764
 
2645
- async function runTaskCriticReview(
2646
- repo: string,
2647
- params: Record<string, unknown>,
2648
- quality: DeterministicQualityResult,
2765
+ type QualityCriticTimeoutBehavior = "skip" | "retry_once" | "block";
2766
+
2767
+ function resolveQualityCriticTimeoutMs(runtimeConfig: WorkerpalsRuntimeConfig): number {
2768
+ const value = Number(runtimeConfig.workerpals.qualityCriticTimeoutMs);
2769
+ if (!Number.isFinite(value)) return 90_000;
2770
+ return Math.max(1_000, Math.min(7_200_000, Math.floor(value)));
2771
+ }
2772
+
2773
+ function resolveQualityCriticTimeoutBehavior(
2649
2774
  runtimeConfig: WorkerpalsRuntimeConfig,
2650
- onLog?: (stream: "stdout" | "stderr", line: string) => void,
2651
- ): Promise<CriticReview | null> {
2652
- const endpoint = normalizeChatCompletionsEndpoint(runtimeConfig.workerpals.llm.endpoint);
2653
- const model = runtimeConfig.workerpals.llm.model.trim();
2654
- if (!endpoint || !model) return null;
2775
+ ): QualityCriticTimeoutBehavior {
2776
+ const value = String(runtimeConfig.workerpals.qualityCriticTimeoutBehavior ?? "")
2777
+ .trim()
2778
+ .toLowerCase()
2779
+ .replace(/-/g, "_");
2780
+ if (value === "skip" || value === "retry_once" || value === "block") return value;
2781
+ return "retry_once";
2782
+ }
2655
2783
 
2656
- const changedForDiff = quality.changedPaths.slice(0, 8);
2657
- let diffText = "";
2658
- if (changedForDiff.length > 0) {
2659
- const diffResult = await git(repo, ["diff", "--", ...changedForDiff]);
2660
- diffText = diffResult.ok ? diffResult.stdout : diffResult.stderr;
2661
- }
2662
- const qualityCriticMaxDiffChars = (() => {
2663
- const value = Number(runtimeConfig.workerpals.qualityCriticMaxDiffChars);
2664
- if (!Number.isFinite(value)) return 16_000;
2665
- return Math.max(256, Math.min(524_288, Math.floor(value)));
2666
- })();
2667
- const qualityCriticMaxValidationOutputChars = (() => {
2668
- const value = Number(runtimeConfig.workerpals.qualityCriticMaxValidationOutputChars);
2669
- if (!Number.isFinite(value)) return 8_000;
2670
- return Math.max(256, Math.min(524_288, Math.floor(value)));
2671
- })();
2672
- const qualityCriticTimeoutMs = (() => {
2673
- const value = Number(runtimeConfig.workerpals.qualityCriticTimeoutMs);
2674
- if (!Number.isFinite(value)) return 45_000;
2675
- return Math.max(1_000, Math.min(7_200_000, Math.floor(value)));
2676
- })();
2677
- diffText = compactJobOutput(diffText, outputPolicyForRuntime(runtimeConfig)).slice(
2678
- 0,
2679
- qualityCriticMaxDiffChars,
2680
- );
2784
+ function resolveQualityCriticModel(
2785
+ runtimeConfig: WorkerpalsRuntimeConfig,
2786
+ fallback = "",
2787
+ ): string {
2788
+ return String(runtimeConfig.workerpals.qualityCriticModel ?? "").trim() || fallback.trim();
2789
+ }
2790
+
2791
+ function resolveQualityCriticMaxDiffChars(
2792
+ runtimeConfig: WorkerpalsRuntimeConfig,
2793
+ compact = false,
2794
+ ): number {
2795
+ const value = Number(runtimeConfig.workerpals.qualityCriticMaxDiffChars);
2796
+ const max = Number.isFinite(value) ? value : 16_000;
2797
+ const bounded = Math.max(256, Math.min(524_288, Math.floor(max)));
2798
+ return compact ? Math.min(bounded, 6_000) : bounded;
2799
+ }
2681
2800
 
2682
- const validationSummary = quality.validationRuns
2801
+ function resolveQualityCriticMaxValidationOutputChars(
2802
+ runtimeConfig: WorkerpalsRuntimeConfig,
2803
+ compact = false,
2804
+ ): number {
2805
+ const value = Number(runtimeConfig.workerpals.qualityCriticMaxValidationOutputChars);
2806
+ const max = Number.isFinite(value) ? value : 8_000;
2807
+ const bounded = Math.max(256, Math.min(524_288, Math.floor(max)));
2808
+ return compact ? Math.min(bounded, 2_000) : bounded;
2809
+ }
2810
+
2811
+ function buildCriticValidationSummary(
2812
+ quality: DeterministicQualityResult,
2813
+ maxValidationOutputChars: number,
2814
+ ): string {
2815
+ const allPassed =
2816
+ quality.validationRuns.length > 0 && quality.validationRuns.every((run) => run.ok);
2817
+ return quality.validationRuns
2683
2818
  .map((run) => {
2684
- const output = [run.stdout, run.stderr]
2685
- .filter(Boolean)
2686
- .join("\n")
2687
- .slice(0, qualityCriticMaxValidationOutputChars);
2819
+ const output =
2820
+ allPassed
2821
+ ? ""
2822
+ : [run.stdout, run.stderr]
2823
+ .filter(Boolean)
2824
+ .join("\n")
2825
+ .slice(0, maxValidationOutputChars);
2688
2826
  return [
2689
2827
  `Command: ${run.command}`,
2690
2828
  `Result: ${run.ok ? "pass" : "fail"} (exit ${run.exitCode}, ${run.elapsedMs}ms)`,
@@ -2694,6 +2832,38 @@ async function runTaskCriticReview(
2694
2832
  .join("\n");
2695
2833
  })
2696
2834
  .join("\n\n---\n\n");
2835
+ }
2836
+
2837
+ function criticTimeoutReview(
2838
+ source: "Codex" | "LLM",
2839
+ timeoutMs: number,
2840
+ elapsedMs: number,
2841
+ ): CriticReview {
2842
+ const summary = `${source} critic timed out after ${elapsedMs}ms (timeout=${timeoutMs}ms).`;
2843
+ return {
2844
+ score: 0,
2845
+ findings: [summary],
2846
+ mustFix: [
2847
+ "CriticGate timeout behavior is set to block; complete the critic review by reducing critic input, choosing a faster critic model, or increasing workerpals.quality_critic_timeout_ms.",
2848
+ ],
2849
+ revisionGuidance:
2850
+ "Do not change product code for this finding unless product code caused the critic prompt explosion. Adjust CriticGate configuration or reduce validation/diff evidence volume.",
2851
+ raw: JSON.stringify({ score: 0, findings: [summary], must_fix: ["CriticGate timed out"] }),
2852
+ };
2853
+ }
2854
+
2855
+ async function runTaskCriticReview(
2856
+ repo: string,
2857
+ params: Record<string, unknown>,
2858
+ quality: DeterministicQualityResult,
2859
+ runtimeConfig: WorkerpalsRuntimeConfig,
2860
+ onLog?: (stream: "stdout" | "stderr", line: string) => void,
2861
+ ): Promise<CriticReview | null> {
2862
+ const endpoint = normalizeChatCompletionsEndpoint(runtimeConfig.workerpals.llm.endpoint);
2863
+ const model = resolveQualityCriticModel(runtimeConfig, runtimeConfig.workerpals.llm.model.trim());
2864
+ if (!endpoint || !model) return null;
2865
+ const qualityCriticTimeoutMs = resolveQualityCriticTimeoutMs(runtimeConfig);
2866
+ const timeoutBehavior = resolveQualityCriticTimeoutBehavior(runtimeConfig);
2697
2867
 
2698
2868
  const planning = params.planning as TaskExecutePlanning;
2699
2869
  const instruction = String(params.instruction ?? "").trim();
@@ -2711,33 +2881,65 @@ async function runTaskCriticReview(
2711
2881
  const changedPathsText =
2712
2882
  quality.changedPaths.map((entry) => `- ${entry}`).join("\n") || "- (none)";
2713
2883
  const criticSystem = loadPromptTemplate("workerpals/task_quality_critic_system_prompt.md").trim();
2714
- const criticUser = loadPromptTemplate("workerpals/task_quality_critic_user_prompt.md", {
2715
- instruction,
2716
- acceptance_criteria: acceptanceCriteriaText,
2717
- validation_steps: validationStepsText,
2718
- changed_paths: changedPathsText,
2719
- diff_excerpt: diffText || "(empty diff excerpt)",
2720
- validation_evidence: validationSummary || "(no validation output)",
2721
- });
2722
2884
 
2723
2885
  const apiKey = runtimeConfig.workerpals.llm.apiKey.trim() || "local";
2724
2886
  const headers: Record<string, string> = {
2725
2887
  "Content-Type": "application/json",
2726
2888
  };
2727
2889
  if (apiKey) headers.Authorization = `Bearer ${apiKey}`;
2728
- const bodyBase = {
2729
- model,
2730
- messages: [
2731
- { role: "system", content: criticSystem },
2732
- { role: "user", content: criticUser },
2733
- ],
2734
- temperature: 0,
2735
- max_tokens: 700,
2890
+
2891
+ const buildAttemptPayload = async (compact: boolean) => {
2892
+ const changedForDiff = quality.changedPaths.slice(0, compact ? 4 : 8);
2893
+ let diffText = "";
2894
+ if (changedForDiff.length > 0) {
2895
+ const diffResult = await git(repo, ["diff", "--", ...changedForDiff]);
2896
+ diffText = diffResult.ok ? diffResult.stdout : diffResult.stderr;
2897
+ }
2898
+ diffText = compactJobOutput(diffText, outputPolicyForRuntime(runtimeConfig)).slice(
2899
+ 0,
2900
+ resolveQualityCriticMaxDiffChars(runtimeConfig, compact),
2901
+ );
2902
+ const validationSummary = buildCriticValidationSummary(
2903
+ quality,
2904
+ resolveQualityCriticMaxValidationOutputChars(runtimeConfig, compact),
2905
+ );
2906
+ const criticUser = loadPromptTemplate("workerpals/task_quality_critic_user_prompt.md", {
2907
+ instruction,
2908
+ acceptance_criteria: acceptanceCriteriaText,
2909
+ validation_steps: validationStepsText,
2910
+ changed_paths: changedPathsText,
2911
+ diff_excerpt: diffText || "(empty diff excerpt)",
2912
+ validation_evidence: validationSummary || "(no validation output)",
2913
+ });
2914
+ const promptChars = criticSystem.length + criticUser.length;
2915
+ const promptBytes = new TextEncoder().encode(`${criticSystem}\n${criticUser}`).length;
2916
+ return {
2917
+ bodyBase: {
2918
+ model,
2919
+ messages: [
2920
+ { role: "system", content: criticSystem },
2921
+ { role: "user", content: criticUser },
2922
+ ],
2923
+ temperature: 0,
2924
+ max_tokens: compact ? 500 : 700,
2925
+ },
2926
+ promptChars,
2927
+ promptBytes,
2928
+ diffChars: diffText.length,
2929
+ validationChars: validationSummary.length,
2930
+ };
2736
2931
  };
2737
2932
 
2738
- const runCriticRequest = async (responseFormat: Record<string, unknown> | null) => {
2933
+ const runCriticRequest = async (
2934
+ bodyBase: Record<string, unknown>,
2935
+ responseFormat: Record<string, unknown> | null,
2936
+ ) => {
2739
2937
  const controller = new AbortController();
2740
- const timer = setTimeout(() => controller.abort(), qualityCriticTimeoutMs);
2938
+ let timedOut = false;
2939
+ const timer = setTimeout(() => {
2940
+ timedOut = true;
2941
+ controller.abort();
2942
+ }, qualityCriticTimeoutMs);
2741
2943
  try {
2742
2944
  const response = await fetch(endpoint, {
2743
2945
  method: "POST",
@@ -2748,14 +2950,29 @@ async function runTaskCriticReview(
2748
2950
  signal: controller.signal,
2749
2951
  });
2750
2952
  const text = await response.text();
2751
- return { response, text };
2953
+ return { timedOut: false as const, response, text };
2954
+ } catch (err) {
2955
+ if (!timedOut && String((err as { name?: unknown })?.name ?? "") !== "AbortError") {
2956
+ throw err;
2957
+ }
2958
+ return { timedOut: true as const, err };
2752
2959
  } finally {
2753
2960
  clearTimeout(timer);
2754
2961
  }
2755
2962
  };
2756
2963
 
2757
- try {
2758
- let request = await runCriticRequest({ type: "json_object" });
2964
+ const runAttempt = async (
2965
+ attempt: number,
2966
+ compact: boolean,
2967
+ ): Promise<{ status: "timeout" } | { status: "done"; review: CriticReview | null }> => {
2968
+ const payload = await buildAttemptPayload(compact);
2969
+ const startedAt = Date.now();
2970
+ onLog?.(
2971
+ "stdout",
2972
+ `[CriticGate] LLM review attempt ${attempt}${compact ? " (compact)" : ""}: model=${model} timeout_ms=${qualityCriticTimeoutMs} behavior=${timeoutBehavior} prompt_chars=${payload.promptChars} prompt_bytes=${payload.promptBytes} diff_chars=${payload.diffChars} validation_chars=${payload.validationChars}`,
2973
+ );
2974
+ let request = await runCriticRequest(payload.bodyBase, { type: "json_object" });
2975
+ if (request.timedOut) return { status: "timeout" };
2759
2976
  if (!request.response.ok && request.response.status === 400) {
2760
2977
  const lowered = request.text.toLowerCase();
2761
2978
  if (lowered.includes("response_format")) {
@@ -2763,7 +2980,8 @@ async function runTaskCriticReview(
2763
2980
  "stdout",
2764
2981
  "[CriticGate] fallback: response_format json_object unsupported; retrying without strict response_format.",
2765
2982
  );
2766
- request = await runCriticRequest(null);
2983
+ request = await runCriticRequest(payload.bodyBase, null);
2984
+ if (request.timedOut) return { status: "timeout" };
2767
2985
  }
2768
2986
  }
2769
2987
  if (!request.response.ok) {
@@ -2771,12 +2989,12 @@ async function runTaskCriticReview(
2771
2989
  "stderr",
2772
2990
  `[CriticGate] review request failed (${request.response.status}): ${toSingleLine(request.text, 240)}`,
2773
2991
  );
2774
- return null;
2992
+ return { status: "done", review: null };
2775
2993
  }
2776
2994
 
2777
- const payload = parseJsonObjectLoose(request.text) ?? JSON.parse(request.text);
2778
- const choices = Array.isArray((payload as Record<string, unknown>).choices)
2779
- ? ((payload as Record<string, unknown>).choices as Array<Record<string, unknown>>)
2995
+ const responsePayload = parseJsonObjectLoose(request.text) ?? JSON.parse(request.text);
2996
+ const choices = Array.isArray((responsePayload as Record<string, unknown>).choices)
2997
+ ? ((responsePayload as Record<string, unknown>).choices as Array<Record<string, unknown>>)
2780
2998
  : [];
2781
2999
  const content = String(
2782
3000
  (choices[0]?.message as Record<string, unknown> | undefined)?.content ?? "",
@@ -2790,7 +3008,7 @@ async function runTaskCriticReview(
2790
3008
  220,
2791
3009
  )}`,
2792
3010
  );
2793
- return null;
3011
+ return { status: "done", review: null };
2794
3012
  }
2795
3013
 
2796
3014
  const scoreRaw = Number(reviewObj.score);
@@ -2804,13 +3022,43 @@ async function runTaskCriticReview(
2804
3022
  .trim()
2805
3023
  .slice(0, 2000);
2806
3024
  const score = Number.isFinite(scoreRaw) ? Math.max(0, Math.min(10, scoreRaw)) : 0;
3025
+ onLog?.(
3026
+ "stdout",
3027
+ `[CriticGate] LLM review completed in ${Date.now() - startedAt}ms (attempt ${attempt}).`,
3028
+ );
2807
3029
  return {
2808
- score,
2809
- findings,
2810
- mustFix,
2811
- revisionGuidance,
2812
- raw: compactJobOutput(content, outputPolicyForRuntime(runtimeConfig)),
3030
+ status: "done",
3031
+ review: {
3032
+ score,
3033
+ findings,
3034
+ mustFix,
3035
+ revisionGuidance,
3036
+ raw: compactJobOutput(content, outputPolicyForRuntime(runtimeConfig)),
3037
+ },
2813
3038
  };
3039
+ };
3040
+
3041
+ try {
3042
+ let attempt = await runAttempt(1, false);
3043
+ if (attempt.status === "timeout" && timeoutBehavior === "retry_once") {
3044
+ onLog?.(
3045
+ "stderr",
3046
+ `[CriticGate] LLM review timed out after ${qualityCriticTimeoutMs}ms; retrying once with compact critic input.`,
3047
+ );
3048
+ attempt = await runAttempt(2, true);
3049
+ }
3050
+ if (attempt.status === "timeout") {
3051
+ if (timeoutBehavior === "block") {
3052
+ onLog?.(
3053
+ "stderr",
3054
+ `[CriticGate] LLM review timed out after ${qualityCriticTimeoutMs}ms; blocking because quality_critic_timeout_behavior=block.`,
3055
+ );
3056
+ return criticTimeoutReview("LLM", qualityCriticTimeoutMs, qualityCriticTimeoutMs);
3057
+ }
3058
+ onLog?.("stderr", `[CriticGate] LLM timed out after ${qualityCriticTimeoutMs}ms; skipping.`);
3059
+ return null;
3060
+ }
3061
+ return attempt.review;
2814
3062
  } catch (err) {
2815
3063
  onLog?.(
2816
3064
  "stderr",
@@ -2905,7 +3153,10 @@ export function buildQualityRevisionHint(
2905
3153
  "- Convergence mode: diagnostic-first repair. This same browser focus failed in the previous revision, so do not guess another selector or rewrite a different stage.",
2906
3154
  );
2907
3155
  lines.push(
2908
- "- Diagnostic requirement: before editing again, inspect or add a tiny temporary diagnostic around the failing stage that records locator counts, visible textContent, role/ARIA attributes, data-testid values, and a nearby DOM snippet for the candidate nodes.",
3156
+ "- Diagnostic requirement: before editing again, inspect or add a tiny temporary diagnostic around the failing stage that records locator counts, visible textContent, role/ARIA attributes, data-testid values, bounding boxes, and a nearby DOM snippet for the candidate nodes.",
3157
+ );
3158
+ lines.push(
3159
+ "- Artifact freshness rule: only trust screenshots/logs captured after the failing action in the current revision. If the screenshot is stale or stops before the failing locator, capture or print the DOM state instead of reasoning from that image.",
2909
3160
  );
2910
3161
  lines.push(
2911
3162
  "- React Native Web note: screenshots can show the intended state while Playwright reads a duplicate or stale rendered node. Prefer one unique selected-state test id or a semantic checked attribute on the stable pressable, then assert locator count and visibility.",
@@ -2947,7 +3198,7 @@ export function buildQualityRevisionHint(
2947
3198
  );
2948
3199
  if (browserRepairPacket.needsDiagnosticProbe) {
2949
3200
  lines.push(
2950
- `Validation rerun rule: PushPals ValidationGate will rerun "${browserRepairPacket.command}" after the patch, but this is now a repeated browser assertion. If a quick local startup probe shows the browser server can run in this executor, run one targeted "${browserRepairPacket.command}" confirmation after the DOM-backed fix. Do not hand off another unverified selector guess.`,
3201
+ `Validation rerun rule: PushPals ValidationGate will rerun "${browserRepairPacket.command}" after the patch, but this is now a repeated browser assertion. If a quick local startup probe shows the browser server can run in this executor, run exactly one targeted "${browserRepairPacket.command}" confirmation after the DOM-backed fix. Do not stop after fast checks only. Do not hand off another unverified selector guess.`,
2951
3202
  );
2952
3203
  } else {
2953
3204
  lines.push(
@@ -5462,86 +5713,92 @@ async function runCodexCriticReview(
5462
5713
 
5463
5714
  const instruction = String(params.instruction ?? "").trim();
5464
5715
  const planning = params.planning as TaskExecutePlanning;
5465
-
5466
- const changedForDiff = quality.changedPaths.slice(0, 8);
5467
- let diffText = "";
5468
- const qualityCriticMaxDiffChars = (() => {
5469
- const value = Number(runtimeConfig.workerpals.qualityCriticMaxDiffChars);
5470
- if (!Number.isFinite(value)) return 16_000;
5471
- return Math.max(256, Math.min(524_288, Math.floor(value)));
5472
- })();
5473
- const qualityCriticMaxValidationOutputChars = (() => {
5474
- const value = Number(runtimeConfig.workerpals.qualityCriticMaxValidationOutputChars);
5475
- if (!Number.isFinite(value)) return 8_000;
5476
- return Math.max(256, Math.min(524_288, Math.floor(value)));
5477
- })();
5478
- const qualityCriticTimeoutMs = (() => {
5479
- const value = Number(runtimeConfig.workerpals.qualityCriticTimeoutMs);
5480
- if (!Number.isFinite(value)) return 45_000;
5481
- return Math.max(1_000, Math.min(7_200_000, Math.floor(value)));
5482
- })();
5483
- if (changedForDiff.length > 0) {
5484
- const diffResult = await git(repo, ["diff", "--", ...changedForDiff]);
5485
- diffText = (diffResult.ok ? diffResult.stdout : diffResult.stderr).slice(
5716
+ const qualityCriticTimeoutMs = resolveQualityCriticTimeoutMs(runtimeConfig);
5717
+ const timeoutBehavior = resolveQualityCriticTimeoutBehavior(runtimeConfig);
5718
+ const criticModel = resolveQualityCriticModel(runtimeConfig);
5719
+
5720
+ const buildCriticInstruction = async (compact: boolean) => {
5721
+ const changedForDiff = quality.changedPaths.slice(0, compact ? 4 : 8);
5722
+ let diffText = "";
5723
+ if (changedForDiff.length > 0) {
5724
+ const diffResult = await git(repo, ["diff", "--", ...changedForDiff]);
5725
+ diffText = diffResult.ok ? diffResult.stdout : diffResult.stderr;
5726
+ }
5727
+ diffText = compactJobOutput(diffText, outputPolicyForRuntime(runtimeConfig)).slice(
5486
5728
  0,
5487
- qualityCriticMaxDiffChars,
5729
+ resolveQualityCriticMaxDiffChars(runtimeConfig, compact),
5488
5730
  );
5489
- }
5490
-
5491
- const validationSummary = quality.validationRuns
5492
- .map((run) => {
5493
- const output = [run.stdout, run.stderr]
5494
- .filter(Boolean)
5495
- .join("\n")
5496
- .slice(0, qualityCriticMaxValidationOutputChars);
5497
- return [
5498
- `Command: ${run.command}`,
5499
- `Result: ${run.ok ? "pass" : "fail"} (exit ${run.exitCode})`,
5500
- output,
5501
- ]
5502
- .filter(Boolean)
5503
- .join("\n");
5504
- })
5505
- .join("\n---\n");
5506
-
5507
- const criticInstruction = loadPromptTemplate(
5508
- "workerpals/codex_quality_critic_instruction_prompt.md",
5509
- {
5510
- instruction,
5511
- acceptance_criteria:
5512
- planning.acceptanceCriteria.map((c) => `- ${c}`).join("\n") || "- (none)",
5513
- changed_paths: quality.changedPaths.join(", ") || "(none)",
5514
- diff_section: diffText ? `Diff:\n${diffText}` : "Diff: (empty - no changes detected)",
5515
- validation_section: validationSummary
5516
- ? `Validation:\n${validationSummary}`
5517
- : "Validation: (none)",
5518
- },
5519
- );
5731
+ const validationSummary = buildCriticValidationSummary(
5732
+ quality,
5733
+ resolveQualityCriticMaxValidationOutputChars(runtimeConfig, compact),
5734
+ );
5735
+ const criticInstruction = loadPromptTemplate(
5736
+ "workerpals/codex_quality_critic_instruction_prompt.md",
5737
+ {
5738
+ instruction,
5739
+ acceptance_criteria:
5740
+ planning.acceptanceCriteria.map((c) => `- ${c}`).join("\n") || "- (none)",
5741
+ changed_paths: quality.changedPaths.join(", ") || "(none)",
5742
+ diff_section: diffText ? `Diff:\n${diffText}` : "Diff: (empty - no changes detected)",
5743
+ validation_section: validationSummary
5744
+ ? `Validation:\n${validationSummary}`
5745
+ : "Validation: (none)",
5746
+ },
5747
+ );
5748
+ return {
5749
+ criticInstruction,
5750
+ promptChars: criticInstruction.length,
5751
+ promptBytes: new TextEncoder().encode(criticInstruction).length,
5752
+ diffChars: diffText.length,
5753
+ validationChars: validationSummary.length,
5754
+ };
5755
+ };
5520
5756
 
5521
5757
  const tmpOutputPath = `/tmp/pushpals-critic-${Date.now()}-${Math.random().toString(36).slice(2, 8)}.txt`;
5522
- const cmd = [
5523
- ...codexPrefix,
5524
- "-c",
5525
- 'model_reasoning_effort="low"',
5526
- "-a",
5527
- "never",
5528
- "exec",
5529
- "-s",
5530
- "read-only",
5531
- "--output-last-message",
5532
- tmpOutputPath,
5533
- "-",
5534
- ];
5758
+ const buildCmd = () => {
5759
+ const cmd = [
5760
+ ...codexPrefix,
5761
+ "-c",
5762
+ 'model_reasoning_effort="low"',
5763
+ "-a",
5764
+ "never",
5765
+ "exec",
5766
+ "-s",
5767
+ "read-only",
5768
+ "--color",
5769
+ "never",
5770
+ "--output-last-message",
5771
+ tmpOutputPath,
5772
+ ];
5773
+ if (criticModel) cmd.push("-m", criticModel);
5774
+ cmd.push("-");
5775
+ return cmd;
5776
+ };
5535
5777
 
5536
5778
  const env = buildWorkerSandboxWritableEnv(repo);
5537
5779
  const codexMask = maskRepoLocalCodexFilesForCodexCli(repo, env);
5538
- try {
5539
- const proc = Bun.spawn(cmd, {
5780
+
5781
+ const runAttempt = async (
5782
+ attempt: number,
5783
+ compact: boolean,
5784
+ ): Promise<{ status: "timeout" } | { status: "done"; review: CriticReview | null }> => {
5785
+ try {
5786
+ unlinkSync(tmpOutputPath);
5787
+ } catch {
5788
+ /* ignore stale/missing critic output */
5789
+ }
5790
+ const payload = await buildCriticInstruction(compact);
5791
+ const startedAt = Date.now();
5792
+ onLog?.(
5793
+ "stdout",
5794
+ `[CriticGate] Codex review attempt ${attempt}${compact ? " (compact)" : ""}: model=${criticModel || "(codex default)"} timeout_ms=${qualityCriticTimeoutMs} behavior=${timeoutBehavior} prompt_chars=${payload.promptChars} prompt_bytes=${payload.promptBytes} diff_chars=${payload.diffChars} validation_chars=${payload.validationChars}`,
5795
+ );
5796
+ const proc = Bun.spawn(buildCmd(), {
5540
5797
  cwd: repo,
5541
5798
  env,
5542
5799
  stdout: "pipe",
5543
5800
  stderr: "pipe",
5544
- stdin: new Blob([criticInstruction]),
5801
+ stdin: new Blob([payload.criticInstruction]),
5545
5802
  });
5546
5803
 
5547
5804
  let timedOut = false;
@@ -5558,8 +5815,7 @@ async function runCodexCriticReview(
5558
5815
  clearTimeout(timer);
5559
5816
 
5560
5817
  if (timedOut) {
5561
- onLog?.("stderr", "[CriticGate] Codex timed out; skipping.");
5562
- return null;
5818
+ return { status: "timeout" };
5563
5819
  }
5564
5820
  if (exitCode !== 0) {
5565
5821
  const stderrText = await new Response(proc.stderr).text();
@@ -5567,7 +5823,7 @@ async function runCodexCriticReview(
5567
5823
  "stderr",
5568
5824
  `[CriticGate] Codex exited ${exitCode}: ${toSingleLine(stderrText, 220)}`,
5569
5825
  );
5570
- return null;
5826
+ return { status: "done", review: null };
5571
5827
  }
5572
5828
 
5573
5829
  let lastMessage = "";
@@ -5584,7 +5840,7 @@ async function runCodexCriticReview(
5584
5840
 
5585
5841
  if (!lastMessage) {
5586
5842
  onLog?.("stderr", "[CriticGate] Codex: no output message captured; skipping.");
5587
- return null;
5843
+ return { status: "done", review: null };
5588
5844
  }
5589
5845
 
5590
5846
  const reviewObj = parseJsonObjectLoose(lastMessage);
@@ -5593,7 +5849,7 @@ async function runCodexCriticReview(
5593
5849
  "stderr",
5594
5850
  `[CriticGate] Codex returned non-JSON: ${toSingleLine(lastMessage, 220)}`,
5595
5851
  );
5596
- return null;
5852
+ return { status: "done", review: null };
5597
5853
  }
5598
5854
 
5599
5855
  const scoreRaw = Number(reviewObj.score);
@@ -5607,14 +5863,43 @@ async function runCodexCriticReview(
5607
5863
  const revisionGuidance = String(reviewObj.revision_guidance ?? "")
5608
5864
  .trim()
5609
5865
  .slice(0, 2000);
5610
- onLog?.("stdout", `[CriticGate] Codex score: ${score}/10`);
5866
+ onLog?.(
5867
+ "stdout",
5868
+ `[CriticGate] Codex score: ${score}/10 (${Date.now() - startedAt}ms, attempt ${attempt})`,
5869
+ );
5611
5870
  return {
5612
- score,
5613
- findings,
5614
- mustFix,
5615
- revisionGuidance,
5616
- raw: compactJobOutput(lastMessage, outputPolicyForRuntime(runtimeConfig)),
5871
+ status: "done",
5872
+ review: {
5873
+ score,
5874
+ findings,
5875
+ mustFix,
5876
+ revisionGuidance,
5877
+ raw: compactJobOutput(lastMessage, outputPolicyForRuntime(runtimeConfig)),
5878
+ },
5617
5879
  };
5880
+ };
5881
+
5882
+ try {
5883
+ let attempt = await runAttempt(1, false);
5884
+ if (attempt.status === "timeout" && timeoutBehavior === "retry_once") {
5885
+ onLog?.(
5886
+ "stderr",
5887
+ `[CriticGate] Codex timed out after ${qualityCriticTimeoutMs}ms; retrying once with compact critic input.`,
5888
+ );
5889
+ attempt = await runAttempt(2, true);
5890
+ }
5891
+ if (attempt.status === "timeout") {
5892
+ if (timeoutBehavior === "block") {
5893
+ onLog?.(
5894
+ "stderr",
5895
+ `[CriticGate] Codex timed out after ${qualityCriticTimeoutMs}ms; blocking because quality_critic_timeout_behavior=block.`,
5896
+ );
5897
+ return criticTimeoutReview("Codex", qualityCriticTimeoutMs, qualityCriticTimeoutMs);
5898
+ }
5899
+ onLog?.("stderr", `[CriticGate] Codex timed out after ${qualityCriticTimeoutMs}ms; skipping.`);
5900
+ return null;
5901
+ }
5902
+ return attempt.review;
5618
5903
  } catch (err) {
5619
5904
  onLog?.("stderr", `[CriticGate] Codex error: ${toSingleLine(err, 220)} (skipping).`);
5620
5905
  return null;
@@ -5857,7 +6142,7 @@ export async function executeJob(
5857
6142
  );
5858
6143
  for (const run of quality.validationRuns) {
5859
6144
  if (run.ok) continue;
5860
- const digest = extractValidationFailureDigest(run);
6145
+ const digest = extractValidationFailureRetryDigest(run, repo);
5861
6146
  if (digest) previousValidationFailureDigests.set(validationCommandKey(run.command), digest);
5862
6147
  }
5863
6148
  const validationOutsideTaskScope =