@pushpalsdev/cli 1.1.4 → 1.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -826,6 +826,11 @@ export async function runValidationArgv(
826
826
  outputPolicy: Partial<OutputCompactionPolicy>,
827
827
  timeoutMessage: string,
828
828
  ): Promise<ValidationExecutionResult> {
829
+ type ValidationWaitResult =
830
+ | { type: "exit"; code: number }
831
+ | { type: "timeout" }
832
+ | { type: "failure-signal" }
833
+ | { type: "success-signal" };
829
834
  const startedAt = Date.now();
830
835
  const proc = Bun.spawn(argv, {
831
836
  cwd: repo,
@@ -846,7 +851,7 @@ export async function runValidationArgv(
846
851
  let stoppedAfterSuccessSignal = false;
847
852
  const timeout = Math.max(1_000, timeoutMs);
848
853
  let timeoutTimer: ReturnType<typeof setTimeout> | null = null;
849
- const timeoutPromise = new Promise<{ type: "timeout" }>((resolveTimeout) => {
854
+ const timeoutPromise = new Promise<ValidationWaitResult>((resolveTimeout) => {
850
855
  timeoutTimer = setTimeout(() => {
851
856
  timedOut = true;
852
857
  resolveTimeout({ type: "timeout" });
@@ -855,7 +860,7 @@ export async function runValidationArgv(
855
860
 
856
861
  let browserSignalTimer: ReturnType<typeof setInterval> | null = null;
857
862
  const browserSignalPromise = isLongRunningBrowserValidationCommand(command)
858
- ? new Promise<{ type: "failure-signal" | "success-signal" }>((resolveBrowserSignal) => {
863
+ ? new Promise<ValidationWaitResult>((resolveBrowserSignal) => {
859
864
  const idleMs = browserValidationFailureIdleMs(env);
860
865
  const successIdleMs = browserValidationSuccessIdleMs(env);
861
866
  browserSignalTimer = setInterval(() => {
@@ -877,11 +882,11 @@ export async function runValidationArgv(
877
882
  }
878
883
  }, 250);
879
884
  })
880
- : new Promise<never>(() => {
885
+ : new Promise<ValidationWaitResult>(() => {
881
886
  // Non-browser validations should only end on process exit or timeout.
882
887
  });
883
888
 
884
- const exitOrTimeout = await Promise.race([
889
+ const exitOrTimeout = await Promise.race<ValidationWaitResult>([
885
890
  proc.exited.then((code) => ({ type: "exit" as const, code })),
886
891
  timeoutPromise,
887
892
  browserSignalPromise,
@@ -1740,9 +1745,9 @@ function classifyBrowserValidationFailureKindFromText(text: string): BrowserVali
1740
1745
 
1741
1746
  function extractBrowserValidationStage(text: string): string | null {
1742
1747
  const patterns = [
1743
- /\bBrowser validation failed during\s+([^:.\r\n]+?)\s+stage\b/i,
1744
- /\bfailed during\s+([^:.\r\n]+?)\s+stage\b/i,
1745
- /\b(?:stage|phase)\s*[:=]\s*["'`]?([^"'`.\r\n]+)["'`]?/i,
1748
+ /\bBrowser validation failed during\s+([^:.\r\n|]+?)\s+stage\b/i,
1749
+ /\bfailed during\s+([^:.\r\n|]+?)\s+stage\b/i,
1750
+ /\b(?:stage|phase)\s*[:=]\s*["'`]?([^"'`.\r\n|]+)["'`]?/i,
1746
1751
  ];
1747
1752
  for (const pattern of patterns) {
1748
1753
  const match = text.match(pattern);
@@ -1757,6 +1762,27 @@ function extractBrowserValidationStage(text: string): string | null {
1757
1762
  return null;
1758
1763
  }
1759
1764
 
1765
+ function refineBrowserValidationStage(
1766
+ stage: string | null,
1767
+ selector: string | null,
1768
+ expected: string | null,
1769
+ text: string,
1770
+ ): string | null {
1771
+ const combined = stripAnsiControlSequences(
1772
+ [stage, selector, expected, text].filter(Boolean).join(" "),
1773
+ ).toLowerCase();
1774
+ if (/\b(game-control-panel|planet control panel|selected planet panel)\b/i.test(combined)) {
1775
+ return "planet control panel";
1776
+ }
1777
+ if (/\bsettings-home-button\b|\breturn to home from settings\b/i.test(combined)) {
1778
+ return "settings return";
1779
+ }
1780
+ if (/\bshop-home-button\b|\breturn to home from shop\b/i.test(combined)) {
1781
+ return "shop return";
1782
+ }
1783
+ return stage;
1784
+ }
1785
+
1760
1786
  function inferBrowserValidationFailureFocus(params: {
1761
1787
  stage?: string | null;
1762
1788
  selector?: string | null;
@@ -1980,13 +2006,60 @@ function summarizeBrowserValidationOutput(text: string): string {
1980
2006
  .map((line) => line.trim())
1981
2007
  .filter(Boolean)
1982
2008
  .filter((line) =>
1983
- /\b(Web end-to-end smoke test failed|Browser validation failed|Expected |locator\.|page\.|waiting for getBy|Call log:|ERR_SOCKET_BAD_PORT|EADDRINUSE|EPERM|EACCES|browserType\.launch|Executable doesn't exist|Expo exited early|freeport|net::ERR_|Validation command timed out|terminated by signal|SIGTERM|timed out after \d+ms)/i.test(
2009
+ /\b(Web end-to-end smoke test failed|Browser validation failed|Expected |locator\.|page\.|waiting for getBy|Call log:|Verified:|Saved screenshot|Saved trace|ERR_SOCKET_BAD_PORT|EADDRINUSE|EPERM|EACCES|browserType\.launch|Executable doesn't exist|Expo exited early|freeport|net::ERR_|Validation command timed out|terminated by signal|SIGTERM|timed out after \d+ms)/i.test(
1984
2010
  line,
1985
2011
  ),
1986
2012
  );
1987
2013
  return toSingleLine(lines.slice(0, 8).join(" | "), 900);
1988
2014
  }
1989
2015
 
2016
+ function lastBrowserVerifiedStage(text: string): string | null {
2017
+ const verifiedStages = [...stripAnsiControlSequences(text).matchAll(/\bVerified:\s+([^|\r\n]+)/gi)]
2018
+ .map((match) => match[1]?.trim())
2019
+ .filter((entry): entry is string => Boolean(entry));
2020
+ const lastVerified = verifiedStages.at(-1);
2021
+ return lastVerified ? toSingleLine(lastVerified, 80) : null;
2022
+ }
2023
+
2024
+ export function extractValidationFailureRetryDigest(
2025
+ run: {
2026
+ command: string;
2027
+ stdout?: string;
2028
+ stderr?: string;
2029
+ exitCode?: number;
2030
+ elapsedMs?: number;
2031
+ },
2032
+ repo?: string,
2033
+ ): string {
2034
+ const baseDigest = extractValidationFailureDigest(run);
2035
+ if (!isLongRunningBrowserValidationCommand(run.command)) return baseDigest;
2036
+ const combined = stripAnsiControlSequences([run.stderr, run.stdout].filter(Boolean).join("\n"));
2037
+ const failureKind = classifyBrowserValidationFailureKindFromText(`${baseDigest}\n${combined}`);
2038
+ if (failureKind !== "assertion") return baseDigest;
2039
+
2040
+ const recentLogSummary = summarizeRecentBrowserValidationLogs(repo);
2041
+ const enrichedBrowserContext = [combined, recentLogSummary].filter(Boolean).join("\n");
2042
+ const selector = extractBrowserValidationSelector(enrichedBrowserContext);
2043
+ const expected = extractBrowserValidationExpectedUi(enrichedBrowserContext);
2044
+ const stage = refineBrowserValidationStage(
2045
+ extractBrowserValidationStage(enrichedBrowserContext),
2046
+ selector,
2047
+ expected,
2048
+ enrichedBrowserContext,
2049
+ );
2050
+ const lastVerified = lastBrowserVerifiedStage(enrichedBrowserContext);
2051
+ const output = summarizeBrowserValidationOutput(enrichedBrowserContext);
2052
+ const parts = [
2053
+ baseDigest,
2054
+ stage ? `stage=${stage}` : "",
2055
+ selector ? `selector=${selector}` : "",
2056
+ expected ? `expected=${expected}` : "",
2057
+ lastVerified ? `last verified=${lastVerified}` : "",
2058
+ output && output !== baseDigest ? output : "",
2059
+ ].filter(Boolean);
2060
+ return toSingleLine(parts.join(" | "), 900) || baseDigest;
2061
+ }
2062
+
1990
2063
  export function buildBrowserValidationRepairPacket(
1991
2064
  validationRuns: ValidationExecutionResult[],
1992
2065
  previousFailureDigests: Map<string, string> = new Map(),
@@ -1995,15 +2068,24 @@ export function buildBrowserValidationRepairPacket(
1995
2068
  for (const run of validationRuns) {
1996
2069
  if (run.ok || !isLongRunningBrowserValidationCommand(run.command)) continue;
1997
2070
  const combined = stripAnsiControlSequences([run.stderr, run.stdout].filter(Boolean).join("\n"));
1998
- const digest = extractValidationFailureDigest(run);
1999
- const failureKind = classifyBrowserValidationFailureKindFromText(`${digest}\n${combined}`);
2071
+ const baseDigest = extractValidationFailureDigest(run);
2072
+ const failureKind = classifyBrowserValidationFailureKindFromText(`${baseDigest}\n${combined}`);
2000
2073
  if (failureKind === "unknown") continue;
2074
+ const digest =
2075
+ failureKind === "assertion"
2076
+ ? extractValidationFailureRetryDigest(run, repo) || baseDigest
2077
+ : baseDigest;
2001
2078
  const previousDigest = previousFailureDigests.get(validationCommandKey(run.command)) ?? null;
2002
2079
  const recentLogSummary = summarizeRecentBrowserValidationLogs(repo);
2003
2080
  const enrichedBrowserContext = [combined, recentLogSummary].filter(Boolean).join("\n");
2004
- const stage = extractBrowserValidationStage(enrichedBrowserContext);
2005
2081
  const selector = extractBrowserValidationSelector(enrichedBrowserContext);
2006
2082
  const expected = extractBrowserValidationExpectedUi(enrichedBrowserContext);
2083
+ const stage = refineBrowserValidationStage(
2084
+ extractBrowserValidationStage(enrichedBrowserContext),
2085
+ selector,
2086
+ expected,
2087
+ enrichedBrowserContext,
2088
+ );
2007
2089
  const previousStage = previousDigest ? extractBrowserValidationStage(previousDigest) : null;
2008
2090
  const previousSelector = previousDigest ? extractBrowserValidationSelector(previousDigest) : null;
2009
2091
  const previousExpected = previousDigest ? extractBrowserValidationExpectedUi(previousDigest) : null;
@@ -2021,17 +2103,21 @@ export function buildBrowserValidationRepairPacket(
2021
2103
  text: previousDigest,
2022
2104
  })
2023
2105
  : null;
2106
+ const sameFailureSignal =
2107
+ Boolean(previousDigest) &&
2108
+ (previousDigest === digest ||
2109
+ (Boolean(failureFocus) &&
2110
+ failureFocus === previousFailureFocus &&
2111
+ (!selector || !previousSelector || selector === previousSelector)));
2024
2112
  const progress =
2025
2113
  previousDigest == null
2026
2114
  ? "first_failure"
2027
- : previousDigest === digest
2115
+ : sameFailureSignal
2028
2116
  ? "same_failure"
2029
2117
  : "new_failure";
2030
2118
  const needsDiagnosticProbe =
2031
2119
  failureKind === "assertion" &&
2032
- Boolean(previousDigest) &&
2033
- Boolean(failureFocus) &&
2034
- failureFocus === previousFailureFocus;
2120
+ sameFailureSignal;
2035
2121
  return {
2036
2122
  command: run.command,
2037
2123
  failureKind,
@@ -2642,49 +2728,67 @@ async function runDeterministicQualityGate(
2642
2728
  };
2643
2729
  }
2644
2730
 
2645
- async function runTaskCriticReview(
2646
- repo: string,
2647
- params: Record<string, unknown>,
2648
- quality: DeterministicQualityResult,
2731
+ type QualityCriticTimeoutBehavior = "skip" | "retry_once" | "block";
2732
+
2733
+ function resolveQualityCriticTimeoutMs(runtimeConfig: WorkerpalsRuntimeConfig): number {
2734
+ const value = Number(runtimeConfig.workerpals.qualityCriticTimeoutMs);
2735
+ if (!Number.isFinite(value)) return 90_000;
2736
+ return Math.max(1_000, Math.min(7_200_000, Math.floor(value)));
2737
+ }
2738
+
2739
+ function resolveQualityCriticTimeoutBehavior(
2649
2740
  runtimeConfig: WorkerpalsRuntimeConfig,
2650
- onLog?: (stream: "stdout" | "stderr", line: string) => void,
2651
- ): Promise<CriticReview | null> {
2652
- const endpoint = normalizeChatCompletionsEndpoint(runtimeConfig.workerpals.llm.endpoint);
2653
- const model = runtimeConfig.workerpals.llm.model.trim();
2654
- if (!endpoint || !model) return null;
2741
+ ): QualityCriticTimeoutBehavior {
2742
+ const value = String(runtimeConfig.workerpals.qualityCriticTimeoutBehavior ?? "")
2743
+ .trim()
2744
+ .toLowerCase()
2745
+ .replace(/-/g, "_");
2746
+ if (value === "skip" || value === "retry_once" || value === "block") return value;
2747
+ return "retry_once";
2748
+ }
2655
2749
 
2656
- const changedForDiff = quality.changedPaths.slice(0, 8);
2657
- let diffText = "";
2658
- if (changedForDiff.length > 0) {
2659
- const diffResult = await git(repo, ["diff", "--", ...changedForDiff]);
2660
- diffText = diffResult.ok ? diffResult.stdout : diffResult.stderr;
2661
- }
2662
- const qualityCriticMaxDiffChars = (() => {
2663
- const value = Number(runtimeConfig.workerpals.qualityCriticMaxDiffChars);
2664
- if (!Number.isFinite(value)) return 16_000;
2665
- return Math.max(256, Math.min(524_288, Math.floor(value)));
2666
- })();
2667
- const qualityCriticMaxValidationOutputChars = (() => {
2668
- const value = Number(runtimeConfig.workerpals.qualityCriticMaxValidationOutputChars);
2669
- if (!Number.isFinite(value)) return 8_000;
2670
- return Math.max(256, Math.min(524_288, Math.floor(value)));
2671
- })();
2672
- const qualityCriticTimeoutMs = (() => {
2673
- const value = Number(runtimeConfig.workerpals.qualityCriticTimeoutMs);
2674
- if (!Number.isFinite(value)) return 45_000;
2675
- return Math.max(1_000, Math.min(7_200_000, Math.floor(value)));
2676
- })();
2677
- diffText = compactJobOutput(diffText, outputPolicyForRuntime(runtimeConfig)).slice(
2678
- 0,
2679
- qualityCriticMaxDiffChars,
2680
- );
2750
+ function resolveQualityCriticModel(
2751
+ runtimeConfig: WorkerpalsRuntimeConfig,
2752
+ fallback = "",
2753
+ ): string {
2754
+ return String(runtimeConfig.workerpals.qualityCriticModel ?? "").trim() || fallback.trim();
2755
+ }
2681
2756
 
2682
- const validationSummary = quality.validationRuns
2757
+ function resolveQualityCriticMaxDiffChars(
2758
+ runtimeConfig: WorkerpalsRuntimeConfig,
2759
+ compact = false,
2760
+ ): number {
2761
+ const value = Number(runtimeConfig.workerpals.qualityCriticMaxDiffChars);
2762
+ const max = Number.isFinite(value) ? value : 16_000;
2763
+ const bounded = Math.max(256, Math.min(524_288, Math.floor(max)));
2764
+ return compact ? Math.min(bounded, 6_000) : bounded;
2765
+ }
2766
+
2767
+ function resolveQualityCriticMaxValidationOutputChars(
2768
+ runtimeConfig: WorkerpalsRuntimeConfig,
2769
+ compact = false,
2770
+ ): number {
2771
+ const value = Number(runtimeConfig.workerpals.qualityCriticMaxValidationOutputChars);
2772
+ const max = Number.isFinite(value) ? value : 8_000;
2773
+ const bounded = Math.max(256, Math.min(524_288, Math.floor(max)));
2774
+ return compact ? Math.min(bounded, 2_000) : bounded;
2775
+ }
2776
+
2777
+ function buildCriticValidationSummary(
2778
+ quality: DeterministicQualityResult,
2779
+ maxValidationOutputChars: number,
2780
+ ): string {
2781
+ const allPassed =
2782
+ quality.validationRuns.length > 0 && quality.validationRuns.every((run) => run.ok);
2783
+ return quality.validationRuns
2683
2784
  .map((run) => {
2684
- const output = [run.stdout, run.stderr]
2685
- .filter(Boolean)
2686
- .join("\n")
2687
- .slice(0, qualityCriticMaxValidationOutputChars);
2785
+ const output =
2786
+ allPassed
2787
+ ? ""
2788
+ : [run.stdout, run.stderr]
2789
+ .filter(Boolean)
2790
+ .join("\n")
2791
+ .slice(0, maxValidationOutputChars);
2688
2792
  return [
2689
2793
  `Command: ${run.command}`,
2690
2794
  `Result: ${run.ok ? "pass" : "fail"} (exit ${run.exitCode}, ${run.elapsedMs}ms)`,
@@ -2694,6 +2798,38 @@ async function runTaskCriticReview(
2694
2798
  .join("\n");
2695
2799
  })
2696
2800
  .join("\n\n---\n\n");
2801
+ }
2802
+
2803
+ function criticTimeoutReview(
2804
+ source: "Codex" | "LLM",
2805
+ timeoutMs: number,
2806
+ elapsedMs: number,
2807
+ ): CriticReview {
2808
+ const summary = `${source} critic timed out after ${elapsedMs}ms (timeout=${timeoutMs}ms).`;
2809
+ return {
2810
+ score: 0,
2811
+ findings: [summary],
2812
+ mustFix: [
2813
+ "CriticGate timeout behavior is set to block; complete the critic review by reducing critic input, choosing a faster critic model, or increasing workerpals.quality_critic_timeout_ms.",
2814
+ ],
2815
+ revisionGuidance:
2816
+ "Do not change product code for this finding unless product code caused the critic prompt explosion. Adjust CriticGate configuration or reduce validation/diff evidence volume.",
2817
+ raw: JSON.stringify({ score: 0, findings: [summary], must_fix: ["CriticGate timed out"] }),
2818
+ };
2819
+ }
2820
+
2821
+ async function runTaskCriticReview(
2822
+ repo: string,
2823
+ params: Record<string, unknown>,
2824
+ quality: DeterministicQualityResult,
2825
+ runtimeConfig: WorkerpalsRuntimeConfig,
2826
+ onLog?: (stream: "stdout" | "stderr", line: string) => void,
2827
+ ): Promise<CriticReview | null> {
2828
+ const endpoint = normalizeChatCompletionsEndpoint(runtimeConfig.workerpals.llm.endpoint);
2829
+ const model = resolveQualityCriticModel(runtimeConfig, runtimeConfig.workerpals.llm.model.trim());
2830
+ if (!endpoint || !model) return null;
2831
+ const qualityCriticTimeoutMs = resolveQualityCriticTimeoutMs(runtimeConfig);
2832
+ const timeoutBehavior = resolveQualityCriticTimeoutBehavior(runtimeConfig);
2697
2833
 
2698
2834
  const planning = params.planning as TaskExecutePlanning;
2699
2835
  const instruction = String(params.instruction ?? "").trim();
@@ -2711,33 +2847,65 @@ async function runTaskCriticReview(
2711
2847
  const changedPathsText =
2712
2848
  quality.changedPaths.map((entry) => `- ${entry}`).join("\n") || "- (none)";
2713
2849
  const criticSystem = loadPromptTemplate("workerpals/task_quality_critic_system_prompt.md").trim();
2714
- const criticUser = loadPromptTemplate("workerpals/task_quality_critic_user_prompt.md", {
2715
- instruction,
2716
- acceptance_criteria: acceptanceCriteriaText,
2717
- validation_steps: validationStepsText,
2718
- changed_paths: changedPathsText,
2719
- diff_excerpt: diffText || "(empty diff excerpt)",
2720
- validation_evidence: validationSummary || "(no validation output)",
2721
- });
2722
2850
 
2723
2851
  const apiKey = runtimeConfig.workerpals.llm.apiKey.trim() || "local";
2724
2852
  const headers: Record<string, string> = {
2725
2853
  "Content-Type": "application/json",
2726
2854
  };
2727
2855
  if (apiKey) headers.Authorization = `Bearer ${apiKey}`;
2728
- const bodyBase = {
2729
- model,
2730
- messages: [
2731
- { role: "system", content: criticSystem },
2732
- { role: "user", content: criticUser },
2733
- ],
2734
- temperature: 0,
2735
- max_tokens: 700,
2856
+
2857
+ const buildAttemptPayload = async (compact: boolean) => {
2858
+ const changedForDiff = quality.changedPaths.slice(0, compact ? 4 : 8);
2859
+ let diffText = "";
2860
+ if (changedForDiff.length > 0) {
2861
+ const diffResult = await git(repo, ["diff", "--", ...changedForDiff]);
2862
+ diffText = diffResult.ok ? diffResult.stdout : diffResult.stderr;
2863
+ }
2864
+ diffText = compactJobOutput(diffText, outputPolicyForRuntime(runtimeConfig)).slice(
2865
+ 0,
2866
+ resolveQualityCriticMaxDiffChars(runtimeConfig, compact),
2867
+ );
2868
+ const validationSummary = buildCriticValidationSummary(
2869
+ quality,
2870
+ resolveQualityCriticMaxValidationOutputChars(runtimeConfig, compact),
2871
+ );
2872
+ const criticUser = loadPromptTemplate("workerpals/task_quality_critic_user_prompt.md", {
2873
+ instruction,
2874
+ acceptance_criteria: acceptanceCriteriaText,
2875
+ validation_steps: validationStepsText,
2876
+ changed_paths: changedPathsText,
2877
+ diff_excerpt: diffText || "(empty diff excerpt)",
2878
+ validation_evidence: validationSummary || "(no validation output)",
2879
+ });
2880
+ const promptChars = criticSystem.length + criticUser.length;
2881
+ const promptBytes = new TextEncoder().encode(`${criticSystem}\n${criticUser}`).length;
2882
+ return {
2883
+ bodyBase: {
2884
+ model,
2885
+ messages: [
2886
+ { role: "system", content: criticSystem },
2887
+ { role: "user", content: criticUser },
2888
+ ],
2889
+ temperature: 0,
2890
+ max_tokens: compact ? 500 : 700,
2891
+ },
2892
+ promptChars,
2893
+ promptBytes,
2894
+ diffChars: diffText.length,
2895
+ validationChars: validationSummary.length,
2896
+ };
2736
2897
  };
2737
2898
 
2738
- const runCriticRequest = async (responseFormat: Record<string, unknown> | null) => {
2899
+ const runCriticRequest = async (
2900
+ bodyBase: Record<string, unknown>,
2901
+ responseFormat: Record<string, unknown> | null,
2902
+ ) => {
2739
2903
  const controller = new AbortController();
2740
- const timer = setTimeout(() => controller.abort(), qualityCriticTimeoutMs);
2904
+ let timedOut = false;
2905
+ const timer = setTimeout(() => {
2906
+ timedOut = true;
2907
+ controller.abort();
2908
+ }, qualityCriticTimeoutMs);
2741
2909
  try {
2742
2910
  const response = await fetch(endpoint, {
2743
2911
  method: "POST",
@@ -2748,14 +2916,29 @@ async function runTaskCriticReview(
2748
2916
  signal: controller.signal,
2749
2917
  });
2750
2918
  const text = await response.text();
2751
- return { response, text };
2919
+ return { timedOut: false as const, response, text };
2920
+ } catch (err) {
2921
+ if (!timedOut && String((err as { name?: unknown })?.name ?? "") !== "AbortError") {
2922
+ throw err;
2923
+ }
2924
+ return { timedOut: true as const, err };
2752
2925
  } finally {
2753
2926
  clearTimeout(timer);
2754
2927
  }
2755
2928
  };
2756
2929
 
2757
- try {
2758
- let request = await runCriticRequest({ type: "json_object" });
2930
+ const runAttempt = async (
2931
+ attempt: number,
2932
+ compact: boolean,
2933
+ ): Promise<{ status: "timeout" } | { status: "done"; review: CriticReview | null }> => {
2934
+ const payload = await buildAttemptPayload(compact);
2935
+ const startedAt = Date.now();
2936
+ onLog?.(
2937
+ "stdout",
2938
+ `[CriticGate] LLM review attempt ${attempt}${compact ? " (compact)" : ""}: model=${model} timeout_ms=${qualityCriticTimeoutMs} behavior=${timeoutBehavior} prompt_chars=${payload.promptChars} prompt_bytes=${payload.promptBytes} diff_chars=${payload.diffChars} validation_chars=${payload.validationChars}`,
2939
+ );
2940
+ let request = await runCriticRequest(payload.bodyBase, { type: "json_object" });
2941
+ if (request.timedOut) return { status: "timeout" };
2759
2942
  if (!request.response.ok && request.response.status === 400) {
2760
2943
  const lowered = request.text.toLowerCase();
2761
2944
  if (lowered.includes("response_format")) {
@@ -2763,7 +2946,8 @@ async function runTaskCriticReview(
2763
2946
  "stdout",
2764
2947
  "[CriticGate] fallback: response_format json_object unsupported; retrying without strict response_format.",
2765
2948
  );
2766
- request = await runCriticRequest(null);
2949
+ request = await runCriticRequest(payload.bodyBase, null);
2950
+ if (request.timedOut) return { status: "timeout" };
2767
2951
  }
2768
2952
  }
2769
2953
  if (!request.response.ok) {
@@ -2771,12 +2955,12 @@ async function runTaskCriticReview(
2771
2955
  "stderr",
2772
2956
  `[CriticGate] review request failed (${request.response.status}): ${toSingleLine(request.text, 240)}`,
2773
2957
  );
2774
- return null;
2958
+ return { status: "done", review: null };
2775
2959
  }
2776
2960
 
2777
- const payload = parseJsonObjectLoose(request.text) ?? JSON.parse(request.text);
2778
- const choices = Array.isArray((payload as Record<string, unknown>).choices)
2779
- ? ((payload as Record<string, unknown>).choices as Array<Record<string, unknown>>)
2961
+ const responsePayload = parseJsonObjectLoose(request.text) ?? JSON.parse(request.text);
2962
+ const choices = Array.isArray((responsePayload as Record<string, unknown>).choices)
2963
+ ? ((responsePayload as Record<string, unknown>).choices as Array<Record<string, unknown>>)
2780
2964
  : [];
2781
2965
  const content = String(
2782
2966
  (choices[0]?.message as Record<string, unknown> | undefined)?.content ?? "",
@@ -2790,7 +2974,7 @@ async function runTaskCriticReview(
2790
2974
  220,
2791
2975
  )}`,
2792
2976
  );
2793
- return null;
2977
+ return { status: "done", review: null };
2794
2978
  }
2795
2979
 
2796
2980
  const scoreRaw = Number(reviewObj.score);
@@ -2804,13 +2988,43 @@ async function runTaskCriticReview(
2804
2988
  .trim()
2805
2989
  .slice(0, 2000);
2806
2990
  const score = Number.isFinite(scoreRaw) ? Math.max(0, Math.min(10, scoreRaw)) : 0;
2991
+ onLog?.(
2992
+ "stdout",
2993
+ `[CriticGate] LLM review completed in ${Date.now() - startedAt}ms (attempt ${attempt}).`,
2994
+ );
2807
2995
  return {
2808
- score,
2809
- findings,
2810
- mustFix,
2811
- revisionGuidance,
2812
- raw: compactJobOutput(content, outputPolicyForRuntime(runtimeConfig)),
2996
+ status: "done",
2997
+ review: {
2998
+ score,
2999
+ findings,
3000
+ mustFix,
3001
+ revisionGuidance,
3002
+ raw: compactJobOutput(content, outputPolicyForRuntime(runtimeConfig)),
3003
+ },
2813
3004
  };
3005
+ };
3006
+
3007
+ try {
3008
+ let attempt = await runAttempt(1, false);
3009
+ if (attempt.status === "timeout" && timeoutBehavior === "retry_once") {
3010
+ onLog?.(
3011
+ "stderr",
3012
+ `[CriticGate] LLM review timed out after ${qualityCriticTimeoutMs}ms; retrying once with compact critic input.`,
3013
+ );
3014
+ attempt = await runAttempt(2, true);
3015
+ }
3016
+ if (attempt.status === "timeout") {
3017
+ if (timeoutBehavior === "block") {
3018
+ onLog?.(
3019
+ "stderr",
3020
+ `[CriticGate] LLM review timed out after ${qualityCriticTimeoutMs}ms; blocking because quality_critic_timeout_behavior=block.`,
3021
+ );
3022
+ return criticTimeoutReview("LLM", qualityCriticTimeoutMs, qualityCriticTimeoutMs);
3023
+ }
3024
+ onLog?.("stderr", `[CriticGate] LLM timed out after ${qualityCriticTimeoutMs}ms; skipping.`);
3025
+ return null;
3026
+ }
3027
+ return attempt.review;
2814
3028
  } catch (err) {
2815
3029
  onLog?.(
2816
3030
  "stderr",
@@ -2905,7 +3119,10 @@ export function buildQualityRevisionHint(
2905
3119
  "- Convergence mode: diagnostic-first repair. This same browser focus failed in the previous revision, so do not guess another selector or rewrite a different stage.",
2906
3120
  );
2907
3121
  lines.push(
2908
- "- Diagnostic requirement: before editing again, inspect or add a tiny temporary diagnostic around the failing stage that records locator counts, visible textContent, role/ARIA attributes, data-testid values, and a nearby DOM snippet for the candidate nodes.",
3122
+ "- Diagnostic requirement: before editing again, inspect or add a tiny temporary diagnostic around the failing stage that records locator counts, visible textContent, role/ARIA attributes, data-testid values, bounding boxes, and a nearby DOM snippet for the candidate nodes.",
3123
+ );
3124
+ lines.push(
3125
+ "- Artifact freshness rule: only trust screenshots/logs captured after the failing action in the current revision. If the screenshot is stale or stops before the failing locator, capture or print the DOM state instead of reasoning from that image.",
2909
3126
  );
2910
3127
  lines.push(
2911
3128
  "- React Native Web note: screenshots can show the intended state while Playwright reads a duplicate or stale rendered node. Prefer one unique selected-state test id or a semantic checked attribute on the stable pressable, then assert locator count and visibility.",
@@ -2947,7 +3164,7 @@ export function buildQualityRevisionHint(
2947
3164
  );
2948
3165
  if (browserRepairPacket.needsDiagnosticProbe) {
2949
3166
  lines.push(
2950
- `Validation rerun rule: PushPals ValidationGate will rerun "${browserRepairPacket.command}" after the patch, but this is now a repeated browser assertion. If a quick local startup probe shows the browser server can run in this executor, run one targeted "${browserRepairPacket.command}" confirmation after the DOM-backed fix. Do not hand off another unverified selector guess.`,
3167
+ `Validation rerun rule: PushPals ValidationGate will rerun "${browserRepairPacket.command}" after the patch, but this is now a repeated browser assertion. If a quick local startup probe shows the browser server can run in this executor, run exactly one targeted "${browserRepairPacket.command}" confirmation after the DOM-backed fix. Do not stop after fast checks only. Do not hand off another unverified selector guess.`,
2951
3168
  );
2952
3169
  } else {
2953
3170
  lines.push(
@@ -5462,86 +5679,92 @@ async function runCodexCriticReview(
5462
5679
 
5463
5680
  const instruction = String(params.instruction ?? "").trim();
5464
5681
  const planning = params.planning as TaskExecutePlanning;
5465
-
5466
- const changedForDiff = quality.changedPaths.slice(0, 8);
5467
- let diffText = "";
5468
- const qualityCriticMaxDiffChars = (() => {
5469
- const value = Number(runtimeConfig.workerpals.qualityCriticMaxDiffChars);
5470
- if (!Number.isFinite(value)) return 16_000;
5471
- return Math.max(256, Math.min(524_288, Math.floor(value)));
5472
- })();
5473
- const qualityCriticMaxValidationOutputChars = (() => {
5474
- const value = Number(runtimeConfig.workerpals.qualityCriticMaxValidationOutputChars);
5475
- if (!Number.isFinite(value)) return 8_000;
5476
- return Math.max(256, Math.min(524_288, Math.floor(value)));
5477
- })();
5478
- const qualityCriticTimeoutMs = (() => {
5479
- const value = Number(runtimeConfig.workerpals.qualityCriticTimeoutMs);
5480
- if (!Number.isFinite(value)) return 45_000;
5481
- return Math.max(1_000, Math.min(7_200_000, Math.floor(value)));
5482
- })();
5483
- if (changedForDiff.length > 0) {
5484
- const diffResult = await git(repo, ["diff", "--", ...changedForDiff]);
5485
- diffText = (diffResult.ok ? diffResult.stdout : diffResult.stderr).slice(
5682
+ const qualityCriticTimeoutMs = resolveQualityCriticTimeoutMs(runtimeConfig);
5683
+ const timeoutBehavior = resolveQualityCriticTimeoutBehavior(runtimeConfig);
5684
+ const criticModel = resolveQualityCriticModel(runtimeConfig);
5685
+
5686
+ const buildCriticInstruction = async (compact: boolean) => {
5687
+ const changedForDiff = quality.changedPaths.slice(0, compact ? 4 : 8);
5688
+ let diffText = "";
5689
+ if (changedForDiff.length > 0) {
5690
+ const diffResult = await git(repo, ["diff", "--", ...changedForDiff]);
5691
+ diffText = diffResult.ok ? diffResult.stdout : diffResult.stderr;
5692
+ }
5693
+ diffText = compactJobOutput(diffText, outputPolicyForRuntime(runtimeConfig)).slice(
5486
5694
  0,
5487
- qualityCriticMaxDiffChars,
5695
+ resolveQualityCriticMaxDiffChars(runtimeConfig, compact),
5488
5696
  );
5489
- }
5490
-
5491
- const validationSummary = quality.validationRuns
5492
- .map((run) => {
5493
- const output = [run.stdout, run.stderr]
5494
- .filter(Boolean)
5495
- .join("\n")
5496
- .slice(0, qualityCriticMaxValidationOutputChars);
5497
- return [
5498
- `Command: ${run.command}`,
5499
- `Result: ${run.ok ? "pass" : "fail"} (exit ${run.exitCode})`,
5500
- output,
5501
- ]
5502
- .filter(Boolean)
5503
- .join("\n");
5504
- })
5505
- .join("\n---\n");
5506
-
5507
- const criticInstruction = loadPromptTemplate(
5508
- "workerpals/codex_quality_critic_instruction_prompt.md",
5509
- {
5510
- instruction,
5511
- acceptance_criteria:
5512
- planning.acceptanceCriteria.map((c) => `- ${c}`).join("\n") || "- (none)",
5513
- changed_paths: quality.changedPaths.join(", ") || "(none)",
5514
- diff_section: diffText ? `Diff:\n${diffText}` : "Diff: (empty - no changes detected)",
5515
- validation_section: validationSummary
5516
- ? `Validation:\n${validationSummary}`
5517
- : "Validation: (none)",
5518
- },
5519
- );
5697
+ const validationSummary = buildCriticValidationSummary(
5698
+ quality,
5699
+ resolveQualityCriticMaxValidationOutputChars(runtimeConfig, compact),
5700
+ );
5701
+ const criticInstruction = loadPromptTemplate(
5702
+ "workerpals/codex_quality_critic_instruction_prompt.md",
5703
+ {
5704
+ instruction,
5705
+ acceptance_criteria:
5706
+ planning.acceptanceCriteria.map((c) => `- ${c}`).join("\n") || "- (none)",
5707
+ changed_paths: quality.changedPaths.join(", ") || "(none)",
5708
+ diff_section: diffText ? `Diff:\n${diffText}` : "Diff: (empty - no changes detected)",
5709
+ validation_section: validationSummary
5710
+ ? `Validation:\n${validationSummary}`
5711
+ : "Validation: (none)",
5712
+ },
5713
+ );
5714
+ return {
5715
+ criticInstruction,
5716
+ promptChars: criticInstruction.length,
5717
+ promptBytes: new TextEncoder().encode(criticInstruction).length,
5718
+ diffChars: diffText.length,
5719
+ validationChars: validationSummary.length,
5720
+ };
5721
+ };
5520
5722
 
5521
5723
  const tmpOutputPath = `/tmp/pushpals-critic-${Date.now()}-${Math.random().toString(36).slice(2, 8)}.txt`;
5522
- const cmd = [
5523
- ...codexPrefix,
5524
- "-c",
5525
- 'model_reasoning_effort="low"',
5526
- "-a",
5527
- "never",
5528
- "exec",
5529
- "-s",
5530
- "read-only",
5531
- "--output-last-message",
5532
- tmpOutputPath,
5533
- "-",
5534
- ];
5724
+ const buildCmd = () => {
5725
+ const cmd = [
5726
+ ...codexPrefix,
5727
+ "-c",
5728
+ 'model_reasoning_effort="low"',
5729
+ "-a",
5730
+ "never",
5731
+ "exec",
5732
+ "-s",
5733
+ "read-only",
5734
+ "--color",
5735
+ "never",
5736
+ "--output-last-message",
5737
+ tmpOutputPath,
5738
+ ];
5739
+ if (criticModel) cmd.push("-m", criticModel);
5740
+ cmd.push("-");
5741
+ return cmd;
5742
+ };
5535
5743
 
5536
5744
  const env = buildWorkerSandboxWritableEnv(repo);
5537
5745
  const codexMask = maskRepoLocalCodexFilesForCodexCli(repo, env);
5538
- try {
5539
- const proc = Bun.spawn(cmd, {
5746
+
5747
+ const runAttempt = async (
5748
+ attempt: number,
5749
+ compact: boolean,
5750
+ ): Promise<{ status: "timeout" } | { status: "done"; review: CriticReview | null }> => {
5751
+ try {
5752
+ unlinkSync(tmpOutputPath);
5753
+ } catch {
5754
+ /* ignore stale/missing critic output */
5755
+ }
5756
+ const payload = await buildCriticInstruction(compact);
5757
+ const startedAt = Date.now();
5758
+ onLog?.(
5759
+ "stdout",
5760
+ `[CriticGate] Codex review attempt ${attempt}${compact ? " (compact)" : ""}: model=${criticModel || "(codex default)"} timeout_ms=${qualityCriticTimeoutMs} behavior=${timeoutBehavior} prompt_chars=${payload.promptChars} prompt_bytes=${payload.promptBytes} diff_chars=${payload.diffChars} validation_chars=${payload.validationChars}`,
5761
+ );
5762
+ const proc = Bun.spawn(buildCmd(), {
5540
5763
  cwd: repo,
5541
5764
  env,
5542
5765
  stdout: "pipe",
5543
5766
  stderr: "pipe",
5544
- stdin: new Blob([criticInstruction]),
5767
+ stdin: new Blob([payload.criticInstruction]),
5545
5768
  });
5546
5769
 
5547
5770
  let timedOut = false;
@@ -5558,8 +5781,7 @@ async function runCodexCriticReview(
5558
5781
  clearTimeout(timer);
5559
5782
 
5560
5783
  if (timedOut) {
5561
- onLog?.("stderr", "[CriticGate] Codex timed out; skipping.");
5562
- return null;
5784
+ return { status: "timeout" };
5563
5785
  }
5564
5786
  if (exitCode !== 0) {
5565
5787
  const stderrText = await new Response(proc.stderr).text();
@@ -5567,7 +5789,7 @@ async function runCodexCriticReview(
5567
5789
  "stderr",
5568
5790
  `[CriticGate] Codex exited ${exitCode}: ${toSingleLine(stderrText, 220)}`,
5569
5791
  );
5570
- return null;
5792
+ return { status: "done", review: null };
5571
5793
  }
5572
5794
 
5573
5795
  let lastMessage = "";
@@ -5584,7 +5806,7 @@ async function runCodexCriticReview(
5584
5806
 
5585
5807
  if (!lastMessage) {
5586
5808
  onLog?.("stderr", "[CriticGate] Codex: no output message captured; skipping.");
5587
- return null;
5809
+ return { status: "done", review: null };
5588
5810
  }
5589
5811
 
5590
5812
  const reviewObj = parseJsonObjectLoose(lastMessage);
@@ -5593,7 +5815,7 @@ async function runCodexCriticReview(
5593
5815
  "stderr",
5594
5816
  `[CriticGate] Codex returned non-JSON: ${toSingleLine(lastMessage, 220)}`,
5595
5817
  );
5596
- return null;
5818
+ return { status: "done", review: null };
5597
5819
  }
5598
5820
 
5599
5821
  const scoreRaw = Number(reviewObj.score);
@@ -5607,14 +5829,43 @@ async function runCodexCriticReview(
5607
5829
  const revisionGuidance = String(reviewObj.revision_guidance ?? "")
5608
5830
  .trim()
5609
5831
  .slice(0, 2000);
5610
- onLog?.("stdout", `[CriticGate] Codex score: ${score}/10`);
5832
+ onLog?.(
5833
+ "stdout",
5834
+ `[CriticGate] Codex score: ${score}/10 (${Date.now() - startedAt}ms, attempt ${attempt})`,
5835
+ );
5611
5836
  return {
5612
- score,
5613
- findings,
5614
- mustFix,
5615
- revisionGuidance,
5616
- raw: compactJobOutput(lastMessage, outputPolicyForRuntime(runtimeConfig)),
5837
+ status: "done",
5838
+ review: {
5839
+ score,
5840
+ findings,
5841
+ mustFix,
5842
+ revisionGuidance,
5843
+ raw: compactJobOutput(lastMessage, outputPolicyForRuntime(runtimeConfig)),
5844
+ },
5617
5845
  };
5846
+ };
5847
+
5848
+ try {
5849
+ let attempt = await runAttempt(1, false);
5850
+ if (attempt.status === "timeout" && timeoutBehavior === "retry_once") {
5851
+ onLog?.(
5852
+ "stderr",
5853
+ `[CriticGate] Codex timed out after ${qualityCriticTimeoutMs}ms; retrying once with compact critic input.`,
5854
+ );
5855
+ attempt = await runAttempt(2, true);
5856
+ }
5857
+ if (attempt.status === "timeout") {
5858
+ if (timeoutBehavior === "block") {
5859
+ onLog?.(
5860
+ "stderr",
5861
+ `[CriticGate] Codex timed out after ${qualityCriticTimeoutMs}ms; blocking because quality_critic_timeout_behavior=block.`,
5862
+ );
5863
+ return criticTimeoutReview("Codex", qualityCriticTimeoutMs, qualityCriticTimeoutMs);
5864
+ }
5865
+ onLog?.("stderr", `[CriticGate] Codex timed out after ${qualityCriticTimeoutMs}ms; skipping.`);
5866
+ return null;
5867
+ }
5868
+ return attempt.review;
5618
5869
  } catch (err) {
5619
5870
  onLog?.("stderr", `[CriticGate] Codex error: ${toSingleLine(err, 220)} (skipping).`);
5620
5871
  return null;
@@ -5857,7 +6108,7 @@ export async function executeJob(
5857
6108
  );
5858
6109
  for (const run of quality.validationRuns) {
5859
6110
  if (run.ok) continue;
5860
- const digest = extractValidationFailureDigest(run);
6111
+ const digest = extractValidationFailureRetryDigest(run, repo);
5861
6112
  if (digest) previousValidationFailureDigests.set(validationCommandKey(run.command), digest);
5862
6113
  }
5863
6114
  const validationOutsideTaskScope =