npm - @pushpalsdev/cli - Versions diffs - 1.1.0 → 1.1.2 - Mend

@pushpalsdev/cli 1.1.0 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@pushpalsdev/cli",
-  "version": "1.1.0",
+  "version": "1.1.2",
   "description": "PushPals terminal CLI for LocalBuddy -> RemoteBuddy orchestration",
   "license": "MIT",
   "repository": {

package/runtime/prompts/workerpals/openai_codex_task_execute_system_prompt.md CHANGED Viewed

@@ -16,7 +16,7 @@ Execution rules:
 - If the hinted file is a thin wrapper or the behavior lives elsewhere, edit the behavior-owning file(s) needed to solve the task and explain the scope expansion in your final response.
 - Avoid irrelevant sprawl; the review agent will judge whether changed files are necessary for the requested outcome.
 - Read relevant files before editing, then run focused validation.
-- PushPals runs the deterministic ValidationGate after your edit, including any repo-required `vision.md` commands. During the editing turn, prefer focused/fast validation. Do not spend the main Codex execution budget repeatedly running long browser/e2e smoke commands such as `bun run web:e2e`; run them only when the task is specifically about the browser harness or when you need a final targeted confirmation and can stop promptly on a clear failure.
+- PushPals runs the deterministic ValidationGate after your edit, including any repo-required `vision.md` commands. During the editing turn, prefer focused/fast validation. Do not run long browser/e2e smoke commands such as `bun run web:e2e` by default from the Codex executor; ValidationGate is the authoritative browser runner and has the provisioned browser/runtime environment. For browser-harness tasks, inspect existing artifacts, run fast non-browser checks, and only run the full browser command once when a quick local startup probe shows it can run here and you need one targeted confirmation.
 - Use direct commands without shell wrappers. Prefer plain commands like `git diff -- path`, `git add <path>`, `git status --porcelain`, and `pwd`.
 - Do not wrap commands in `/bin/bash -lc`, `sh -lc`, `cmd /c`, or `powershell -Command`, and avoid pipelines, `awk`, heredocs, or multi-command shell snippets unless they are truly unavoidable.
 - If the command router rejects a command, simplify it to a single direct command instead of retrying more shell wrappers.

package/runtime/sandbox/apps/workerpals/src/backends/openai_codex/test_openai_codex_runtime_config.py CHANGED Viewed

@@ -295,6 +295,8 @@ class OpenAICodexRuntimeConfigTests(unittest.TestCase):
         template = _load_prompt_template("workerpals/openai_codex_task_execute_system_prompt.md")
         self.assertIn("Codex CLI is required infrastructure", template)
         self.assertIn("Use direct commands without shell wrappers", template)
+        self.assertIn("ValidationGate is the authoritative browser runner", template)
+        self.assertIn("Do not run long browser/e2e smoke commands", template)
     def test_extracts_usage_counts_from_nested_json_event(self) -> None:
         usage = _extract_usage_counts(

package/runtime/sandbox/apps/workerpals/src/docker_executor.ts CHANGED Viewed

@@ -43,6 +43,10 @@ const WORKERPAL_SANDBOX_COMPONENT_LABEL = "pushpals.component=workerpals-sandbox
 const DOCKER_IMAGE_INSPECT_TIMEOUT_MS = 15_000;
 const DOCKER_IMAGE_BUILD_TIMEOUT_MS = 10 * 60_000;
 const DOCKER_IMAGE_PULL_TIMEOUT_MS = 10 * 60_000;
+const BROWSER_VALIDATION_JOB_REPAIR_ATTEMPTS = 8;
+const BROWSER_VALIDATION_JOB_OVERHEAD_MS = 15 * 60_000;
+const BROWSER_VALIDATION_JOB_MIN_TIMEOUT_MS = 4 * 60 * 60_000;
+const BROWSER_VALIDATION_JOB_MAX_TIMEOUT_MS = 8 * 60 * 60_000;
 function parseClampedInt(value: unknown, defaultValue: number, min: number, max: number): number {
   const parsed =
@@ -237,6 +241,75 @@ export interface Job {
   sessionId: string;
 }
+function readPositiveNumber(value: unknown): number | null {
+  const parsed =
+    typeof value === "number"
+      ? value
+      : typeof value === "string"
+        ? Number.parseInt(value, 10)
+        : Number.NaN;
+  if (!Number.isFinite(parsed) || parsed <= 0) return null;
+  return Math.floor(parsed);
+}
+function maybeRecord(value: unknown): Record<string, unknown> | null {
+  return value && typeof value === "object" && !Array.isArray(value)
+    ? (value as Record<string, unknown>)
+    : null;
+}
+function collectValidationCommandHints(params: Record<string, unknown>): string[] {
+  const planning = maybeRecord(params.planning);
+  const values: unknown[] = [
+    params.instruction,
+    params.plannerWorkerInstruction,
+    params.validationSteps,
+    params.requiredValidationSteps,
+    planning?.validationSteps,
+    planning?.requiredValidationSteps,
+  ];
+  const commands: string[] = [];
+  for (const value of values) {
+    if (typeof value === "string") {
+      commands.push(value);
+      continue;
+    }
+    if (Array.isArray(value)) {
+      commands.push(...value.filter((entry): entry is string => typeof entry === "string"));
+    }
+  }
+  return commands;
+}
+function hasBrowserValidationCommand(job: Pick<Job, "kind" | "params">): boolean {
+  if (job.kind !== "task.execute") return false;
+  return collectValidationCommandHints(job.params).some((command) =>
+    /\b(web:e2e|e2e:web|browser:e2e|smoke:web|web:smoke|browser:smoke|playwright|cypress)\b/i.test(
+      command,
+    ),
+  );
+}
+export function resolveDockerJobTimeoutMs(
+  configuredTimeoutMs: number,
+  job: Pick<Job, "kind" | "params">,
+): number {
+  const baseTimeoutMs = Math.max(10_000, Math.floor(configuredTimeoutMs));
+  if (!hasBrowserValidationCommand(job)) return baseTimeoutMs;
+  const planning = maybeRecord(job.params.planning);
+  const executionBudgetMs = readPositiveNumber(planning?.executionBudgetMs) ?? 1_800_000;
+  const finalizationBudgetMs = readPositiveNumber(planning?.finalizationBudgetMs) ?? 120_000;
+  const attempts = BROWSER_VALIDATION_JOB_REPAIR_ATTEMPTS + 1; // initial attempt plus repairs
+  const estimatedTimeoutMs =
+    attempts * (executionBudgetMs + finalizationBudgetMs + BROWSER_VALIDATION_JOB_OVERHEAD_MS);
+  const boundedTimeoutMs = Math.min(
+    BROWSER_VALIDATION_JOB_MAX_TIMEOUT_MS,
+    Math.max(BROWSER_VALIDATION_JOB_MIN_TIMEOUT_MS, estimatedTimeoutMs),
+  );
+  return Math.max(baseTimeoutMs, boundedTimeoutMs);
+}
 export class DockerExecutor {
   private options: Required<Omit<DockerExecutorOptions, "config">>;
   private worktreeDir: string;
@@ -1120,6 +1193,7 @@ export class DockerExecutor {
       worktreePath,
       onLog,
     );
+    await this.ensureWorktreeDependencyArtifacts(containerWorktreePath, onLog);
     const args: string[] = [
       "exec",
@@ -1140,9 +1214,15 @@ export class DockerExecutor {
       stdout: "pipe",
       stderr: "pipe",
     });
+    const timeoutMs = resolveDockerJobTimeoutMs(this.options.timeoutMs, job);
+    if (timeoutMs !== this.options.timeoutMs) {
+      const note = `[DockerExecutor] Extended job timeout for browser validation convergence: ${timeoutMs}ms (configured ${this.options.timeoutMs}ms).`;
+      console.log(note);
+      onLog?.("stdout", note);
+    }
     const { leadMs: warningLeadMs, delayMs: warningDelayMs } = computeTimeoutWarningWindow(
-      this.options.timeoutMs,
+      timeoutMs,
     );
     const warningTimer = setTimeout(() => {
       const warning = `[DockerExecutor] Job nearing timeout in warm container (${Math.round(
@@ -1171,7 +1251,7 @@ export class DockerExecutor {
       } catch {
         // Ignore kill errors
       }
-    }, this.options.timeoutMs);
+    }, timeoutMs);
     // Process streams
     const stdoutLines: string[] = [];
@@ -1191,11 +1271,56 @@ export class DockerExecutor {
     const result = this.parseResult(stdoutLines, stderrLines, exitCode, {
       timedOutByDocker,
       elapsedMs,
+      timeoutMs,
     });
     return result;
   }
+  private async ensureWorktreeDependencyArtifacts(
+    containerWorktreePath: string,
+    onLog?: (stream: "stdout" | "stderr", line: string) => void,
+  ): Promise<void> {
+    const worktreePrefix = shellSingleQuote(`${containerWorktreePath}/`);
+    const command = [
+      "set -eu",
+      "linked=\"\"",
+      "for name in node_modules; do",
+      "  src=\"/repo/$name\"",
+      `  dest=${worktreePrefix}$name`,
+      "  if { [ -e \"$src\" ] || [ -L \"$src\" ]; } && [ ! -e \"$dest\" ] && [ ! -L \"$dest\" ]; then",
+      "    ln -s \"$src\" \"$dest\"",
+      "    linked=\"$linked $name\"",
+      "  fi",
+      "done",
+      "printf '%s' \"$linked\"",
+    ].join("\n");
+    const result = await this.runWarmShell(command);
+    if (!result.ok) {
+      const detail = [result.stderr, result.stdout].filter(Boolean).join("\n").trim();
+      const warning = `[DockerExecutor] Worktree dependency artifact linking skipped: ${
+        detail || `exit ${result.exitCode}`
+      }`;
+      console.warn(warning);
+      onLog?.("stderr", warning);
+      return;
+    }
+    const linked = result.stdout
+      .trim()
+      .split(/\s+/g)
+      .map((entry) => entry.trim())
+      .filter(Boolean);
+    if (linked.length === 0) return;
+    const note = `[DockerExecutor] Linked worktree dependency artifact(s): ${linked.join(
+      ", ",
+    )}`;
+    console.log(note);
+    onLog?.("stdout", note);
+  }
   private async waitForWorktreePathInWarmContainer(
     containerWorktreePath: string,
     timeoutMs = 5_000,
@@ -1400,7 +1525,7 @@ export class DockerExecutor {
     stdoutLines: string[],
     stderrLines: string[],
     exitCode: number,
-    context: { timedOutByDocker: boolean; elapsedMs: number },
+    context: { timedOutByDocker: boolean; elapsedMs: number; timeoutMs: number },
   ): DockerJobResult {
     let sawSentinel = false;
     let sentinelParseError = "";
@@ -1442,7 +1567,7 @@ export class DockerExecutor {
     if (context.timedOutByDocker) {
       return {
         ok: false,
-        summary: `Job timed out in Docker executor after ${context.elapsedMs}ms (limit ${this.options.timeoutMs}ms; terminated before structured result).`,
+        summary: `Job timed out in Docker executor after ${context.elapsedMs}ms (limit ${context.timeoutMs}ms; terminated before structured result).`,
         stdout,
         stderr,
         exitCode,

package/runtime/sandbox/apps/workerpals/src/execute_job.ts CHANGED Viewed

@@ -3,7 +3,15 @@
  * Used by both the host Worker (direct mode) and the Docker job runner.
  */
-import { existsSync, lstatSync, readFileSync, renameSync, rmSync, unlinkSync } from "fs";
+import {
+  existsSync,
+  lstatSync,
+  readdirSync,
+  readFileSync,
+  renameSync,
+  rmSync,
+  unlinkSync,
+} from "fs";
 import { resolve } from "path";
 import {
   buildGitCommitArgs as buildSourceControlGitCommitArgs,
@@ -76,6 +84,24 @@ export interface ValidationBlocker {
   detail: string;
 }
+type BrowserValidationFailureKind = "assertion" | "startup" | "runtime" | "network" | "unknown";
+export interface BrowserValidationRepairPacket {
+  command: string;
+  failureKind: BrowserValidationFailureKind;
+  stage: string | null;
+  selector: string | null;
+  expected: string | null;
+  digest: string;
+  previousDigest: string | null;
+  previousStage: string | null;
+  previousSelector: string | null;
+  previousExpected: string | null;
+  progress: "first_failure" | "same_failure" | "new_failure";
+  artifacts: string[];
+  output: string;
+}
 interface DeterministicQualityResult {
   ok: boolean;
   skipped: boolean;
@@ -120,6 +146,42 @@ export interface QualityGatePolicy {
   criticMinScore: number;
 }
+const BROWSER_VALIDATION_MAX_AUTO_REVISIONS = 8;
+export function qualityRevisionLoopUpperBound(policy: {
+  maxAutoRevisions: number;
+  validationMaxAutoRevisions: number;
+}, opts: {
+  browserValidation?: boolean;
+} = {}): number {
+  return Math.max(
+    policy.maxAutoRevisions,
+    policy.validationMaxAutoRevisions,
+    opts.browserValidation ? BROWSER_VALIDATION_MAX_AUTO_REVISIONS : 0,
+  );
+}
+function taskRequestsBrowserValidation(params: Record<string, unknown>): boolean {
+  const candidates: string[] = [];
+  const collect = (value: unknown) => {
+    if (typeof value === "string") {
+      candidates.push(value);
+    } else if (Array.isArray(value)) {
+      for (const item of value) collect(item);
+    }
+  };
+  const planning =
+    params.planning && typeof params.planning === "object"
+      ? (params.planning as Record<string, unknown>)
+      : {};
+  collect(planning.requiredValidationSteps);
+  collect(planning.validationSteps);
+  collect(params.requiredValidationSteps);
+  collect(params.validationSteps);
+  collect(params.instruction);
+  return candidates.some((candidate) => isLongRunningBrowserValidationCommand(candidate));
+}
 function shouldSoftPassValidationBlocker(
   policy: QualityGatePolicy,
   blocker: ValidationBlocker | null,
@@ -148,14 +210,17 @@ export function revisionLimitForQualityGateFailures(opts: {
   qualityIssues: string[];
   requiredValidationFailures: string[];
   blocker: ValidationBlocker | null;
+  browserRepairPacket?: BrowserValidationRepairPacket | null;
 }): number {
   const hasValidationGateFailure =
     opts.requiredValidationFailures.length > 0 ||
     opts.blocker !== null ||
     opts.qualityIssues.some((issue) => issue.startsWith("ValidationGate:"));
-  return hasValidationGateFailure
-    ? opts.policy.validationMaxAutoRevisions
-    : opts.policy.maxAutoRevisions;
+  if (!hasValidationGateFailure) return opts.policy.maxAutoRevisions;
+  if (opts.browserRepairPacket) {
+    return Math.max(opts.policy.validationMaxAutoRevisions, BROWSER_VALIDATION_MAX_AUTO_REVISIONS);
+  }
+  return opts.policy.validationMaxAutoRevisions;
 }
 // ─── Utilities ───────────────────────────────────────────────────────────────
@@ -1135,8 +1200,15 @@ export function prepareValidationCommandArgv(
   return [...argv, "--", "--port", port];
 }
-function isBrowserValidationInfrastructureDigest(digest: string): boolean {
-  return /\b(ERR_SOCKET_BAD_PORT|EADDRINUSE|ECONNREFUSED|ECONNRESET|ETIMEDOUT|timed out|timeout|port|browser runtime|playwright install|executable doesn't exist)\b/i.test(
+function isBrowserAssertionDigest(digest: string): boolean {
+  return /\b(Web end-to-end smoke test failed|locator\.[a-z0-9_]+:\s+Timeout\s+\d+ms\s+exceeded|page\.[a-z0-9_]+:\s+Timeout\s+\d+ms\s+exceeded|waiting for getBy(?:TestId|Role|Text|Label|Placeholder|Title)\(|Expected .+ to be .+ within \d+ms|AssertionError|Error:\s+expect\()/i.test(
+    digest,
+  );
+}
+export function isBrowserValidationInfrastructureDigest(digest: string): boolean {
+  if (isBrowserAssertionDigest(digest)) return false;
+  return /\b(browserType\.launch|ERR_SOCKET_BAD_PORT|EADDRINUSE|ECONNREFUSED|ECONNRESET|ETIMEDOUT|listen\s+EPERM|EPERM|EACCES|freeport|port selection|browser runtime|playwright install|executable doesn't exist|Expo exited early|local port bind|Validation command timed out|terminated by signal)\b/i.test(
     digest,
   );
 }
@@ -1466,7 +1538,7 @@ function parseChangedPathsFromStatus(statusOutput: string): string[] {
   return out;
 }
-function isLikelyTestPath(path: string): boolean {
+export function isAssertionCoverageTestPath(path: string): boolean {
   const normalized = path.replace(/\\/g, "/").toLowerCase();
   return (
     normalized.includes("/tests/") ||
@@ -1477,6 +1549,21 @@ function isLikelyTestPath(path: string): boolean {
   );
 }
+export function isBrowserSmokeHarnessPath(path: string): boolean {
+  const normalized = path.replace(/\\/g, "/").toLowerCase();
+  return (
+    /(^|\/)scripts\/test-[^/]*\.(?:c?js|m?js|ts)$/.test(normalized) ||
+    /(^|\/)scripts\/[^/]*(?:e2e|smoke|playwright|browser)[^/]*\.(?:c?js|m?js|ts)$/.test(
+      normalized,
+    ) ||
+    /(^|\/)(?:playwright|cypress)\.config\.(?:c?js|m?js|ts)$/.test(normalized)
+  );
+}
+export function isLikelyTestPath(path: string): boolean {
+  return isAssertionCoverageTestPath(path) || isBrowserSmokeHarnessPath(path);
+}
 function extractRunnableValidationCommand(step: string): string | null {
   const trimmed = step.trim();
   if (!trimmed) return null;
@@ -1582,6 +1669,288 @@ export function extractValidationFailureDigest(run: {
   return "";
 }
+function classifyBrowserValidationFailureKindFromText(text: string): BrowserValidationFailureKind {
+  const combined = stripAnsiControlSequences(text);
+  if (
+    /\b(browserType\.launch|Executable doesn't exist|playwright install|Browser runtime preflight failed|Please run the following command to download new browsers|Validation command timed out|terminated by signal|SIGTERM|timed out after \d+ms)\b/i.test(
+      combined,
+    )
+  ) {
+    return "runtime";
+  }
+  if (
+    /\b(ERR_SOCKET_BAD_PORT|EADDRINUSE|listen\s+EPERM|EPERM|EACCES|freeport|port selection|Expo exited early|local port bind|cannot bind|operation not permitted)\b/i.test(
+      combined,
+    )
+  ) {
+    return "startup";
+  }
+  if (/\b(page\.[a-z0-9_]+:\s+net::ERR_[A-Z0-9_]+|ECONNREFUSED|ECONNRESET|ETIMEDOUT)\b/i.test(combined)) {
+    return "network";
+  }
+  if (isBrowserAssertionDigest(combined)) {
+    return "assertion";
+  }
+  return "unknown";
+}
+function extractBrowserValidationStage(text: string): string | null {
+  const patterns = [
+    /\bBrowser validation failed during\s+([^:.\r\n]+?)\s+stage\b/i,
+    /\bfailed during\s+([^:.\r\n]+?)\s+stage\b/i,
+    /\b(?:stage|phase)\s*[:=]\s*["'`]?([^"'`.\r\n]+)["'`]?/i,
+  ];
+  for (const pattern of patterns) {
+    const match = text.match(pattern);
+    const value = match?.[1]?.trim();
+    if (value) return toSingleLine(value, 80);
+  }
+  return null;
+}
+function extractBalancedLocatorCall(text: string): string | null {
+  const callPattern = /\b(?:getBy(?:TestId|Role|Text|Label|Placeholder|Title)|locator\.[a-z0-9_]+|page\.[a-z0-9_]+)\(/gi;
+  let match: RegExpExecArray | null;
+  while ((match = callPattern.exec(text)) != null) {
+    let depth = 0;
+    let quote: string | null = null;
+    let escaped = false;
+    for (let index = match.index; index < text.length; index += 1) {
+      const char = text[index] ?? "";
+      if (quote) {
+        if (escaped) {
+          escaped = false;
+        } else if (char === "\\") {
+          escaped = true;
+        } else if (char === quote) {
+          quote = null;
+        }
+        continue;
+      }
+      if (char === "'" || char === '"' || char === "`") {
+        quote = char;
+        continue;
+      }
+      if (char === "(") {
+        depth += 1;
+        continue;
+      }
+      if (char === ")") {
+        depth -= 1;
+        if (depth === 0) return toSingleLine(text.slice(match.index, index + 1), 120);
+      }
+      if (depth <= 0 && /\s/.test(char) && index > match.index) break;
+    }
+  }
+  return null;
+}
+function extractBrowserValidationSelector(text: string): string | null {
+  const balanced = extractBalancedLocatorCall(text);
+  if (balanced) return balanced;
+  const patterns = [
+    /\bwaiting for\s+(getBy(?:TestId|Role|Text|Label|Placeholder|Title)\([^)\r\n]+\))/i,
+    /\b(locator\.[a-z0-9_]+\([^)\r\n]*\))/i,
+    /\b(page\.[a-z0-9_]+\([^)\r\n]*\))/i,
+    /\b(getBy(?:TestId|Role|Text|Label|Placeholder|Title)\([^)\r\n]+\))/i,
+  ];
+  for (const pattern of patterns) {
+    const match = text.match(pattern);
+    const value = match?.[1]?.trim();
+    if (value) return toSingleLine(value, 120);
+  }
+  return null;
+}
+function extractBrowserValidationExpectedUi(text: string): string | null {
+  const patterns = [
+    /\bExpected\s+([^:.\r\n]+?)\s+within\s+\d+ms\b/i,
+    /\bExpected\s+([^:.\r\n]+?)(?:[:.]|\r?\n)/i,
+    /\bExpected\s+([^:.\r\n]+?)$/i,
+  ];
+  for (const pattern of patterns) {
+    const match = text.match(pattern);
+    const value = match?.[1]?.trim();
+    if (value) return toSingleLine(value, 140);
+  }
+  return null;
+}
+function extractBrowserValidationArtifacts(text: string): string[] {
+  const combined = stripAnsiControlSequences(text);
+  const out: string[] = [];
+  const seen = new Set<string>();
+  const addArtifact = (raw: string | undefined) => {
+    const artifact = String(raw ?? "")
+      .trim()
+      .replace(/[),.;:]+$/, "");
+    if (!artifact || seen.has(artifact)) return;
+    seen.add(artifact);
+    out.push(toSingleLine(artifact, 220));
+  };
+  const patterns = [
+    /\b(?:screenshot|snapshot|trace|video|artifact|output|saved|wrote)[^:\r\n]*:\s*(["'`]?)([^"'`\s]+(?:outputs|test-results|playwright-report)[^\s"'`]+(?:\.png|\.jpg|\.jpeg|\.webp|\.zip|\.json|\.txt|\.webm))\1/gi,
+    /((?:\/repo|\/workspace|[A-Za-z]:[\\/])?[^\s"'`]*?(?:outputs|test-results|playwright-report)[\\/][^\s"'`]+(?:\.png|\.jpg|\.jpeg|\.webp|\.zip|\.json|\.txt|\.webm))/gi,
+  ];
+  for (const pattern of patterns) {
+    let match: RegExpExecArray | null;
+    while ((match = pattern.exec(combined)) != null) {
+      addArtifact(match[2] ?? match[1]);
+      if (out.length >= 4) return out;
+    }
+  }
+  return out;
+}
+function collectRecentBrowserValidationFiles(
+  repo: string | undefined,
+  extensions: RegExp,
+  limit = 8,
+): string[] {
+  if (!repo) return [];
+  const roots = ["outputs/web-e2e", "test-results", "playwright-report"]
+    .map((entry) => resolve(repo, entry))
+    .filter((entry) => existsSync(entry));
+  const files: Array<{ path: string; mtimeMs: number }> = [];
+  const visit = (dir: string, depth: number) => {
+    if (depth > 4 || files.length > 2_000) return;
+    let entries: Array<{ name: unknown; isDirectory(): boolean; isFile(): boolean }>;
+    try {
+      entries = readdirSync(dir, { withFileTypes: true });
+    } catch {
+      return;
+    }
+    for (const entry of entries) {
+      const entryName = String(entry.name);
+      const path = resolve(dir, entryName);
+      if (entry.isDirectory()) {
+        visit(path, depth + 1);
+        continue;
+      }
+      if (!entry.isFile() || !extensions.test(entryName)) continue;
+      try {
+        const stat = lstatSync(path);
+        files.push({ path, mtimeMs: stat.mtimeMs });
+      } catch {
+        // Ignore files that disappear while a validation command is cleaning up.
+      }
+    }
+  };
+  for (const root of roots) visit(root, 0);
+  return files
+    .sort((a, b) => b.mtimeMs - a.mtimeMs)
+    .slice(0, limit)
+    .map((entry) => entry.path);
+}
+function collectRecentBrowserValidationArtifacts(repo: string | undefined): string[] {
+  return collectRecentBrowserValidationFiles(
+    repo,
+    /\.(?:png|jpe?g|webp|zip|json|txt|log|webm)$/i,
+    6,
+  ).map((entry) => toSingleLine(entry, 220));
+}
+function summarizeRecentBrowserValidationLogs(repo: string | undefined): string {
+  const logFiles = collectRecentBrowserValidationFiles(repo, /\.(?:log|txt)$/i, 3);
+  const summaries: string[] = [];
+  for (const logFile of logFiles) {
+    let content = "";
+    try {
+      content = readFileSync(logFile, "utf8");
+    } catch {
+      continue;
+    }
+    const lines = stripAnsiControlSequences(content)
+      .split(/\r?\n/)
+      .map((line) => line.trim())
+      .filter(Boolean)
+      .filter((line) =>
+        /\b(Web end-to-end smoke test failed|Browser validation failed|Expected |locator\.|page\.|waiting for |Call log:|Verified:|Saved screenshot|Saved trace|ERR_SOCKET_BAD_PORT|EADDRINUSE|EPERM|EACCES|browserType\.launch|Expo exited early|freeport|net::ERR_|Validation command timed out|terminated by signal|SIGTERM|timed out after \d+ms)/i.test(
+          line,
+        ),
+      );
+    if (lines.length === 0) continue;
+    summaries.push(`${logFile}: ${lines.slice(-18).join(" | ")}`);
+  }
+  return toSingleLine(summaries.join(" | "), 1_400);
+}
+function mergeBrowserValidationArtifacts(...sources: Array<string[] | undefined>): string[] {
+  const out: string[] = [];
+  const seen = new Set<string>();
+  for (const source of sources) {
+    for (const artifact of source ?? []) {
+      const clean = toSingleLine(artifact, 220);
+      if (!clean || seen.has(clean)) continue;
+      seen.add(clean);
+      out.push(clean);
+      if (out.length >= 8) return out;
+    }
+  }
+  return out;
+}
+function summarizeBrowserValidationOutput(text: string): string {
+  const lines = stripAnsiControlSequences(text)
+    .split(/\r?\n/)
+    .map((line) => line.trim())
+    .filter(Boolean)
+    .filter((line) =>
+      /\b(Web end-to-end smoke test failed|Browser validation failed|Expected |locator\.|page\.|waiting for getBy|Call log:|ERR_SOCKET_BAD_PORT|EADDRINUSE|EPERM|EACCES|browserType\.launch|Executable doesn't exist|Expo exited early|freeport|net::ERR_|Validation command timed out|terminated by signal|SIGTERM|timed out after \d+ms)/i.test(
+        line,
+      ),
+    );
+  return toSingleLine(lines.slice(0, 8).join(" | "), 900);
+}
+export function buildBrowserValidationRepairPacket(
+  validationRuns: ValidationExecutionResult[],
+  previousFailureDigests: Map<string, string> = new Map(),
+  repo?: string,
+): BrowserValidationRepairPacket | null {
+  for (const run of validationRuns) {
+    if (run.ok || !isLongRunningBrowserValidationCommand(run.command)) continue;
+    const combined = stripAnsiControlSequences([run.stderr, run.stdout].filter(Boolean).join("\n"));
+    const digest = extractValidationFailureDigest(run);
+    const failureKind = classifyBrowserValidationFailureKindFromText(`${digest}\n${combined}`);
+    if (failureKind === "unknown") continue;
+    const previousDigest = previousFailureDigests.get(validationCommandKey(run.command)) ?? null;
+    const recentLogSummary = summarizeRecentBrowserValidationLogs(repo);
+    const enrichedBrowserContext = [combined, recentLogSummary].filter(Boolean).join("\n");
+    const progress =
+      previousDigest == null
+        ? "first_failure"
+        : previousDigest === digest
+          ? "same_failure"
+          : "new_failure";
+    return {
+      command: run.command,
+      failureKind,
+      stage: extractBrowserValidationStage(enrichedBrowserContext),
+      selector: extractBrowserValidationSelector(enrichedBrowserContext),
+      expected: extractBrowserValidationExpectedUi(enrichedBrowserContext),
+      digest,
+      previousDigest,
+      previousStage: previousDigest ? extractBrowserValidationStage(previousDigest) : null,
+      previousSelector: previousDigest ? extractBrowserValidationSelector(previousDigest) : null,
+      previousExpected: previousDigest ? extractBrowserValidationExpectedUi(previousDigest) : null,
+      progress,
+      artifacts: mergeBrowserValidationArtifacts(
+        extractBrowserValidationArtifacts(combined),
+        collectRecentBrowserValidationArtifacts(repo),
+      ),
+      output: [
+        summarizeBrowserValidationOutput(combined) || digest,
+        recentLogSummary,
+      ]
+        .filter(Boolean)
+        .join(" | "),
+    };
+  }
+  return null;
+}
 export function collectRequiredValidationFailures(
   requiredCommands: string[],
   validationRuns: Array<{ command: string; ok: boolean; exitCode?: number }>,
@@ -1866,6 +2235,9 @@ async function runDeterministicQualityGate(
       [...changedPaths, ...preparedMergeConflictPaths].filter((path) => isLikelyTestPath(path)),
     ),
   );
+  const changedAssertionCoverageTestPaths = changedTestPaths.filter((path) =>
+    isAssertionCoverageTestPath(path),
+  );
   const issues: string[] = [];
   const scopeIssues: string[] = [];
   const validationIssues: string[] = [];
@@ -1890,8 +2262,8 @@ async function runDeterministicQualityGate(
     }
     if (
       isTestTask &&
-      changedTestPaths.length > 0 &&
-      !hasBalancedPositiveNegativeAssertions(changedTestPaths, repo)
+      changedAssertionCoverageTestPaths.length > 0 &&
+      !hasBalancedPositiveNegativeAssertions(changedAssertionCoverageTestPaths, repo)
     ) {
       addScopeIssue(
         "found changed test files without both positive and negative assertion coverage (expected both).",
@@ -2344,9 +2716,101 @@ export function buildQualityRevisionHint(
   reviewFixContext?: ReviewFixContext | null,
   validationRuns: ValidationExecutionResult[] = [],
   validationBlocker: ValidationBlocker | null = null,
+  browserRepairPacket: BrowserValidationRepairPacket | null = null,
 ): string {
   const lines: string[] = [];
   lines.push("Quality revision required before completion.");
+  const focusedBrowserRepair = Boolean(browserRepairPacket);
+  if (browserRepairPacket) {
+    lines.push("Primary ValidationGate repair objective:");
+    lines.push(`- Command: ${browserRepairPacket.command}`);
+    lines.push(`- Failure type: browser ${browserRepairPacket.failureKind}`);
+    lines.push(
+      "- First action: inspect the captured browser output/artifacts and actual rendered UI before editing; do not guess from component names or intended copy.",
+    );
+    if (browserRepairPacket.stage) lines.push(`- Stage: ${browserRepairPacket.stage}`);
+    if (browserRepairPacket.expected) {
+      lines.push(`- Expected UI: ${browserRepairPacket.expected}`);
+    }
+    if (browserRepairPacket.selector) {
+      lines.push(`- Selector/wait: ${browserRepairPacket.selector}`);
+    }
+    if (browserRepairPacket.artifacts.length > 0) {
+      lines.push("Failure artifacts to inspect:");
+      for (const artifact of browserRepairPacket.artifacts) {
+        lines.push(`- ${artifact}`);
+      }
+    } else {
+      lines.push(
+        "- Failure artifacts: none were captured in command output; if this repo writes screenshots/traces, inspect the latest browser failure artifact before changing selectors.",
+      );
+    }
+    if (browserRepairPacket.digest) {
+      lines.push(`- Current failure: ${browserRepairPacket.digest}`);
+    }
+    if (browserRepairPacket.previousDigest) {
+      const breadcrumb =
+        browserRepairPacket.progress === "same_failure"
+          ? "same failure repeated for this command"
+          : "new failure for this command after the previous revision";
+      lines.push(`- Breadcrumb: ${breadcrumb}; previous failure was ${browserRepairPacket.previousDigest}`);
+      if (
+        browserRepairPacket.previousStage ||
+        browserRepairPacket.previousExpected ||
+        browserRepairPacket.previousSelector
+      ) {
+        lines.push("Previous browser failure detail:");
+        if (browserRepairPacket.previousStage) {
+          lines.push(`- Previous stage: ${browserRepairPacket.previousStage}`);
+        }
+        if (browserRepairPacket.previousExpected) {
+          lines.push(`- Previous expected UI: ${browserRepairPacket.previousExpected}`);
+        }
+        if (browserRepairPacket.previousSelector) {
+          lines.push(`- Previous selector/wait: ${browserRepairPacket.previousSelector}`);
+        }
+      }
+    } else {
+      lines.push("- Breadcrumb: first captured failure for this command in this revision loop");
+    }
+    if (browserRepairPacket.output) {
+      lines.push(`- Relevant output: ${browserRepairPacket.output}`);
+    }
+    if (browserRepairPacket.failureKind === "assertion") {
+      lines.push(
+        "Repair direction: fix this exact visible UI assertion or the app state that should make it true. If the expected text/role/test id is not present in the screenshot, update the smoke assertion to the visible product UI that proves the same stage, or add accessibility metadata to an existing control. Do not add optional navigation or broaden the smoke path. Do not change browser startup, port selection, Playwright installation, or unrelated e2e harness behavior unless the captured failure is reclassified as startup/setup.",
+      );
+      lines.push(
+        "Selector stability rule: prefer existing data-testid/accessibility labels/roles and stage containers over guessed title/body text. If a stage already passed with a stable container such as a home/shell/test-id locator, reuse that signal instead of replacing it with copy checks.",
+      );
+      lines.push(
+        "Text assertion rule: rendered titles may be split across sibling nodes. Do not invent a combined phrase for split text; either assert the individual visible fragments within the stage container or add/reuse a stable test id/accessibility label.",
+      );
+      if (
+        browserRepairPacket.progress === "same_failure" ||
+        (browserRepairPacket.stage &&
+          browserRepairPacket.previousStage &&
+          browserRepairPacket.stage === browserRepairPacket.previousStage)
+      ) {
+        lines.push(
+          "Repeated-stage rule: this browser stage has failed before in the current revision loop, so treat the previous selector/copy assumption as suspect and switch to the most stable rendered locator for that same stage.",
+        );
+      }
+    } else {
+      lines.push(
+        "Repair direction: this is a browser startup/runtime/network failure. Fix only startup/runtime provisioning for this command and do not rewrite app UI assertions unless a later ValidationGate run reaches an assertion stage.",
+      );
+    }
+    lines.push(
+      "Convergence rule: preserve stages that already passed, repair only the current failing browser stage, and stop after one targeted browser confirmation so the next ValidationGate run gets a clean signal.",
+    );
+    lines.push(
+      "Executor sandbox rule: if the full browser command cannot run inside this edit turn because local server binding is denied or Expo/Playwright reports ERR_SOCKET_BAD_PORT, listen EPERM, EACCES, or a local port bind/freeport failure before reaching the app, treat that as a Codex executor verification limitation. Do not change app startup, ports, or browser provisioning for that local-only signal unless the ValidationGate failure above is also a startup/setup failure. Use the captured artifacts plus fast checks, then let ValidationGate perform the authoritative browser run.",
+    );
+    lines.push(
+      `Validation rerun rule: PushPals ValidationGate will rerun "${browserRepairPacket.command}" after the patch. During a focused browser repair turn, run fast non-browser checks and inspect captured artifacts first; do not run the full browser command from the Codex executor by default. Only run the full browser command for one targeted confirmation if artifacts are missing and a quick local bind/startup probe shows the browser server can actually run in this executor. Otherwise stop after fast checks so ValidationGate gets the clean authoritative signal.`,
+    );
+  }
   if (reviewFixContext) {
     lines.push("Rejected PR retry requirements:");
     if (reviewFixContext.previousReviewScore != null) {
@@ -2373,8 +2837,28 @@ export function buildQualityRevisionHint(
     lines.push("Raise the score above the approval threshold without reopening already accepted behavior.");
   }
   if (issues.length > 0) {
-    lines.push("Deterministic quality issues:");
-    for (const issue of issues) lines.push(`- ${issue}`);
+    const displayedIssues = focusedBrowserRepair
+      ? issues.filter(
+          (issue) =>
+            issue.startsWith("ValidationGate:") ||
+            issue.includes("Required vision.md validation") ||
+            issue.includes("Validation blocker"),
+        )
+      : issues;
+    if (displayedIssues.length > 0) {
+      lines.push(
+        focusedBrowserRepair
+          ? "Deterministic quality issues relevant to this validation repair:"
+          : "Deterministic quality issues:",
+      );
+      for (const issue of displayedIssues) lines.push(`- ${issue}`);
+    }
+    const suppressedCount = issues.length - displayedIssues.length;
+    if (focusedBrowserRepair && suppressedCount > 0) {
+      lines.push(
+        `Suppressed ${suppressedCount} lower-priority ScopeGate/CriticGate note(s) until the browser validation repair passes.`,
+      );
+    }
   }
   if (validationBlocker) {
     lines.push(
@@ -2387,7 +2871,10 @@ export function buildQualityRevisionHint(
   const failedValidationRuns = validationRuns.filter((run) => !run.ok);
   if (failedValidationRuns.length > 0) {
     lines.push("Validation failure diagnostics:");
-    for (const run of failedValidationRuns.slice(0, 5)) {
+    const runsToShow = browserRepairPacket
+      ? failedValidationRuns.filter((run) => run.command === browserRepairPacket.command).slice(0, 1)
+      : failedValidationRuns.slice(0, 5);
+    for (const run of runsToShow) {
       lines.push(`- ${run.command} failed with exit ${run.exitCode} after ${run.elapsedMs}ms.`);
       const output = toSingleLine(
         stripAnsiControlSequences([run.stderr, run.stdout].filter(Boolean).join("\n")),
@@ -2397,14 +2884,40 @@ export function buildQualityRevisionHint(
     }
   }
   if (critic) {
-    lines.push(`Critic score: ${critic.score.toFixed(1)} / 10`);
-    if (critic.mustFix.length > 0) {
+    const deferCriticForBrowserAssertion =
+      focusedBrowserRepair && browserRepairPacket?.failureKind === "assertion";
+    const criticIsSevere =
+      critic.score <= 4 ||
+      [...critic.mustFix, ...critic.findings, critic.revisionGuidance].some((entry) =>
+        /\b(browser|e2e|validation|web smoke|playwright)\b/i.test(entry),
+      );
+    if (deferCriticForBrowserAssertion) {
+      lines.push(
+        `CriticGate notes deferred while repairing the primary browser assertion failure (score ${critic.score.toFixed(1)} / 10).`,
+      );
+    } else if (!focusedBrowserRepair || criticIsSevere) {
+      lines.push(`Critic score: ${critic.score.toFixed(1)} / 10`);
+    }
+    if (
+      !deferCriticForBrowserAssertion &&
+      (!focusedBrowserRepair || criticIsSevere) &&
+      critic.mustFix.length > 0
+    ) {
       lines.push("Critic must-fix findings:");
       for (const issue of critic.mustFix) lines.push(`- ${issue}`);
     }
-    if (critic.revisionGuidance) {
+    if (
+      !deferCriticForBrowserAssertion &&
+      (!focusedBrowserRepair || criticIsSevere) &&
+      critic.revisionGuidance
+    ) {
       lines.push(`Critic revision guidance: ${critic.revisionGuidance}`);
     }
+    if (focusedBrowserRepair && !criticIsSevere && !deferCriticForBrowserAssertion) {
+      lines.push(
+        `CriticGate notes deferred while repairing the primary browser validation failure (score ${critic.score.toFixed(1)} / 10).`,
+      );
+    }
   }
   if (planning.acceptanceCriteria.length > 0) {
     lines.push("Required acceptance criteria:");
@@ -2661,10 +3174,14 @@ export type WorkerGitCommitIdentity = SourceControlCommitIdentity;
 export const explicitWorkerCommitIdentityFromEnv = explicitSourceControlCommitIdentityFromEnv;
+export function buildSandboxArtifactUnstageCommand(): string[] {
+  return ["reset", "-q", "--", ...SANDBOX_STAGE_ARTIFACT_PATHS];
+}
 async function unstageSandboxArtifactPaths(
   repo: string,
 ): Promise<{ ok: boolean; stdout: string; stderr: string }> {
-  return git(repo, ["reset", "-q", "--", ...SANDBOX_STAGE_ARTIFACT_PATHS]);
+  return git(repo, buildSandboxArtifactUnstageCommand());
 }
 async function resolveGitConfigValue(repo: string, key: string): Promise<string> {
@@ -4499,7 +5016,7 @@ function hasInvalidRepoPathHint(values: string[]): boolean {
   return values.some((entry) => normalizeStagePath(entry) === null);
 }
-const SANDBOX_STAGE_ARTIFACT_PATHS = ["workspace", "outputs", ".codex"];
+export const SANDBOX_STAGE_ARTIFACT_PATHS = ["workspace", "outputs", ".codex", "node_modules"];
 function taskExecuteOrigin(params: Record<string, unknown>): "autonomy" | "user" {
   const explicit = String(params.origin ?? "")
@@ -5049,10 +5566,9 @@ export async function executeJob(
   const qualityGatePolicy = deriveQualityGatePolicy(normalizedParams, runtimeConfig);
   const qualityMaxAutoRevisions = qualityGatePolicy.maxAutoRevisions;
   const qualityValidationMaxAutoRevisions = qualityGatePolicy.validationMaxAutoRevisions;
-  const qualityRevisionLoopMax = Math.max(
-    qualityMaxAutoRevisions,
-    qualityValidationMaxAutoRevisions,
-  );
+  const qualityRevisionLoopMax = qualityRevisionLoopUpperBound(qualityGatePolicy, {
+    browserValidation: taskRequestsBrowserValidation(normalizedParams),
+  });
   const qualitySoftPassOnExhausted = qualityGatePolicy.softPassOnExhausted;
   const qualityCriticMinScore = qualityGatePolicy.criticMinScore;
@@ -5190,6 +5706,11 @@ export async function executeJob(
         revisionAttempt,
       },
     );
+    const browserRepairPacket = buildBrowserValidationRepairPacket(
+      quality.validationRuns,
+      previousValidationFailureDigests,
+      repo,
+    );
     for (const run of quality.validationRuns) {
       if (run.ok) continue;
       const digest = extractValidationFailureDigest(run);
@@ -5324,8 +5845,15 @@ export async function executeJob(
         ? []
         : quality.requiredValidationFailures,
       blocker: validationOutsideTaskScope ? null : quality.blocker,
+      browserRepairPacket: validationOutsideTaskScope ? null : browserRepairPacket,
     });
-    const issueSummary = issues.map((entry) => toSingleLine(entry, 180)).join(" | ");
+    const issueSummary =
+      browserRepairPacket && !validationOutsideTaskScope
+        ? `ValidationGate browser ${browserRepairPacket.failureKind} repair for ${browserRepairPacket.command}: ${toSingleLine(
+            browserRepairPacket.digest,
+            180,
+          )}`
+        : issues.map((entry) => toSingleLine(entry, 180)).join(" | ");
     if (quality.blocker && !validationOutsideTaskScope) {
       const blockerSummary = `Quality gate blocked by ${quality.blocker.category} issue: ${quality.blocker.detail}`;
       const blockerDiagnostics = truncate(
@@ -5339,7 +5867,7 @@ export async function executeJob(
         requiredValidationFailures: quality.requiredValidationFailures,
         blocker: quality.blocker,
         revisionAttempt,
-        maxAutoRevisions: qualityValidationMaxAutoRevisions,
+        maxAutoRevisions: activeMaxAutoRevisions,
         outsideTaskScope: validationOutsideTaskScope,
       });
       if (requiredValidationCanRevise) {
@@ -5456,6 +5984,7 @@ export async function executeJob(
       reviewFixContext,
       validationOutsideTaskScope ? [] : quality.validationRuns,
       validationOutsideTaskScope ? null : quality.blocker,
+      validationOutsideTaskScope ? null : browserRepairPacket,
     );
     onLog?.(
       "stderr",

package/runtime/sandbox/package.json CHANGED Viewed

@@ -10,6 +10,7 @@
     "cli:integration": "bun run scripts/cli-integration.ts",
     "cli:bundle": "bun run --cwd packages/cli build",
     "cli:monitor:export": "bun run scripts/sync-cli-monitor-ui.ts",
+    "replay:worker-job": "bun run scripts/replay-worker-job.ts",
     "protocol:build": "bun --cwd packages/protocol build",
     "protocol:typecheck": "bun --cwd packages/protocol typecheck",
     "server:only": "bun --cwd apps/server --env-file ../../.env dev",

package/runtime/sandbox/prompts/workerpals/openai_codex_task_execute_system_prompt.md CHANGED Viewed

@@ -16,7 +16,7 @@ Execution rules:
 - If the hinted file is a thin wrapper or the behavior lives elsewhere, edit the behavior-owning file(s) needed to solve the task and explain the scope expansion in your final response.
 - Avoid irrelevant sprawl; the review agent will judge whether changed files are necessary for the requested outcome.
 - Read relevant files before editing, then run focused validation.
-- PushPals runs the deterministic ValidationGate after your edit, including any repo-required `vision.md` commands. During the editing turn, prefer focused/fast validation. Do not spend the main Codex execution budget repeatedly running long browser/e2e smoke commands such as `bun run web:e2e`; run them only when the task is specifically about the browser harness or when you need a final targeted confirmation and can stop promptly on a clear failure.
+- PushPals runs the deterministic ValidationGate after your edit, including any repo-required `vision.md` commands. During the editing turn, prefer focused/fast validation. Do not run long browser/e2e smoke commands such as `bun run web:e2e` by default from the Codex executor; ValidationGate is the authoritative browser runner and has the provisioned browser/runtime environment. For browser-harness tasks, inspect existing artifacts, run fast non-browser checks, and only run the full browser command once when a quick local startup probe shows it can run here and you need one targeted confirmation.
 - Use direct commands without shell wrappers. Prefer plain commands like `git diff -- path`, `git add <path>`, `git status --porcelain`, and `pwd`.
 - Do not wrap commands in `/bin/bash -lc`, `sh -lc`, `cmd /c`, or `powershell -Command`, and avoid pipelines, `awk`, heredocs, or multi-command shell snippets unless they are truly unavoidable.
 - If the command router rejects a command, simplify it to a single direct command instead of retrying more shell wrappers.