@pushpalsdev/cli 1.1.1 → 1.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/runtime/prompts/workerpals/openai_codex_task_execute_system_prompt.md +1 -1
- package/runtime/sandbox/apps/workerpals/src/backends/openai_codex/test_openai_codex_runtime_config.py +2 -0
- package/runtime/sandbox/apps/workerpals/src/docker_executor.ts +84 -4
- package/runtime/sandbox/apps/workerpals/src/execute_job.ts +252 -17
- package/runtime/sandbox/prompts/workerpals/openai_codex_task_execute_system_prompt.md +1 -1
package/package.json
CHANGED
|
@@ -16,7 +16,7 @@ Execution rules:
|
|
|
16
16
|
- If the hinted file is a thin wrapper or the behavior lives elsewhere, edit the behavior-owning file(s) needed to solve the task and explain the scope expansion in your final response.
|
|
17
17
|
- Avoid irrelevant sprawl; the review agent will judge whether changed files are necessary for the requested outcome.
|
|
18
18
|
- Read relevant files before editing, then run focused validation.
|
|
19
|
-
- PushPals runs the deterministic ValidationGate after your edit, including any repo-required `vision.md` commands. During the editing turn, prefer focused/fast validation. Do not
|
|
19
|
+
- PushPals runs the deterministic ValidationGate after your edit, including any repo-required `vision.md` commands. During the editing turn, prefer focused/fast validation. Do not run long browser/e2e smoke commands such as `bun run web:e2e` by default from the Codex executor; ValidationGate is the authoritative browser runner and has the provisioned browser/runtime environment. For browser-harness tasks, inspect existing artifacts, run fast non-browser checks, and only run the full browser command once when a quick local startup probe shows it can run here and you need one targeted confirmation.
|
|
20
20
|
- Use direct commands without shell wrappers. Prefer plain commands like `git diff -- path`, `git add <path>`, `git status --porcelain`, and `pwd`.
|
|
21
21
|
- Do not wrap commands in `/bin/bash -lc`, `sh -lc`, `cmd /c`, or `powershell -Command`, and avoid pipelines, `awk`, heredocs, or multi-command shell snippets unless they are truly unavoidable.
|
|
22
22
|
- If the command router rejects a command, simplify it to a single direct command instead of retrying more shell wrappers.
|
|
@@ -295,6 +295,8 @@ class OpenAICodexRuntimeConfigTests(unittest.TestCase):
|
|
|
295
295
|
template = _load_prompt_template("workerpals/openai_codex_task_execute_system_prompt.md")
|
|
296
296
|
self.assertIn("Codex CLI is required infrastructure", template)
|
|
297
297
|
self.assertIn("Use direct commands without shell wrappers", template)
|
|
298
|
+
self.assertIn("ValidationGate is the authoritative browser runner", template)
|
|
299
|
+
self.assertIn("Do not run long browser/e2e smoke commands", template)
|
|
298
300
|
|
|
299
301
|
def test_extracts_usage_counts_from_nested_json_event(self) -> None:
|
|
300
302
|
usage = _extract_usage_counts(
|
|
@@ -43,6 +43,10 @@ const WORKERPAL_SANDBOX_COMPONENT_LABEL = "pushpals.component=workerpals-sandbox
|
|
|
43
43
|
const DOCKER_IMAGE_INSPECT_TIMEOUT_MS = 15_000;
|
|
44
44
|
const DOCKER_IMAGE_BUILD_TIMEOUT_MS = 10 * 60_000;
|
|
45
45
|
const DOCKER_IMAGE_PULL_TIMEOUT_MS = 10 * 60_000;
|
|
46
|
+
const BROWSER_VALIDATION_JOB_REPAIR_ATTEMPTS = 8;
|
|
47
|
+
const BROWSER_VALIDATION_JOB_OVERHEAD_MS = 15 * 60_000;
|
|
48
|
+
const BROWSER_VALIDATION_JOB_MIN_TIMEOUT_MS = 4 * 60 * 60_000;
|
|
49
|
+
const BROWSER_VALIDATION_JOB_MAX_TIMEOUT_MS = 8 * 60 * 60_000;
|
|
46
50
|
|
|
47
51
|
function parseClampedInt(value: unknown, defaultValue: number, min: number, max: number): number {
|
|
48
52
|
const parsed =
|
|
@@ -237,6 +241,75 @@ export interface Job {
|
|
|
237
241
|
sessionId: string;
|
|
238
242
|
}
|
|
239
243
|
|
|
244
|
+
function readPositiveNumber(value: unknown): number | null {
|
|
245
|
+
const parsed =
|
|
246
|
+
typeof value === "number"
|
|
247
|
+
? value
|
|
248
|
+
: typeof value === "string"
|
|
249
|
+
? Number.parseInt(value, 10)
|
|
250
|
+
: Number.NaN;
|
|
251
|
+
if (!Number.isFinite(parsed) || parsed <= 0) return null;
|
|
252
|
+
return Math.floor(parsed);
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
function maybeRecord(value: unknown): Record<string, unknown> | null {
|
|
256
|
+
return value && typeof value === "object" && !Array.isArray(value)
|
|
257
|
+
? (value as Record<string, unknown>)
|
|
258
|
+
: null;
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
function collectValidationCommandHints(params: Record<string, unknown>): string[] {
|
|
262
|
+
const planning = maybeRecord(params.planning);
|
|
263
|
+
const values: unknown[] = [
|
|
264
|
+
params.instruction,
|
|
265
|
+
params.plannerWorkerInstruction,
|
|
266
|
+
params.validationSteps,
|
|
267
|
+
params.requiredValidationSteps,
|
|
268
|
+
planning?.validationSteps,
|
|
269
|
+
planning?.requiredValidationSteps,
|
|
270
|
+
];
|
|
271
|
+
const commands: string[] = [];
|
|
272
|
+
for (const value of values) {
|
|
273
|
+
if (typeof value === "string") {
|
|
274
|
+
commands.push(value);
|
|
275
|
+
continue;
|
|
276
|
+
}
|
|
277
|
+
if (Array.isArray(value)) {
|
|
278
|
+
commands.push(...value.filter((entry): entry is string => typeof entry === "string"));
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
return commands;
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
function hasBrowserValidationCommand(job: Pick<Job, "kind" | "params">): boolean {
|
|
285
|
+
if (job.kind !== "task.execute") return false;
|
|
286
|
+
return collectValidationCommandHints(job.params).some((command) =>
|
|
287
|
+
/\b(web:e2e|e2e:web|browser:e2e|smoke:web|web:smoke|browser:smoke|playwright|cypress)\b/i.test(
|
|
288
|
+
command,
|
|
289
|
+
),
|
|
290
|
+
);
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
export function resolveDockerJobTimeoutMs(
|
|
294
|
+
configuredTimeoutMs: number,
|
|
295
|
+
job: Pick<Job, "kind" | "params">,
|
|
296
|
+
): number {
|
|
297
|
+
const baseTimeoutMs = Math.max(10_000, Math.floor(configuredTimeoutMs));
|
|
298
|
+
if (!hasBrowserValidationCommand(job)) return baseTimeoutMs;
|
|
299
|
+
|
|
300
|
+
const planning = maybeRecord(job.params.planning);
|
|
301
|
+
const executionBudgetMs = readPositiveNumber(planning?.executionBudgetMs) ?? 1_800_000;
|
|
302
|
+
const finalizationBudgetMs = readPositiveNumber(planning?.finalizationBudgetMs) ?? 120_000;
|
|
303
|
+
const attempts = BROWSER_VALIDATION_JOB_REPAIR_ATTEMPTS + 1; // initial attempt plus repairs
|
|
304
|
+
const estimatedTimeoutMs =
|
|
305
|
+
attempts * (executionBudgetMs + finalizationBudgetMs + BROWSER_VALIDATION_JOB_OVERHEAD_MS);
|
|
306
|
+
const boundedTimeoutMs = Math.min(
|
|
307
|
+
BROWSER_VALIDATION_JOB_MAX_TIMEOUT_MS,
|
|
308
|
+
Math.max(BROWSER_VALIDATION_JOB_MIN_TIMEOUT_MS, estimatedTimeoutMs),
|
|
309
|
+
);
|
|
310
|
+
return Math.max(baseTimeoutMs, boundedTimeoutMs);
|
|
311
|
+
}
|
|
312
|
+
|
|
240
313
|
export class DockerExecutor {
|
|
241
314
|
private options: Required<Omit<DockerExecutorOptions, "config">>;
|
|
242
315
|
private worktreeDir: string;
|
|
@@ -1141,9 +1214,15 @@ export class DockerExecutor {
|
|
|
1141
1214
|
stdout: "pipe",
|
|
1142
1215
|
stderr: "pipe",
|
|
1143
1216
|
});
|
|
1217
|
+
const timeoutMs = resolveDockerJobTimeoutMs(this.options.timeoutMs, job);
|
|
1218
|
+
if (timeoutMs !== this.options.timeoutMs) {
|
|
1219
|
+
const note = `[DockerExecutor] Extended job timeout for browser validation convergence: ${timeoutMs}ms (configured ${this.options.timeoutMs}ms).`;
|
|
1220
|
+
console.log(note);
|
|
1221
|
+
onLog?.("stdout", note);
|
|
1222
|
+
}
|
|
1144
1223
|
|
|
1145
1224
|
const { leadMs: warningLeadMs, delayMs: warningDelayMs } = computeTimeoutWarningWindow(
|
|
1146
|
-
|
|
1225
|
+
timeoutMs,
|
|
1147
1226
|
);
|
|
1148
1227
|
const warningTimer = setTimeout(() => {
|
|
1149
1228
|
const warning = `[DockerExecutor] Job nearing timeout in warm container (${Math.round(
|
|
@@ -1172,7 +1251,7 @@ export class DockerExecutor {
|
|
|
1172
1251
|
} catch {
|
|
1173
1252
|
// Ignore kill errors
|
|
1174
1253
|
}
|
|
1175
|
-
},
|
|
1254
|
+
}, timeoutMs);
|
|
1176
1255
|
|
|
1177
1256
|
// Process streams
|
|
1178
1257
|
const stdoutLines: string[] = [];
|
|
@@ -1192,6 +1271,7 @@ export class DockerExecutor {
|
|
|
1192
1271
|
const result = this.parseResult(stdoutLines, stderrLines, exitCode, {
|
|
1193
1272
|
timedOutByDocker,
|
|
1194
1273
|
elapsedMs,
|
|
1274
|
+
timeoutMs,
|
|
1195
1275
|
});
|
|
1196
1276
|
|
|
1197
1277
|
return result;
|
|
@@ -1445,7 +1525,7 @@ export class DockerExecutor {
|
|
|
1445
1525
|
stdoutLines: string[],
|
|
1446
1526
|
stderrLines: string[],
|
|
1447
1527
|
exitCode: number,
|
|
1448
|
-
context: { timedOutByDocker: boolean; elapsedMs: number },
|
|
1528
|
+
context: { timedOutByDocker: boolean; elapsedMs: number; timeoutMs: number },
|
|
1449
1529
|
): DockerJobResult {
|
|
1450
1530
|
let sawSentinel = false;
|
|
1451
1531
|
let sentinelParseError = "";
|
|
@@ -1487,7 +1567,7 @@ export class DockerExecutor {
|
|
|
1487
1567
|
if (context.timedOutByDocker) {
|
|
1488
1568
|
return {
|
|
1489
1569
|
ok: false,
|
|
1490
|
-
summary: `Job timed out in Docker executor after ${context.elapsedMs}ms (limit ${
|
|
1570
|
+
summary: `Job timed out in Docker executor after ${context.elapsedMs}ms (limit ${context.timeoutMs}ms; terminated before structured result).`,
|
|
1491
1571
|
stdout,
|
|
1492
1572
|
stderr,
|
|
1493
1573
|
exitCode,
|
|
@@ -3,7 +3,15 @@
|
|
|
3
3
|
* Used by both the host Worker (direct mode) and the Docker job runner.
|
|
4
4
|
*/
|
|
5
5
|
|
|
6
|
-
import {
|
|
6
|
+
import {
|
|
7
|
+
existsSync,
|
|
8
|
+
lstatSync,
|
|
9
|
+
readdirSync,
|
|
10
|
+
readFileSync,
|
|
11
|
+
renameSync,
|
|
12
|
+
rmSync,
|
|
13
|
+
unlinkSync,
|
|
14
|
+
} from "fs";
|
|
7
15
|
import { resolve } from "path";
|
|
8
16
|
import {
|
|
9
17
|
buildGitCommitArgs as buildSourceControlGitCommitArgs,
|
|
@@ -84,12 +92,15 @@ export interface BrowserValidationRepairPacket {
|
|
|
84
92
|
stage: string | null;
|
|
85
93
|
selector: string | null;
|
|
86
94
|
expected: string | null;
|
|
95
|
+
failureFocus: string | null;
|
|
87
96
|
digest: string;
|
|
88
97
|
previousDigest: string | null;
|
|
89
98
|
previousStage: string | null;
|
|
90
99
|
previousSelector: string | null;
|
|
91
100
|
previousExpected: string | null;
|
|
101
|
+
previousFailureFocus: string | null;
|
|
92
102
|
progress: "first_failure" | "same_failure" | "new_failure";
|
|
103
|
+
needsDiagnosticProbe: boolean;
|
|
93
104
|
artifacts: string[];
|
|
94
105
|
output: string;
|
|
95
106
|
}
|
|
@@ -138,7 +149,41 @@ export interface QualityGatePolicy {
|
|
|
138
149
|
criticMinScore: number;
|
|
139
150
|
}
|
|
140
151
|
|
|
141
|
-
const BROWSER_VALIDATION_MAX_AUTO_REVISIONS =
|
|
152
|
+
const BROWSER_VALIDATION_MAX_AUTO_REVISIONS = 8;
|
|
153
|
+
|
|
154
|
+
export function qualityRevisionLoopUpperBound(policy: {
|
|
155
|
+
maxAutoRevisions: number;
|
|
156
|
+
validationMaxAutoRevisions: number;
|
|
157
|
+
}, opts: {
|
|
158
|
+
browserValidation?: boolean;
|
|
159
|
+
} = {}): number {
|
|
160
|
+
return Math.max(
|
|
161
|
+
policy.maxAutoRevisions,
|
|
162
|
+
policy.validationMaxAutoRevisions,
|
|
163
|
+
opts.browserValidation ? BROWSER_VALIDATION_MAX_AUTO_REVISIONS : 0,
|
|
164
|
+
);
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
function taskRequestsBrowserValidation(params: Record<string, unknown>): boolean {
|
|
168
|
+
const candidates: string[] = [];
|
|
169
|
+
const collect = (value: unknown) => {
|
|
170
|
+
if (typeof value === "string") {
|
|
171
|
+
candidates.push(value);
|
|
172
|
+
} else if (Array.isArray(value)) {
|
|
173
|
+
for (const item of value) collect(item);
|
|
174
|
+
}
|
|
175
|
+
};
|
|
176
|
+
const planning =
|
|
177
|
+
params.planning && typeof params.planning === "object"
|
|
178
|
+
? (params.planning as Record<string, unknown>)
|
|
179
|
+
: {};
|
|
180
|
+
collect(planning.requiredValidationSteps);
|
|
181
|
+
collect(planning.validationSteps);
|
|
182
|
+
collect(params.requiredValidationSteps);
|
|
183
|
+
collect(params.validationSteps);
|
|
184
|
+
collect(params.instruction);
|
|
185
|
+
return candidates.some((candidate) => isLongRunningBrowserValidationCommand(candidate));
|
|
186
|
+
}
|
|
142
187
|
|
|
143
188
|
function shouldSoftPassValidationBlocker(
|
|
144
189
|
policy: QualityGatePolicy,
|
|
@@ -1663,9 +1708,48 @@ function extractBrowserValidationStage(text: string): string | null {
|
|
|
1663
1708
|
const value = match?.[1]?.trim();
|
|
1664
1709
|
if (value) return toSingleLine(value, 80);
|
|
1665
1710
|
}
|
|
1711
|
+
const verifiedStages = [...text.matchAll(/\bVerified:\s+([^|\r\n]+)/gi)]
|
|
1712
|
+
.map((match) => match[1]?.trim())
|
|
1713
|
+
.filter((entry): entry is string => Boolean(entry));
|
|
1714
|
+
const lastVerifiedStage = verifiedStages.at(-1);
|
|
1715
|
+
if (lastVerifiedStage) return toSingleLine(lastVerifiedStage, 80);
|
|
1666
1716
|
return null;
|
|
1667
1717
|
}
|
|
1668
1718
|
|
|
1719
|
+
function inferBrowserValidationFailureFocus(params: {
|
|
1720
|
+
stage?: string | null;
|
|
1721
|
+
selector?: string | null;
|
|
1722
|
+
expected?: string | null;
|
|
1723
|
+
text?: string | null;
|
|
1724
|
+
}): string | null {
|
|
1725
|
+
const combined = stripAnsiControlSequences(
|
|
1726
|
+
[params.stage, params.selector, params.expected, params.text].filter(Boolean).join(" "),
|
|
1727
|
+
).toLowerCase();
|
|
1728
|
+
if (!combined.trim()) return null;
|
|
1729
|
+
|
|
1730
|
+
const focusRules: Array<[RegExp, string]> = [
|
|
1731
|
+
[/\b(settings|ui[-\s]?size|scale(?:\s+option)?|settings-ui-|large ui option|medium|compact)\b/i, "settings UI size"],
|
|
1732
|
+
[/\b(shop|skin|ship-option|projectile-option)\b/i, "shop navigation"],
|
|
1733
|
+
[/\b(home|shell|home-screen|home-play|play button|landing)\b/i, "home shell"],
|
|
1734
|
+
[/\b(match[-\s]?entry|start match|game-screen|countdown)\b/i, "match entry"],
|
|
1735
|
+
[/\b(in[-\s]?game|game-control|help-menu|planet|deploy|allocation|resource|decoy|attack|defense|tank)\b/i, "in-game UI"],
|
|
1736
|
+
];
|
|
1737
|
+
for (const [pattern, label] of focusRules) {
|
|
1738
|
+
if (pattern.test(combined)) return label;
|
|
1739
|
+
}
|
|
1740
|
+
|
|
1741
|
+
const stableLocatorMatch = combined.match(/\b(?:getbytestid|data-testid|testid)\(?['"`]?([a-z0-9_-]+)/i);
|
|
1742
|
+
if (stableLocatorMatch?.[1]) return `test id ${stableLocatorMatch[1]}`;
|
|
1743
|
+
|
|
1744
|
+
const compact = combined
|
|
1745
|
+
.replace(/[^a-z0-9]+/g, " ")
|
|
1746
|
+
.trim()
|
|
1747
|
+
.split(/\s+/)
|
|
1748
|
+
.slice(0, 5)
|
|
1749
|
+
.join(" ");
|
|
1750
|
+
return compact ? toSingleLine(compact, 80) : null;
|
|
1751
|
+
}
|
|
1752
|
+
|
|
1669
1753
|
function extractBalancedLocatorCall(text: string): string | null {
|
|
1670
1754
|
const callPattern = /\b(?:getBy(?:TestId|Role|Text|Label|Placeholder|Title)|locator\.[a-z0-9_]+|page\.[a-z0-9_]+)\(/gi;
|
|
1671
1755
|
let match: RegExpExecArray | null;
|
|
@@ -1760,13 +1844,102 @@ function extractBrowserValidationArtifacts(text: string): string[] {
|
|
|
1760
1844
|
return out;
|
|
1761
1845
|
}
|
|
1762
1846
|
|
|
1847
|
+
function collectRecentBrowserValidationFiles(
|
|
1848
|
+
repo: string | undefined,
|
|
1849
|
+
extensions: RegExp,
|
|
1850
|
+
limit = 8,
|
|
1851
|
+
): string[] {
|
|
1852
|
+
if (!repo) return [];
|
|
1853
|
+
const roots = ["outputs/web-e2e", "test-results", "playwright-report"]
|
|
1854
|
+
.map((entry) => resolve(repo, entry))
|
|
1855
|
+
.filter((entry) => existsSync(entry));
|
|
1856
|
+
const files: Array<{ path: string; mtimeMs: number }> = [];
|
|
1857
|
+
const visit = (dir: string, depth: number) => {
|
|
1858
|
+
if (depth > 4 || files.length > 2_000) return;
|
|
1859
|
+
let entries: Array<{ name: unknown; isDirectory(): boolean; isFile(): boolean }>;
|
|
1860
|
+
try {
|
|
1861
|
+
entries = readdirSync(dir, { withFileTypes: true });
|
|
1862
|
+
} catch {
|
|
1863
|
+
return;
|
|
1864
|
+
}
|
|
1865
|
+
for (const entry of entries) {
|
|
1866
|
+
const entryName = String(entry.name);
|
|
1867
|
+
const path = resolve(dir, entryName);
|
|
1868
|
+
if (entry.isDirectory()) {
|
|
1869
|
+
visit(path, depth + 1);
|
|
1870
|
+
continue;
|
|
1871
|
+
}
|
|
1872
|
+
if (!entry.isFile() || !extensions.test(entryName)) continue;
|
|
1873
|
+
try {
|
|
1874
|
+
const stat = lstatSync(path);
|
|
1875
|
+
files.push({ path, mtimeMs: stat.mtimeMs });
|
|
1876
|
+
} catch {
|
|
1877
|
+
// Ignore files that disappear while a validation command is cleaning up.
|
|
1878
|
+
}
|
|
1879
|
+
}
|
|
1880
|
+
};
|
|
1881
|
+
for (const root of roots) visit(root, 0);
|
|
1882
|
+
return files
|
|
1883
|
+
.sort((a, b) => b.mtimeMs - a.mtimeMs)
|
|
1884
|
+
.slice(0, limit)
|
|
1885
|
+
.map((entry) => entry.path);
|
|
1886
|
+
}
|
|
1887
|
+
|
|
1888
|
+
function collectRecentBrowserValidationArtifacts(repo: string | undefined): string[] {
|
|
1889
|
+
return collectRecentBrowserValidationFiles(
|
|
1890
|
+
repo,
|
|
1891
|
+
/\.(?:png|jpe?g|webp|zip|json|txt|log|webm)$/i,
|
|
1892
|
+
6,
|
|
1893
|
+
).map((entry) => toSingleLine(entry, 220));
|
|
1894
|
+
}
|
|
1895
|
+
|
|
1896
|
+
function summarizeRecentBrowserValidationLogs(repo: string | undefined): string {
|
|
1897
|
+
const logFiles = collectRecentBrowserValidationFiles(repo, /\.(?:log|txt)$/i, 3);
|
|
1898
|
+
const summaries: string[] = [];
|
|
1899
|
+
for (const logFile of logFiles) {
|
|
1900
|
+
let content = "";
|
|
1901
|
+
try {
|
|
1902
|
+
content = readFileSync(logFile, "utf8");
|
|
1903
|
+
} catch {
|
|
1904
|
+
continue;
|
|
1905
|
+
}
|
|
1906
|
+
const lines = stripAnsiControlSequences(content)
|
|
1907
|
+
.split(/\r?\n/)
|
|
1908
|
+
.map((line) => line.trim())
|
|
1909
|
+
.filter(Boolean)
|
|
1910
|
+
.filter((line) =>
|
|
1911
|
+
/\b(Web end-to-end smoke test failed|Browser validation failed|Expected |locator\.|page\.|waiting for |Call log:|Verified:|Saved screenshot|Saved trace|ERR_SOCKET_BAD_PORT|EADDRINUSE|EPERM|EACCES|browserType\.launch|Expo exited early|freeport|net::ERR_|Validation command timed out|terminated by signal|SIGTERM|timed out after \d+ms)/i.test(
|
|
1912
|
+
line,
|
|
1913
|
+
),
|
|
1914
|
+
);
|
|
1915
|
+
if (lines.length === 0) continue;
|
|
1916
|
+
summaries.push(`${logFile}: ${lines.slice(-18).join(" | ")}`);
|
|
1917
|
+
}
|
|
1918
|
+
return toSingleLine(summaries.join(" | "), 1_400);
|
|
1919
|
+
}
|
|
1920
|
+
|
|
1921
|
+
function mergeBrowserValidationArtifacts(...sources: Array<string[] | undefined>): string[] {
|
|
1922
|
+
const out: string[] = [];
|
|
1923
|
+
const seen = new Set<string>();
|
|
1924
|
+
for (const source of sources) {
|
|
1925
|
+
for (const artifact of source ?? []) {
|
|
1926
|
+
const clean = toSingleLine(artifact, 220);
|
|
1927
|
+
if (!clean || seen.has(clean)) continue;
|
|
1928
|
+
seen.add(clean);
|
|
1929
|
+
out.push(clean);
|
|
1930
|
+
if (out.length >= 8) return out;
|
|
1931
|
+
}
|
|
1932
|
+
}
|
|
1933
|
+
return out;
|
|
1934
|
+
}
|
|
1935
|
+
|
|
1763
1936
|
function summarizeBrowserValidationOutput(text: string): string {
|
|
1764
1937
|
const lines = stripAnsiControlSequences(text)
|
|
1765
1938
|
.split(/\r?\n/)
|
|
1766
1939
|
.map((line) => line.trim())
|
|
1767
1940
|
.filter(Boolean)
|
|
1768
1941
|
.filter((line) =>
|
|
1769
|
-
/\b(Web end-to-end smoke test failed|Browser validation failed|Expected |locator\.|page\.|waiting for getBy|Call log:|ERR_SOCKET_BAD_PORT|EADDRINUSE|EPERM|EACCES|browserType\.launch|Executable doesn't exist|Expo exited early|freeport|net::ERR_|Validation command timed out|terminated by signal|SIGTERM|timed out after \d+ms)
|
|
1942
|
+
/\b(Web end-to-end smoke test failed|Browser validation failed|Expected |locator\.|page\.|waiting for getBy|Call log:|ERR_SOCKET_BAD_PORT|EADDRINUSE|EPERM|EACCES|browserType\.launch|Executable doesn't exist|Expo exited early|freeport|net::ERR_|Validation command timed out|terminated by signal|SIGTERM|timed out after \d+ms)/i.test(
|
|
1770
1943
|
line,
|
|
1771
1944
|
),
|
|
1772
1945
|
);
|
|
@@ -1776,6 +1949,7 @@ function summarizeBrowserValidationOutput(text: string): string {
|
|
|
1776
1949
|
export function buildBrowserValidationRepairPacket(
|
|
1777
1950
|
validationRuns: ValidationExecutionResult[],
|
|
1778
1951
|
previousFailureDigests: Map<string, string> = new Map(),
|
|
1952
|
+
repo?: string,
|
|
1779
1953
|
): BrowserValidationRepairPacket | null {
|
|
1780
1954
|
for (const run of validationRuns) {
|
|
1781
1955
|
if (run.ok || !isLongRunningBrowserValidationCommand(run.command)) continue;
|
|
@@ -1784,26 +1958,64 @@ export function buildBrowserValidationRepairPacket(
|
|
|
1784
1958
|
const failureKind = classifyBrowserValidationFailureKindFromText(`${digest}\n${combined}`);
|
|
1785
1959
|
if (failureKind === "unknown") continue;
|
|
1786
1960
|
const previousDigest = previousFailureDigests.get(validationCommandKey(run.command)) ?? null;
|
|
1961
|
+
const recentLogSummary = summarizeRecentBrowserValidationLogs(repo);
|
|
1962
|
+
const enrichedBrowserContext = [combined, recentLogSummary].filter(Boolean).join("\n");
|
|
1963
|
+
const stage = extractBrowserValidationStage(enrichedBrowserContext);
|
|
1964
|
+
const selector = extractBrowserValidationSelector(enrichedBrowserContext);
|
|
1965
|
+
const expected = extractBrowserValidationExpectedUi(enrichedBrowserContext);
|
|
1966
|
+
const previousStage = previousDigest ? extractBrowserValidationStage(previousDigest) : null;
|
|
1967
|
+
const previousSelector = previousDigest ? extractBrowserValidationSelector(previousDigest) : null;
|
|
1968
|
+
const previousExpected = previousDigest ? extractBrowserValidationExpectedUi(previousDigest) : null;
|
|
1969
|
+
const failureFocus = inferBrowserValidationFailureFocus({
|
|
1970
|
+
stage,
|
|
1971
|
+
selector,
|
|
1972
|
+
expected,
|
|
1973
|
+
text: enrichedBrowserContext,
|
|
1974
|
+
});
|
|
1975
|
+
const previousFailureFocus = previousDigest
|
|
1976
|
+
? inferBrowserValidationFailureFocus({
|
|
1977
|
+
stage: previousStage,
|
|
1978
|
+
selector: previousSelector,
|
|
1979
|
+
expected: previousExpected,
|
|
1980
|
+
text: previousDigest,
|
|
1981
|
+
})
|
|
1982
|
+
: null;
|
|
1787
1983
|
const progress =
|
|
1788
1984
|
previousDigest == null
|
|
1789
1985
|
? "first_failure"
|
|
1790
1986
|
: previousDigest === digest
|
|
1791
1987
|
? "same_failure"
|
|
1792
1988
|
: "new_failure";
|
|
1989
|
+
const needsDiagnosticProbe =
|
|
1990
|
+
failureKind === "assertion" &&
|
|
1991
|
+
Boolean(previousDigest) &&
|
|
1992
|
+
Boolean(failureFocus) &&
|
|
1993
|
+
failureFocus === previousFailureFocus;
|
|
1793
1994
|
return {
|
|
1794
1995
|
command: run.command,
|
|
1795
1996
|
failureKind,
|
|
1796
|
-
stage
|
|
1797
|
-
selector
|
|
1798
|
-
expected
|
|
1997
|
+
stage,
|
|
1998
|
+
selector,
|
|
1999
|
+
expected,
|
|
2000
|
+
failureFocus,
|
|
1799
2001
|
digest,
|
|
1800
2002
|
previousDigest,
|
|
1801
|
-
previousStage
|
|
1802
|
-
previousSelector
|
|
1803
|
-
previousExpected
|
|
2003
|
+
previousStage,
|
|
2004
|
+
previousSelector,
|
|
2005
|
+
previousExpected,
|
|
2006
|
+
previousFailureFocus,
|
|
1804
2007
|
progress,
|
|
1805
|
-
|
|
1806
|
-
|
|
2008
|
+
needsDiagnosticProbe,
|
|
2009
|
+
artifacts: mergeBrowserValidationArtifacts(
|
|
2010
|
+
extractBrowserValidationArtifacts(combined),
|
|
2011
|
+
collectRecentBrowserValidationArtifacts(repo),
|
|
2012
|
+
),
|
|
2013
|
+
output: [
|
|
2014
|
+
summarizeBrowserValidationOutput(combined) || digest,
|
|
2015
|
+
recentLogSummary,
|
|
2016
|
+
]
|
|
2017
|
+
.filter(Boolean)
|
|
2018
|
+
.join(" | "),
|
|
1807
2019
|
};
|
|
1808
2020
|
}
|
|
1809
2021
|
return null;
|
|
@@ -2587,6 +2799,9 @@ export function buildQualityRevisionHint(
|
|
|
2587
2799
|
"- First action: inspect the captured browser output/artifacts and actual rendered UI before editing; do not guess from component names or intended copy.",
|
|
2588
2800
|
);
|
|
2589
2801
|
if (browserRepairPacket.stage) lines.push(`- Stage: ${browserRepairPacket.stage}`);
|
|
2802
|
+
if (browserRepairPacket.failureFocus) {
|
|
2803
|
+
lines.push(`- Failure focus: ${browserRepairPacket.failureFocus}`);
|
|
2804
|
+
}
|
|
2590
2805
|
if (browserRepairPacket.expected) {
|
|
2591
2806
|
lines.push(`- Expected UI: ${browserRepairPacket.expected}`);
|
|
2592
2807
|
}
|
|
@@ -2631,6 +2846,17 @@ export function buildQualityRevisionHint(
|
|
|
2631
2846
|
} else {
|
|
2632
2847
|
lines.push("- Breadcrumb: first captured failure for this command in this revision loop");
|
|
2633
2848
|
}
|
|
2849
|
+
if (browserRepairPacket.needsDiagnosticProbe) {
|
|
2850
|
+
lines.push(
|
|
2851
|
+
"- Convergence mode: diagnostic-first repair. This same browser focus failed in the previous revision, so do not guess another selector or rewrite a different stage.",
|
|
2852
|
+
);
|
|
2853
|
+
lines.push(
|
|
2854
|
+
"- Diagnostic requirement: before editing again, inspect or add a tiny temporary diagnostic around the failing stage that records locator counts, visible textContent, role/ARIA attributes, data-testid values, and a nearby DOM snippet for the candidate nodes.",
|
|
2855
|
+
);
|
|
2856
|
+
lines.push(
|
|
2857
|
+
"- React Native Web note: screenshots can show the intended state while Playwright reads a duplicate or stale rendered node. Prefer one unique selected-state test id or a semantic checked attribute on the stable pressable, then assert locator count and visibility.",
|
|
2858
|
+
);
|
|
2859
|
+
}
|
|
2634
2860
|
if (browserRepairPacket.output) {
|
|
2635
2861
|
lines.push(`- Relevant output: ${browserRepairPacket.output}`);
|
|
2636
2862
|
}
|
|
@@ -2663,8 +2889,17 @@ export function buildQualityRevisionHint(
|
|
|
2663
2889
|
"Convergence rule: preserve stages that already passed, repair only the current failing browser stage, and stop after one targeted browser confirmation so the next ValidationGate run gets a clean signal.",
|
|
2664
2890
|
);
|
|
2665
2891
|
lines.push(
|
|
2666
|
-
|
|
2892
|
+
"Executor sandbox rule: if the full browser command cannot run inside this edit turn because local server binding is denied or Expo/Playwright reports ERR_SOCKET_BAD_PORT, listen EPERM, EACCES, or a local port bind/freeport failure before reaching the app, treat that as a Codex executor verification limitation. Do not change app startup, ports, or browser provisioning for that local-only signal unless the ValidationGate failure above is also a startup/setup failure. Use the captured artifacts plus fast checks, then let ValidationGate perform the authoritative browser run.",
|
|
2667
2893
|
);
|
|
2894
|
+
if (browserRepairPacket.needsDiagnosticProbe) {
|
|
2895
|
+
lines.push(
|
|
2896
|
+
`Validation rerun rule: PushPals ValidationGate will rerun "${browserRepairPacket.command}" after the patch, but this is now a repeated browser assertion. If a quick local startup probe shows the browser server can run in this executor, run one targeted "${browserRepairPacket.command}" confirmation after the DOM-backed fix. Do not hand off another unverified selector guess.`,
|
|
2897
|
+
);
|
|
2898
|
+
} else {
|
|
2899
|
+
lines.push(
|
|
2900
|
+
`Validation rerun rule: PushPals ValidationGate will rerun "${browserRepairPacket.command}" after the patch. During a focused browser repair turn, run fast non-browser checks and inspect captured artifacts first; do not run the full browser command from the Codex executor by default. Only run the full browser command for one targeted confirmation if artifacts are missing and a quick local bind/startup probe shows the browser server can actually run in this executor. Otherwise stop after fast checks so ValidationGate gets the clean authoritative signal.`,
|
|
2901
|
+
);
|
|
2902
|
+
}
|
|
2668
2903
|
}
|
|
2669
2904
|
if (reviewFixContext) {
|
|
2670
2905
|
lines.push("Rejected PR retry requirements:");
|
|
@@ -5421,10 +5656,9 @@ export async function executeJob(
|
|
|
5421
5656
|
const qualityGatePolicy = deriveQualityGatePolicy(normalizedParams, runtimeConfig);
|
|
5422
5657
|
const qualityMaxAutoRevisions = qualityGatePolicy.maxAutoRevisions;
|
|
5423
5658
|
const qualityValidationMaxAutoRevisions = qualityGatePolicy.validationMaxAutoRevisions;
|
|
5424
|
-
const qualityRevisionLoopMax =
|
|
5425
|
-
|
|
5426
|
-
|
|
5427
|
-
);
|
|
5659
|
+
const qualityRevisionLoopMax = qualityRevisionLoopUpperBound(qualityGatePolicy, {
|
|
5660
|
+
browserValidation: taskRequestsBrowserValidation(normalizedParams),
|
|
5661
|
+
});
|
|
5428
5662
|
const qualitySoftPassOnExhausted = qualityGatePolicy.softPassOnExhausted;
|
|
5429
5663
|
const qualityCriticMinScore = qualityGatePolicy.criticMinScore;
|
|
5430
5664
|
|
|
@@ -5565,6 +5799,7 @@ export async function executeJob(
|
|
|
5565
5799
|
const browserRepairPacket = buildBrowserValidationRepairPacket(
|
|
5566
5800
|
quality.validationRuns,
|
|
5567
5801
|
previousValidationFailureDigests,
|
|
5802
|
+
repo,
|
|
5568
5803
|
);
|
|
5569
5804
|
for (const run of quality.validationRuns) {
|
|
5570
5805
|
if (run.ok) continue;
|
|
@@ -5722,7 +5957,7 @@ export async function executeJob(
|
|
|
5722
5957
|
requiredValidationFailures: quality.requiredValidationFailures,
|
|
5723
5958
|
blocker: quality.blocker,
|
|
5724
5959
|
revisionAttempt,
|
|
5725
|
-
maxAutoRevisions:
|
|
5960
|
+
maxAutoRevisions: activeMaxAutoRevisions,
|
|
5726
5961
|
outsideTaskScope: validationOutsideTaskScope,
|
|
5727
5962
|
});
|
|
5728
5963
|
if (requiredValidationCanRevise) {
|
|
@@ -16,7 +16,7 @@ Execution rules:
|
|
|
16
16
|
- If the hinted file is a thin wrapper or the behavior lives elsewhere, edit the behavior-owning file(s) needed to solve the task and explain the scope expansion in your final response.
|
|
17
17
|
- Avoid irrelevant sprawl; the review agent will judge whether changed files are necessary for the requested outcome.
|
|
18
18
|
- Read relevant files before editing, then run focused validation.
|
|
19
|
-
- PushPals runs the deterministic ValidationGate after your edit, including any repo-required `vision.md` commands. During the editing turn, prefer focused/fast validation. Do not
|
|
19
|
+
- PushPals runs the deterministic ValidationGate after your edit, including any repo-required `vision.md` commands. During the editing turn, prefer focused/fast validation. Do not run long browser/e2e smoke commands such as `bun run web:e2e` by default from the Codex executor; ValidationGate is the authoritative browser runner and has the provisioned browser/runtime environment. For browser-harness tasks, inspect existing artifacts, run fast non-browser checks, and only run the full browser command once when a quick local startup probe shows it can run here and you need one targeted confirmation.
|
|
20
20
|
- Use direct commands without shell wrappers. Prefer plain commands like `git diff -- path`, `git add <path>`, `git status --porcelain`, and `pwd`.
|
|
21
21
|
- Do not wrap commands in `/bin/bash -lc`, `sh -lc`, `cmd /c`, or `powershell -Command`, and avoid pipelines, `awk`, heredocs, or multi-command shell snippets unless they are truly unavoidable.
|
|
22
22
|
- If the command router rejects a command, simplify it to a single direct command instead of retrying more shell wrappers.
|