@debugg-ai/debugg-ai-mcp 2.4.1 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -34,7 +34,7 @@ docker run -i --rm --init -e DEBUGGAI_API_KEY=your_api_key quinnosha/debugg-ai-m
34
34
 
35
35
  ## Tools
36
36
 
37
- The server exposes **11** tools grouped into Browser (2), Search (3), Projects (3), and Environments (3). The headline tool is `check_app_in_browser`; the rest manage projects, environments + their credentials, and execution history through a uniform `search_*` + CRUD pattern.
37
+ The server exposes **12** tools grouped into Browser (3), Search (3), Projects (3), and Environments (3). The headline tools are `check_app_in_browser` (full AI agent) and `probe_page` (lightweight no-LLM page probe); the rest manage projects, environments + their credentials, and execution history through a uniform `search_*` + CRUD pattern.
38
38
 
39
39
  ### Browser
40
40
 
@@ -75,6 +75,26 @@ URLs are short-lived presigned S3 — refetch the parent execution via `search_e
75
75
 
76
76
  Fires a server-side browser-agent crawl to populate the project's knowledge graph. Localhost URLs tunnel automatically. Returns `{executionId, status, targetUrl, durationMs, outcome?, crawlSummary?, knowledgeGraph?, browserSession?}` with `knowledgeGraph.imported === true` on successful ingestion. The `browserSession` block (HAR + console-log URLs, same shape as above) is also present on completed crawls.
77
77
 
78
+ #### `probe_page`
79
+
80
+ **Lightweight no-LLM batch page probe.** Pass 1-20 URLs; each navigates, waits for load, and returns rendered state — screenshot + page metadata + structured console errors + network summary. No agent loop, no LLM cost, no scenario assertions. Use it for "did I just break /settings?", multi-route smoke after a refactor, CI per-PR sweeps, and quick is-it-up checks where `check_app_in_browser`'s 60-150s agent loop is overkill.
81
+
82
+ | Parameter | Type | Description |
83
+ |-----------|------|-------------|
84
+ | `targets` | array **required** | 1-20 entries: `[{url, waitForSelector?, waitForLoadState?, timeoutMs?}]` |
85
+ | `targets[].url` | string **required** | Public URL or localhost (auto-tunneled) |
86
+ | `targets[].waitForLoadState` | enum | `'load'` (default) / `'domcontentloaded'` / `'networkidle'` |
87
+ | `targets[].waitForSelector` | string | Optional CSS selector to wait for after navigation |
88
+ | `targets[].timeoutMs` | number | Per-URL timeout, 1000-30000 (default 10000) |
89
+ | `includeHtml` | boolean | Return raw HTML in each result (default false) |
90
+ | `captureScreenshots` | boolean | Return one PNG per target (default true) |
91
+
92
+ The whole batch shares a single backend execution + browser session + tunnel — 5 URLs in one call is dramatically faster than 5 parallel single-URL calls. Per-URL `error` field preserves batch resilience: a single failed target doesn't fail the others.
93
+
94
+ **`networkSummary` aggregation key is `origin + pathname`** — refetch loops (`?n=0..4` repeatedly hitting the same endpoint) collapse into a single entry with the count, so `/api/poll` showing up with `count: 47` is the actionable "infinite refetch loop" signal users originally asked for.
95
+
96
+ Performance budget: <10s for 1 URL, <25s for 20. Localhost dead-port returns `LocalServerUnreachable` in <2s without burning a workflow execution.
97
+
78
98
  ### Search (dual-mode: uuid detail OR filtered list)
79
99
 
80
100
  Each `search_*` tool has two modes. Pass `{uuid}` for a single-record detail response. Pass filter params for a paginated summary list. 404 from the backend surfaces as `isError: true` with `{error: 'NotFound', message, uuid}`.
@@ -1,5 +1,6 @@
1
1
  export * from './testPageChangesHandler.js';
2
2
  export * from './triggerCrawlHandler.js';
3
+ export * from './probePageHandler.js';
3
4
  export * from './searchProjectsHandler.js';
4
5
  export * from './searchEnvironmentsHandler.js';
5
6
  export * from './searchExecutionsHandler.js';
@@ -0,0 +1,275 @@
1
+ /**
2
+ * probePageHandler — lightweight no-LLM batch page probe.
3
+ *
4
+ * Mirrors triggerCrawlHandler's 4-step pattern (find template → execute →
5
+ * poll → format response) but: (a) takes a list of targets and produces a
6
+ * list of results, (b) does no agent steps (zero LLM in critical path),
7
+ * (c) MCP-side aggregates per-target HAR slices into NetworkSummary[].
8
+ *
9
+ * The backend "Page Probe" workflow template runs:
10
+ * browser.setup → loop[targets](page.navigate → page.capture) → done
11
+ *
12
+ * Each page.capture node emits per-iteration outputData with consoleSlice
13
+ * + harSlice windowed to that URL's load span — that's what makes per-URL
14
+ * networkSummary attribution accurate.
15
+ */
16
+ import { config } from '../config/index.js';
17
+ import { Logger } from '../utils/logger.js';
18
+ import { handleExternalServiceError } from '../utils/errors.js';
19
+ import { imageContentBlock } from '../utils/imageUtils.js';
20
+ import { DebuggAIServerClient } from '../services/index.js';
21
+ import { TunnelProvisionError } from '../services/tunnels.js';
22
+ import { tunnelManager } from '../services/ngrok/tunnelManager.js';
23
+ import { probeLocalPort, probeTunnelHealth } from '../utils/localReachability.js';
24
+ import { extractLocalhostPort } from '../utils/urlParser.js';
25
+ import { buildContext, findExistingTunnel, ensureTunnel, sanitizeResponseUrls, touchTunnelById, } from '../utils/tunnelContext.js';
26
+ import { getCachedTemplateUuid, invalidateTemplateCache } from '../utils/handlerCaches.js';
27
+ import { summarizeHar, summarizeConsole } from '../utils/harSummarizer.js';
28
+ const logger = new Logger({ module: 'probePageHandler' });
29
+ const TEMPLATE_KEYWORD = 'page probe';
30
+ export async function probePageHandler(input, context, rawProgressCallback) {
31
+ const startTime = Date.now();
32
+ logger.toolStart('probe_page', input);
33
+ // Bead 0bq: progress circuit-breaker — see testPageChangesHandler for rationale.
34
+ let progressDisabled = false;
35
+ const progressCallback = rawProgressCallback
36
+ ? async (update) => {
37
+ if (progressDisabled)
38
+ return;
39
+ try {
40
+ await rawProgressCallback(update);
41
+ }
42
+ catch (err) {
43
+ progressDisabled = true;
44
+ logger.warn('Progress emission failed; disabling further emissions for this request', {
45
+ error: err instanceof Error ? err.message : String(err),
46
+ });
47
+ }
48
+ }
49
+ : undefined;
50
+ const client = new DebuggAIServerClient(config.api.key);
51
+ await client.init();
52
+ const abortController = new AbortController();
53
+ const onStdinClose = () => {
54
+ abortController.abort();
55
+ progressDisabled = true;
56
+ };
57
+ process.stdin.once('close', onStdinClose);
58
+ // Per-target tunnel contexts. Index aligns with input.targets[].
59
+ const targetContexts = [];
60
+ // Tunnel keys we provisioned this call (for cleanup if creation fails after key acquired).
61
+ const acquiredKeyIds = [];
62
+ // Progress budget: 1 pre-flight + 1 template + 1 execute + N per-target captures + 1 done
63
+ const TOTAL_STEPS = 3 + input.targets.length + 1;
64
+ let progressStep = 0;
65
+ try {
66
+ if (progressCallback) {
67
+ await progressCallback({ progress: ++progressStep, total: TOTAL_STEPS, message: `Pre-flight + tunnel setup (${input.targets.length} target${input.targets.length === 1 ? '' : 's'})...` });
68
+ }
69
+ // ── Per-target pre-flight + tunnel resolution ──────────────────────────
70
+ for (const target of input.targets) {
71
+ const ctx = buildContext(target.url);
72
+ if (ctx.isLocalhost) {
73
+ // Pre-flight TCP probe: fail fast if dev server isn't listening.
74
+ const port = extractLocalhostPort(ctx.originalUrl);
75
+ if (typeof port === 'number') {
76
+ const probe = await probeLocalPort(port);
77
+ if (!probe.reachable) {
78
+ const payload = {
79
+ error: 'LocalServerUnreachable',
80
+ message: `No server listening on 127.0.0.1:${port}. Start your dev server on that port before running probe_page. Probe result: ${probe.code} (${probe.detail ?? 'no detail'}).`,
81
+ detail: {
82
+ port,
83
+ probeCode: probe.code,
84
+ probeDetail: probe.detail,
85
+ elapsedMs: probe.elapsedMs,
86
+ },
87
+ };
88
+ logger.warn(`Pre-flight port probe failed for ${ctx.originalUrl}: ${probe.code} in ${probe.elapsedMs}ms`);
89
+ return { content: [{ type: 'text', text: JSON.stringify(payload, null, 2) }], isError: true };
90
+ }
91
+ }
92
+ // Reuse existing tunnel for this port if any; otherwise provision.
93
+ const reused = findExistingTunnel(ctx);
94
+ if (reused) {
95
+ targetContexts.push(reused);
96
+ }
97
+ else {
98
+ let tunnel;
99
+ try {
100
+ tunnel = await client.tunnels.provisionWithRetry();
101
+ }
102
+ catch (provisionError) {
103
+ const msg = provisionError instanceof Error ? provisionError.message : String(provisionError);
104
+ const diag = provisionError instanceof TunnelProvisionError ? ` ${provisionError.diagnosticSuffix()}` : '';
105
+ throw new Error(`Failed to provision tunnel for ${ctx.originalUrl}. ` +
106
+ `(Detail: ${msg})${diag}`);
107
+ }
108
+ acquiredKeyIds.push(tunnel.keyId);
109
+ let tunneled;
110
+ try {
111
+ tunneled = await ensureTunnel(ctx, tunnel.tunnelKey, tunnel.tunnelId, tunnel.keyId, () => client.revokeNgrokKey(tunnel.keyId));
112
+ }
113
+ catch (tunnelError) {
114
+ const msg = tunnelError instanceof Error ? tunnelError.message : String(tunnelError);
115
+ throw new Error(`Tunnel creation failed for ${ctx.originalUrl}. (Detail: ${msg})`);
116
+ }
117
+ // Tunnel health probe: catch the IPv4/IPv6 bind / dead-server case
118
+ // before committing to a full backend execution.
119
+ if (tunneled.targetUrl) {
120
+ const health = await probeTunnelHealth(tunneled.targetUrl);
121
+ if (!health.healthy) {
122
+ const payload = {
123
+ error: 'TunnelTrafficBlocked',
124
+ message: `Tunnel established but traffic isn't reaching the dev server. ${health.detail ?? ''}`,
125
+ detail: {
126
+ code: health.code,
127
+ status: health.status,
128
+ ngrokErrorCode: health.ngrokErrorCode,
129
+ elapsedMs: health.elapsedMs,
130
+ },
131
+ };
132
+ if (tunneled.tunnelId) {
133
+ tunnelManager.stopTunnel(tunneled.tunnelId).catch((err) => logger.warn(`Failed to stop broken tunnel ${tunneled.tunnelId}: ${err}`));
134
+ }
135
+ return { content: [{ type: 'text', text: JSON.stringify(payload, null, 2) }], isError: true };
136
+ }
137
+ }
138
+ targetContexts.push(tunneled);
139
+ }
140
+ }
141
+ else {
142
+ // Public URL — no tunnel needed.
143
+ targetContexts.push(ctx);
144
+ }
145
+ }
146
+ // ── Locate workflow template ───────────────────────────────────────────
147
+ if (progressCallback) {
148
+ await progressCallback({ progress: ++progressStep, total: TOTAL_STEPS, message: 'Locating page-probe workflow template...' });
149
+ }
150
+ const templateUuid = await getCachedTemplateUuid(TEMPLATE_KEYWORD, async (name) => {
151
+ return client.workflows.findTemplateByName(name);
152
+ });
153
+ if (!templateUuid) {
154
+ throw new Error(`Page Probe Workflow Template not found. ` +
155
+ `Ensure the backend has a template matching "${TEMPLATE_KEYWORD}" seeded and accessible.`);
156
+ }
157
+ // ── Build contextData (camelCase; axiosTransport snake_cases on the wire) ──
158
+ const contextData = {
159
+ targets: input.targets.map((t, i) => ({
160
+ url: targetContexts[i].targetUrl ?? t.url,
161
+ waitForSelector: t.waitForSelector,
162
+ waitForLoadState: t.waitForLoadState,
163
+ timeoutMs: t.timeoutMs,
164
+ })),
165
+ includeHtml: input.includeHtml,
166
+ captureScreenshots: input.captureScreenshots,
167
+ };
168
+ // ── Execute ────────────────────────────────────────────────────────────
169
+ if (progressCallback) {
170
+ await progressCallback({ progress: ++progressStep, total: TOTAL_STEPS, message: 'Queuing workflow execution...' });
171
+ }
172
+ const executeResponse = await client.workflows.executeWorkflow(templateUuid, contextData);
173
+ const executionUuid = executeResponse.executionUuid;
174
+ logger.info(`Probe execution queued: ${executionUuid}`);
175
+ // ── Poll ───────────────────────────────────────────────────────────────
176
+ let lastCompleted = -1;
177
+ const finalExecution = await client.workflows.pollExecution(executionUuid, async (exec) => {
178
+ // Keep all active tunnels alive during polling.
179
+ for (const tc of targetContexts) {
180
+ if (tc.tunnelId)
181
+ touchTunnelById(tc.tunnelId);
182
+ }
183
+ if (!progressCallback)
184
+ return;
185
+ const completedNodes = (exec.nodeExecutions ?? []).filter(n => n.nodeType === 'page.capture' && n.status === 'success').length;
186
+ if (completedNodes !== lastCompleted) {
187
+ lastCompleted = completedNodes;
188
+ await progressCallback({
189
+ progress: Math.min(progressStep + completedNodes, TOTAL_STEPS - 1),
190
+ total: TOTAL_STEPS,
191
+ message: `Probed ${completedNodes}/${input.targets.length} target${input.targets.length === 1 ? '' : 's'}...`,
192
+ });
193
+ }
194
+ }, abortController.signal);
195
+ // ── Format response ────────────────────────────────────────────────────
196
+ const duration = Date.now() - startTime;
197
+ const captureNodes = (finalExecution.nodeExecutions ?? [])
198
+ .filter(n => n.nodeType === 'page.capture')
199
+ .sort((a, b) => a.executionOrder - b.executionOrder);
200
+ const results = [];
201
+ const screenshotBlocks = [];
202
+ for (let i = 0; i < input.targets.length; i++) {
203
+ const target = input.targets[i];
204
+ const node = captureNodes[i];
205
+ const data = node?.outputData ?? {};
206
+ const result = {
207
+ url: target.url, // ORIGINAL caller URL — not the tunneled rewrite
208
+ finalUrl: typeof data.finalUrl === 'string' ? data.finalUrl : (typeof data.url === 'string' ? data.url : target.url),
209
+ statusCode: typeof data.statusCode === 'number' ? data.statusCode : 0,
210
+ title: typeof data.title === 'string' ? data.title : null,
211
+ loadTimeMs: typeof data.loadTimeMs === 'number' ? data.loadTimeMs : 0,
212
+ consoleErrors: summarizeConsole(Array.isArray(data.consoleSlice) ? data.consoleSlice : []),
213
+ networkSummary: summarizeHar(Array.isArray(data.harSlice) ? data.harSlice : []),
214
+ };
215
+ if (input.includeHtml && typeof data.html === 'string') {
216
+ result.html = data.html;
217
+ }
218
+ if (typeof data.error === 'string' && data.error) {
219
+ result.error = data.error;
220
+ }
221
+ results.push(result);
222
+ if (input.captureScreenshots && typeof data.screenshotB64 === 'string' && data.screenshotB64) {
223
+ screenshotBlocks.push(imageContentBlock(data.screenshotB64, 'image/png'));
224
+ }
225
+ }
226
+ const responsePayload = {
227
+ executionId: executionUuid,
228
+ durationMs: typeof finalExecution.durationMs === 'number' ? finalExecution.durationMs : duration,
229
+ results,
230
+ };
231
+ if (finalExecution.browserSession) {
232
+ responsePayload.browserSession = finalExecution.browserSession;
233
+ }
234
+ // Sanitize ngrok URLs from the entire payload — agent-authored strings in
235
+ // node outputData (titles, HTML, console messages from the page itself)
236
+ // can occasionally contain the tunnel URL; rewrite to the original
237
+ // localhost origin per tunnel context. For multi-localhost batches we
238
+ // run sanitize once per localhost target since each may have its own
239
+ // tunnel↔origin mapping.
240
+ let sanitizedPayload = responsePayload;
241
+ for (const tc of targetContexts) {
242
+ if (tc.isLocalhost) {
243
+ sanitizedPayload = sanitizeResponseUrls(sanitizedPayload, tc);
244
+ }
245
+ }
246
+ logger.toolComplete('probe_page', duration);
247
+ return {
248
+ content: [
249
+ { type: 'text', text: JSON.stringify(sanitizedPayload, null, 2) },
250
+ ...screenshotBlocks,
251
+ ],
252
+ };
253
+ }
254
+ catch (error) {
255
+ const duration = Date.now() - startTime;
256
+ logger.toolError('probe_page', error, duration);
257
+ if (error instanceof Error && (error.message.includes('not found') || error.message.includes('401'))) {
258
+ invalidateTemplateCache();
259
+ }
260
+ throw handleExternalServiceError(error, 'DebuggAI', 'probe_page execution');
261
+ }
262
+ finally {
263
+ process.stdin.removeListener('close', onStdinClose);
264
+ // Tunnels intentionally NOT torn down — reuse pattern (bead vwd) +
265
+ // 55-min idle auto-shutoff. Revoke only orphaned keys (we acquired the
266
+ // key but tunnel creation failed before ensureTunnel completed).
267
+ for (let i = 0; i < acquiredKeyIds.length; i++) {
268
+ const keyId = acquiredKeyIds[i];
269
+ const tc = targetContexts[i];
270
+ if (tc && !tc.tunnelId && keyId) {
271
+ client.revokeNgrokKey(keyId).catch(err => logger.warn(`Failed to revoke unused ngrok key ${keyId}: ${err}`));
272
+ }
273
+ }
274
+ }
275
+ }
@@ -61,6 +61,11 @@ export async function searchEnvironmentsHandler(input, _context) {
61
61
  const client = new DebuggAIServerClient(config.api.key);
62
62
  await client.init();
63
63
  // ── Resolve projectUuid ──
64
+ // Bead gb4n: when projectUuid is provided directly (caller skips git
65
+ // auto-resolution), `name` and `repoName` are unknown. OMIT those fields
66
+ // rather than emitting nulls — null fields surprised callers and
67
+ // muddied the contract. If a caller needs them, they fetch via
68
+ // search_projects.
64
69
  let projectUuid = input.projectUuid;
65
70
  let project = null;
66
71
  if (!projectUuid) {
@@ -73,10 +78,15 @@ export async function searchEnvironmentsHandler(input, _context) {
73
78
  return noProjectResolved(pagination, `No DebuggAI project found for repo "${repoName}". Pass projectUuid explicitly.`);
74
79
  }
75
80
  projectUuid = resolved.uuid;
76
- project = { uuid: resolved.uuid, name: resolved.name, repoName: resolved.repo?.name ?? repoName };
81
+ project = { uuid: resolved.uuid };
82
+ if (resolved.name)
83
+ project.name = resolved.name;
84
+ const rn = resolved.repo?.name ?? repoName;
85
+ if (rn)
86
+ project.repoName = rn;
77
87
  }
78
88
  else {
79
- project = { uuid: projectUuid, name: null, repoName: null };
89
+ project = { uuid: projectUuid };
80
90
  }
81
91
  // ── uuid mode ──
82
92
  if (input.uuid) {
@@ -15,8 +15,23 @@ import { tunnelManager } from '../services/ngrok/tunnelManager.js';
15
15
  import { probeLocalPort, probeTunnelHealth } from '../utils/localReachability.js';
16
16
  import { extractLocalhostPort } from '../utils/urlParser.js';
17
17
  import { getCachedTemplateUuid, getCachedProjectUuid, invalidateTemplateCache, invalidateProjectCache, } from '../utils/handlerCaches.js';
18
+ import { isTransientWorkflowError, transientReasonTag } from '../utils/transientErrors.js';
19
+ import { Telemetry, TelemetryEvents } from '../utils/telemetry.js';
18
20
  const logger = new Logger({ module: 'testPageChangesHandler' });
19
21
  const TEMPLATE_NAME = 'app evaluation';
22
+ // Bead kbxy: bounded retry on known transient backend signatures (Pydantic
23
+ // JSON parse errors, 502s, ECONNRESETs). Default 1 retry; env-overridable
24
+ // up to 3 to balance reliability vs quota cost. Conservative: only retries
25
+ // on documented transient patterns (utils/transientErrors.ts).
26
+ function getMaxTransientRetries() {
27
+ const raw = process.env.DEBUGGAI_TRANSIENT_RETRIES;
28
+ if (raw === undefined || raw === '')
29
+ return 1;
30
+ const n = parseInt(raw, 10);
31
+ if (!Number.isFinite(n) || n < 0)
32
+ return 1;
33
+ return Math.min(n, 3);
34
+ }
20
35
  // Concurrency control — max 2 simultaneous browser checks.
21
36
  // Additional requests queue and run when a slot opens.
22
37
  const MAX_CONCURRENT = 2;
@@ -229,88 +244,126 @@ async function testPageChangesHandlerInner(input, context, rawProgressCallback)
229
244
  if (progressCallback) {
230
245
  await progressCallback({ progress: 3, total: TOTAL_STEPS, message: 'Queuing workflow execution...' });
231
246
  }
232
- const executeResponse = await client.workflows.executeWorkflow(templateUuid, contextData, Object.keys(env).length > 0 ? env : undefined);
233
- const executionUuid = executeResponse.executionUuid;
234
- logger.info(`Execution queued: ${executionUuid}`);
235
- // --- Poll ---
236
- // Progress phases:
247
+ // --- Execute + Poll (with bounded retry on transient errors, bead kbxy) ---
248
+ // Progress phases (per attempt):
237
249
  // 1-3: MCP setup (tunnel, template, queue) — already sent above
238
250
  // 4-6: Backend setup (trigger, browser.setup, subworkflow starting)
239
251
  // 7-27: Agent steps (mapped from state.stepsTaken)
240
252
  // 28: Complete
241
253
  const BACKEND_SETUP_END = 6;
242
- let lastStepsTaken = 0;
243
- let observedMaxSteps = MAX_EXEC_STEPS;
244
254
  const TERMINAL_STATUSES = new Set(['completed', 'failed', 'cancelled']);
245
- const finalExecution = await client.workflows.pollExecution(executionUuid, async (exec) => {
246
- // Keep the tunnel alive while the workflow is actively running
247
- if (ctx.tunnelId)
248
- touchTunnelById(ctx.tunnelId);
249
- const nodes = exec.nodeExecutions ?? [];
250
- const stepsTaken = Math.max(nodes.filter(n => n.nodeType === 'brain.step').length, exec.state?.stepsTaken ?? 0);
251
- if (stepsTaken !== lastStepsTaken) {
252
- lastStepsTaken = stepsTaken;
253
- logger.info(`Execution status: ${exec.status}, nodes: ${nodes.length}, steps: ${stepsTaken}`);
254
- }
255
- if (!progressCallback)
256
- return;
257
- // Bead 0bq: emit the final "Complete:" progress INSIDE this callback
258
- // when terminal status is detected. pollExecution will return on the
259
- // next line (line 183 in services/workflows.ts), so there's no
260
- // post-pollExecution progress emission that could race the response.
261
- if (TERMINAL_STATUSES.has(exec.status)) {
262
- const terminalOutcome = exec.state?.outcome ?? exec.status;
263
- await progressCallback({
264
- progress: TOTAL_STEPS,
265
- total: TOTAL_STEPS,
266
- message: `Complete: ${terminalOutcome}`,
255
+ const MAX_RETRIES = getMaxTransientRetries();
256
+ let executeResponse;
257
+ let executionUuid = '';
258
+ let finalExecution;
259
+ let attempt = 0;
260
+ while (true) {
261
+ attempt++;
262
+ if (attempt > 1) {
263
+ // Retry path emit telemetry + progress notification + brief backoff.
264
+ Telemetry.capture(TelemetryEvents.WORKFLOW_TRANSIENT_RETRY, {
265
+ tool: 'check_app_in_browser',
266
+ attempt,
267
+ reason: transientReasonTag(finalExecution),
268
+ previousExecutionId: executionUuid,
269
+ previousErrorMessage: finalExecution?.errorMessage?.slice(0, 200),
270
+ previousStateError: finalExecution?.state?.error?.slice(0, 200),
267
271
  });
268
- return;
269
- }
270
- // --- Compute progress number ---
271
- let execProgress;
272
- let message;
273
- if (stepsTaken > 0) {
274
- // Agent is actively stepping — map into slots 7..27
275
- if (stepsTaken > observedMaxSteps)
276
- observedMaxSteps = stepsTaken + 5;
277
- const stepSlots = TOTAL_STEPS - BACKEND_SETUP_END - 1; // 21 slots
278
- execProgress = BACKEND_SETUP_END + Math.max(1, Math.round((stepsTaken / observedMaxSteps) * stepSlots));
279
- execProgress = Math.min(execProgress, TOTAL_STEPS - 1);
280
- // Use state.currentAction for the message (backend sends intent + actionType)
281
- const ca = exec.state?.currentAction;
282
- if (ca?.intent) {
283
- const action = ca.actionType ?? ca.action_type ?? 'working';
284
- message = `Step ${stepsTaken}: [${action}] ${ca.intent}`;
285
- }
286
- else {
287
- message = `Agent evaluating... (step ${stepsTaken})`;
272
+ if (progressCallback) {
273
+ await progressCallback({
274
+ progress: SETUP_STEPS,
275
+ total: TOTAL_STEPS,
276
+ message: `Transient backend error — retrying (attempt ${attempt}/${MAX_RETRIES + 1})...`,
277
+ });
288
278
  }
279
+ await new Promise(r => setTimeout(r, 1000 * (attempt - 1)));
289
280
  }
290
- else {
291
- // No agent steps yet — show backend setup progress from node transitions
292
- const hasSubworkflow = nodes.some(n => n.nodeType === 'subworkflow.run');
293
- const hasBrowserSetup = nodes.some(n => n.nodeType === 'browser.setup');
294
- const browserReady = nodes.some(n => n.nodeType === 'browser.setup' && n.status === 'success');
295
- if (browserReady || hasSubworkflow) {
296
- execProgress = BACKEND_SETUP_END;
297
- message = 'Browser ready, agent starting...';
281
+ executeResponse = await client.workflows.executeWorkflow(templateUuid, contextData, Object.keys(env).length > 0 ? env : undefined);
282
+ executionUuid = executeResponse.executionUuid;
283
+ logger.info(`Execution queued: ${executionUuid}${attempt > 1 ? ` (retry ${attempt - 1}/${MAX_RETRIES})` : ''}`);
284
+ // Closure state reset PER ATTEMPT so progress numbers don't double-count
285
+ // across retries.
286
+ let lastStepsTaken = 0;
287
+ let observedMaxSteps = MAX_EXEC_STEPS;
288
+ finalExecution = await client.workflows.pollExecution(executionUuid, async (exec) => {
289
+ // Keep the tunnel alive while the workflow is actively running
290
+ if (ctx.tunnelId)
291
+ touchTunnelById(ctx.tunnelId);
292
+ const nodes = exec.nodeExecutions ?? [];
293
+ const stepsTaken = Math.max(nodes.filter(n => n.nodeType === 'brain.step').length, exec.state?.stepsTaken ?? 0);
294
+ if (stepsTaken !== lastStepsTaken) {
295
+ lastStepsTaken = stepsTaken;
296
+ logger.info(`Execution status: ${exec.status}, nodes: ${nodes.length}, steps: ${stepsTaken}`);
298
297
  }
299
- else if (hasBrowserSetup) {
300
- execProgress = SETUP_STEPS + 2;
301
- message = 'Launching browser...';
298
+ if (!progressCallback)
299
+ return;
300
+ // Bead 0bq: emit the final "Complete:" progress INSIDE this callback
301
+ // when terminal status is detected. pollExecution will return on the
302
+ // next line (line 183 in services/workflows.ts), so there's no
303
+ // post-pollExecution progress emission that could race the response.
304
+ if (TERMINAL_STATUSES.has(exec.status)) {
305
+ const terminalOutcome = exec.state?.outcome ?? exec.status;
306
+ await progressCallback({
307
+ progress: TOTAL_STEPS,
308
+ total: TOTAL_STEPS,
309
+ message: `Complete: ${terminalOutcome}`,
310
+ });
311
+ return;
302
312
  }
303
- else if (nodes.length > 0) {
304
- execProgress = SETUP_STEPS + 1;
305
- message = 'Workflow triggered, preparing...';
313
+ // --- Compute progress number ---
314
+ let execProgress;
315
+ let message;
316
+ if (stepsTaken > 0) {
317
+ // Agent is actively stepping — map into slots 7..27
318
+ if (stepsTaken > observedMaxSteps)
319
+ observedMaxSteps = stepsTaken + 5;
320
+ const stepSlots = TOTAL_STEPS - BACKEND_SETUP_END - 1; // 21 slots
321
+ execProgress = BACKEND_SETUP_END + Math.max(1, Math.round((stepsTaken / observedMaxSteps) * stepSlots));
322
+ execProgress = Math.min(execProgress, TOTAL_STEPS - 1);
323
+ // Use state.currentAction for the message (backend sends intent + actionType)
324
+ const ca = exec.state?.currentAction;
325
+ if (ca?.intent) {
326
+ const action = ca.actionType ?? ca.action_type ?? 'working';
327
+ message = `Step ${stepsTaken}: [${action}] ${ca.intent}`;
328
+ }
329
+ else {
330
+ message = `Agent evaluating... (step ${stepsTaken})`;
331
+ }
306
332
  }
307
333
  else {
308
- execProgress = SETUP_STEPS + 1;
309
- message = 'Waiting for execution to start...';
334
+ // No agent steps yet — show backend setup progress from node transitions
335
+ const hasSubworkflow = nodes.some(n => n.nodeType === 'subworkflow.run');
336
+ const hasBrowserSetup = nodes.some(n => n.nodeType === 'browser.setup');
337
+ const browserReady = nodes.some(n => n.nodeType === 'browser.setup' && n.status === 'success');
338
+ if (browserReady || hasSubworkflow) {
339
+ execProgress = BACKEND_SETUP_END;
340
+ message = 'Browser ready, agent starting...';
341
+ }
342
+ else if (hasBrowserSetup) {
343
+ execProgress = SETUP_STEPS + 2;
344
+ message = 'Launching browser...';
345
+ }
346
+ else if (nodes.length > 0) {
347
+ execProgress = SETUP_STEPS + 1;
348
+ message = 'Workflow triggered, preparing...';
349
+ }
350
+ else {
351
+ execProgress = SETUP_STEPS + 1;
352
+ message = 'Waiting for execution to start...';
353
+ }
310
354
  }
311
- }
312
- await progressCallback({ progress: execProgress, total: TOTAL_STEPS, message });
313
- }, abortController.signal);
355
+ await progressCallback({ progress: execProgress, total: TOTAL_STEPS, message });
356
+ }, abortController.signal);
357
+ // Decide retry vs exit: only retry on documented transient signatures
358
+ // AND while we still have budget. Otherwise break and surface whatever
359
+ // result the agent reached.
360
+ if (attempt > MAX_RETRIES)
361
+ break;
362
+ if (!isTransientWorkflowError(finalExecution))
363
+ break;
364
+ logger.warn(`Transient backend error detected (${transientReasonTag(finalExecution) ?? 'unknown'}) — ` +
365
+ `retrying (attempt ${attempt + 1}/${MAX_RETRIES + 1})`);
366
+ }
314
367
  const duration = Date.now() - startTime;
315
368
  // --- Format result ---
316
369
  const outcome = finalExecution.state?.outcome ?? finalExecution.status;
@@ -368,15 +421,41 @@ async function testPageChangesHandlerInner(input, context, rawProgressCallback)
368
421
  reason: sw.error || undefined,
369
422
  };
370
423
  }
424
+ const stepsTaken = finalExecution.state?.stepsTaken ?? subworkflowNode?.outputData?.stepsTaken ?? actionTrace.length;
425
+ const success = finalExecution.state?.success ?? subworkflowNode?.outputData?.success ?? false;
371
426
  const responsePayload = {
372
427
  outcome,
373
- success: finalExecution.state?.success ?? subworkflowNode?.outputData?.success ?? false,
428
+ success,
374
429
  status: finalExecution.status,
375
- stepsTaken: finalExecution.state?.stepsTaken ?? subworkflowNode?.outputData?.stepsTaken ?? actionTrace.length,
430
+ stepsTaken,
431
+ stepsBudget: MAX_EXEC_STEPS, // bead qmdd
432
+ stepsRemaining: Math.max(0, MAX_EXEC_STEPS - (stepsTaken ?? 0)), // bead qmdd
376
433
  targetUrl: originalUrl,
377
434
  executionId: executionUuid,
378
435
  durationMs: finalExecution.durationMs ?? duration,
379
436
  };
437
+ // Bead jqmj: failureCategory disambiguates the three meanings of 'fail':
438
+ // 'agent-error' — workflow/infra failure (Pydantic parse error,
439
+ // backend exception, transport issue). Caller's
440
+ // right move: retry-with-backoff.
441
+ // 'assertion-mismatch' — agent ran the scenario but page state didn't
442
+ // match expectations. Caller's right move: fix
443
+ // code or update the test description.
444
+ // ('page-error' is reserved for v2 — needs a structured signal from
445
+ // backend to distinguish from assertion-mismatch reliably; today's
446
+ // inferrable info is too fragile.)
447
+ // Field is OMITTED on success (no failure to categorize).
448
+ if (!success) {
449
+ // state.error is the AGENT's narrative — it can describe assertion
450
+ // failures ("expected heading to contain Welcome") OR infrastructure
451
+ // failures ("Pydantic JSON parse error"). Without a structured signal,
452
+ // we only count it as 'agent-error' when paired with workflow-level
453
+ // failure (status='failed') or transient signature.
454
+ // status='failed' or errorMessage set → workflow-level / transport error.
455
+ const hasInfraFailure = finalExecution.status === 'failed'
456
+ || !!finalExecution.errorMessage;
457
+ responsePayload.failureCategory = hasInfraFailure ? 'agent-error' : 'assertion-mismatch';
458
+ }
380
459
  if (actionTrace.length > 0)
381
460
  responsePayload.actionTrace = actionTrace;
382
461
  if (evaluation)