autokap 1.0.6 → 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. package/assets/chrome/ios-statusbar-comparison-reference.jpg +0 -0
  2. package/assets/chrome/ios-statusbar-dark-reference.jpg +0 -0
  3. package/assets/chrome/ios-statusbar-light-reference.jpg +0 -0
  4. package/assets/devices/ipad-pro-11-m4.json +52 -0
  5. package/assets/devices/iphone-16-pro.json +53 -0
  6. package/assets/devices/macbook-air-13.json +45 -0
  7. package/assets/frames/MacBook Air 13.svg +242 -0
  8. package/assets/frames/Status bar - iPhone.png +0 -0
  9. Menu bar- iPad.png +0 -0
  10. package/assets/frames/iPad Pro M4 11_.png +0 -0
  11. package/assets/frames/iPhone 16 Pro.png +0 -0
  12. package/assets/icons/Cellular Connection.svg +3 -0
  13. package/assets/icons/Union.svg +6 -0
  14. package/assets/icons/Wifi.svg +3 -0
  15. package/assets/icons/battery.svg +5 -0
  16. package/assets/icons/battery_charging.svg +8 -0
  17. package/dist/abort.d.ts +5 -0
  18. package/dist/abort.js +44 -0
  19. package/dist/agent.d.ts +142 -0
  20. package/dist/agent.js +4511 -0
  21. package/dist/billing-operation-logging.d.ts +38 -0
  22. package/dist/billing-operation-logging.js +248 -0
  23. package/dist/browser-bar.d.ts +40 -0
  24. package/dist/browser-bar.js +147 -0
  25. package/dist/browser.d.ts +25 -0
  26. package/dist/browser.js +177 -9
  27. package/dist/capture-alt-text.d.ts +12 -0
  28. package/dist/capture-alt-text.js +51 -0
  29. package/dist/capture-encryption.d.ts +10 -0
  30. package/dist/capture-encryption.js +41 -0
  31. package/dist/capture-language-preflight.d.ts +41 -0
  32. package/dist/capture-language-preflight.js +286 -0
  33. package/dist/capture-llm-page-identity.d.ts +15 -0
  34. package/dist/capture-llm-page-identity.js +116 -0
  35. package/dist/capture-model-resolution.d.ts +9 -0
  36. package/dist/capture-model-resolution.js +21 -0
  37. package/dist/capture-page-identity.d.ts +9 -0
  38. package/dist/capture-page-identity.js +219 -0
  39. package/dist/capture-preset-credentials.d.ts +12 -0
  40. package/dist/capture-preset-credentials.js +57 -0
  41. package/dist/capture-request-plan.d.ts +58 -0
  42. package/dist/capture-request-plan.js +216 -0
  43. package/dist/capture-run-optimizer.d.ts +139 -0
  44. package/dist/capture-run-optimizer.js +848 -0
  45. package/dist/capture-selector-memory.d.ts +26 -0
  46. package/dist/capture-selector-memory.js +327 -0
  47. package/dist/capture-session-profile-encryption.d.ts +2 -0
  48. package/dist/capture-session-profile-encryption.js +22 -0
  49. package/dist/capture-step-timeout.d.ts +10 -0
  50. package/dist/capture-step-timeout.js +30 -0
  51. package/dist/capture-studio-sync.d.ts +22 -0
  52. package/dist/capture-studio-sync.js +166 -0
  53. package/dist/capture-variant-state.d.ts +54 -0
  54. package/dist/capture-variant-state.js +156 -0
  55. package/dist/cli.js +15 -0
  56. package/dist/clip-orchestrator.d.ts +148 -0
  57. package/dist/clip-orchestrator.js +950 -0
  58. package/dist/clip-postprocess.d.ts +42 -0
  59. package/dist/clip-postprocess.js +192 -0
  60. package/dist/cost-logging.d.ts +27 -0
  61. package/dist/cost-logging.js +128 -0
  62. package/dist/credential-templates.d.ts +5 -0
  63. package/dist/credential-templates.js +60 -0
  64. package/dist/element-capture.d.ts +53 -0
  65. package/dist/element-capture.js +766 -0
  66. package/dist/hybrid-navigator.d.ts +138 -0
  67. package/dist/hybrid-navigator.js +468 -0
  68. package/dist/index.d.ts +15 -0
  69. package/dist/index.js +11 -0
  70. package/dist/llm-usage.d.ts +17 -0
  71. package/dist/llm-usage.js +45 -0
  72. package/dist/mockup-html.d.ts +119 -0
  73. package/dist/mockup-html.js +253 -0
  74. package/dist/mockup.d.ts +94 -0
  75. package/dist/mockup.js +608 -0
  76. package/dist/mouse-animation.d.ts +46 -0
  77. package/dist/mouse-animation.js +100 -0
  78. package/dist/overlay-utils.d.ts +14 -0
  79. package/dist/overlay-utils.js +13 -0
  80. package/dist/posthog.d.ts +4 -0
  81. package/dist/posthog.js +26 -0
  82. package/dist/prompt-cache.d.ts +10 -0
  83. package/dist/prompt-cache.js +24 -0
  84. package/dist/prompts.d.ts +167 -0
  85. package/dist/prompts.js +1165 -0
  86. package/dist/remote-browser.d.ts +191 -0
  87. package/dist/remote-browser.js +305 -0
  88. package/dist/security.d.ts +20 -0
  89. package/dist/security.js +569 -0
  90. package/dist/server-capture-runtime.d.ts +123 -0
  91. package/dist/server-capture-runtime.js +638 -0
  92. package/dist/server-credit-usage.d.ts +12 -0
  93. package/dist/server-credit-usage.js +41 -0
  94. package/dist/server-posthog.d.ts +2 -0
  95. package/dist/server-posthog.js +16 -0
  96. package/dist/server-project-webhooks.d.ts +45 -0
  97. package/dist/server-project-webhooks.js +97 -0
  98. package/dist/server-screenshot-watermark.d.ts +7 -0
  99. package/dist/server-screenshot-watermark.js +38 -0
  100. package/dist/session-profile.d.ts +86 -0
  101. package/dist/session-profile.js +1373 -0
  102. package/dist/sf-pro-fonts.d.ts +4 -0
  103. package/dist/sf-pro-fonts.js +7 -0
  104. package/dist/status-bar-l10n.d.ts +14 -0
  105. package/dist/status-bar-l10n.js +177 -0
  106. package/dist/status-bar.d.ts +44 -0
  107. package/dist/status-bar.js +336 -0
  108. package/dist/tools.d.ts +4 -0
  109. package/dist/tools.js +578 -0
  110. package/dist/video-agent.d.ts +143 -0
  111. package/dist/video-agent.js +4783 -0
  112. package/dist/video-observation.d.ts +36 -0
  113. package/dist/video-observation.js +192 -0
  114. package/dist/video-planner.d.ts +12 -0
  115. package/dist/video-planner.js +500 -0
  116. package/dist/video-prompts.d.ts +37 -0
  117. package/dist/video-prompts.js +554 -0
  118. package/dist/video-tools.d.ts +3 -0
  119. package/dist/video-tools.js +59 -0
  120. package/dist/video-variant-state.d.ts +29 -0
  121. package/dist/video-variant-state.js +80 -0
  122. package/dist/vision-model.d.ts +17 -0
  123. package/dist/vision-model.js +74 -0
  124. package/dist/ws-auth.d.ts +20 -0
  125. package/dist/ws-auth.js +67 -0
  126. package/dist/ws-handler.d.ts +10 -0
  127. package/dist/ws-handler.js +1663 -0
  128. package/dist/ws-server.d.ts +9 -0
  129. package/dist/ws-server.js +52 -0
  130. package/package.json +93 -39
package/dist/agent.js ADDED
@@ -0,0 +1,4511 @@
1
+ import OpenAI from 'openai';
2
+ import { describeObservationChange } from './browser.js';
3
+ import { agentTools } from './tools.js';
4
+ import { buildSystemPrompt, buildStableAnchorUserMessage, buildIterationUserMessage, buildVerificationMessage, buildVisionObserverPrompt, } from './prompts.js';
5
+ import { logger, emitScreenshot, emitReasoningDelta } from './logger.js';
6
+ import { extractStepUsage } from './llm-usage.js';
7
+ import { hasManualMultiProviderOrder, resolvePromptCacheStrategy } from './prompt-cache.js';
8
+ import { dismissCookiesAndWidgets } from './cookie-dismiss.js';
9
+ import { describeSecurityTarget, evaluateActionSecurity } from './security.js';
10
+ import { getPostHog, DISTINCT_ID } from './posthog.js';
11
+ import { isAbortError, sleepWithAbort, throwIfAborted } from './abort.js';
12
+ import { evaluateRequestedLanguageState, evaluateRequestedThemeState, performDeterministicSessionRepair, } from './session-profile.js';
13
+ import { resolveActionCredentialArgs, sanitizeCredentialParams, } from './credential-templates.js';
14
+ import { callVisionCapableModel, VisionModelUnsupportedError, } from './vision-model.js';
15
+ import { createHash } from 'crypto';
16
+ /**
17
+ * Converts a screenshot buffer to an image URL for LLM messages.
18
+ * Uses the provided uploader (Supabase storage → HTTPS URL) when available,
19
+ * falls back to a base64 data URI for compatibility.
20
+ */
21
+ async function makeImageUrl(buffer, mimeType, uploadImage) {
22
+ if (uploadImage) {
23
+ try {
24
+ return await uploadImage(buffer, mimeType);
25
+ }
26
+ catch {
27
+ // fall through to base64
28
+ }
29
+ }
30
+ return `data:${mimeType};base64,${buffer.toString('base64')}`;
31
+ }
32
+ function computeScreenshotFingerprint(buffer) {
33
+ return createHash('sha1').update(buffer).digest('hex');
34
+ }
35
+ function summarizeVariantManifestForPlanner(manifest) {
36
+ if (!manifest)
37
+ return '';
38
+ const parts = [
39
+ `expected=${manifest.expectedPageIds.join(',') || 'main'}`,
40
+ `current=${manifest.currentPageId ?? 'main'}`,
41
+ manifest.currentPageIdentity ? `identity=${manifest.currentPageIdentity.summary}` : '',
42
+ `completed=${manifest.completedPages.join(',') || 'none'}`,
43
+ `remaining=${manifest.remainingPages.join(',') || 'none'}`,
44
+ manifest.lastCheckpointId ? `checkpoint=${manifest.lastCheckpointId}` : '',
45
+ ];
46
+ if (manifest.captureStatuses) {
47
+ parts.push(`statuses=${Object.entries(manifest.captureStatuses)
48
+ .slice(0, 6)
49
+ .map(([pageId, status]) => `${pageId}:${status}`)
50
+ .join('|') || 'none'}`);
51
+ }
52
+ if (manifest.previousValidatedCaptures.length > 0) {
53
+ parts.push(`validated=${manifest.previousValidatedCaptures
54
+ .slice(-3)
55
+ .map((capture) => `${capture.pageId}${capture.identity ? `[${capture.identity.summary}]` : ''}:${capture.assessment.slice(0, 80)}`)
56
+ .join(' | ')}`);
57
+ }
58
+ return parts.join('; ');
59
+ }
60
+ function summarizeCaptureCursorForPlanner(config) {
61
+ if (!config.captureCursor)
62
+ return '';
63
+ return [
64
+ `page=${config.captureCursor.pageId}`,
65
+ `target=${config.captureCursor.targetId}`,
66
+ `phase=${config.captureCursor.phase}`,
67
+ `resume=${config.captureCursor.resumeFromActionIndex}`,
68
+ config.captureCursor.lastVerifiedCheckpointId
69
+ ? `checkpoint=${config.captureCursor.lastVerifiedCheckpointId}`
70
+ : '',
71
+ ].filter(Boolean).join('; ');
72
+ }
73
+ function summarizeRepairTicketForPlanner(config) {
74
+ if (!config.activeRepairTicket)
75
+ return '';
76
+ const ticket = config.activeRepairTicket;
77
+ return [
78
+ `ticket=${ticket.id}`,
79
+ `cause=${ticket.cause}`,
80
+ `status=${ticket.status}`,
81
+ `summary=${ticket.summary}`,
82
+ ticket.expectedState.lang ? `lang=${ticket.expectedState.lang}` : '',
83
+ ticket.expectedState.theme ? `theme=${ticket.expectedState.theme}` : '',
84
+ ticket.expectedState.authState ? `auth=${ticket.expectedState.authState}` : '',
85
+ ticket.expectedState.url ? `url=${ticket.expectedState.url}` : '',
86
+ ticket.expectedState.pageId ? `page=${ticket.expectedState.pageId}` : '',
87
+ ].filter(Boolean).join('; ');
88
+ }
89
+ function findDuplicateVariantCapture(manifest, fingerprint) {
90
+ if (!manifest?.currentPageId)
91
+ return null;
92
+ const duplicate = manifest.previousValidatedCaptures.find((capture) => capture.pageId !== manifest.currentPageId
93
+ && capture.fingerprint
94
+ && capture.fingerprint === fingerprint);
95
+ return duplicate?.pageId ?? null;
96
+ }
97
+ const PRESET_EDITOR_RE = /\b(edit[_ -]?preset|preset[_ -]editor|preset[_ -]edit|preset[_ -]form)\b/i;
98
+ const PRESET_CONTEXT_RE = /\bpreset\b/i;
99
+ const CONFIGURATION_RE = /\b(edit|editor|config|configuration|configure|settings|modify|modifier|form)\b/i;
100
+ const EDITOR_PROMPT_RE = /\b(additional instructions|instructions suppl[eé]mentaires|continue with ai|continuer avec l['’]ia)\b/i;
101
+ const DIALOG_TARGET_RE = /\b(modal|dialog|drawer|popup)\b/i;
102
+ const GALLERY_ROUTE_RE = /\/(gallery|captures?|screenshots?)\b/i;
103
+ const SETTINGS_ROUTE_RE = /\/settings\b/i;
104
+ const GALLERY_SURFACE_RE = /\b(gallery|galerie|captures?|screenshots?|thumbnail|thumbnails|miniatures?)\b/i;
105
+ const GALLERY_DETAIL_PROMPT_RE = /\b(first (?:image|screenshot|result|card)|open the first|click on the first|after clicking|individual screenshots?|detail view|detailed view|subsequent page|premier resultat|premier résultat|captures individuelles?)\b/i;
106
+ const GALLERY_OVERVIEW_CONTROL_RE = /\b(filter by preset|filtrer par preset|rechercher|search|filter|preset)\b/i;
107
+ const GALLERY_GROUP_CARD_RE = /\b(download all|tout telecharger|tout télécharger|\d+\s*(?:screenshots?|captures?(?: d['’]ecran| d’écran)?))\b/i;
108
+ const SETTINGS_SURFACE_RE = /\b(settings|param[eè]tres|project name|team members|billing|workspace settings|general settings)\b/i;
109
+ const TEMPLATE_GALLERY_MARKERS = [
110
+ 'new preset',
111
+ 'nouveau preset',
112
+ 'search templates',
113
+ 'rechercher des templates',
114
+ 'start from scratch',
115
+ 'partir de zéro',
116
+ 'homepage hero',
117
+ 'pricing page',
118
+ 'mobile responsive',
119
+ 'dark mode showcase',
120
+ 'multi-language',
121
+ 'feature showcase',
122
+ 'marketing',
123
+ 'responsive',
124
+ 'video',
125
+ ];
126
+ const MODAL_GENERIC_SUBJECT_TOKENS = new Set([
127
+ 'modal',
128
+ 'dialog',
129
+ 'drawer',
130
+ 'popup',
131
+ 'overlay',
132
+ 'preset',
133
+ 'presets',
134
+ 'template',
135
+ 'templates',
136
+ 'selection',
137
+ 'select',
138
+ 'choose',
139
+ 'details',
140
+ 'detail',
141
+ 'configuration',
142
+ 'config',
143
+ 'gallery',
144
+ 'captures',
145
+ 'screenshots',
146
+ 'settings',
147
+ 'editor',
148
+ ]);
149
+ const MODAL_CONFIGURATION_SURFACE_RE = /\b(additional instructions|instructions supplementaires|continue with ai|continuer avec (?:l )?ia|ai prompt|prompt ia)\b/i;
150
+ function normalizeEvidenceText(value) {
151
+ return value
152
+ .toLowerCase()
153
+ .normalize('NFD')
154
+ .replace(/[\u0300-\u036f]/g, ' ')
155
+ .replace(/[^a-z0-9]+/g, ' ')
156
+ .trim();
157
+ }
158
+ function tokenizeEvidenceText(value) {
159
+ return normalizeEvidenceText(value)
160
+ .split(/\s+/g)
161
+ .map(part => part.trim())
162
+ .filter(part => part.length >= 3);
163
+ }
164
+ function countDialogSubjectTokenMatches(params) {
165
+ const strongTokens = params.subjectTokens.filter(token => token.length >= 4 && !MODAL_GENERIC_SUBJECT_TOKENS.has(token));
166
+ if (strongTokens.length === 0)
167
+ return 0;
168
+ const evidenceTokens = new Set(tokenizeEvidenceText(`${params.visibleText} ${params.currentUrl}`));
169
+ return strongTokens.filter(token => evidenceTokens.has(normalizeEvidenceText(token))).length;
170
+ }
171
+ function hasExpectedDialogSurface(params) {
172
+ const normalizedVisibleText = normalizeEvidenceText(params.visibleText);
173
+ const selectionSurfaceScore = TEMPLATE_GALLERY_MARKERS.filter(marker => normalizedVisibleText.includes(normalizeEvidenceText(marker))).length
174
+ + (/\b(template|templates|modele|modeles|picker|selection)\b/i.test(normalizedVisibleText) ? 1 : 0);
175
+ const configurationSurfaceScore = selectionSurfaceScore
176
+ + (MODAL_CONFIGURATION_SURFACE_RE.test(normalizedVisibleText) ? 2 : 0);
177
+ const subjectTokenMatches = countDialogSubjectTokenMatches({
178
+ subjectTokens: params.identity.subjectTokens,
179
+ visibleText: params.visibleText,
180
+ currentUrl: params.currentUrl,
181
+ });
182
+ if (params.identity.kind === 'modal_selection') {
183
+ return selectionSurfaceScore > 0 || subjectTokenMatches > 0;
184
+ }
185
+ if (params.identity.kind === 'modal_configuration') {
186
+ return configurationSurfaceScore > 0 || subjectTokenMatches > 0;
187
+ }
188
+ return true;
189
+ }
190
+ function inferGallerySubstateFailure(params) {
191
+ if (!GALLERY_DETAIL_PROMPT_RE.test(params.context)) {
192
+ return null;
193
+ }
194
+ const normalizedVisibleText = normalizeEvidenceText(params.visibleText);
195
+ const stillShowsOverviewControls = GALLERY_OVERVIEW_CONTROL_RE.test(normalizedVisibleText);
196
+ const stillShowsGroupedCollections = GALLERY_GROUP_CARD_RE.test(normalizedVisibleText);
197
+ if (stillShowsOverviewControls && stillShowsGroupedCollections) {
198
+ return 'Expected the opened gallery result/details view, but the gallery overview/filter controls are still visible.';
199
+ }
200
+ return null;
201
+ }
202
+ export function inferVariantIdentityFailure(params) {
203
+ const pageId = (params.pageId || '').toLowerCase();
204
+ const prompt = params.prompt.toLowerCase();
205
+ const context = `${pageId} ${prompt}`;
206
+ const visibleText = params.visibleText.toLowerCase();
207
+ const identity = params.pageIdentity ?? null;
208
+ const galleryMarkerCount = TEMPLATE_GALLERY_MARKERS.filter((marker) => visibleText.includes(marker)).length;
209
+ const hasMultipleKnownTemplates = ['homepage hero', 'pricing page', 'mobile responsive', 'dark mode showcase', 'multi-language', 'feature showcase']
210
+ .filter((marker) => visibleText.includes(marker))
211
+ .length >= 2;
212
+ const looksLikeTemplateGallery = galleryMarkerCount >= 3
213
+ || (visibleText.includes('new preset') && hasMultipleKnownTemplates)
214
+ || (visibleText.includes('nouveau preset') && hasMultipleKnownTemplates);
215
+ const expectsPresetEditor = identity?.dedicatedRoute
216
+ || identity?.kind === 'editor_route'
217
+ || (!identity
218
+ && (PRESET_EDITOR_RE.test(context)
219
+ || (PRESET_CONTEXT_RE.test(context) && CONFIGURATION_RE.test(context))
220
+ || (EDITOR_PROMPT_RE.test(context) && !DIALOG_TARGET_RE.test(context))));
221
+ const expectsDialogTarget = identity?.dialogTarget ?? false;
222
+ const expectsGalleryTarget = identity?.kind === 'gallery';
223
+ const onPresetEditorRoute = /\/projects\/[^/]+\/presets\/[^/?#]+/i.test(params.currentUrl) || /\/presets\/[^/?#]+/i.test(params.currentUrl);
224
+ const onGalleryRoute = GALLERY_ROUTE_RE.test(params.currentUrl);
225
+ const onSettingsRoute = SETTINGS_ROUTE_RE.test(params.currentUrl);
226
+ const gallerySurfaceScore = TEMPLATE_GALLERY_MARKERS
227
+ .filter((marker) => marker === 'new preset' || marker === 'nouveau preset')
228
+ .reduce((score, marker) => score + (visibleText.includes(marker) ? 1 : 0), 0)
229
+ + (GALLERY_SURFACE_RE.test(visibleText) ? 2 : 0);
230
+ const settingsSurfaceScore = SETTINGS_SURFACE_RE.test(visibleText) ? 2 : 0;
231
+ if (expectsDialogTarget) {
232
+ if (params.dialogCount === 0) {
233
+ return 'Expected a dialog/modal capture target, but no dialog is currently open.';
234
+ }
235
+ if (identity && !hasExpectedDialogSurface({
236
+ identity,
237
+ visibleText,
238
+ currentUrl: params.currentUrl,
239
+ })) {
240
+ return identity.kind === 'modal_configuration'
241
+ ? 'Expected the configured modal state, but the current dialog content does not match the requested template/details view.'
242
+ : 'Expected the template picker modal, but the current dialog content does not match the requested selection state.';
243
+ }
244
+ return null;
245
+ }
246
+ if (expectsGalleryTarget) {
247
+ const expectsDetailView = GALLERY_DETAIL_PROMPT_RE.test(context);
248
+ if (!expectsDetailView && params.dialogCount > 0) {
249
+ return 'Expected the gallery overview, but a dialog/modal is still open on top of the page.';
250
+ }
251
+ if (onSettingsRoute || (settingsSurfaceScore > 0 && !onGalleryRoute && gallerySurfaceScore === 0)) {
252
+ return 'Expected the gallery overview, but the page still looks like settings or another non-gallery section.';
253
+ }
254
+ const gallerySubstateFailure = inferGallerySubstateFailure({ context, visibleText });
255
+ if (gallerySubstateFailure)
256
+ return gallerySubstateFailure;
257
+ return null;
258
+ }
259
+ if (!expectsPresetEditor)
260
+ return null;
261
+ if (looksLikeTemplateGallery) {
262
+ return 'Expected the preset editor page, but the template gallery/sidebar is still visible.';
263
+ }
264
+ if (params.dialogCount > 0) {
265
+ return 'Expected the preset editor page, but a dialog/modal is still open on top of the page.';
266
+ }
267
+ if (PRESET_CONTEXT_RE.test(context) && !onPresetEditorRoute) {
268
+ return 'Expected the preset editor page, but the browser is not on a preset editor route.';
269
+ }
270
+ return null;
271
+ }
272
+ // OAuth guard: block clicks on OAuth elements when credentials are provided
273
+ const OAUTH_TEXT_RE = /\b(google|apple|microsoft|github|facebook|twitter|linkedin|sso)\b/i;
274
+ const OAUTH_HREF_RE = /google\.com|apple\.com|microsoft\.com|github\.com|facebook\.com|twitter\.com|linkedin\.com|auth0\.com|oauth/i;
275
+ const HIDDEN_PASSWORD_GIVE_UP_RE = /\b(password (is )?missing|missing password|mot de passe manquant|password unavailable|no password provided)\b/i;
276
+ const GENERIC_PASSWORD_INPUT_RE = /^(password|your password|motdepasse|mot de passe|password123|secret)$/i;
277
+ function isOAuthElement(el) {
278
+ return OAUTH_TEXT_RE.test(el.text) || OAUTH_TEXT_RE.test(el.ariaLabel || '') || OAUTH_HREF_RE.test(el.href || '');
279
+ }
280
+ function isPasswordFieldTarget(args, interactiveElements) {
281
+ const byIndex = args.index !== undefined
282
+ ? interactiveElements.find((el) => el.index === args.index)
283
+ : null;
284
+ const bySelector = !byIndex && typeof args.selector === 'string'
285
+ ? interactiveElements.find((el) => el.selector === args.selector)
286
+ : null;
287
+ const target = byIndex ?? bySelector;
288
+ if (!target)
289
+ return false;
290
+ const haystack = `${target.inputType || ''} ${target.text || ''} ${target.ariaLabel || ''} ${target.selector}`.toLowerCase();
291
+ return target.inputType === 'password' || /\b(password|mot de passe|passcode)\b/i.test(haystack);
292
+ }
293
+ function createClient(apiKey) {
294
+ return new OpenAI({
295
+ baseURL: 'https://openrouter.ai/api/v1',
296
+ apiKey,
297
+ defaultHeaders: {
298
+ 'HTTP-Referer': 'https://github.com/screenshot-agent',
299
+ 'X-Title': 'Screenshot Agent',
300
+ },
301
+ });
302
+ }
303
+ /** Check if a model is a Grok model (xAI). These models have broken streaming tool calls via OpenRouter. */
304
+ function isGrokModel(model) {
305
+ const normalized = model.toLowerCase();
306
+ return normalized.includes('grok') || normalized.startsWith('x-ai/');
307
+ }
308
+ /** Check if a model supports OpenRouter's reasoning/thinking parameter. */
309
+ function supportsReasoning(model) {
310
+ return isGrokModel(model);
311
+ }
312
+ /** Build the reasoning body fragment for OpenRouter requests. */
313
+ function reasoningBody(model, effort) {
314
+ if (!effort || effort === 'off')
315
+ return {};
316
+ if (!supportsReasoning(model))
317
+ return {};
318
+ return { reasoning: { effort } };
319
+ }
320
+ /** Spread into OpenRouter request bodies to inject provider routing preferences for a given model. */
321
+ function providerBody(model, prefsMap) {
322
+ const prefs = prefsMap?.[model];
323
+ // Enforce Zero Data Retention on all requests (GDPR compliance)
324
+ return { provider: { ...prefs, zdr: true } };
325
+ }
326
+ let _reasoningMessageCounter = 0;
327
+ /**
328
+ * Parse tool calls from XML content emitted by models that don't support native tool_calls
329
+ * streaming (e.g., MiniMax M2.5 via OpenRouter). Supports two formats:
330
+ * - <minimax:tool_call><invoke name="X"><parameter name="Y">Z</parameter></invoke></minimax:tool_call>
331
+ * - <tool_code>function_name\n{"arg": "value"}</tool_code>
332
+ */
333
+ function parseXmlToolCalls(content, completionId) {
334
+ const results = [];
335
+ // Format 1: <minimax:tool_call><invoke name="..."><parameter name="...">...</parameter></invoke></minimax:tool_call>
336
+ const minimaxRe = /<minimax:tool_call>([\s\S]*?)<\/minimax:tool_call>/gi;
337
+ for (const match of content.matchAll(minimaxRe)) {
338
+ const block = match[1];
339
+ const invokeRe = /<invoke\s+name="([^"]+)">([\s\S]*?)<\/invoke>/gi;
340
+ for (const invokeMatch of block.matchAll(invokeRe)) {
341
+ const name = invokeMatch[1];
342
+ const paramsBlock = invokeMatch[2];
343
+ const args = {};
344
+ const paramRe = /<parameter\s+name="([^"]+)">([\s\S]*?)<\/parameter>/gi;
345
+ for (const paramMatch of paramsBlock.matchAll(paramRe)) {
346
+ args[paramMatch[1]] = paramMatch[2].trim();
347
+ }
348
+ results.push({
349
+ id: `xml-${completionId || Date.now()}-${results.length}`,
350
+ type: 'function',
351
+ function: { name, arguments: JSON.stringify(args) },
352
+ });
353
+ }
354
+ }
355
+ // Format 2: <tool_code>function_name\n{"arg": "value"}</tool_code>
356
+ if (results.length === 0) {
357
+ const toolCodeRe = /<tool_code>([\s\S]*?)<\/tool_code>/gi;
358
+ for (const match of content.matchAll(toolCodeRe)) {
359
+ const inner = match[1].trim();
360
+ const newlineIdx = inner.indexOf('\n');
361
+ if (newlineIdx > 0) {
362
+ const name = inner.slice(0, newlineIdx).trim();
363
+ const argsStr = inner.slice(newlineIdx + 1).trim();
364
+ results.push({
365
+ id: `xml-${completionId || Date.now()}-${results.length}`,
366
+ type: 'function',
367
+ function: { name, arguments: argsStr },
368
+ });
369
+ }
370
+ }
371
+ }
372
+ return results;
373
+ }
374
+ async function callWithRetry(client, params, maxRetries = 3, signal, providerPreferences, reasoningEffort) {
375
+ // Grok models have broken streaming tool calls via OpenRouter — the provider
376
+ // silently drops tool_calls from SSE deltas, resulting in 0 tool calls received
377
+ // despite the model producing them. Use non-streaming for these models.
378
+ const useStreaming = !isGrokModel(params.model);
379
+ let messagesToUse = params.messages;
380
+ for (let attempt = 1; attempt <= maxRetries; attempt++) {
381
+ throwIfAborted(signal, 'Agent run cancelled.');
382
+ try {
383
+ let content = '';
384
+ let toolCalls = [];
385
+ let finishReason = null;
386
+ let completionId = '';
387
+ let model = params.model;
388
+ let chunkCount = 0;
389
+ const usageData = { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 };
390
+ if (useStreaming) {
391
+ // Streaming path for most models
392
+ const stream = await client.chat.completions.create({ ...params, messages: messagesToUse, stream: true, ...providerBody(params.model, providerPreferences), ...reasoningBody(params.model, reasoningEffort) }, { signal });
393
+ const toolCallBuffers = new Map();
394
+ const messageId = `reasoning-${++_reasoningMessageCounter}`;
395
+ for await (const chunk of stream) {
396
+ chunkCount++;
397
+ if (chunk.id)
398
+ completionId = chunk.id;
399
+ if (chunk.model)
400
+ model = chunk.model;
401
+ if (chunk.usage) {
402
+ usageData.prompt_tokens = chunk.usage.prompt_tokens ?? 0;
403
+ usageData.completion_tokens = chunk.usage.completion_tokens ?? 0;
404
+ usageData.total_tokens = chunk.usage.total_tokens ?? 0;
405
+ }
406
+ const delta = chunk.choices?.[0]?.delta;
407
+ if (!delta)
408
+ continue;
409
+ if (chunk.choices[0].finish_reason) {
410
+ finishReason = chunk.choices[0].finish_reason;
411
+ }
412
+ if (delta.content) {
413
+ content += delta.content;
414
+ emitReasoningDelta(delta.content, messageId);
415
+ }
416
+ if (delta.tool_calls) {
417
+ for (const tc of delta.tool_calls) {
418
+ const existing = toolCallBuffers.get(tc.index);
419
+ if (existing) {
420
+ if (tc.function?.name)
421
+ existing.name = tc.function.name;
422
+ if (tc.id)
423
+ existing.id = tc.id;
424
+ if (tc.function?.arguments)
425
+ existing.arguments += tc.function.arguments;
426
+ }
427
+ else {
428
+ toolCallBuffers.set(tc.index, {
429
+ id: tc.id || `tc-${completionId || `gen-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`}-${tc.index}`,
430
+ name: tc.function?.name || '',
431
+ arguments: tc.function?.arguments || '',
432
+ });
433
+ }
434
+ }
435
+ }
436
+ }
437
+ toolCalls = Array.from(toolCallBuffers.entries())
438
+ .sort(([a], [b]) => a - b)
439
+ .filter(([, tc]) => tc.name.trim().length > 0)
440
+ .map(([, tc]) => ({
441
+ id: tc.id,
442
+ type: 'function',
443
+ function: { name: tc.name, arguments: tc.arguments },
444
+ }));
445
+ }
446
+ else {
447
+ // Non-streaming path for models with broken streaming (e.g. Grok via OpenRouter)
448
+ const nonStreamStart = Date.now();
449
+ logger.info('Réflexion en cours…');
450
+ const response = await client.chat.completions.create({ ...params, messages: messagesToUse, stream: false, ...providerBody(params.model, providerPreferences), ...reasoningBody(params.model, reasoningEffort) }, { signal });
451
+ logger.debug(`LLM response received in ${Date.now() - nonStreamStart}ms (model: ${params.model})`);
452
+ completionId = response.id ?? '';
453
+ model = response.model ?? params.model;
454
+ finishReason = response.choices?.[0]?.finish_reason ?? null;
455
+ content = response.choices?.[0]?.message?.content ?? '';
456
+ if (response.usage) {
457
+ usageData.prompt_tokens = response.usage.prompt_tokens ?? 0;
458
+ usageData.completion_tokens = response.usage.completion_tokens ?? 0;
459
+ usageData.total_tokens = response.usage.total_tokens ?? 0;
460
+ }
461
+ if (response.choices?.[0]?.message?.tool_calls) {
462
+ toolCalls = response.choices[0].message.tool_calls
463
+ .filter((tc) => tc.type === 'function' && 'function' in tc)
464
+ .map((tc) => ({
465
+ id: tc.id,
466
+ type: 'function',
467
+ function: { name: tc.function.name, arguments: tc.function.arguments },
468
+ }));
469
+ }
470
+ chunkCount = 1;
471
+ }
472
+ // Some models (e.g., MiniMax M2.5) return tool calls as XML in the content
473
+ // field instead of using the native tool_calls streaming delta. Parse them.
474
+ if (toolCalls.length === 0 && content) {
475
+ const xmlParsed = parseXmlToolCalls(content, completionId);
476
+ if (xmlParsed.length > 0) {
477
+ toolCalls.push(...xmlParsed);
478
+ content = '';
479
+ logger.debug(`Parsed ${xmlParsed.length} tool call(s) from XML content (model: ${model})`);
480
+ }
481
+ }
482
+ // If the model returned no tool calls, log diagnostics and retry with coercion.
483
+ const hasSubstantiveContent = content ? /[a-zA-Z]{3,}/.test(content) : false;
484
+ if (toolCalls.length === 0 && attempt < maxRetries && (hasSubstantiveContent || !content)) {
485
+ logger.info(`Model ${model} returned no usable tool calls (attempt ${attempt}/${maxRetries}). `
486
+ + `Diagnostics: chunks=${chunkCount}, finish_reason=${finishReason}, `
487
+ + `content_length=${content.length}, `
488
+ + `usage=${JSON.stringify(usageData)}, content_preview=${JSON.stringify(content.slice(0, 500) || '(empty)')}`);
489
+ if (content) {
490
+ messagesToUse = [
491
+ ...params.messages,
492
+ { role: 'assistant', content },
493
+ {
494
+ role: 'user',
495
+ content: 'You must call one of the available tools. Do not respond with text — select the most appropriate tool and call it now.',
496
+ },
497
+ ];
498
+ }
499
+ else {
500
+ messagesToUse = [
501
+ ...params.messages,
502
+ {
503
+ role: 'user',
504
+ content: 'You must call one of the available tools. Do not respond with text — select the most appropriate tool and call it now.',
505
+ },
506
+ ];
507
+ }
508
+ continue;
509
+ }
510
+ if (toolCalls.length === 0 && content && !hasSubstantiveContent) {
511
+ logger.info(`Model ${model} returned non-substantive content (${content.length} chars), skipping coercion retry. Preview: ${JSON.stringify(content.slice(0, 200))}`);
512
+ }
513
+ const result = {
514
+ id: completionId,
515
+ object: 'chat.completion',
516
+ created: Math.floor(Date.now() / 1000),
517
+ model,
518
+ choices: [{
519
+ index: 0,
520
+ message: {
521
+ role: 'assistant',
522
+ content: toolCalls.length > 0 ? null : (content || null),
523
+ tool_calls: toolCalls.length > 0 ? toolCalls : undefined,
524
+ refusal: null,
525
+ },
526
+ finish_reason: (finishReason || 'stop'),
527
+ logprobs: null,
528
+ }],
529
+ usage: usageData,
530
+ };
531
+ return result;
532
+ }
533
+ catch (err) {
534
+ if (isAbortError(err)) {
535
+ throw err;
536
+ }
537
+ const error = err;
538
+ if (error.status === 429 && attempt < maxRetries) {
539
+ const delay = Math.pow(2, attempt) * 1000;
540
+ logger.info(`Rate limited, retrying in ${delay}ms...`);
541
+ await sleepWithAbort(delay, signal);
542
+ continue;
543
+ }
544
+ throw err;
545
+ }
546
+ }
547
+ throw new Error('Max retries exceeded');
548
+ }
549
+ async function callTextOnlyWithRetry(client, params, maxRetries = 3, signal) {
550
+ for (let attempt = 1; attempt <= maxRetries; attempt++) {
551
+ throwIfAborted(signal, 'Agent run cancelled.');
552
+ try {
553
+ return await client.chat.completions.create({ ...params, stream: false }, { signal });
554
+ }
555
+ catch (err) {
556
+ if (isAbortError(err)) {
557
+ throw err;
558
+ }
559
+ const error = err;
560
+ if (error.status === 429 && attempt < maxRetries) {
561
+ const delay = Math.pow(2, attempt) * 1000;
562
+ logger.info(`Rate limited, retrying in ${delay}ms...`);
563
+ await sleepWithAbort(delay, signal);
564
+ continue;
565
+ }
566
+ throw err;
567
+ }
568
+ }
569
+ throw new Error('Max retries exceeded');
570
+ }
571
+ function extractUsage(response, stepNumber, stepType, modelRequested, imagesInPrompt) {
572
+ return extractStepUsage(response, {
573
+ stepNumber,
574
+ stepType,
575
+ modelRequested,
576
+ imagesInPrompt,
577
+ });
578
+ }
579
+ function mergeUsage(primary, secondary) {
580
+ if (!primary)
581
+ return secondary;
582
+ if (!secondary)
583
+ return primary;
584
+ return {
585
+ ...secondary,
586
+ stepNumber: primary.stepNumber,
587
+ stepType: primary.stepType,
588
+ modelRequested: primary.modelRequested,
589
+ imagesInPrompt: Math.max(primary.imagesInPrompt, secondary.imagesInPrompt),
590
+ promptTokens: (primary.promptTokens ?? 0) + (secondary.promptTokens ?? 0),
591
+ completionTokens: (primary.completionTokens ?? 0) + (secondary.completionTokens ?? 0),
592
+ totalTokens: (primary.totalTokens ?? 0) + (secondary.totalTokens ?? 0),
593
+ cacheReadTokens: (primary.cacheReadTokens ?? 0) + (secondary.cacheReadTokens ?? 0),
594
+ cacheWriteTokens: (primary.cacheWriteTokens ?? 0) + (secondary.cacheWriteTokens ?? 0),
595
+ };
596
+ }
597
+ function extractAssistantText(content) {
598
+ if (typeof content === 'string')
599
+ return content.trim();
600
+ if (!Array.isArray(content))
601
+ return '';
602
+ return content
603
+ .map((part) => {
604
+ if (typeof part === 'string')
605
+ return part;
606
+ return typeof part.text === 'string' ? part.text : '';
607
+ })
608
+ .join('\n')
609
+ .trim();
610
+ }
611
+ export function inferDeterministicReadyDecision(params) {
612
+ const parsedAssessment = parseVerificationDecisionText(params.assessment);
613
+ const pageLooksStable = params.observation.readyState !== 'loading';
614
+ const hasBlockingDialog = params.observation.dialogCount > 0;
615
+ const hasLoadingIndicators = params.observation.loadingIndicatorCount > 0;
616
+ const onExpectedOrigin = urlsRoughlyMatch(params.targetUrl, params.observation.url)
617
+ || urlsRoughlyMatch(params.observation.url, params.targetUrl);
618
+ // Dialog presence is no longer a hard rejection — the LLM sees the screenshot
619
+ // and can judge whether the dialog is the intended capture target or an obstruction.
620
+ if (hasLoadingIndicators) {
621
+ return { verified: false, reason: `Loading indicators still visible (${params.observation.loadingIndicatorCount}).` };
622
+ }
623
+ if (parsedAssessment?.verified && pageLooksStable && onExpectedOrigin && params.allowSuccess !== false) {
624
+ return { verified: true };
625
+ }
626
+ return null;
627
+ }
628
+ function hasHardVariantMismatch(config, bundle) {
629
+ if (config.currentLang) {
630
+ const languageState = evaluateRequestedLanguageState({
631
+ currentUrl: bundle.url,
632
+ requestedLang: config.currentLang,
633
+ signals: bundle.pageSignals,
634
+ });
635
+ if (!languageState.active && !languageState.ambiguous && languageState.confidence === 'high') {
636
+ return true;
637
+ }
638
+ }
639
+ if (config.currentTheme) {
640
+ const themeState = evaluateRequestedThemeState({
641
+ requestedTheme: config.currentTheme,
642
+ signals: bundle.pageSignals,
643
+ });
644
+ if (!themeState.active && !themeState.ambiguous && themeState.confidence === 'high') {
645
+ return true;
646
+ }
647
+ }
648
+ return false;
649
+ }
650
+ export function inferManifestReadyDecision(params) {
651
+ const manifest = params.config.variantManifest;
652
+ const identity = manifest?.currentPageIdentity;
653
+ if (!manifest?.currentPageId || !identity)
654
+ return null;
655
+ if (!(identity.dialogTarget || identity.kind === 'gallery'))
656
+ return null;
657
+ const parsedAssessment = parseVerificationDecisionText(params.assessment);
658
+ if (!parsedAssessment?.verified)
659
+ return null;
660
+ const pageLooksStable = params.observation.readyState !== 'loading'
661
+ && params.observation.loadingIndicatorCount === 0;
662
+ const onExpectedOrigin = urlsRoughlyMatch(params.config.url, params.bundle.url)
663
+ || urlsRoughlyMatch(params.bundle.url, params.config.url);
664
+ if (!pageLooksStable || !onExpectedOrigin || hasHardVariantMismatch(params.config, params.bundle)) {
665
+ return null;
666
+ }
667
+ if (identity.dialogTarget) {
668
+ const visibleText = [
669
+ params.bundle.pageSignals.title,
670
+ params.bundle.pageSignals.headings.join(' '),
671
+ params.bundle.pageSignals.navLabels.join(' '),
672
+ params.bundle.pageSignals.visibleText,
673
+ ].filter(Boolean).join(' ');
674
+ const strongSubjectTokens = identity.subjectTokens.filter(token => token.length >= 4 && !MODAL_GENERIC_SUBJECT_TOKENS.has(token));
675
+ const subjectTokenMatches = countDialogSubjectTokenMatches({
676
+ subjectTokens: identity.subjectTokens,
677
+ visibleText,
678
+ currentUrl: params.bundle.url,
679
+ });
680
+ const dialogSurfaceMatches = hasExpectedDialogSurface({
681
+ identity,
682
+ visibleText,
683
+ currentUrl: params.bundle.url,
684
+ });
685
+ if (!dialogSurfaceMatches || (strongSubjectTokens.length > 0 && subjectTokenMatches === 0)) {
686
+ return {
687
+ verified: false,
688
+ reason: identity.kind === 'modal_configuration'
689
+ ? 'Expected the configured modal state, but the open dialog does not match the requested content.'
690
+ : 'Expected the requested dialog/modal target, but the open dialog content does not match it.',
691
+ };
692
+ }
693
+ }
694
+ return { verified: true };
695
+ }
696
+ /**
697
+ * Page-signal deterministic verification: accepts when ALL of the following hold:
698
+ * 1. Assessment text parses as positive (caller already believes ready)
699
+ * 2. inferVariantIdentityFailure finds no identity mismatch
700
+ * 3. No loading indicators, readyState === 'complete'
701
+ * 4. No hard variant (lang/theme) mismatch
702
+ * 5. No unexpected dialog blocking the page (unless dialog is the target)
703
+ *
704
+ * Returns { verified: true } when confident, null when inconclusive (→ LLM).
705
+ * NEVER returns { verified: false } — absence of evidence is not evidence of absence.
706
+ */
707
+ export function inferPageSignalReadyDecision(params) {
708
+ // Require a positive assessment from the caller
709
+ const parsedAssessment = parseVerificationDecisionText(params.assessment);
710
+ if (!parsedAssessment?.verified)
711
+ return null;
712
+ // Require page stability — no loading or transitional state
713
+ if (params.observation.readyState !== 'complete')
714
+ return null;
715
+ if (params.observation.loadingIndicatorCount > 0)
716
+ return null;
717
+ // Require URL origin match
718
+ if (!urlsRoughlyMatch(params.config.url, params.bundle.url)
719
+ && !urlsRoughlyMatch(params.bundle.url, params.config.url)) {
720
+ return null;
721
+ }
722
+ // Reject if hard variant mismatch (wrong lang or theme with high confidence)
723
+ if (hasHardVariantMismatch(params.config, params.bundle))
724
+ return null;
725
+ // Require no identity failure — if inferVariantIdentityFailure flagged something,
726
+ // we cannot accept deterministically (the page content doesn't match expectations)
727
+ if (params.identityFailure)
728
+ return null;
729
+ // Reject if an unexpected dialog is open (unless the target IS a dialog)
730
+ const expectsDialog = params.config.variantManifest?.currentPageIdentity?.dialogTarget;
731
+ if (params.observation.dialogCount > 0 && !expectsDialog)
732
+ return null;
733
+ return { verified: true };
734
+ }
735
+ function summarizeVerificationDiagnostics(config, bundle) {
736
+ const result = {};
737
+ if (config.currentLang) {
738
+ const languageState = evaluateRequestedLanguageState({
739
+ currentUrl: bundle.url,
740
+ requestedLang: config.currentLang,
741
+ signals: bundle.pageSignals,
742
+ });
743
+ const status = languageState.active
744
+ ? 'match'
745
+ : languageState.ambiguous
746
+ ? 'ambiguous'
747
+ : 'mismatch';
748
+ result.lang = `${status}/${languageState.confidence}: ${languageState.reasons.join('; ') || 'no_language_signal'}`;
749
+ }
750
+ if (config.currentTheme) {
751
+ const themeState = evaluateRequestedThemeState({
752
+ requestedTheme: config.currentTheme,
753
+ signals: bundle.pageSignals,
754
+ });
755
+ const status = themeState.active
756
+ ? 'match'
757
+ : themeState.ambiguous
758
+ ? 'ambiguous'
759
+ : 'mismatch';
760
+ result.theme = `${status}/${themeState.confidence}: ${themeState.reasons.join('; ') || 'no_theme_signal'}`;
761
+ }
762
+ return result;
763
+ }
764
+ function buildLanguagePreflightVerificationFailureReason(params) {
765
+ const requestedLang = params.requestedLang?.trim().toLowerCase();
766
+ const requestedTheme = params.requestedTheme;
767
+ if ((!requestedLang && !requestedTheme) || !params.signals) {
768
+ return 'Language preflight failed: unable to confirm the requested fixed UI variant.';
769
+ }
770
+ const failures = [];
771
+ if (requestedLang) {
772
+ const languageState = evaluateRequestedLanguageState({
773
+ currentUrl: params.currentUrl,
774
+ requestedLang,
775
+ signals: params.signals,
776
+ });
777
+ if (!languageState.active || languageState.ambiguous) {
778
+ if (languageState.reasons.length > 0) {
779
+ failures.push(`requested "${requestedLang}", detected "${languageState.detected ?? 'unknown'}" (${languageState.reasons.join('; ')})`);
780
+ }
781
+ else {
782
+ failures.push(`requested "${requestedLang}", but the fixed app UI is still not confirmed in that language`);
783
+ }
784
+ }
785
+ }
786
+ if (requestedTheme) {
787
+ const themeState = evaluateRequestedThemeState({
788
+ requestedTheme,
789
+ signals: params.signals,
790
+ });
791
+ if (!themeState.active || themeState.ambiguous) {
792
+ if (themeState.reasons.length > 0) {
793
+ failures.push(`theme requested "${requestedTheme}", detected "${themeState.detected ?? 'unknown'}" (${themeState.reasons.join('; ')})`);
794
+ }
795
+ else {
796
+ failures.push(`theme "${requestedTheme}" is still not confirmed`);
797
+ }
798
+ }
799
+ }
800
+ if (failures.length === 0) {
801
+ return '';
802
+ }
803
+ return `Language preflight failed: ${failures.join(' | ')}.`;
804
+ }
805
+ function isTechnicalVerificationFailureReason(reason) {
806
+ if (!reason)
807
+ return false;
808
+ return /\b(timeout|timed out|parser|transport|stale verification snapshot|captureverificationbundle|invalid json|json parse|response format|temporary|network error|connection reset|econnreset|service unavailable)\b/i.test(reason);
809
+ }
810
+ const ACTION_OBSERVATION_TIMEOUT_MS = 2500;
811
+ const VERIFICATION_SCREENSHOT_TIMEOUT_MS = 5000;
812
+ const VERIFICATION_BUNDLE_TIMEOUT_MS = 8000;
813
+ const ACTION_DIAGNOSTIC_WAIT_MS = 200;
814
+ class AgentStepTimeoutError extends Error {
815
+ timeoutMs;
816
+ stepLabel;
817
+ constructor(stepLabel, timeoutMs) {
818
+ super(`Timed out after ${timeoutMs}ms while ${stepLabel}.`);
819
+ this.name = 'AgentStepTimeoutError';
820
+ this.stepLabel = stepLabel;
821
+ this.timeoutMs = timeoutMs;
822
+ }
823
+ }
824
+ function isAgentStepTimeoutError(error) {
825
+ return error instanceof AgentStepTimeoutError;
826
+ }
827
+ async function withAgentStepTimeout(work, params) {
828
+ return new Promise((resolve, reject) => {
829
+ const timer = setTimeout(() => {
830
+ reject(new AgentStepTimeoutError(params.stepLabel, params.timeoutMs));
831
+ }, params.timeoutMs);
832
+ void work()
833
+ .then((result) => {
834
+ clearTimeout(timer);
835
+ resolve(result);
836
+ })
837
+ .catch((error) => {
838
+ clearTimeout(timer);
839
+ reject(error);
840
+ });
841
+ });
842
+ }
843
+ async function withOptionalAgentStepTimeout(work, params) {
844
+ try {
845
+ return await withAgentStepTimeout(work, params);
846
+ }
847
+ catch (error) {
848
+ if (isAgentStepTimeoutError(error)) {
849
+ throw error;
850
+ }
851
+ return null;
852
+ }
853
+ }
854
+ function getActionExecutionTimeoutMs(action, args) {
855
+ switch (action) {
856
+ case 'navigate_to':
857
+ return 25000;
858
+ case 'click':
859
+ case 'safe_expand':
860
+ case 'select_option':
861
+ return 5000;
862
+ case 'type_text':
863
+ case 'press_key':
864
+ case 'scroll':
865
+ case 'scroll_to_element':
866
+ case 'resize_viewport':
867
+ case 'dismiss_overlays':
868
+ case 'hover':
869
+ return 3500;
870
+ case 'wait':
871
+ return Math.min(6000, Math.max(1000, Number(args.milliseconds ?? 1000) + 1000));
872
+ default:
873
+ return 3500;
874
+ }
875
+ }
876
+ function buildActionTimeoutMessage(stage, action, timeoutMs) {
877
+ return `Action timeout during ${stage} (${action}, ${timeoutMs}ms)`;
878
+ }
879
+ function buildVerificationTimeoutMessage(stage, timeoutMs) {
880
+ return `Verification timeout during ${stage} (${timeoutMs}ms).`;
881
+ }
882
+ function logActionProgress(action, stage) {
883
+ logger.debug(`Action progress: ${action} -> ${stage}`);
884
+ }
885
+ function getDiagnosticRecoveryFailureMessage(stage, action, timeoutMs, options = {}) {
886
+ const prefix = buildActionTimeoutMessage(stage, action, timeoutMs);
887
+ if (options.recoveryError) {
888
+ return `${prefix}; ${options.recoveryError}`;
889
+ }
890
+ if (stage === 'browser execution') {
891
+ return `${prefix}; state may be partially changed.`;
892
+ }
893
+ if (options.afterChanged) {
894
+ return `${prefix}; diagnostic recovery could not confirm a stable post-action state.`;
895
+ }
896
+ return `${prefix}; no reliable state change detected.`;
897
+ }
898
+ async function performActionDiagnosticRecovery(browser, action, before) {
899
+ logActionProgress(action, 'diagnostic recovery');
900
+ try {
901
+ await withAgentStepTimeout(() => browser.dismissOverlays(), {
902
+ stepLabel: `${action} diagnostic dismissOverlays`,
903
+ timeoutMs: getActionExecutionTimeoutMs('dismiss_overlays', {}),
904
+ });
905
+ await withAgentStepTimeout(() => browser.wait(ACTION_DIAGNOSTIC_WAIT_MS), {
906
+ stepLabel: `${action} diagnostic wait`,
907
+ timeoutMs: getActionExecutionTimeoutMs('wait', { milliseconds: ACTION_DIAGNOSTIC_WAIT_MS }),
908
+ });
909
+ const after = await withAgentStepTimeout(() => browser.captureObservation(), {
910
+ stepLabel: `${action} diagnostic captureObservation`,
911
+ timeoutMs: ACTION_OBSERVATION_TIMEOUT_MS,
912
+ });
913
+ if (!before) {
914
+ return {
915
+ reaction: null,
916
+ error: 'baseline observation was unavailable, so diagnostic recovery could not compare page state.',
917
+ };
918
+ }
919
+ return {
920
+ reaction: describeObservationChange(before, after),
921
+ };
922
+ }
923
+ catch (error) {
924
+ if (isAgentStepTimeoutError(error)) {
925
+ return {
926
+ reaction: null,
927
+ error: `${buildActionTimeoutMessage('diagnostic recovery', action, error.timeoutMs)}.`,
928
+ };
929
+ }
930
+ return {
931
+ reaction: null,
932
+ error: `diagnostic recovery failed: ${error.message}`,
933
+ };
934
+ }
935
+ }
936
+ async function deterministicReadyObservationCheck(params) {
937
+ logger.debug('Verification progress: captureObservation');
938
+ try {
939
+ const observation = await withAgentStepTimeout(() => params.browser.captureObservation(), {
940
+ stepLabel: 'verification captureObservation',
941
+ timeoutMs: ACTION_OBSERVATION_TIMEOUT_MS,
942
+ });
943
+ return inferDeterministicReadyDecision({
944
+ observation,
945
+ assessment: params.assessment,
946
+ targetUrl: params.config.url,
947
+ skipDialogCheck: params.skipDialogCheck,
948
+ allowSuccess: params.allowSuccess,
949
+ });
950
+ }
951
+ catch (error) {
952
+ if (isAgentStepTimeoutError(error)) {
953
+ return {
954
+ verified: false,
955
+ reason: buildVerificationTimeoutMessage('captureObservation', error.timeoutMs),
956
+ };
957
+ }
958
+ return null;
959
+ }
960
+ }
961
+ /**
962
+ * Fast-path optimization: extract a PASS/FAIL verdict from LLM text output
963
+ * using keyword matching, avoiding a separate LLM call when the signal is clear.
964
+ *
965
+ * Returns null when uncertain — the caller MUST fall back to an LLM vision call.
966
+ * Do not add more patterns here; expand LLM fallback coverage instead.
967
+ */
968
+ export function parseVerificationDecisionText(content) {
969
+ const normalized = content.replace(/\s+/g, ' ').trim();
970
+ if (!normalized)
971
+ return null;
972
+ const upper = normalized.toUpperCase();
973
+ if (upper === 'PASS' || upper.startsWith('PASS ')) {
974
+ return { verified: true };
975
+ }
976
+ if (upper.startsWith('FAIL:')) {
977
+ return { verified: false, reason: normalized.slice(5).trim() || 'Verification failed' };
978
+ }
979
+ if (upper.startsWith('FAIL ')) {
980
+ return { verified: false, reason: normalized.slice(5).trim() || 'Verification failed' };
981
+ }
982
+ if (/^PASS\b/i.test(normalized)) {
983
+ return { verified: true };
984
+ }
985
+ if (/^FAIL\b/i.test(normalized)) {
986
+ return { verified: false, reason: normalized.replace(/^FAIL\b[:\s-]*/i, '').trim() || 'Verification failed' };
987
+ }
988
+ const positiveSignals = [
989
+ /\bready to capture\b/i,
990
+ /\bready for capture\b/i,
991
+ /\bcapture can proceed\b/i,
992
+ /\bpage (?:is|looks|appears)? ?clean\b/i,
993
+ /\bpage est propre\b/i,
994
+ /\bpage propre\b/i,
995
+ /\bno (?:overlay|overlays|banner|banners|spinner|spinners|modal|modals|popup|popups)\b/i,
996
+ /\baucun(?:e)? (?:overlay|banni[eè]re|spinner|modal|popup|widget)\b/i,
997
+ /\bsans overlay\b/i,
998
+ /\bfully loaded\b/i,
999
+ /\bcompl[eè]tement charg[ée]\b/i,
1000
+ /\bchargement (?:est )?termin[ée]\b/i,
1001
+ /\ball elements visible\b/i,
1002
+ /\btous les [ée]l[ée]ments visibles\b/i,
1003
+ /\bmatches (?:the )?(?:user request|request)\b/i,
1004
+ /\bcorrespond(?: parfaitement)? [àa] la demande\b/i,
1005
+ /\bpr[êe]te? pour la capture\b/i,
1006
+ ];
1007
+ const negativeSignals = [
1008
+ /\bnot ready\b/i,
1009
+ /\bpas pr[êe]te?\b/i,
1010
+ /\bnot clean\b/i,
1011
+ /\bpas propre\b/i,
1012
+ /\bwrong page\b/i,
1013
+ /\bmauvaise page\b/i,
1014
+ /\bstill visible\b/i,
1015
+ /\bencore visible\b/i,
1016
+ /\btoujours visible\b/i,
1017
+ /\bstill loading\b/i,
1018
+ /\bencore en chargement\b/i,
1019
+ /\bspinner visible\b/i,
1020
+ /\bcookie banner visible\b/i,
1021
+ /\bbanni[eè]re cookie visible\b/i,
1022
+ /\boverlay present\b/i,
1023
+ /\bmodal visible\b/i,
1024
+ /\bpopup visible\b/i,
1025
+ /\bcropped\b/i,
1026
+ /\bcut off\b/i,
1027
+ /\bpartially visible\b/i,
1028
+ /\btronqu[ée]\b/i,
1029
+ /\bcoup[ée]\b/i,
1030
+ /\bpartiellement visible\b/i,
1031
+ /\bneeds fixing\b/i,
1032
+ /\bmust be fixed\b/i,
1033
+ /\b[àa] corriger\b/i,
1034
+ ];
1035
+ const positiveCount = positiveSignals.filter((pattern) => pattern.test(normalized)).length;
1036
+ const hasNegativeSignal = negativeSignals.some((pattern) => pattern.test(normalized));
1037
+ if (positiveCount >= 1 && !hasNegativeSignal) {
1038
+ return { verified: true };
1039
+ }
1040
+ if (hasNegativeSignal && positiveCount === 0) {
1041
+ return { verified: false, reason: normalized.slice(0, 160) };
1042
+ }
1043
+ return null;
1044
+ }
1045
+ function parseWaitDurationMs(content) {
1046
+ const normalized = content.toLowerCase();
1047
+ const msMatch = normalized.match(/(\d{2,5})\s*(ms|millisecond|milliseconds)/i);
1048
+ if (msMatch) {
1049
+ return Math.min(5000, Math.max(300, Number(msMatch[1])));
1050
+ }
1051
+ const secMatch = normalized.match(/(\d{1,2}(?:[.,]\d+)?)\s*(s|sec|secs|second|seconds|seconde|secondes)/i);
1052
+ if (secMatch) {
1053
+ const seconds = Number(secMatch[1].replace(',', '.'));
1054
+ if (!Number.isNaN(seconds)) {
1055
+ return Math.min(5000, Math.max(300, Math.round(seconds * 1000)));
1056
+ }
1057
+ }
1058
+ return 1000;
1059
+ }
1060
+ /** Known tool names for validating embedded JSON tool calls in text. */
1061
+ const KNOWN_TOOL_NAMES = new Set([
1062
+ 'click', 'type_text', 'select_option', 'scroll', 'press_key',
1063
+ 'dismiss_overlays', 'wait', 'search_text', 'navigate_to',
1064
+ 'resize_viewport', 'take_screenshot', 'ready_to_capture',
1065
+ 'give_up', 'begin_subgoal', 'note', 'capture_by_selector',
1066
+ ]);
1067
+ /**
1068
+ * Try to extract a tool call embedded as JSON in the model's text output.
1069
+ * Some models emit tool calls as plain text in the response content instead of
1070
+ * using the structured tool_calls array, especially during streaming or format confusion:
1071
+ * {"name": "click", "arguments": {"index": 5, "reason": "..."}}
1072
+ * This function gracefully recovers from such cases.
1073
+ */
1074
+ function parseEmbeddedToolCall(text) {
1075
+ // Pattern 1: JSON object with "name" and "arguments" keys
1076
+ const jsonMatch = text.match(/\{[^{}]*"name"\s*:\s*"(\w+)"[^{}]*"arguments"\s*:\s*(\{[^}]+\})/s);
1077
+ if (jsonMatch) {
1078
+ const [, name, argsStr] = jsonMatch;
1079
+ if (KNOWN_TOOL_NAMES.has(name)) {
1080
+ try {
1081
+ const args = JSON.parse(argsStr);
1082
+ return { name: name, args };
1083
+ }
1084
+ catch { /* ignore */ }
1085
+ }
1086
+ }
1087
+ // Pattern 2: function-call-like syntax: toolName({...})
1088
+ const funcMatch = text.match(/\b(\w+)\s*\(\s*(\{[\s\S]*?\})\s*\)/);
1089
+ if (funcMatch) {
1090
+ const [, name, argsStr] = funcMatch;
1091
+ if (KNOWN_TOOL_NAMES.has(name)) {
1092
+ try {
1093
+ const args = JSON.parse(argsStr);
1094
+ return { name: name, args };
1095
+ }
1096
+ catch { /* ignore */ }
1097
+ }
1098
+ }
1099
+ return null;
1100
+ }
1101
+ export function inferActionFromAssistantText(content) {
1102
+ const normalized = content.replace(/\s+/g, ' ').trim();
1103
+ if (!normalized)
1104
+ return null;
1105
+ // Try to extract an embedded tool call from the text (some models emit
1106
+ // tool calls as plain text in content instead of structured tool_calls).
1107
+ const embedded = parseEmbeddedToolCall(normalized);
1108
+ if (embedded) {
1109
+ logger.debug(`Recovered embedded tool call from text: ${embedded.name}`);
1110
+ return embedded;
1111
+ }
1112
+ const verificationDecision = parseVerificationDecisionText(normalized);
1113
+ if (verificationDecision?.verified) {
1114
+ return {
1115
+ name: 'ready_to_capture',
1116
+ args: { assessment: normalized.slice(0, 800) },
1117
+ };
1118
+ }
1119
+ if (verificationDecision && !verificationDecision.verified) {
1120
+ return {
1121
+ name: 'give_up',
1122
+ args: { reason: verificationDecision.reason ?? normalized.slice(0, 400) },
1123
+ };
1124
+ }
1125
+ const lower = normalized.toLowerCase();
1126
+ if ((/\bwait\b/.test(lower) || /\battend/.test(lower))
1127
+ && (/\bsettle\b/.test(lower) || /\bcharg/.test(lower) || /\bload/.test(lower) || /\bnavigation\b/.test(lower))) {
1128
+ return {
1129
+ name: 'wait',
1130
+ args: {
1131
+ milliseconds: parseWaitDurationMs(normalized),
1132
+ reason: 'assistant_text_fallback',
1133
+ },
1134
+ };
1135
+ }
1136
+ return null;
1137
+ }
1138
+ /**
1139
+ * In dual-model mode, extract a text observation from a screenshot via the cheap vision model.
1140
+ * Returns the observation text and usage, or null if visionModel is not configured.
1141
+ */
1142
+ async function getVisionObservation(client, config, screenshot, context, stepNumber) {
1143
+ if (!config.visionModel)
1144
+ return null;
1145
+ const screenshotUrl = await makeImageUrl(screenshot, 'image/jpeg', config.uploadImage);
1146
+ const videoGuidance = config.runMode === 'video_navigation_preflight'
1147
+ ? ' For video navigation preflight, explicitly say whether the screenshot is the EXACT pre-recording start state. Call out generic /home or dashboard states, unopened projects/sections/dialogs, and wrong fixed UI language/theme in the chrome.'
1148
+ : '';
1149
+ const messages = [
1150
+ {
1151
+ role: 'system',
1152
+ content: 'You are a page state observer for screenshot verification. Describe the current page concisely: layout, visible content, overlays, loading state, and whether the page actually matches the requested target.'
1153
+ + videoGuidance,
1154
+ },
1155
+ { role: 'user', content: [
1156
+ { type: 'image_url', image_url: { url: screenshotUrl } },
1157
+ { type: 'text', text: context },
1158
+ ] },
1159
+ ];
1160
+ try {
1161
+ const result = await client.chat.completions.create({ model: config.visionModel, messages, max_tokens: 300, stream: false, ...providerBody(config.visionModel, config.providerPreferences) }, { signal: config.abortSignal });
1162
+ const content = result.choices?.[0]?.message?.content?.trim() ?? '';
1163
+ const usage = extractUsage(result, stepNumber, 'verification', config.visionModel, 1);
1164
+ return { observation: content, usage };
1165
+ }
1166
+ catch (err) {
1167
+ if (isAbortError(err))
1168
+ throw err;
1169
+ logger.error(`Vision observer for verification failed: ${err.message}`);
1170
+ return null; // Fall through to mono-model path
1171
+ }
1172
+ }
1173
+ function buildVerificationVisionContext(params) {
1174
+ const parts = [
1175
+ 'Describe this page for verification.',
1176
+ `User request: ${params.config.prompt}`,
1177
+ `Agent assessment: ${params.assessment}`,
1178
+ params.pageContext.currentUrl ? `Current URL: ${params.pageContext.currentUrl}` : null,
1179
+ params.pageContext.pageTitle ? `Current page title: ${params.pageContext.pageTitle}` : null,
1180
+ params.config.currentLang ? `Expected language: ${params.config.currentLang}` : null,
1181
+ params.config.currentTheme ? `Expected theme: ${params.config.currentTheme}` : null,
1182
+ ].filter(Boolean);
1183
+ if (params.config.runMode === 'video_navigation_preflight') {
1184
+ parts.push('Verification target: exact pre-recording start state.', 'Explicitly say if the page is still only a generic home/dashboard/list state instead of the requested project/section/dialog.', 'Explicitly call out wrong fixed app chrome language/theme if buttons, navigation, headings, or breadcrumbs do not match the requested variant.');
1185
+ }
1186
+ return parts.join('\n');
1187
+ }
1188
+ /**
1189
+ * Strip image_url parts from ChatCompletionContentPart[] and prepend a text observation.
1190
+ */
1191
+ function replaceImagesWithObservation(parts, observation) {
1192
+ const textParts = parts.filter(p => p.type !== 'image_url');
1193
+ return [
1194
+ { type: 'text', text: `Page observation (from vision model):\n${observation}` },
1195
+ ...textParts,
1196
+ ];
1197
+ }
1198
+ async function fallbackVerifyScreenshotJson(client, config, modelState, screenshot, assessment, pageContext, stepNumber, pageFingerprint, precomputedVisionObs) {
1199
+ const verificationTarget = config.runMode === 'video_navigation_preflight'
1200
+ ? 'exact pre-recording start state'
1201
+ : 'requested page';
1202
+ // Dual-model mode: reuse precomputed observation or call vision observer
1203
+ const visionObs = precomputedVisionObs !== undefined
1204
+ ? precomputedVisionObs
1205
+ : await getVisionObservation(client, config, screenshot, buildVerificationVisionContext({
1206
+ config,
1207
+ pageContext,
1208
+ assessment,
1209
+ }), stepNumber);
1210
+ const screenshotUrl = visionObs ? '' : await makeImageUrl(screenshot, 'image/jpeg', config.uploadImage);
1211
+ const rawParts = buildVerificationMessage({
1212
+ userPrompt: config.prompt,
1213
+ screenshotUrl,
1214
+ previousAssessment: assessment,
1215
+ runMode: config.runMode,
1216
+ currentLang: config.currentLang,
1217
+ currentTheme: config.currentTheme,
1218
+ pageContext,
1219
+ runHints: config.runHints,
1220
+ variantManifest: config.variantManifest,
1221
+ });
1222
+ const userContent = visionObs ? replaceImagesWithObservation(rawParts, visionObs.observation) : rawParts;
1223
+ const fallbackMessages = [
1224
+ {
1225
+ role: 'system',
1226
+ content: `You are a screenshot quality inspector. Respond with a JSON object: { "ready": true } if the screenshot is clean, fully loaded, and matches the ${verificationTarget}. Otherwise respond with { "ready": false, "reason": "<short explanation>" }. Output ONLY valid JSON, nothing else.`,
1227
+ },
1228
+ { role: 'user', content: userContent },
1229
+ ];
1230
+ if (visionObs) {
1231
+ // Text-only call — no vision fallback needed
1232
+ const result = await client.chat.completions.create({ model: modelState.active, messages: fallbackMessages, max_tokens: 256, response_format: { type: 'json_object' }, stream: false, ...providerBody(modelState.active, config.providerPreferences) }, { signal: config.abortSignal });
1233
+ const primaryUsage = extractUsage(result, stepNumber, 'verification', modelState.active, 0);
1234
+ const usage = visionObs.usage ? mergeUsage(visionObs.usage, primaryUsage) : primaryUsage;
1235
+ const content = extractAssistantText(result.choices?.[0]?.message?.content);
1236
+ if (!content || !content.trim()) {
1237
+ logger.debug('Verification JSON fallback (dual-model) returned empty response — trying plain-text fallback');
1238
+ return fallbackVerifyScreenshotText(client, config, modelState, screenshot, assessment, stepNumber, pageFingerprint, visionObs);
1239
+ }
1240
+ try {
1241
+ const json = JSON.parse(content);
1242
+ if (json.ready === true) {
1243
+ logger.success('Qualité du screenshot vérifiée');
1244
+ return { verified: true, usage, matchedPageId: config.variantManifest?.currentPageId ?? null, pageFingerprint, mode: 'text_fallback' };
1245
+ }
1246
+ const reason = (typeof json.reason === 'string' && json.reason.trim()) || 'Verification failed';
1247
+ logger.ai(`Problème de vérification : ${reason}`);
1248
+ return { verified: false, reason, usage, blockingReason: reason, pageFingerprint, mode: 'text_fallback' };
1249
+ }
1250
+ catch {
1251
+ const snippet = (content || '').trim().slice(0, 160);
1252
+ return { verified: false, reason: `Verification fallback returned invalid JSON: ${snippet}`, usage, blockingReason: `Verification fallback returned invalid JSON: ${snippet}`, pageFingerprint, mode: 'text_fallback' };
1253
+ }
1254
+ }
1255
+ // Mono-model mode: send image directly with vision fallback
1256
+ const fallbackResult = await callVisionCapableModel({
1257
+ primaryModel: modelState.active,
1258
+ fallbackModel: modelState.active === config.model ? config.fallbackModel : undefined,
1259
+ onFallbackActivated: (model, reason) => {
1260
+ modelState.active = model;
1261
+ logger.info(`Vision fallback activated for JSON verification: ${model} (reason: ${reason})`);
1262
+ },
1263
+ callModel: (model) => client.chat.completions.create({
1264
+ model,
1265
+ messages: fallbackMessages,
1266
+ max_tokens: 256,
1267
+ response_format: { type: 'json_object' },
1268
+ stream: false,
1269
+ ...providerBody(model, config.providerPreferences),
1270
+ }, { signal: config.abortSignal }),
1271
+ });
1272
+ modelState.active = fallbackResult.model;
1273
+ const usage = extractUsage(fallbackResult.result, stepNumber, 'verification', fallbackResult.model, 1);
1274
+ // Some models return tool_calls even when no tools are provided — check before parsing content
1275
+ const fallbackToolCall = fallbackResult.result.choices?.[0]?.message?.tool_calls?.[0];
1276
+ if (fallbackToolCall && 'function' in fallbackToolCall) {
1277
+ if (fallbackToolCall.function.name === 'ready_to_capture') {
1278
+ logger.success('Qualité du screenshot vérifiée');
1279
+ return { verified: true, usage, matchedPageId: config.variantManifest?.currentPageId ?? null, pageFingerprint, mode: 'text_fallback' };
1280
+ }
1281
+ if (fallbackToolCall.function.name === 'give_up') {
1282
+ const tcArgs = JSON.parse(fallbackToolCall.function.arguments);
1283
+ const reason = (typeof tcArgs.reason === 'string' && tcArgs.reason.trim()) || 'Verification failed';
1284
+ logger.ai(`Problème de vérification : ${reason}`);
1285
+ return { verified: false, reason, usage, blockingReason: reason, pageFingerprint, mode: 'text_fallback' };
1286
+ }
1287
+ }
1288
+ const content = extractAssistantText(fallbackResult.result.choices?.[0]?.message?.content);
1289
+ // Guard against empty responses — some models don't support response_format:json_object.
1290
+ // Fall through to a plain-text verification call as a last resort.
1291
+ if (!content || !content.trim()) {
1292
+ logger.debug('Verification JSON fallback returned empty response — trying plain-text fallback');
1293
+ return fallbackVerifyScreenshotText(client, config, modelState, screenshot, assessment, stepNumber, pageFingerprint);
1294
+ }
1295
+ try {
1296
+ const json = JSON.parse(content);
1297
+ const ready = json.ready === true;
1298
+ if (ready) {
1299
+ logger.success('Qualité du screenshot vérifiée');
1300
+ return {
1301
+ verified: true,
1302
+ usage,
1303
+ matchedPageId: config.variantManifest?.currentPageId ?? null,
1304
+ pageFingerprint,
1305
+ mode: 'text_fallback',
1306
+ };
1307
+ }
1308
+ const reason = (typeof json.reason === 'string' && json.reason.trim()) || 'Verification failed';
1309
+ logger.ai(`Problème de vérification : ${reason}`);
1310
+ return {
1311
+ verified: false,
1312
+ reason,
1313
+ usage,
1314
+ blockingReason: reason,
1315
+ pageFingerprint,
1316
+ mode: 'text_fallback',
1317
+ };
1318
+ }
1319
+ catch {
1320
+ // JSON parsing failed — treat as verification failure
1321
+ const snippet = (content || '').trim().slice(0, 160);
1322
+ logger.debug(`Verification JSON fallback returned invalid JSON: ${snippet}`);
1323
+ return {
1324
+ verified: false,
1325
+ reason: snippet ? `Verification fallback returned invalid JSON: ${snippet}` : 'Verification returned no valid decision',
1326
+ usage,
1327
+ blockingReason: snippet ? `Verification fallback returned invalid JSON: ${snippet}` : 'Verification returned no valid decision',
1328
+ pageFingerprint,
1329
+ mode: 'text_fallback',
1330
+ };
1331
+ }
1332
+ }
1333
+ async function fallbackVerifyScreenshotText(client, config, modelState, screenshot, assessment, stepNumber, pageFingerprint, precomputedVisionObs) {
1334
+ const verificationTarget = config.runMode === 'video_navigation_preflight'
1335
+ ? 'exact pre-recording start state'
1336
+ : 'requested page';
1337
+ const verifyPromptText = `Requested page: ${config.prompt.slice(0, 300)}\nAgent assessment: ${assessment.slice(0, 300)}\nIs this screenshot at the ${verificationTarget}?`;
1338
+ // Dual-model mode: reuse precomputed observation or call vision observer
1339
+ const visionObs = precomputedVisionObs !== undefined
1340
+ ? precomputedVisionObs
1341
+ : await getVisionObservation(client, config, screenshot, verifyPromptText, stepNumber);
1342
+ if (visionObs) {
1343
+ try {
1344
+ const messages = [
1345
+ {
1346
+ role: 'system',
1347
+ content: `You are a screenshot quality inspector. Reply with exactly one word: READY if the page is clean, fully loaded, and matches the ${verificationTarget}. Otherwise reply NOT_READY followed by a colon and a short reason. Example: "NOT_READY: cookie banner visible". Output nothing else.`,
1348
+ },
1349
+ { role: 'user', content: `Page observation (from vision model):\n${visionObs.observation}\n\n${verifyPromptText}` },
1350
+ ];
1351
+ const result = await client.chat.completions.create({ model: modelState.active, messages, max_tokens: 64, stream: false, ...providerBody(modelState.active, config.providerPreferences) }, { signal: config.abortSignal });
1352
+ const primaryUsage = extractUsage(result, stepNumber, 'verification', modelState.active, 0);
1353
+ const usage = visionObs.usage ? mergeUsage(visionObs.usage, primaryUsage) : primaryUsage;
1354
+ const content = extractAssistantText(result.choices?.[0]?.message?.content);
1355
+ if (!content || !content.trim()) {
1356
+ return { verified: false, reason: 'Verification model returned empty response', usage, blockingReason: 'Verification model returned empty response', pageFingerprint, mode: 'text_fallback' };
1357
+ }
1358
+ const normalized = content.trim().toUpperCase();
1359
+ if (normalized === 'READY' || normalized.startsWith('READY')) {
1360
+ logger.success('Qualité du screenshot vérifiée');
1361
+ return { verified: true, usage, matchedPageId: config.variantManifest?.currentPageId ?? null, pageFingerprint, mode: 'text_fallback' };
1362
+ }
1363
+ const notReadyMatch = content.match(/NOT[_\s-]?READY\s*[:\-—]\s*(.*)/i);
1364
+ const reason = notReadyMatch?.[1]?.trim() || content.trim().slice(0, 200);
1365
+ logger.ai(`Problème de vérification : ${reason}`);
1366
+ return { verified: false, reason, usage, blockingReason: reason, pageFingerprint, mode: 'text_fallback' };
1367
+ }
1368
+ catch (err) {
1369
+ if (isAbortError(err))
1370
+ throw err;
1371
+ logger.error(`Verification text fallback (dual-model) call failed: ${err.message}`);
1372
+ return { verified: false, reason: 'Verification text fallback call failed', usage: null, blockingReason: 'Verification text fallback call failed', pageFingerprint, mode: 'text_fallback' };
1373
+ }
1374
+ }
1375
+ // Mono-model mode: send image directly with vision fallback
1376
+ const screenshotUrl = await makeImageUrl(screenshot, 'image/jpeg', config.uploadImage);
1377
+ const messages = [
1378
+ {
1379
+ role: 'system',
1380
+ content: `You are a screenshot quality inspector. Reply with exactly one word: READY if the screenshot is clean, fully loaded, and matches the ${verificationTarget}. Otherwise reply NOT_READY followed by a colon and a short reason. Example: "NOT_READY: cookie banner visible". Output nothing else.`,
1381
+ },
1382
+ {
1383
+ role: 'user',
1384
+ content: [
1385
+ { type: 'image_url', image_url: { url: screenshotUrl } },
1386
+ { type: 'text', text: verifyPromptText },
1387
+ ],
1388
+ },
1389
+ ];
1390
+ try {
1391
+ const result = await callVisionCapableModel({
1392
+ primaryModel: modelState.active,
1393
+ fallbackModel: modelState.active === config.model ? config.fallbackModel : undefined,
1394
+ onFallbackActivated: (model, reason) => {
1395
+ modelState.active = model;
1396
+ logger.info(`Vision fallback activated for text verification: ${model} (reason: ${reason})`);
1397
+ },
1398
+ callModel: (model) => client.chat.completions.create({
1399
+ model,
1400
+ messages,
1401
+ max_tokens: 64,
1402
+ stream: false,
1403
+ ...providerBody(model, config.providerPreferences),
1404
+ }, { signal: config.abortSignal }),
1405
+ });
1406
+ modelState.active = result.model;
1407
+ const usage = extractUsage(result.result, stepNumber, 'verification', result.model, 1);
1408
+ const content = extractAssistantText(result.result.choices?.[0]?.message?.content);
1409
+ if (!content || !content.trim()) {
1410
+ logger.debug('Verification text fallback returned empty response');
1411
+ return { verified: false, reason: 'Verification model returned empty response', usage, blockingReason: 'Verification model returned empty response', pageFingerprint, mode: 'text_fallback' };
1412
+ }
1413
+ const normalized = content.trim().toUpperCase();
1414
+ if (normalized === 'READY' || normalized.startsWith('READY')) {
1415
+ logger.success('Qualité du screenshot vérifiée');
1416
+ return { verified: true, usage, matchedPageId: config.variantManifest?.currentPageId ?? null, pageFingerprint, mode: 'text_fallback' };
1417
+ }
1418
+ const notReadyMatch = content.match(/NOT[_\s-]?READY\s*[:\-—]\s*(.*)/i);
1419
+ const reason = notReadyMatch?.[1]?.trim() || content.trim().slice(0, 200);
1420
+ logger.ai(`Problème de vérification : ${reason}`);
1421
+ return { verified: false, reason, usage, blockingReason: reason, pageFingerprint, mode: 'text_fallback' };
1422
+ }
1423
+ catch (err) {
1424
+ if (isAbortError(err))
1425
+ throw err;
1426
+ logger.error(`Verification text fallback call failed: ${err.message}`);
1427
+ return { verified: false, reason: 'Verification text fallback call failed', usage: null, blockingReason: 'Verification text fallback call failed', pageFingerprint, mode: 'text_fallback' };
1428
+ }
1429
+ }
1430
+ async function performBrowserAction(browser, action, args) {
1431
+ const dismissPageOverlays = async () => {
1432
+ if (typeof browser.dismissOverlays === 'function') {
1433
+ await browser.dismissOverlays();
1434
+ return;
1435
+ }
1436
+ await dismissCookiesAndWidgets(browser.currentPage);
1437
+ };
1438
+ switch (action) {
1439
+ case 'click':
1440
+ case 'safe_expand': {
1441
+ const hoverOnly = args.hover_only === true || action === 'safe_expand';
1442
+ // Replay actions may need force:true to bypass pointer-event interception
1443
+ // (e.g., after dark theme switch, <html class="dark"> intercepts events briefly)
1444
+ const forceClick = args.__forceClick === true;
1445
+ if (args.index !== undefined) {
1446
+ if (hoverOnly) {
1447
+ await browser.hoverByIndex(args.index);
1448
+ }
1449
+ else {
1450
+ await browser.clickByIndex(args.index);
1451
+ }
1452
+ }
1453
+ else if (args.selector) {
1454
+ if (hoverOnly) {
1455
+ await browser.hoverBySelector(args.selector);
1456
+ }
1457
+ else {
1458
+ await browser.clickBySelector(args.selector, { force: forceClick });
1459
+ }
1460
+ }
1461
+ else if (args.x !== undefined && args.y !== undefined) {
1462
+ if (hoverOnly) {
1463
+ await browser.hoverByCoordinates(args.x, args.y);
1464
+ }
1465
+ else {
1466
+ await browser.clickByCoordinates(args.x, args.y);
1467
+ }
1468
+ }
1469
+ else {
1470
+ throw new Error('click requires index, selector, or x/y coordinates');
1471
+ }
1472
+ return;
1473
+ }
1474
+ case 'hover':
1475
+ if (args.index !== undefined) {
1476
+ await browser.hoverByIndex(args.index);
1477
+ }
1478
+ else if (args.selector) {
1479
+ await browser.hoverBySelector(args.selector);
1480
+ }
1481
+ else if (args.x !== undefined && args.y !== undefined) {
1482
+ await browser.hoverByCoordinates(args.x, args.y);
1483
+ }
1484
+ else {
1485
+ throw new Error('hover requires index, selector, or x/y coordinates');
1486
+ }
1487
+ return;
1488
+ case 'type_text':
1489
+ await browser.typeText(args.text, {
1490
+ index: args.index,
1491
+ selector: args.selector,
1492
+ clearFirst: args.clearFirst,
1493
+ });
1494
+ return;
1495
+ case 'select_option':
1496
+ await browser.selectOption({
1497
+ index: args.index,
1498
+ selector: args.selector,
1499
+ optionLabel: args.optionLabel,
1500
+ optionValue: args.optionValue,
1501
+ optionIndex: args.optionIndex,
1502
+ });
1503
+ return;
1504
+ case 'scroll':
1505
+ if (args.index !== undefined) {
1506
+ await browser.scrollElementIntoView(args.index, {
1507
+ align: args.align,
1508
+ margin: args.margin,
1509
+ });
1510
+ }
1511
+ else {
1512
+ await browser.scroll(args.direction, args.amount, args.selector);
1513
+ }
1514
+ return;
1515
+ case 'scroll_to_element':
1516
+ await browser.scrollElementIntoView(args.index, {
1517
+ align: args.align,
1518
+ margin: args.margin,
1519
+ });
1520
+ return;
1521
+ case 'press_key':
1522
+ await browser.pressKey(args.key);
1523
+ return;
1524
+ case 'navigate_to':
1525
+ await browser.navigateTo(args.url);
1526
+ await dismissPageOverlays();
1527
+ return;
1528
+ case 'wait':
1529
+ await browser.wait(Math.min(args.milliseconds || 1000, 5000));
1530
+ return;
1531
+ case 'resize_viewport':
1532
+ await browser.resizeViewport(args.width, args.height);
1533
+ await browser.wait(500);
1534
+ await dismissPageOverlays();
1535
+ return;
1536
+ case 'dismiss_overlays':
1537
+ await browser.dismissOverlays();
1538
+ return;
1539
+ default:
1540
+ throw new Error(`Unknown action: ${action}`);
1541
+ }
1542
+ }
1543
+ export async function executeAction(browser, action, args) {
1544
+ let before = null;
1545
+ let reaction = null;
1546
+ let outcome;
1547
+ let stateChanged;
1548
+ logActionProgress(action, 'capturing baseline observation');
1549
+ try {
1550
+ before = await withAgentStepTimeout(() => browser.captureObservation(), {
1551
+ stepLabel: `${action} baseline captureObservation`,
1552
+ timeoutMs: ACTION_OBSERVATION_TIMEOUT_MS,
1553
+ });
1554
+ }
1555
+ catch (error) {
1556
+ if (isAgentStepTimeoutError(error)) {
1557
+ logger.info(`${buildActionTimeoutMessage('baseline observation', action, error.timeoutMs)}; continuing without baseline.`);
1558
+ before = null;
1559
+ }
1560
+ else {
1561
+ logger.info(`Action progress: ${action} -> baseline observation unavailable (${error.message})`);
1562
+ before = null;
1563
+ }
1564
+ }
1565
+ const authSubmitAction = isLikelyAuthenticationSubmitAction(action, args, before);
1566
+ const reactionOptions = getReactionOptions(action, args, authSubmitAction);
1567
+ logActionProgress(action, 'executing browser action');
1568
+ try {
1569
+ await withAgentStepTimeout(() => performBrowserAction(browser, action, args), {
1570
+ stepLabel: `${action} browser execution`,
1571
+ timeoutMs: getActionExecutionTimeoutMs(action, args),
1572
+ });
1573
+ }
1574
+ catch (error) {
1575
+ if (isAgentStepTimeoutError(error)) {
1576
+ const recovery = await performActionDiagnosticRecovery(browser, action, before);
1577
+ if (recovery.reaction?.changed) {
1578
+ return {
1579
+ success: true,
1580
+ outcome: `${buildActionTimeoutMessage('browser execution', action, error.timeoutMs)}; recovered via diagnostic observation: ${recovery.reaction.summary}`,
1581
+ stateChanged: true,
1582
+ };
1583
+ }
1584
+ return {
1585
+ success: false,
1586
+ error: getDiagnosticRecoveryFailureMessage('browser execution', action, error.timeoutMs, { afterChanged: recovery.reaction?.changed, recoveryError: recovery.error }),
1587
+ };
1588
+ }
1589
+ return { success: false, error: enrichErrorMessage(error.message) };
1590
+ }
1591
+ if (before) {
1592
+ logActionProgress(action, 'waiting for page reaction');
1593
+ try {
1594
+ reaction = await withAgentStepTimeout(() => browser.waitForPageReaction(before, reactionOptions), {
1595
+ stepLabel: `${action} post-action waitForPageReaction`,
1596
+ timeoutMs: reactionOptions.timeoutMs + 1200,
1597
+ });
1598
+ }
1599
+ catch (error) {
1600
+ if (isAgentStepTimeoutError(error)) {
1601
+ const recovery = await performActionDiagnosticRecovery(browser, action, before);
1602
+ if (recovery.reaction?.changed) {
1603
+ return {
1604
+ success: true,
1605
+ outcome: `${buildActionTimeoutMessage('post-action reaction', action, error.timeoutMs)}; recovered via diagnostic observation: ${recovery.reaction.summary}`,
1606
+ stateChanged: true,
1607
+ };
1608
+ }
1609
+ return {
1610
+ success: false,
1611
+ error: getDiagnosticRecoveryFailureMessage('post-action reaction', action, error.timeoutMs, { afterChanged: recovery.reaction?.changed, recoveryError: recovery.error }),
1612
+ };
1613
+ }
1614
+ logger.info(`Action progress: ${action} -> page reaction unavailable (${error.message})`);
1615
+ reaction = null;
1616
+ }
1617
+ }
1618
+ if (reaction) {
1619
+ outcome = reaction.summary;
1620
+ stateChanged = reaction.changed;
1621
+ if (action !== 'navigate_to' && reaction.before.url !== reaction.after.url) {
1622
+ // Only attempt overlay dismissal when the new page signals possible overlays
1623
+ // (dialogs, loading indicators, or expanded elements). Skipping saves 1-3.5s per navigation.
1624
+ const hasOverlaySignals = reaction.after.dialogCount > 0
1625
+ || reaction.after.loadingIndicatorCount > 0
1626
+ || reaction.after.expandedCount > 0;
1627
+ if (hasOverlaySignals) {
1628
+ logActionProgress(action, 'post-navigation cleanup');
1629
+ try {
1630
+ const cleanup = await withAgentStepTimeout(() => browser.dismissOverlays(), {
1631
+ stepLabel: `${action} post-navigation dismissOverlays`,
1632
+ timeoutMs: getActionExecutionTimeoutMs('dismiss_overlays', {}),
1633
+ });
1634
+ if (cleanup.dismissed) {
1635
+ const cleanupReaction = await withAgentStepTimeout(() => browser.waitForPageReaction(reaction.after, {
1636
+ timeoutMs: 1200,
1637
+ settleMs: 200,
1638
+ }), {
1639
+ stepLabel: `${action} post-navigation waitForPageReaction`,
1640
+ timeoutMs: 2400,
1641
+ }).catch(() => null);
1642
+ outcome = cleanupReaction
1643
+ ? `${reaction.summary}; cleanup: ${cleanupReaction.summary}; page cleanup reapplied after navigation.`
1644
+ : `${reaction.summary}; page cleanup reapplied after navigation.`;
1645
+ stateChanged = reaction.changed || cleanupReaction?.changed || stateChanged;
1646
+ }
1647
+ else {
1648
+ outcome = `${reaction.summary}; page cleanup checked after navigation.`;
1649
+ }
1650
+ }
1651
+ catch (error) {
1652
+ if (isAgentStepTimeoutError(error)) {
1653
+ outcome = `${reaction.summary}; post-navigation cleanup timed out after ${error.timeoutMs}ms.`;
1654
+ }
1655
+ else {
1656
+ outcome = `${reaction.summary}; post-navigation cleanup skipped: ${error.message}`;
1657
+ }
1658
+ }
1659
+ }
1660
+ }
1661
+ }
1662
+ return { success: true, outcome, stateChanged };
1663
+ }
1664
+ function enrichErrorMessage(message) {
1665
+ const msg = message.toLowerCase();
1666
+ if (msg.includes('timeout') && (msg.includes('element') || msg.includes('selector') || msg.includes('locator'))) {
1667
+ return `${message}. HINT: Element not found or not interactable. Try search_text to locate it, scroll to reveal it, or use a different element/approach.`;
1668
+ }
1669
+ if (msg.includes('not visible') || msg.includes('hidden') || msg.includes('display: none')) {
1670
+ return `${message}. HINT: Element is hidden. It may be inside a collapsed menu or behind an overlay. Try safe_expand on a parent trigger, or dismiss_overlays first.`;
1671
+ }
1672
+ if (msg.includes('outside') && msg.includes('viewport')) {
1673
+ return `${message}. HINT: Element is outside the viewport. Use scroll_to_element to bring it into view first.`;
1674
+ }
1675
+ if (msg.includes('navigation') && msg.includes('timeout')) {
1676
+ return `${message}. HINT: Page load timed out. The site may be slow. Try wait(2000) and then continue.`;
1677
+ }
1678
+ if (msg.includes('resolved to 0') || msg.includes('no element') || msg.includes('strict mode')) {
1679
+ return `${message}. HINT: CSS selector matched nothing or too many elements. Use element index instead, or search_text to find the element.`;
1680
+ }
1681
+ if (msg.includes('intercept') || msg.includes('other element would receive')) {
1682
+ return `${message}. HINT: Another element is covering the target (overlay, banner, or modal). Use dismiss_overlays or press_key("Escape") first.`;
1683
+ }
1684
+ return message;
1685
+ }
1686
+ async function verifyScreenshot(client, config, modelState, browser, assessment, stepNumber, options) {
1687
+ throwIfAborted(config.abortSignal, 'Agent verification cancelled.');
1688
+ logger.info('Vérification de la qualité du screenshot…');
1689
+ let verificationBundle;
1690
+ try {
1691
+ verificationBundle = await withAgentStepTimeout(() => browser.captureVerificationBundle(), {
1692
+ stepLabel: 'verification captureVerificationBundle',
1693
+ timeoutMs: VERIFICATION_BUNDLE_TIMEOUT_MS,
1694
+ });
1695
+ logger.debug(`Verification snapshot ready [${verificationBundle.coherenceKey}] @ ${verificationBundle.url}`);
1696
+ }
1697
+ catch (error) {
1698
+ if (isAgentStepTimeoutError(error)) {
1699
+ const reason = buildVerificationTimeoutMessage('captureVerificationBundle', error.timeoutMs);
1700
+ logger.ai(`Problème de vérification : ${reason}`);
1701
+ return {
1702
+ verified: false,
1703
+ reason,
1704
+ usage: null,
1705
+ blockingReason: reason,
1706
+ mode: 'deterministic',
1707
+ };
1708
+ }
1709
+ const reason = error.message;
1710
+ logger.ai(`Problème de vérification : ${reason}`);
1711
+ return {
1712
+ verified: false,
1713
+ reason,
1714
+ usage: null,
1715
+ blockingReason: reason,
1716
+ mode: 'deterministic',
1717
+ };
1718
+ }
1719
+ if (config.runMode === 'language_preflight') {
1720
+ try {
1721
+ const reason = buildLanguagePreflightVerificationFailureReason({
1722
+ requestedLang: config.currentLang,
1723
+ requestedTheme: config.currentTheme,
1724
+ currentUrl: verificationBundle.url,
1725
+ signals: verificationBundle.pageSignals,
1726
+ });
1727
+ if (!reason) {
1728
+ logger.success('Qualité du screenshot vérifiée');
1729
+ return {
1730
+ verified: true,
1731
+ usage: null,
1732
+ matchedPageId: config.variantManifest?.currentPageId ?? null,
1733
+ mode: 'deterministic',
1734
+ };
1735
+ }
1736
+ logger.ai(`Problème de vérification : ${reason}`);
1737
+ return {
1738
+ verified: false,
1739
+ reason,
1740
+ usage: null,
1741
+ blockingReason: reason,
1742
+ mode: 'deterministic',
1743
+ };
1744
+ }
1745
+ catch (error) {
1746
+ const reason = error.message;
1747
+ logger.ai(`Problème de vérification : ${reason}`);
1748
+ return {
1749
+ verified: false,
1750
+ reason,
1751
+ usage: null,
1752
+ blockingReason: reason,
1753
+ mode: 'deterministic',
1754
+ };
1755
+ }
1756
+ }
1757
+ const allowDeterministicSuccess = options?.allowDeterministicSuccess
1758
+ ?? (!config.variantManifest?.currentPageId
1759
+ && (config.variantManifest?.previousValidatedCaptures.length ?? 0) === 0);
1760
+ {
1761
+ const deterministicCheck = inferDeterministicReadyDecision({
1762
+ observation: verificationBundle.observation,
1763
+ assessment,
1764
+ targetUrl: config.url,
1765
+ skipDialogCheck: options?.skipDialogCheck,
1766
+ allowSuccess: allowDeterministicSuccess,
1767
+ });
1768
+ if (deterministicCheck?.verified) {
1769
+ // For video navigation preflight, also check that we're not on a generic dashboard.
1770
+ // The deterministic check may pass on /home when the assessment mentions the project,
1771
+ // but the agent hasn't actually navigated into it.
1772
+ const isGenericDashboard = config.runMode === 'video_navigation_preflight'
1773
+ && verificationBundle.url
1774
+ && /^\/(home|dashboard|app)?\/?$/.test(new URL(verificationBundle.url).pathname);
1775
+ if (isGenericDashboard) {
1776
+ logger.ai('Problème de vérification : la page est un dashboard générique, le projet/section demandé n\'est pas encore ouvert.');
1777
+ return {
1778
+ verified: false,
1779
+ reason: 'Still on generic dashboard (/home). Navigate into the specific project/section before calling ready_to_capture.',
1780
+ usage: null,
1781
+ blockingReason: 'generic_dashboard',
1782
+ pageFingerprint: computeScreenshotFingerprint(verificationBundle.screenshot),
1783
+ mode: 'deterministic',
1784
+ };
1785
+ }
1786
+ logger.success('Qualité du screenshot vérifiée');
1787
+ return {
1788
+ verified: true,
1789
+ usage: null,
1790
+ matchedPageId: config.variantManifest?.currentPageId ?? null,
1791
+ pageFingerprint: computeScreenshotFingerprint(verificationBundle.screenshot),
1792
+ mode: 'deterministic',
1793
+ };
1794
+ }
1795
+ if (deterministicCheck && !deterministicCheck.verified) {
1796
+ logger.ai(`Problème de vérification : ${deterministicCheck.reason}`);
1797
+ return {
1798
+ verified: false,
1799
+ reason: deterministicCheck.reason,
1800
+ usage: null,
1801
+ blockingReason: deterministicCheck.reason,
1802
+ pageFingerprint: computeScreenshotFingerprint(verificationBundle.screenshot),
1803
+ mode: 'deterministic',
1804
+ };
1805
+ }
1806
+ }
1807
+ const screenshotBuf = verificationBundle.screenshot;
1808
+ const pageFingerprint = computeScreenshotFingerprint(screenshotBuf);
1809
+ const duplicateOfPageId = findDuplicateVariantCapture(config.variantManifest, pageFingerprint);
1810
+ if (duplicateOfPageId) {
1811
+ const reason = `Duplicate capture detected: current page matches previously validated page "${duplicateOfPageId}". Reach a distinct state for "${config.variantManifest?.currentPageId ?? 'current'}" before capturing.`;
1812
+ logger.ai(`Problème de vérification : ${reason}`);
1813
+ return {
1814
+ verified: false,
1815
+ reason,
1816
+ usage: null,
1817
+ duplicateOfPageId,
1818
+ blockingReason: reason,
1819
+ pageFingerprint,
1820
+ mode: 'vision',
1821
+ };
1822
+ }
1823
+ const pageSignals = verificationBundle.pageSignals;
1824
+ const latestObservation = verificationBundle.observation;
1825
+ // Content-based identity checks are advisory — the LLM sees the screenshot
1826
+ // and can judge correctly, so we pass these as hints instead of hard rejections.
1827
+ const identityHint = config.runMode === 'video_navigation_preflight' && !config.variantManifest?.currentPageIdentity
1828
+ ? null
1829
+ : inferVariantIdentityFailure({
1830
+ pageId: config.variantManifest?.currentPageId ?? null,
1831
+ prompt: config.prompt,
1832
+ currentUrl: verificationBundle.url,
1833
+ visibleText: [
1834
+ pageSignals.title,
1835
+ pageSignals.headings.join(' '),
1836
+ pageSignals.navLabels.join(' '),
1837
+ pageSignals.visibleText,
1838
+ ].filter(Boolean).join(' '),
1839
+ dialogCount: latestObservation.dialogCount,
1840
+ pageIdentity: config.variantManifest?.currentPageIdentity,
1841
+ });
1842
+ if (identityHint) {
1843
+ logger.debug(`Identity hint for LLM verification: ${identityHint}`);
1844
+ }
1845
+ const variantMismatchHint = config.runMode === 'video_navigation_preflight'
1846
+ ? buildLanguagePreflightVerificationFailureReason({
1847
+ requestedLang: config.currentLang,
1848
+ requestedTheme: config.currentTheme,
1849
+ currentUrl: verificationBundle.url,
1850
+ signals: verificationBundle.pageSignals,
1851
+ }) || null
1852
+ : null;
1853
+ if (variantMismatchHint) {
1854
+ logger.debug(`Variant hint for LLM verification: ${variantMismatchHint}`);
1855
+ }
1856
+ // Manifest-aware deterministic check — only trust its ACCEPT decisions.
1857
+ // Rejections based on content analysis are passed to the LLM as hints.
1858
+ const manifestReadyDecision = inferManifestReadyDecision({
1859
+ observation: latestObservation,
1860
+ assessment,
1861
+ config,
1862
+ bundle: verificationBundle,
1863
+ });
1864
+ if (manifestReadyDecision?.verified) {
1865
+ logger.success('Qualité du screenshot vérifiée');
1866
+ return {
1867
+ verified: true,
1868
+ usage: null,
1869
+ matchedPageId: config.variantManifest?.currentPageId ?? null,
1870
+ pageFingerprint,
1871
+ mode: 'deterministic',
1872
+ };
1873
+ }
1874
+ // Manifest rejection is downgraded to an advisory hint for the LLM
1875
+ const manifestHint = manifestReadyDecision && !manifestReadyDecision.verified
1876
+ ? manifestReadyDecision.reason
1877
+ : null;
1878
+ if (manifestHint) {
1879
+ logger.debug(`Manifest hint for LLM verification: ${manifestHint}`);
1880
+ }
1881
+ // Page-signal deterministic check: combine assessment parse, identity check,
1882
+ // stability, variant match, and dialog absence into a single accept gate.
1883
+ // Never rejects — only accepts or returns null (inconclusive → LLM).
1884
+ const pageSignalDecision = inferPageSignalReadyDecision({
1885
+ observation: latestObservation,
1886
+ assessment,
1887
+ config,
1888
+ bundle: verificationBundle,
1889
+ identityFailure: identityHint,
1890
+ });
1891
+ if (pageSignalDecision?.verified) {
1892
+ logger.success('Qualité du screenshot vérifiée');
1893
+ return {
1894
+ verified: true,
1895
+ usage: null,
1896
+ matchedPageId: config.variantManifest?.currentPageId ?? null,
1897
+ pageFingerprint,
1898
+ mode: 'deterministic',
1899
+ };
1900
+ }
1901
+ // Dialog presence hint — let the LLM judge whether the dialog is the intended
1902
+ // capture target or an obstruction, rather than making a deterministic call.
1903
+ const dialogHint = latestObservation.dialogCount > 0
1904
+ ? `A dialog/modal is currently open (${latestObservation.dialogCount}). Judge whether this dialog is the intended capture target or an unwanted overlay blocking the page.`
1905
+ : null;
1906
+ const pageContext = {
1907
+ currentUrl: verificationBundle.url,
1908
+ pageTitle: verificationBundle.title,
1909
+ };
1910
+ const verificationDiagnostics = summarizeVerificationDiagnostics(config, verificationBundle);
1911
+ const verificationTools = agentTools.filter(t => ['ready_to_capture', 'give_up'].includes(t.function.name));
1912
+ // Dual-model mode: use vision observer to describe the page, then text-only primary model
1913
+ const visionObs = await getVisionObservation(client, config, screenshotBuf, buildVerificationVisionContext({
1914
+ config,
1915
+ pageContext,
1916
+ assessment,
1917
+ }), stepNumber);
1918
+ const screenshotUrl = visionObs ? '' : await makeImageUrl(screenshotBuf, 'image/jpeg', config.uploadImage);
1919
+ const rawParts = buildVerificationMessage({
1920
+ userPrompt: config.prompt,
1921
+ screenshotUrl,
1922
+ previousAssessment: assessment,
1923
+ runMode: config.runMode,
1924
+ currentLang: config.currentLang,
1925
+ currentTheme: config.currentTheme,
1926
+ pageContext,
1927
+ runHints: config.runHints,
1928
+ variantManifest: config.variantManifest,
1929
+ verificationDiagnostics,
1930
+ identityHints: [identityHint, manifestHint, dialogHint, variantMismatchHint].filter(Boolean),
1931
+ });
1932
+ const userContent = visionObs ? replaceImagesWithObservation(rawParts, visionObs.observation) : rawParts;
1933
+ const systemPrompt = config.runMode === 'video_navigation_preflight'
1934
+ ? 'You are a video navigation verification inspector. You MUST respond by calling exactly one tool — do NOT reply with text.\n'
1935
+ + '- Call ready_to_capture ONLY if the screenshot shows the EXACT pre-recording start state.\n'
1936
+ + '- Reject generic /home or dashboard states when the requested project, section, tab, or dialog is not actually open yet.\n'
1937
+ + '- Reject if the fixed app chrome is in the wrong language or theme.\n'
1938
+ + '- Reject if overlays, popups, spinners, or unrelated dialogs remain.\n'
1939
+ + '- Call give_up with a short reason for any mismatch or uncertainty.\n'
1940
+ + 'Pick one tool and call it now.'
1941
+ : 'You are a screenshot quality inspector. You MUST respond by calling exactly one tool — do NOT reply with text.\n'
1942
+ + '- Call ready_to_capture if the screenshot is clean, fully loaded, free of overlays/spinners, and matches the user request.\n'
1943
+ + '- Call give_up with a reason if there are issues (overlays, wrong page, loading state, etc.).\n'
1944
+ + 'Pick one tool and call it now.';
1945
+ const messages = [
1946
+ {
1947
+ role: 'system',
1948
+ content: systemPrompt,
1949
+ },
1950
+ { role: 'user', content: userContent },
1951
+ ];
1952
+ try {
1953
+ let visionResult;
1954
+ if (visionObs) {
1955
+ // Text-only call — no vision fallback needed
1956
+ logger.info('Vérification de la capture…');
1957
+ const result = await callWithRetry(client, {
1958
+ model: modelState.active,
1959
+ messages,
1960
+ tools: verificationTools,
1961
+ tool_choice: 'required',
1962
+ max_tokens: 1024,
1963
+ }, 3, config.abortSignal, config.providerPreferences);
1964
+ visionResult = { result, model: modelState.active, fellBack: false };
1965
+ }
1966
+ else {
1967
+ // Mono-model mode: send image directly with vision fallback
1968
+ logger.info('Vérification de la capture…');
1969
+ visionResult = await callVisionCapableModel({
1970
+ primaryModel: modelState.active,
1971
+ fallbackModel: modelState.active === config.model ? config.fallbackModel : undefined,
1972
+ onFallbackActivated: (model, reason) => {
1973
+ modelState.active = model;
1974
+ logger.debug(`Vision fallback activated for verification: ${model} (reason: ${reason})`);
1975
+ },
1976
+ callModel: (model) => callWithRetry(client, {
1977
+ model,
1978
+ messages,
1979
+ tools: verificationTools,
1980
+ tool_choice: 'required',
1981
+ max_tokens: 1024,
1982
+ }, 3, config.abortSignal, config.providerPreferences),
1983
+ });
1984
+ }
1985
+ const response = visionResult.result;
1986
+ if (visionResult.model && visionResult.model !== modelState.active) {
1987
+ logger.debug(`OpenRouter model substitution detected: requested "${modelState.active}", got "${visionResult.model}"`);
1988
+ }
1989
+ modelState.active = visionResult.model;
1990
+ const primaryUsage = extractUsage(response, stepNumber, 'verification', visionResult.model, visionObs ? 0 : 1);
1991
+ // Merge vision observer usage if dual-model mode was used
1992
+ const usage = visionObs?.usage ? mergeUsage(visionObs.usage, primaryUsage) : primaryUsage;
1993
+ const toolCall = response.choices?.[0]?.message?.tool_calls?.[0];
1994
+ const assistantText = extractAssistantText(response.choices?.[0]?.message?.content);
1995
+ if (toolCall && 'function' in toolCall) {
1996
+ if (toolCall.function.name === 'ready_to_capture') {
1997
+ logger.success('Qualité du screenshot vérifiée');
1998
+ return {
1999
+ verified: true,
2000
+ usage,
2001
+ matchedPageId: config.variantManifest?.currentPageId ?? null,
2002
+ pageFingerprint,
2003
+ mode: 'vision',
2004
+ };
2005
+ }
2006
+ if (toolCall.function.name === 'give_up') {
2007
+ const args = JSON.parse(toolCall.function.arguments);
2008
+ logger.ai(`Problème de vérification : ${args.reason}`);
2009
+ return {
2010
+ verified: false,
2011
+ reason: args.reason,
2012
+ usage,
2013
+ blockingReason: args.reason,
2014
+ pageFingerprint,
2015
+ mode: 'vision',
2016
+ };
2017
+ }
2018
+ }
2019
+ // No tool call found — try to extract a verdict from the assistant's text response
2020
+ // before falling back to a separate JSON verification call.
2021
+ if (assistantText) {
2022
+ const textDecision = parseVerificationDecisionText(assistantText);
2023
+ if (textDecision?.verified) {
2024
+ logger.success('Qualité du screenshot vérifiée');
2025
+ return {
2026
+ verified: true,
2027
+ usage,
2028
+ matchedPageId: config.variantManifest?.currentPageId ?? null,
2029
+ pageFingerprint,
2030
+ mode: 'text_fallback',
2031
+ };
2032
+ }
2033
+ if (textDecision && !textDecision.verified) {
2034
+ const reason = textDecision.reason ?? assistantText.slice(0, 400);
2035
+ logger.ai(`Problème de vérification : ${reason}`);
2036
+ return {
2037
+ verified: false,
2038
+ reason,
2039
+ usage,
2040
+ blockingReason: reason,
2041
+ pageFingerprint,
2042
+ mode: 'text_fallback',
2043
+ };
2044
+ }
2045
+ }
2046
+ // For models that generally support tool use well, skip the JSON fallback
2047
+ // (which costs an extra LLM call) and go straight to the cheaper text fallback.
2048
+ // The JSON fallback is only useful for models that support response_format but not tool_choice.
2049
+ const modelLower = (modelState.active || '').toLowerCase();
2050
+ const hasReliableToolUse = /claude|gpt-4|gemini|sonnet|opus|haiku/.test(modelLower);
2051
+ if (hasReliableToolUse) {
2052
+ logger.debug(`Verification returned no tool call from reliable model; skipping JSON fallback, trying text fallback directly. Model text: ${(assistantText || '(empty)').slice(0, 300)}`);
2053
+ return fallbackVerifyScreenshotText(client, config, modelState, screenshotBuf, assessment, stepNumber, pageFingerprint, visionObs);
2054
+ }
2055
+ logger.debug(`Verification returned no structured tool call; retrying with JSON fallback. Model text: ${(assistantText || '(empty)').slice(0, 300)}`);
2056
+ const fallback = await fallbackVerifyScreenshotJson(client, config, modelState, screenshotBuf, assessment, pageContext, stepNumber, pageFingerprint, visionObs);
2057
+ return {
2058
+ ...fallback,
2059
+ usage: mergeUsage(usage, fallback.usage),
2060
+ };
2061
+ }
2062
+ catch (err) {
2063
+ if (isAbortError(err)) {
2064
+ throw err;
2065
+ }
2066
+ logger.error(`Verification call failed: ${err.message}`);
2067
+ return {
2068
+ verified: false,
2069
+ reason: err.message,
2070
+ usage: null,
2071
+ fatal: err instanceof VisionModelUnsupportedError,
2072
+ blockingReason: err.message,
2073
+ mode: 'vision',
2074
+ };
2075
+ }
2076
+ }
2077
+ export async function verifyCaptureReadiness(browser, config, apiKey, options = {}) {
2078
+ const client = createClient(apiKey);
2079
+ const modelState = { active: config.model };
2080
+ return verifyScreenshot(client, config, modelState, browser, options.assessment ?? 'Preflight readiness check for a carried-over page state.', options.stepNumber ?? 1);
2081
+ }
2082
+ // Meta-actions that don't interact with the browser — excluded from stuck/no-effect detection
2083
+ const META_ACTIONS = new Set(['note', 'begin_subgoal', 'ready_to_capture']);
2084
+ const BOOTSTRAP_ACTIONS = new Set(['dismiss_overlays', 'wait']);
2085
+ function isNoEffectAction(action) {
2086
+ return (action.success === false
2087
+ || action.stateChanged === false
2088
+ || action.outcome === 'No visible state change detected after the action.');
2089
+ }
2090
+ function isBootstrapStabilizationAction(action) {
2091
+ if (BOOTSTRAP_ACTIONS.has(action.action))
2092
+ return true;
2093
+ return action.action === 'press_key' && String(action.params.key || '').toLowerCase() === 'escape';
2094
+ }
2095
+ function hasMeaningfulBrowserAction(actionHistory) {
2096
+ return actionHistory.some((action) => !META_ACTIONS.has(action.action) && !isBootstrapStabilizationAction(action));
2097
+ }
2098
+ function buildRecoveryActionSignature(action) {
2099
+ const parts = [action.action];
2100
+ if (typeof action.params.selector === 'string' && action.params.selector.trim()) {
2101
+ parts.push(`selector:${action.params.selector.trim()}`);
2102
+ }
2103
+ else if (action.params.index !== undefined) {
2104
+ parts.push(`index:${String(action.params.index)}`);
2105
+ }
2106
+ else if (typeof action.params.url === 'string' && action.params.url.trim()) {
2107
+ parts.push(`url:${action.params.url.trim().slice(0, 160)}`);
2108
+ }
2109
+ else if (action.params.x !== undefined && action.params.y !== undefined) {
2110
+ parts.push(`xy:${String(action.params.x)},${String(action.params.y)}`);
2111
+ }
2112
+ if (typeof action.params.optionLabel === 'string' && action.params.optionLabel.trim()) {
2113
+ parts.push(`optionLabel:${action.params.optionLabel.trim()}`);
2114
+ }
2115
+ else if (typeof action.params.optionValue === 'string' && action.params.optionValue.trim()) {
2116
+ parts.push(`optionValue:${action.params.optionValue.trim()}`);
2117
+ }
2118
+ if (typeof action.params.key === 'string' && action.params.key.trim()) {
2119
+ parts.push(`key:${action.params.key.trim()}`);
2120
+ }
2121
+ return parts.join('|');
2122
+ }
2123
+ function urlsRoughlyMatch(expectedUrl, currentUrl) {
2124
+ if (!expectedUrl || !currentUrl)
2125
+ return false;
2126
+ try {
2127
+ const expected = new URL(expectedUrl);
2128
+ const current = new URL(currentUrl);
2129
+ if (expected.origin !== current.origin)
2130
+ return false;
2131
+ // Exact pathname match, or current is a sub-path with a '/' separator.
2132
+ // e.g., /products matches /products/123 but NOT /products-edit.
2133
+ const expectedPath = expected.pathname || '/';
2134
+ const currentPath = current.pathname || '/';
2135
+ if (currentPath === expectedPath)
2136
+ return true;
2137
+ if (currentPath.startsWith(expectedPath) && (expectedPath.endsWith('/') || currentPath[expectedPath.length] === '/')) {
2138
+ return true;
2139
+ }
2140
+ return false;
2141
+ }
2142
+ catch {
2143
+ return currentUrl === expectedUrl;
2144
+ }
2145
+ }
2146
+ function normalizeGuardPageUrl(value) {
2147
+ if (typeof value !== 'string' || value.trim().length === 0)
2148
+ return '';
2149
+ try {
2150
+ const parsed = new URL(value);
2151
+ return `${parsed.origin}${parsed.pathname}`.replace(/\/$/, '') || parsed.origin;
2152
+ }
2153
+ catch {
2154
+ return value.trim().replace(/\/$/, '');
2155
+ }
2156
+ }
2157
+ function buildClickGuardAnchor(params) {
2158
+ const parts = [];
2159
+ if (typeof params.selector === 'string' && params.selector.trim()) {
2160
+ parts.push(`selector:${params.selector.trim()}`);
2161
+ }
2162
+ if (params.index !== undefined) {
2163
+ parts.push(`index:${String(params.index)}`);
2164
+ }
2165
+ if (typeof params.elementLabel === 'string' && params.elementLabel.trim()) {
2166
+ parts.push(`label:${params.elementLabel.trim().toLowerCase()}`);
2167
+ }
2168
+ if (typeof params.href === 'string' && params.href.trim()) {
2169
+ parts.push(`href:${normalizeGuardPageUrl(params.href)}`);
2170
+ }
2171
+ return parts.length > 0 ? parts.join('|') : null;
2172
+ }
2173
+ function buildClickGuardSignature(params, currentUrl) {
2174
+ const anchor = buildClickGuardAnchor(params);
2175
+ if (!anchor)
2176
+ return null;
2177
+ const pageKey = normalizeGuardPageUrl(params.preActionUrl ?? params.postActionUrl ?? currentUrl);
2178
+ return `click|${pageKey}|${anchor}`;
2179
+ }
2180
+ function getPostActionDelayMs(action, execResult, options = {}) {
2181
+ if (!execResult.success)
2182
+ return 0;
2183
+ if (options.authSubmitAction) {
2184
+ return execResult.stateChanged ? 1200 : 700;
2185
+ }
2186
+ switch (action) {
2187
+ case 'navigate_to':
2188
+ case 'wait':
2189
+ case 'resize_viewport':
2190
+ case 'dismiss_overlays':
2191
+ return 0;
2192
+ case 'scroll':
2193
+ case 'scroll_to_element':
2194
+ return execResult.stateChanged ? 90 : 0;
2195
+ case 'type_text':
2196
+ return 60;
2197
+ case 'click':
2198
+ case 'safe_expand':
2199
+ case 'select_option':
2200
+ case 'press_key':
2201
+ case 'hover':
2202
+ return execResult.stateChanged ? 120 : 40;
2203
+ default:
2204
+ return 80;
2205
+ }
2206
+ }
2207
+ export function getLivePreviewScreenshot(pageState) {
2208
+ return pageState.cleanScreenshot ?? pageState.screenshot;
2209
+ }
2210
+ const LOGIN_URL_RE = /\b(login|log-in|signin|sign-in|auth|session|connexion|connect)\b/i;
2211
+ const LOGIN_FIELD_RE = /\b(password|mot de passe|passcode|otp|verification.?code)\b/i;
2212
+ const LOGIN_CLICK_RE = /\b(login|log-in|signin|sign-in|auth|session|connexion|connect|password|mot de passe|passcode|otp|verification.?code|email|e-mail)\b/i;
2213
+ const AUTH_SUBMIT_RE = /\b(continue|next|submit|verify|unlock|access|enter|continuer|suivant|soumettre|verifier|v[eé]rifier|acc[eé]der|entrer)\b/i;
2214
+ const INTERNAL_AUTOMATION_SELECTOR_RE = /\[data-ak-[^\]]+\]|data-ak-interactive-index/i;
2215
+ function getActionLabelHaystack(args) {
2216
+ return [
2217
+ args.selector,
2218
+ args.elementLabel,
2219
+ args.href,
2220
+ args.text,
2221
+ args.reason,
2222
+ ]
2223
+ .filter((value) => typeof value === 'string' && value.trim().length > 0)
2224
+ .join(' ');
2225
+ }
2226
+ function observationLooksLikeAuthSurface(observation) {
2227
+ if (!observation)
2228
+ return false;
2229
+ const haystack = [observation.url, observation.title, observation.textSample]
2230
+ .filter(Boolean)
2231
+ .join(' ');
2232
+ return LOGIN_URL_RE.test(haystack) || LOGIN_FIELD_RE.test(haystack) || LOGIN_CLICK_RE.test(haystack);
2233
+ }
2234
+ function isLikelyAuthenticationSubmitAction(action, args, before) {
2235
+ const haystack = getActionLabelHaystack(args);
2236
+ if (action === 'click') {
2237
+ if (LOGIN_CLICK_RE.test(haystack))
2238
+ return true;
2239
+ if (AUTH_SUBMIT_RE.test(haystack) && observationLooksLikeAuthSurface(before))
2240
+ return true;
2241
+ }
2242
+ if (action === 'press_key' && args.key === 'Enter' && observationLooksLikeAuthSurface(before)) {
2243
+ return true;
2244
+ }
2245
+ return false;
2246
+ }
2247
+ function getReactionOptions(action, args, authSubmitAction) {
2248
+ if (authSubmitAction) {
2249
+ return { timeoutMs: 4200, settleMs: 550, idleGraceMs: 1400 };
2250
+ }
2251
+ switch (action) {
2252
+ case 'navigate_to':
2253
+ return { timeoutMs: 3200, settleMs: 450 };
2254
+ case 'click':
2255
+ case 'safe_expand':
2256
+ case 'select_option':
2257
+ case 'press_key':
2258
+ return { timeoutMs: 2400, settleMs: 350 };
2259
+ case 'hover':
2260
+ case 'scroll':
2261
+ case 'scroll_to_element':
2262
+ case 'resize_viewport':
2263
+ case 'dismiss_overlays':
2264
+ return { timeoutMs: 1600, settleMs: 250 };
2265
+ case 'type_text':
2266
+ return { timeoutMs: 1400, settleMs: 250 };
2267
+ case 'wait':
2268
+ return {
2269
+ timeoutMs: Math.min(2200, Math.max(800, Number(args.milliseconds ?? 1000) + 400)),
2270
+ settleMs: 250,
2271
+ };
2272
+ default:
2273
+ return { timeoutMs: 1800, settleMs: 300 };
2274
+ }
2275
+ }
2276
+ function containsInternalAutomationSelector(selector) {
2277
+ return typeof selector === 'string' && INTERNAL_AUTOMATION_SELECTOR_RE.test(selector);
2278
+ }
2279
+ function replayActionRequiresAnchor(action) {
2280
+ return [
2281
+ 'click',
2282
+ 'type_text',
2283
+ 'select_option',
2284
+ 'scroll',
2285
+ 'safe_expand',
2286
+ 'hover',
2287
+ 'scroll_to_element',
2288
+ ].includes(action.action);
2289
+ }
2290
+ function hasReplayAnchor(params) {
2291
+ if (typeof params.selector === 'string' && params.selector.trim().length > 0 && !containsInternalAutomationSelector(params.selector)) {
2292
+ return true;
2293
+ }
2294
+ if (typeof params.index === 'number') {
2295
+ return true;
2296
+ }
2297
+ if (typeof params.x === 'number' && typeof params.y === 'number') {
2298
+ return true;
2299
+ }
2300
+ if (typeof params.href === 'string' && params.href.trim().length > 0) {
2301
+ return true;
2302
+ }
2303
+ if (typeof params.elementLabel === 'string' && params.elementLabel.trim().length > 0) {
2304
+ return true;
2305
+ }
2306
+ return false;
2307
+ }
2308
+ export function analyzeReplayCandidate(recordedActions, params = {}) {
2309
+ const replayable = compactReplayActions(recordedActions, {
2310
+ currentUrl: params.currentUrl,
2311
+ targetUrl: params.targetUrl,
2312
+ currentViewport: params.currentViewport,
2313
+ isAuthenticated: params.isAuthenticated,
2314
+ });
2315
+ if (replayable.length === 0) {
2316
+ return {
2317
+ replayableActions: [],
2318
+ skipReason: 'no replayable actions remain after filtering bootstrap/auth steps',
2319
+ };
2320
+ }
2321
+ const currentDialogCount = params.currentDialogCount ?? null;
2322
+ const pageIdentity = params.pageIdentity ?? null;
2323
+ if (!pageIdentity?.dialogTarget
2324
+ && (pageIdentity?.dedicatedRoute || pageIdentity?.kind === 'editor_route')
2325
+ && (currentDialogCount ?? 0) > 0) {
2326
+ return {
2327
+ replayableActions: [],
2328
+ skipReason: 'a dialog/modal is still open, but the target expects the underlying page/editor route',
2329
+ };
2330
+ }
2331
+ const sanitized = [];
2332
+ for (const action of replayable) {
2333
+ if (replayActionRequiresAnchor(action) && !hasReplayAnchor(action.params)) {
2334
+ if (sanitized.length === 0) {
2335
+ return {
2336
+ replayableActions: [],
2337
+ skipReason: `the first replay action "${action.action}" has no reusable selector, coordinates, href, or label anchor`,
2338
+ };
2339
+ }
2340
+ continue;
2341
+ }
2342
+ sanitized.push(action);
2343
+ }
2344
+ if (sanitized.length === 0) {
2345
+ return {
2346
+ replayableActions: [],
2347
+ skipReason: 'no replayable actions remain after removing non-reusable interaction steps',
2348
+ };
2349
+ }
2350
+ // Validate that the first action's expected starting URL matches the current browser URL.
2351
+ // This prevents replaying actions recorded from page A when the browser is on page B.
2352
+ const firstActionPreUrl = typeof sanitized[0].params.preActionUrl === 'string' ? sanitized[0].params.preActionUrl : null;
2353
+ if (firstActionPreUrl && params.currentUrl) {
2354
+ if (!urlsRoughlyMatch(firstActionPreUrl, params.currentUrl) && !urlsRoughlyMatch(params.currentUrl, firstActionPreUrl)) {
2355
+ return {
2356
+ replayableActions: [],
2357
+ skipReason: `browser is on ${params.currentUrl} but recorded actions expect to start from ${firstActionPreUrl}`,
2358
+ };
2359
+ }
2360
+ }
2361
+ return {
2362
+ replayableActions: sanitized,
2363
+ skipReason: null,
2364
+ };
2365
+ }
2366
+ function normalizeReplayText(value) {
2367
+ return typeof value === 'string' ? value.trim().toLowerCase() : '';
2368
+ }
2369
+ /**
2370
+ * Compute word-token overlap ratio between two strings.
2371
+ * Used for cross-language replay matching: "Filter by preset" and "Filtrer par preset"
2372
+ * share the token "preset", yielding a non-zero overlap even across translations.
2373
+ * Only considers tokens with length > 2 to skip noise words.
2374
+ */
2375
+ function computeTokenOverlap(a, b) {
2376
+ if (!a || !b)
2377
+ return 0;
2378
+ const tokensA = new Set(a.split(/\s+/).filter(t => t.length > 2));
2379
+ const tokensB = new Set(b.split(/\s+/).filter(t => t.length > 2));
2380
+ if (tokensA.size === 0 || tokensB.size === 0)
2381
+ return 0;
2382
+ let overlap = 0;
2383
+ for (const token of tokensA) {
2384
+ if (tokensB.has(token))
2385
+ overlap++;
2386
+ }
2387
+ return overlap / Math.max(tokensA.size, tokensB.size);
2388
+ }
2389
+ function findElementForPoint(interactiveElements, x, y) {
2390
+ const matches = interactiveElements
2391
+ .filter((element) => {
2392
+ const box = element.boundingBox;
2393
+ return !!box
2394
+ && x >= box.x
2395
+ && x <= box.x + box.width
2396
+ && y >= box.y
2397
+ && y <= box.y + box.height;
2398
+ })
2399
+ .sort((a, b) => {
2400
+ const aArea = (a.boundingBox?.width ?? Number.MAX_SAFE_INTEGER) * (a.boundingBox?.height ?? Number.MAX_SAFE_INTEGER);
2401
+ const bArea = (b.boundingBox?.width ?? Number.MAX_SAFE_INTEGER) * (b.boundingBox?.height ?? Number.MAX_SAFE_INTEGER);
2402
+ return aArea - bArea;
2403
+ });
2404
+ return matches[0] ?? null;
2405
+ }
2406
+ export function matchRecordedActionToElement(action, interactiveElements) {
2407
+ // 1. Exact stable selector match (highest confidence — language-independent)
2408
+ if (typeof action.params.selector === 'string' && !containsInternalAutomationSelector(action.params.selector)) {
2409
+ const exactSelectorMatch = interactiveElements.find((element) => element.selector === action.params.selector);
2410
+ if (exactSelectorMatch) {
2411
+ return exactSelectorMatch;
2412
+ }
2413
+ }
2414
+ // 2. Fuzzy scoring — all signals contribute to a composite score.
2415
+ // Coordinates are NO LONGER an early-return path; they participate in scoring
2416
+ // so that a strong label/href match isn't overridden by a stale coordinate hit.
2417
+ const desiredHref = normalizeReplayText(action.params.href);
2418
+ const desiredLabel = normalizeReplayText(action.params.elementLabel);
2419
+ const desiredSelector = normalizeReplayText(action.params.selector);
2420
+ const desiredTag = normalizeReplayText(action.params.elementTag);
2421
+ const desiredRole = normalizeReplayText(action.params.elementRole);
2422
+ const recordedCx = action.params.elementCx;
2423
+ const recordedCy = action.params.elementCy;
2424
+ let best = null;
2425
+ for (const element of interactiveElements) {
2426
+ let score = 0;
2427
+ const elementText = normalizeReplayText(element.text);
2428
+ const elementAria = normalizeReplayText(element.ariaLabel);
2429
+ const elementHref = normalizeReplayText(element.href);
2430
+ const elementSelector = normalizeReplayText(element.selector);
2431
+ // --- Selector match (language-independent, very stable) ---
2432
+ if (desiredSelector && !containsInternalAutomationSelector(desiredSelector) && elementSelector === desiredSelector) {
2433
+ score += 900;
2434
+ }
2435
+ // --- href match (language-independent, very stable) ---
2436
+ if (desiredHref) {
2437
+ if (elementHref === desiredHref)
2438
+ score += 700;
2439
+ else if (elementHref && urlsRoughlyMatch(elementHref, desiredHref))
2440
+ score += 550;
2441
+ }
2442
+ // --- Label match (language-dependent, tiered confidence) ---
2443
+ if (desiredLabel) {
2444
+ if (elementText === desiredLabel || elementAria === desiredLabel) {
2445
+ score += 450;
2446
+ }
2447
+ else if ((elementText && (elementText.includes(desiredLabel) || desiredLabel.includes(elementText)))
2448
+ || (elementAria && (elementAria.includes(desiredLabel) || desiredLabel.includes(elementAria)))) {
2449
+ score += 260;
2450
+ }
2451
+ else {
2452
+ // Cross-language token overlap: "filter by preset" ↔ "filtrer par preset" share "preset"
2453
+ const textOverlap = computeTokenOverlap(desiredLabel, elementText);
2454
+ const ariaOverlap = computeTokenOverlap(desiredLabel, elementAria);
2455
+ const bestOverlap = Math.max(textOverlap, ariaOverlap);
2456
+ if (bestOverlap >= 0.3) {
2457
+ score += Math.round(200 * bestOverlap);
2458
+ }
2459
+ }
2460
+ }
2461
+ // --- Structural match: tag + role (language-independent) ---
2462
+ // Stronger weight so it can serve as a tiebreaker for fuzzy label matches.
2463
+ if (desiredTag && desiredTag === element.tag.toLowerCase()) {
2464
+ score += 50;
2465
+ if (desiredRole && desiredRole === element.role.toLowerCase()) {
2466
+ score += 60;
2467
+ }
2468
+ }
2469
+ // --- Bounding box proximity (layout-dependent but language-independent) ---
2470
+ if (recordedCx !== undefined && recordedCy !== undefined && element.boundingBox) {
2471
+ const elCx = element.boundingBox.x + element.boundingBox.width / 2;
2472
+ const elCy = element.boundingBox.y + element.boundingBox.height / 2;
2473
+ const dist = Math.sqrt((elCx - recordedCx) ** 2 + (elCy - recordedCy) ** 2);
2474
+ if (dist < 50)
2475
+ score += 80;
2476
+ else if (dist < 120)
2477
+ score += 40;
2478
+ }
2479
+ // --- Legacy coordinate match (weakest signal — layout shift can misfire) ---
2480
+ if (typeof action.params.x === 'number' && typeof action.params.y === 'number' && element.boundingBox) {
2481
+ const box = element.boundingBox;
2482
+ if (action.params.x >= box.x
2483
+ && action.params.x <= box.x + box.width
2484
+ && action.params.y >= box.y
2485
+ && action.params.y <= box.y + box.height) {
2486
+ score += 60;
2487
+ }
2488
+ }
2489
+ if (score <= 0)
2490
+ continue;
2491
+ if (!best || score > best.score) {
2492
+ best = { element, score };
2493
+ }
2494
+ }
2495
+ // Require a minimum confidence threshold to avoid false positives.
2496
+ // A score < 65 means the match is based on very weak signals only (e.g., coordinates
2497
+ // alone = 60) — too risky to replay. Token overlap alone (≥66) or structural signals
2498
+ // combined with coordinates are sufficient.
2499
+ if (best && best.score < 65) {
2500
+ return null;
2501
+ }
2502
+ return best?.element ?? null;
2503
+ }
2504
+ function hasExecutableReplayArgs(action, params) {
2505
+ const hasStableSelector = typeof params.selector === 'string'
2506
+ && params.selector.trim().length > 0
2507
+ && !containsInternalAutomationSelector(params.selector);
2508
+ const hasIndex = typeof params.index === 'number';
2509
+ const hasCoordinates = typeof params.x === 'number' && typeof params.y === 'number';
2510
+ switch (action) {
2511
+ case 'click':
2512
+ case 'safe_expand':
2513
+ case 'hover':
2514
+ return hasStableSelector || hasIndex || hasCoordinates;
2515
+ case 'type_text':
2516
+ return typeof params.text === 'string' && (hasStableSelector || hasIndex);
2517
+ case 'select_option':
2518
+ return ((typeof params.optionLabel === 'string'
2519
+ || typeof params.optionValue === 'string'
2520
+ || typeof params.optionIndex === 'number')
2521
+ && (hasStableSelector || hasIndex));
2522
+ case 'scroll':
2523
+ return hasIndex || hasStableSelector || typeof params.direction === 'string';
2524
+ case 'scroll_to_element':
2525
+ return hasIndex;
2526
+ case 'navigate_to':
2527
+ return typeof params.url === 'string' && params.url.trim().length > 0;
2528
+ case 'resize_viewport':
2529
+ return typeof params.width === 'number' && typeof params.height === 'number';
2530
+ case 'press_key':
2531
+ return typeof params.key === 'string' && params.key.trim().length > 0;
2532
+ case 'wait':
2533
+ case 'dismiss_overlays':
2534
+ return true;
2535
+ default:
2536
+ return false;
2537
+ }
2538
+ }
2539
+ export function resolveReplayActionArgs(action, interactiveElements) {
2540
+ const replayArgs = { ...action.params };
2541
+ if (hasExecutableReplayArgs(action.action, replayArgs)) {
2542
+ return { args: replayArgs, reason: null };
2543
+ }
2544
+ const matchedElement = matchRecordedActionToElement(action, interactiveElements);
2545
+ if (matchedElement) {
2546
+ if (action.action === 'click'
2547
+ || action.action === 'safe_expand'
2548
+ || action.action === 'hover'
2549
+ || action.action === 'type_text'
2550
+ || action.action === 'select_option'
2551
+ || action.action === 'scroll'
2552
+ || action.action === 'scroll_to_element') {
2553
+ replayArgs.index = matchedElement.index;
2554
+ delete replayArgs.selector;
2555
+ delete replayArgs.x;
2556
+ delete replayArgs.y;
2557
+ }
2558
+ }
2559
+ if (replayArgs.selector && replayArgs.index !== undefined) {
2560
+ delete replayArgs.index;
2561
+ }
2562
+ if (hasExecutableReplayArgs(action.action, replayArgs)) {
2563
+ return { args: replayArgs, reason: null };
2564
+ }
2565
+ const anchors = [
2566
+ typeof action.params.selector === 'string' ? action.params.selector : null,
2567
+ typeof action.params.href === 'string' ? action.params.href : null,
2568
+ typeof action.params.elementLabel === 'string' ? action.params.elementLabel : null,
2569
+ ].filter((value) => !!value && value.trim().length > 0);
2570
+ const anchorSummary = anchors.length > 0 ? anchors.join(' / ').slice(0, 160) : 'no replay anchor';
2571
+ return {
2572
+ args: null,
2573
+ reason: `replay action "${action.action}" could not be resolved on the current page (${anchorSummary})`,
2574
+ };
2575
+ }
2576
+ function isExplicitLoginAction(action) {
2577
+ if (action.action === 'navigate_to' && typeof action.params.url === 'string' && LOGIN_URL_RE.test(action.params.url)) {
2578
+ return true;
2579
+ }
2580
+ if (action.action === 'type_text' && typeof action.params.selector === 'string' && LOGIN_FIELD_RE.test(action.params.selector)) {
2581
+ return true;
2582
+ }
2583
+ if (action.action === 'type_text' && typeof action.params.text === 'string' && /\{\{credential\./i.test(action.params.text)) {
2584
+ return true;
2585
+ }
2586
+ if (action.action === 'click') {
2587
+ const haystack = [
2588
+ action.params.selector,
2589
+ action.params.elementLabel,
2590
+ action.params.href,
2591
+ ]
2592
+ .filter((value) => typeof value === 'string')
2593
+ .join(' ');
2594
+ if (LOGIN_CLICK_RE.test(haystack)) {
2595
+ return true;
2596
+ }
2597
+ }
2598
+ return false;
2599
+ }
2600
+ function hasRecentExplicitLoginAction(previousActions) {
2601
+ return previousActions.slice(-3).some(isExplicitLoginAction);
2602
+ }
2603
+ function isLoginAction(action, previousActions = []) {
2604
+ if (isExplicitLoginAction(action)) {
2605
+ return true;
2606
+ }
2607
+ if (action.action === 'click') {
2608
+ const haystack = [
2609
+ action.params.selector,
2610
+ action.params.elementLabel,
2611
+ action.params.href,
2612
+ ]
2613
+ .filter((value) => typeof value === 'string')
2614
+ .join(' ');
2615
+ if (AUTH_SUBMIT_RE.test(haystack) && hasRecentExplicitLoginAction(previousActions)) {
2616
+ return true;
2617
+ }
2618
+ }
2619
+ if (action.action === 'press_key'
2620
+ && action.params.key === 'Enter'
2621
+ && hasRecentExplicitLoginAction(previousActions)) {
2622
+ return true;
2623
+ }
2624
+ return false;
2625
+ }
2626
+ export function compactReplayActions(recordedActions, params = {}) {
2627
+ let replayable = recordedActions.filter(a => REPLAYABLE_ACTIONS.includes(a.action));
2628
+ // When the session is already authenticated, strip login-related actions
2629
+ if (params.isAuthenticated) {
2630
+ const authAwareReplayable = replayable;
2631
+ replayable = authAwareReplayable.filter((action, index) => !isLoginAction(action, authAwareReplayable.slice(0, index)));
2632
+ }
2633
+ let startIndex = 0;
2634
+ while (startIndex < replayable.length) {
2635
+ const action = replayable[startIndex];
2636
+ if (action.action === 'wait') {
2637
+ startIndex += 1;
2638
+ continue;
2639
+ }
2640
+ if (action.action === 'dismiss_overlays') {
2641
+ startIndex += 1;
2642
+ continue;
2643
+ }
2644
+ if (action.action === 'resize_viewport'
2645
+ && params.currentViewport
2646
+ && Number(action.params.width) === params.currentViewport.width
2647
+ && Number(action.params.height) === params.currentViewport.height) {
2648
+ startIndex += 1;
2649
+ continue;
2650
+ }
2651
+ if (action.action === 'navigate_to'
2652
+ && typeof action.params.url === 'string'
2653
+ && (urlsRoughlyMatch(action.params.url, params.currentUrl)
2654
+ || urlsRoughlyMatch(action.params.url, params.targetUrl))) {
2655
+ startIndex += 1;
2656
+ continue;
2657
+ }
2658
+ break;
2659
+ }
2660
+ return replayable.slice(startIndex);
2661
+ }
2662
+ export function countRecentNoEffectActions(actionHistory) {
2663
+ let count = 0;
2664
+ for (let index = actionHistory.length - 1; index >= 0; index -= 1) {
2665
+ const action = actionHistory[index];
2666
+ // Skip meta-actions: note/begin_subgoal always have stateChanged=false but are not stuck indicators.
2667
+ if (META_ACTIONS.has(action.action))
2668
+ continue;
2669
+ if (isNoEffectAction(action)) {
2670
+ count += 1;
2671
+ continue;
2672
+ }
2673
+ break;
2674
+ }
2675
+ return count;
2676
+ }
2677
+ export function shouldTriggerRecovery(actionHistory) {
2678
+ // Only consider browser actions for recovery detection (exclude meta-actions like note/begin_subgoal)
2679
+ const browserActions = actionHistory.filter(a => !META_ACTIONS.has(a.action));
2680
+ if (browserActions.length < 2)
2681
+ return false;
2682
+ if (!hasMeaningfulBrowserAction(browserActions))
2683
+ return false;
2684
+ const last = browserActions[browserActions.length - 1];
2685
+ const previous = browserActions[browserActions.length - 2];
2686
+ const sameFailureSignature = !last.success
2687
+ && !previous.success
2688
+ && buildRecoveryActionSignature(last) === buildRecoveryActionSignature(previous)
2689
+ && String(last.error || '').slice(0, 120) === String(previous.error || '').slice(0, 120);
2690
+ // Detect A→B→A→B oscillation: last 4 browser actions form a repeating 2-cycle
2691
+ if (browserActions.length >= 4) {
2692
+ const [a, b, c, d] = browserActions.slice(-4);
2693
+ const sigA = buildRecoveryActionSignature(a);
2694
+ const sigB = buildRecoveryActionSignature(b);
2695
+ const sigC = buildRecoveryActionSignature(c);
2696
+ const sigD = buildRecoveryActionSignature(d);
2697
+ if ([a, b, c, d].every(isNoEffectAction)
2698
+ && sigA === sigC
2699
+ && sigB === sigD
2700
+ && sigA !== sigB) {
2701
+ return true;
2702
+ }
2703
+ }
2704
+ return sameFailureSignature || countRecentNoEffectActions(actionHistory) >= 2;
2705
+ }
2706
+ function getMeaningfulBrowserActions(actionHistory) {
2707
+ return actionHistory.filter(action => !META_ACTIONS.has(action.action) && !isBootstrapStabilizationAction(action));
2708
+ }
2709
+ function countDistinctActionSignatures(actionHistory) {
2710
+ return new Set(getMeaningfulBrowserActions(actionHistory).map(action => buildRecoveryActionSignature(action))).size;
2711
+ }
2712
+ const HARD_GIVE_UP_RE = /\b(5xx|500\b|404\b|page not found|not found|http error|server error|js crash|javascript crash|browser crashed|connection refused|dns|net::|ssl|certificate|blank page|white page|no content|infinite spinner)\b/i;
2713
+ const RECOVERABLE_GIVE_UP_RE = /\b(verification|ready_to_capture|dialog|modal|overlay|gallery|editor|route|navigation|assistant|conversation|wrong page|duplicate capture|capture target)\b/i;
2714
+ export function inferPrematureGiveUpCorrection(params) {
2715
+ const reason = params.reason.trim();
2716
+ const lastVerificationFailure = params.lastVerificationFailure?.trim();
2717
+ if (HARD_GIVE_UP_RE.test(reason) || (lastVerificationFailure && HARD_GIVE_UP_RE.test(lastVerificationFailure))) {
2718
+ return null;
2719
+ }
2720
+ // Detect verification contradiction: 3+ recent ready_to_capture failures with 2+ distinct
2721
+ // reasons means the validators are cycling between irreconcilable states — allow give_up.
2722
+ const recentCaptureFailures = params.actionHistory
2723
+ .filter(a => a.action === 'ready_to_capture' && !a.success && a.error)
2724
+ .slice(-6);
2725
+ if (recentCaptureFailures.length >= 3) {
2726
+ const distinctReasons = new Set(recentCaptureFailures.map(a => (a.error || '').replace(/^Verification failed:\s*/i, '').trim().slice(0, 120)));
2727
+ if (distinctReasons.size >= 2 && distinctReasons.size <= 3) {
2728
+ return null;
2729
+ }
2730
+ }
2731
+ // If the agent has accumulated many consecutive no-effect actions, the objective
2732
+ // is likely unreachable — allow give_up to avoid burning the remaining budget.
2733
+ if (countRecentNoEffectActions(params.actionHistory) >= 8) {
2734
+ return null;
2735
+ }
2736
+ if (lastVerificationFailure && RECOVERABLE_GIVE_UP_RE.test(lastVerificationFailure)) {
2737
+ return `Do not give up yet. The last verification failure is still recoverable: ${lastVerificationFailure}. Try a materially different navigation or repair step first.`;
2738
+ }
2739
+ const meaningfulActions = getMeaningfulBrowserActions(params.actionHistory);
2740
+ const distinctActionCount = countDistinctActionSignatures(params.actionHistory);
2741
+ const hasTriedEnoughStrategies = meaningfulActions.length >= 4 && distinctActionCount >= 3;
2742
+ const nearingBudget = params.iteration >= Math.max(6, params.maxIterations - 2);
2743
+ if (!hasTriedEnoughStrategies && !nearingBudget) {
2744
+ return 'Do not give up yet. You have not tried enough materially different actions. Change strategy before giving up.';
2745
+ }
2746
+ if (RECOVERABLE_GIVE_UP_RE.test(reason) && !nearingBudget) {
2747
+ return 'Do not give up yet. The current issue still looks recoverable. Try a different navigation, search, or repair approach first.';
2748
+ }
2749
+ return null;
2750
+ }
2751
+ const REPEAT_GUARD_ACTIONS = new Set([
2752
+ 'click',
2753
+ 'safe_expand',
2754
+ 'hover',
2755
+ 'select_option',
2756
+ 'press_key',
2757
+ ]);
2758
+ export function inferRepeatedActionGuard(params) {
2759
+ // Guard 0: navigate-back detection — block navigating to a URL the agent just left
2760
+ if (params.action === 'navigate_to' && typeof params.args.url === 'string' && params.currentUrl) {
2761
+ const targetUrl = params.args.url.replace(/\/$/, '');
2762
+ const recentNavigations = params.actionHistory
2763
+ .filter(a => !META_ACTIONS.has(a.action) && a.success !== false)
2764
+ .slice(-5);
2765
+ // Check if we navigated AWAY from the target URL recently (within last 5 actions)
2766
+ const justLeftTarget = recentNavigations.some(a => a.action === 'navigate_to'
2767
+ && typeof a.params.url === 'string'
2768
+ && !urlsRoughlyMatch(a.params.url, targetUrl)
2769
+ && a.params.previousUrl
2770
+ && urlsRoughlyMatch(String(a.params.previousUrl), targetUrl));
2771
+ // Also check: are we trying to navigate to the same URL we're already on?
2772
+ if (urlsRoughlyMatch(params.currentUrl, targetUrl)) {
2773
+ return 'WARNING: You are already on this URL. No navigation needed. Focus on interacting with the current page instead.';
2774
+ }
2775
+ // Simpler check: did the last successful click/navigation bring us away from target? Going back suggests confusion.
2776
+ // EXCEPTION: if the agent has recent no-effect actions, it's stuck and legitimately trying
2777
+ // a different approach (e.g., navigating to the correct page). Don't block recovery navigation.
2778
+ const recentAllActions = params.actionHistory
2779
+ .filter(a => !META_ACTIONS.has(a.action))
2780
+ .slice(-4);
2781
+ const hasRecentNoEffect = recentAllActions.some(a => isNoEffectAction(a));
2782
+ if (!hasRecentNoEffect) {
2783
+ const recentUrls = recentNavigations
2784
+ .filter(a => typeof a.params.href === 'string' || typeof a.params.url === 'string')
2785
+ .slice(-3)
2786
+ .map(a => String(a.params.href || a.params.url || '').replace(/\/$/, ''));
2787
+ const wasRecentlyOnTarget = recentUrls.some(url => urlsRoughlyMatch(url, targetUrl));
2788
+ if (wasRecentlyOnTarget && !urlsRoughlyMatch(params.currentUrl, targetUrl)) {
2789
+ return `WARNING: You just navigated away from ${targetUrl} — going back suggests you are confused about the goal. Re-read the <task>/<goal> and <variant_manifest> carefully. If you need a modal, open it from the current page instead of navigating back.`;
2790
+ }
2791
+ }
2792
+ }
2793
+ if (!REPEAT_GUARD_ACTIONS.has(params.action))
2794
+ return null;
2795
+ const recentBrowserActions = params.actionHistory
2796
+ .filter(action => !META_ACTIONS.has(action.action))
2797
+ .slice(-2);
2798
+ if (recentBrowserActions.length < 2)
2799
+ return null;
2800
+ // Guard 1: exact same action signature repeated with no effect
2801
+ const candidateSignature = buildRecoveryActionSignature({
2802
+ iteration: 0,
2803
+ action: params.action,
2804
+ params: params.args,
2805
+ success: false,
2806
+ });
2807
+ const candidateClickSignature = params.action === 'click'
2808
+ ? buildClickGuardSignature(params.args, params.currentUrl)
2809
+ : null;
2810
+ const repeatedNoEffect = recentBrowserActions.every(action => isNoEffectAction(action)
2811
+ && (candidateClickSignature && action.action === 'click'
2812
+ ? buildClickGuardSignature(action.params, params.currentUrl) === candidateClickSignature
2813
+ : buildRecoveryActionSignature(action) === candidateSignature));
2814
+ if (repeatedNoEffect) {
2815
+ return 'BLOCKED: The previous attempts on this same target had no effect. Try a different control, search_text, scrolling, or a repair step instead of repeating it.';
2816
+ }
2817
+ // Guard 2: consecutive failed/blocked browser actions (different targets) —
2818
+ // the agent is thrashing without making progress. Triggers after 4+ recent failures.
2819
+ const recentActions = params.actionHistory
2820
+ .filter(action => !META_ACTIONS.has(action.action))
2821
+ .slice(-4);
2822
+ if (recentActions.length >= 4 && recentActions.every(a => isNoEffectAction(a))) {
2823
+ return 'BLOCKED: The last 4 browser actions all failed or had no visible effect. You are stuck. Step back and reconsider: verify the current URL, check if you are on the right page, try navigate_to to reach the correct page, or call give_up if this capture is impossible.';
2824
+ }
2825
+ // Guard 3: same element clicked 3+ times in recent history (regardless of effect).
2826
+ // Catches multi-select dropdown toggling where each click "succeeds" but makes no progress.
2827
+ if (params.action === 'click') {
2828
+ const candidateClickSignature = buildClickGuardSignature(params.args, params.currentUrl);
2829
+ if (candidateClickSignature) {
2830
+ const candidateTarget = String((typeof params.args.elementLabel === 'string' && params.args.elementLabel.trim())
2831
+ ? params.args.elementLabel
2832
+ : params.args.selector ?? params.args.index ?? 'this target');
2833
+ const recentClicks = params.actionHistory
2834
+ .filter(a => a.action === 'click' && !META_ACTIONS.has(a.action))
2835
+ .slice(-6);
2836
+ const sameTargetCount = recentClicks.filter(a => buildClickGuardSignature(a.params, params.currentUrl) === candidateClickSignature).length;
2837
+ if (sameTargetCount >= 3) {
2838
+ return `BLOCKED: You have clicked "${candidateTarget}" ${sameTargetCount} times recently without progress. This is likely a toggle/multi-select control. Press Escape to close any open dropdown, then look for a DIFFERENT button (e.g., an edit/settings icon) to achieve your goal.`;
2839
+ }
2840
+ }
2841
+ }
2842
+ return null;
2843
+ }
2844
+ function normalizeSubgoalName(value) {
2845
+ return value.trim().toLowerCase();
2846
+ }
2847
+ export function findReusableWorkflow(workflowCache, subgoalName) {
2848
+ const normalizedSubgoal = normalizeSubgoalName(subgoalName);
2849
+ if (!normalizedSubgoal)
2850
+ return null;
2851
+ let fuzzyMatch = null;
2852
+ for (let index = workflowCache.length - 1; index >= 0; index -= 1) {
2853
+ const candidate = workflowCache[index];
2854
+ if (candidate.selectors.length === 0)
2855
+ continue;
2856
+ const normalizedCandidate = normalizeSubgoalName(candidate.subgoalName);
2857
+ if (!normalizedCandidate)
2858
+ continue;
2859
+ if (normalizedCandidate === normalizedSubgoal)
2860
+ return candidate;
2861
+ if (!fuzzyMatch
2862
+ && (normalizedCandidate.includes(normalizedSubgoal) || normalizedSubgoal.includes(normalizedCandidate))) {
2863
+ fuzzyMatch = candidate;
2864
+ }
2865
+ }
2866
+ return fuzzyMatch;
2867
+ }
2868
+ /**
2869
+ * Replace base64 image content in older conversation messages with a text placeholder.
2870
+ * Keeps only the most recent `keepRecentImages` user messages that contain screenshots.
2871
+ * This dramatically reduces token cost on long agent runs (40-60% savings).
2872
+ */
2873
+ function compressOldScreenshots(messages, keepRecentImages = 3) {
2874
+ let imageCount = 0;
2875
+ for (let i = messages.length - 1; i >= 1; i--) {
2876
+ const msg = messages[i];
2877
+ if (msg.role !== 'user' || !Array.isArray(msg.content))
2878
+ continue;
2879
+ const content = msg.content;
2880
+ const hasImage = content.some(p => p.type === 'image_url');
2881
+ if (!hasImage)
2882
+ continue;
2883
+ imageCount++;
2884
+ if (imageCount <= keepRecentImages)
2885
+ continue;
2886
+ // Replace image_url parts with a compact text placeholder
2887
+ msg.content = content.map(p => p.type === 'image_url'
2888
+ ? { type: 'text', text: '[screenshot removed — older context]' }
2889
+ : p);
2890
+ }
2891
+ }
2892
+ /**
2893
+ * Strip <page_dom>...</page_dom> blocks from old user messages to reduce token count.
2894
+ * Keeps the DOM in the most recent `keepRecentWithDom` user messages intact.
2895
+ * Only modifies text content parts — images and other types are left untouched.
2896
+ */
2897
+ function compressOldDomBlocks(messages, keepRecentWithDom = 6, preservedPrefixMessages = 1) {
2898
+ const PAGE_DOM_RE = /<page_dom>[\s\S]*?<\/page_dom>/g;
2899
+ const PLACEHOLDER = '<page_dom>[older context — see current iteration for latest DOM]</page_dom>';
2900
+ let domMessageCount = 0;
2901
+ for (let i = messages.length - 1; i >= preservedPrefixMessages; i--) {
2902
+ const msg = messages[i];
2903
+ if (msg.role !== 'user')
2904
+ continue;
2905
+ const contentArr = Array.isArray(msg.content) ? msg.content : null;
2906
+ const hasDom = contentArr
2907
+ ? contentArr.some(p => typeof p === 'object' && 'text' in p && typeof p.text === 'string' && PAGE_DOM_RE.test(p.text))
2908
+ : typeof msg.content === 'string' && PAGE_DOM_RE.test(msg.content);
2909
+ // Reset regex lastIndex after test
2910
+ PAGE_DOM_RE.lastIndex = 0;
2911
+ if (!hasDom)
2912
+ continue;
2913
+ domMessageCount++;
2914
+ if (domMessageCount <= keepRecentWithDom)
2915
+ continue;
2916
+ // Replace DOM block with compact placeholder
2917
+ if (contentArr) {
2918
+ for (const part of contentArr) {
2919
+ if (typeof part === 'object' && 'text' in part && typeof part.text === 'string') {
2920
+ PAGE_DOM_RE.lastIndex = 0;
2921
+ part.text = part.text.replace(PAGE_DOM_RE, PLACEHOLDER);
2922
+ }
2923
+ }
2924
+ }
2925
+ else if (typeof msg.content === 'string') {
2926
+ PAGE_DOM_RE.lastIndex = 0;
2927
+ msg.content = msg.content.replace(PAGE_DOM_RE, PLACEHOLDER);
2928
+ }
2929
+ }
2930
+ }
2931
+ /**
2932
+ * Trim the conversation thread to avoid context window overflow.
2933
+ * Always preserves the first `preservedPrefixMessages` entries, then the last maxMessages messages.
2934
+ * Also compresses old screenshots and DOM blocks to save tokens.
2935
+ */
2936
+ export function trimConversationHistory(messages, maxMessages = 48, preservedPrefixMessages = 1) {
2937
+ // Compress old screenshots before trimming to maximise token savings
2938
+ compressOldScreenshots(messages, 1);
2939
+ // Strip <page_dom> from older user messages — the current DOM is always available
2940
+ compressOldDomBlocks(messages, 6, preservedPrefixMessages);
2941
+ if (messages.length <= maxMessages + preservedPrefixMessages)
2942
+ return;
2943
+ const preservedPrefix = messages.slice(0, preservedPrefixMessages);
2944
+ const recent = messages.slice(-(maxMessages));
2945
+ messages.splice(0, messages.length, ...preservedPrefix, ...recent);
2946
+ }
2947
+ /**
2948
+ * Format a tool result message for the conversation thread.
2949
+ * This is what the LLM sees after each action — it replaces the old action history mechanism.
2950
+ */
2951
+ function formatToolResult(name, args, execResult, elements) {
2952
+ if (!execResult.success) {
2953
+ return `FAILED: ${execResult.error || 'Unknown error'}`;
2954
+ }
2955
+ // Prefix with no-effect warning when the action succeeded but nothing visibly changed.
2956
+ // This helps the model detect when its action didn't produce the intended result
2957
+ // and avoid retrying the same ineffective approach (Anthropic best practice).
2958
+ const noEffectPrefix = execResult.stateChanged === false && name !== 'wait' && name !== 'note' && name !== 'begin_subgoal'
2959
+ ? 'NO_EFFECT: '
2960
+ : '';
2961
+ // Enrich result with element label when clicking by index
2962
+ if ((name === 'click' || name === 'type_text' || name === 'select_option' || name === 'scroll') && args.index !== undefined) {
2963
+ const el = elements.find(e => e.index === args.index);
2964
+ const label = el ? (el.text || el.ariaLabel || el.inputType || el.tag) : '';
2965
+ const labelStr = label ? ` "${label}"` : '';
2966
+ return execResult.outcome
2967
+ ? `${noEffectPrefix}[${args.index}]${labelStr}: ${execResult.outcome}`
2968
+ : `${noEffectPrefix}[${args.index}]${labelStr}: ok`;
2969
+ }
2970
+ return `${noEffectPrefix}${execResult.outcome || 'ok'}`;
2971
+ }
2972
+ async function callPlanner(client, prompt, url, firstScreenshot, options) {
2973
+ try {
2974
+ const isReplanning = (options.completedMilestones ?? 0) > 0;
2975
+ const systemContent = isReplanning
2976
+ ? `You are a web navigation planner. Given a screenshot of the current page and a goal, output the REMAINING steps (3–5 steps) needed to complete the workflow. ${options.completedMilestones} milestone(s) have already been captured. Focus on what still needs to be done. Never propose blocked actions such as "Continue with AI", AI/generate/create buttons, logout/sign out, billing/purchase flows, save/submit/publish actions, or other account-changing/destructive steps unless the goal explicitly is the login screen. No preamble — just the numbered steps.`
2977
+ : 'You are a web navigation planner. Given a screenshot of the starting page and a goal, output a concise numbered action plan (3–7 steps) to achieve the goal. This plan is a rough guide — the agent will adapt if the actual page differs from expectations. Be specific about what to click or navigate, but acknowledge that labels and layouts may differ. Never propose blocked actions such as "Continue with AI", AI/generate/create buttons, logout/sign out, billing/purchase flows, save/submit/publish actions, or other account-changing/destructive steps unless the goal explicitly is the login screen. No preamble — just the numbered steps.';
2978
+ const userGuidanceText = options.userGuidance && options.userGuidance.length > 0
2979
+ ? `\n\n⚠️ USER OVERRIDE — the operator has provided explicit guidance that MUST take priority over any previous plan:\n${options.userGuidance.map((g, i) => ` ${i + 1}. ${g}`).join('\n')}\nYour new plan MUST follow this guidance. If the guidance contradicts the previous plan, discard the previous plan entirely.`
2980
+ : '';
2981
+ const failedAttemptsText = options.failedAttemptsSummary
2982
+ ? `\n\n⚠️ PREVIOUS ATTEMPTS FAILED — the agent already tried these approaches and they did NOT work:\n${options.failedAttemptsSummary}\nDo NOT repeat these failed approaches. The agent may be on the WRONG PAGE — check if navigating to a different section/page is needed first.`
2983
+ : '';
2984
+ const goalText = `Goal: ${prompt}\nCurrent URL: ${url}${options.lang ? `\nTarget language: ${options.lang}` : ''}${options.theme ? `\nTarget theme: ${options.theme}` : ''}${options.currentObjective ? `\nCurrent objective: ${options.currentObjective}` : ''}${options.captureCursorSummary ? `\nCapture cursor: ${options.captureCursorSummary}` : ''}${options.remainingCaptureQueue && options.remainingCaptureQueue.length > 0 ? `\nRemaining capture queue: ${options.remainingCaptureQueue.join(', ')}` : ''}${options.repairTicketSummary ? `\nActive repair ticket: ${options.repairTicketSummary}\nUse repair steps only to unblock the current cursor, then resume the same capture.` : ''}${options.authState === 'authenticated' ? '\nAuthenticated browser state is already active. Do NOT plan login, OAuth, or sign-in steps again unless the explicit goal is the login screen or the screenshot clearly shows a login screen.' : ''}${options.handoffContextSummary ? `\nLive handoff context: ${options.handoffContextSummary}\nContinue from the carried-over page state before deciding to navigate.` : ''}${options.handoffNavigationHints && options.handoffNavigationHints.length > 0 ? `\nLikely next controls from the carried-over state: ${options.handoffNavigationHints.join(' | ')}` : ''}${options.variantManifestSummary ? `\nVariant manifest: ${options.variantManifestSummary}\nThe next steps must satisfy the current page id specifically and avoid duplicating already completed pages.` : ''}${failedAttemptsText}${userGuidanceText}\nForbidden planning actions: Continue with AI, generate/create with AI, logout/sign out, billing/purchase, save/submit/publish, or other mutating/account-changing steps unless the explicit goal is the login screen.\nIf a blocked control is visible, plan the safe alternative route instead.\n\nWrite the ${isReplanning ? 'remaining' : 'numbered action'} plan.`;
2985
+ // Dual-model mode: use cheap vision model to describe the page, then text-only primary model
2986
+ if (options.visionModel) {
2987
+ const screenshotUrl = await makeImageUrl(firstScreenshot, 'image/jpeg', options.uploadImage);
2988
+ const observerMessages = [
2989
+ { role: 'system', content: 'You are a page state observer. Describe the current page layout, visible elements, and navigation options concisely. Focus on what a planner needs to know to navigate the page.' },
2990
+ { role: 'user', content: [
2991
+ { type: 'image_url', image_url: { url: screenshotUrl } },
2992
+ { type: 'text', text: `Describe this page for a navigation planner. URL: ${url}` },
2993
+ ] },
2994
+ ];
2995
+ const observerResult = await client.chat.completions.create({ model: options.visionModel, messages: observerMessages, max_tokens: 300, stream: false, ...providerBody(options.visionModel, options.providerPreferences) }, { signal: options.signal });
2996
+ const observation = observerResult.choices?.[0]?.message?.content?.trim() ?? '';
2997
+ const observerUsage = extractUsage(observerResult, options.stepCounter ?? 0, 'agent_iteration', options.visionModel, 1);
2998
+ const planningMessages = [
2999
+ { role: 'system', content: systemContent },
3000
+ { role: 'user', content: `Page observation:\n${observation}\n\n${goalText}` },
3001
+ ];
3002
+ const planResult = await client.chat.completions.create({ model: options.model, messages: planningMessages, max_tokens: 256, ...providerBody(options.model, options.providerPreferences) }, { signal: options.signal });
3003
+ const plan = planResult.choices?.[0]?.message?.content?.trim() ?? null;
3004
+ const planUsage = extractUsage(planResult, (options.stepCounter ?? 0) + 1, 'agent_iteration', options.model, 0);
3005
+ // Return the primary model's usage; observer usage is tracked separately via usageLog at the call site
3006
+ return { plan, usage: planUsage ?? observerUsage, model: options.model };
3007
+ }
3008
+ // Mono-model mode: send image directly to the primary model (with vision fallback)
3009
+ const screenshotUrl = await makeImageUrl(firstScreenshot, 'image/jpeg', options.uploadImage);
3010
+ const planningMessages = [
3011
+ {
3012
+ role: 'system',
3013
+ content: systemContent,
3014
+ },
3015
+ {
3016
+ role: 'user',
3017
+ content: [
3018
+ { type: 'image_url', image_url: { url: screenshotUrl } },
3019
+ { type: 'text', text: goalText },
3020
+ ],
3021
+ },
3022
+ ];
3023
+ const plannerResult = await callVisionCapableModel({
3024
+ primaryModel: options.model,
3025
+ fallbackModel: options.fallbackModel,
3026
+ onFallbackActivated: (m, reason) => logger.debug(`Planning vision fallback activated: ${m} (reason: ${reason})`),
3027
+ callModel: (model) => client.chat.completions.create({
3028
+ model,
3029
+ messages: planningMessages,
3030
+ max_tokens: 256,
3031
+ ...providerBody(model, options.providerPreferences),
3032
+ }, { signal: options.signal }),
3033
+ });
3034
+ const plan = plannerResult.result.choices?.[0]?.message?.content?.trim() ?? null;
3035
+ const usage = extractUsage(plannerResult.result, options.stepCounter ?? 0, 'agent_iteration', plannerResult.model, 1);
3036
+ return { plan, usage, model: plannerResult.model };
3037
+ }
3038
+ catch (err) {
3039
+ if (!isAbortError(err)) {
3040
+ const modelTried = options.fallbackModel
3041
+ ? `${options.model} → ${options.fallbackModel}`
3042
+ : options.model;
3043
+ logger.debug(`Planning call failed (non-fatal) [${modelTried}]: ${err.message}`);
3044
+ }
3045
+ return { plan: null, usage: null };
3046
+ }
3047
+ }
3048
+ function resolveActionOrigin(config) {
3049
+ if (config.runMode === 'language_preflight')
3050
+ return 'preflight';
3051
+ if (config.runMode === 'repair' || config.currentObjective === 'repair')
3052
+ return 'repair_subplan';
3053
+ return 'main_plan';
3054
+ }
3055
+ function appendActionHistory(actionHistory, config, action) {
3056
+ actionHistory.push({
3057
+ ...action,
3058
+ origin: action.origin ?? resolveActionOrigin(config),
3059
+ phase: action.phase ?? config.captureCursor?.phase,
3060
+ checkpointId: action.checkpointId !== undefined
3061
+ ? action.checkpointId
3062
+ : config.captureCursor?.lastVerifiedCheckpointId ?? null,
3063
+ });
3064
+ }
3065
+ /**
3066
+ * Build a compact trajectory log from action history.
3067
+ * Re-injected every iteration to preserve action context across conversation trimming.
3068
+ * Format: +* iter: action [index] "reason" (+ success, - fail, * state changed)
3069
+ * Capped at the last 30 actions to avoid unbounded token growth.
3070
+ * Meta-actions (note, begin_subgoal) are rendered compactly to reduce noise.
3071
+ */
3072
+ /**
3073
+ * Module-level cache so the OpenRouter model list is fetched at most once per process.
3074
+ * key = model id, value = supports cache_control
3075
+ */
3076
+ const cachingCapabilityCache = new Map();
3077
+ /**
3078
+ * Checks whether a model supports explicit prompt caching via cache_control by
3079
+ * inspecting its `pricing.input_cache_read` field in the OpenRouter models API.
3080
+ * Results are memoised for the lifetime of the process.
3081
+ */
3082
+ async function modelSupportsCaching(model, apiKey) {
3083
+ if (cachingCapabilityCache.has(model))
3084
+ return cachingCapabilityCache.get(model);
3085
+ try {
3086
+ const res = await fetch('https://openrouter.ai/api/v1/models', {
3087
+ headers: { Authorization: `Bearer ${apiKey}` },
3088
+ });
3089
+ if (!res.ok)
3090
+ throw new Error(`HTTP ${res.status}`);
3091
+ const body = await res.json();
3092
+ for (const m of body.data ?? []) {
3093
+ const supported = Number(m.pricing?.input_cache_read ?? 0) > 0;
3094
+ cachingCapabilityCache.set(m.id, supported);
3095
+ }
3096
+ return cachingCapabilityCache.get(model) ?? false;
3097
+ }
3098
+ catch (err) {
3099
+ logger.info(`Could not detect caching support for ${model}: ${err.message}. Disabling cache_control.`);
3100
+ return false;
3101
+ }
3102
+ }
3103
+ function buildTrajectoryLog(actionHistory) {
3104
+ if (actionHistory.length === 0)
3105
+ return '';
3106
+ return actionHistory.slice(-25).map(a => {
3107
+ const status = a.success ? '+' : '-';
3108
+ const changed = a.stateChanged ? '*' : ' ';
3109
+ // Meta-actions: show their name/content inline without index/reason noise
3110
+ if (a.action === 'begin_subgoal')
3111
+ return ` ${a.iteration}: [subgoal: ${String(a.params.name ?? '').slice(0, 30)}]`;
3112
+ if (a.action === 'note')
3113
+ return ` ${a.iteration}: [note]`;
3114
+ const target = a.params.index !== undefined ? ` [${a.params.index}]` : '';
3115
+ const reason = a.params.reason ? ` "${String(a.params.reason).slice(0, 30)}"` : '';
3116
+ return `${status}${changed} ${a.iteration}: ${a.action}${target}${reason}`;
3117
+ }).join('\n');
3118
+ }
3119
+ /**
3120
+ * Build a compact summary of recent failed/no-effect actions for the planner.
3121
+ * Helps the planner understand what was already tried so it doesn't repeat the same mistakes.
3122
+ */
3123
+ function buildFailedAttemptsSummary(actionHistory) {
3124
+ const recent = actionHistory.slice(-12);
3125
+ const failed = recent.filter(a => !a.success || a.stateChanged === false).filter(a => a.action !== 'note' && a.action !== 'begin_subgoal' && a.action !== 'wait');
3126
+ if (failed.length === 0)
3127
+ return undefined;
3128
+ return failed.map(a => {
3129
+ const target = a.params.elementLabel
3130
+ ? `"${String(a.params.elementLabel).slice(0, 40)}"`
3131
+ : a.params.index !== undefined
3132
+ ? `[${a.params.index}]`
3133
+ : a.params.query
3134
+ ? `"${String(a.params.query).slice(0, 40)}"`
3135
+ : '';
3136
+ const outcome = !a.success
3137
+ ? `FAILED: ${(a.error || 'unknown').slice(0, 60)}`
3138
+ : 'NO EFFECT';
3139
+ return `- ${a.action} ${target} → ${outcome}`;
3140
+ }).join('\n');
3141
+ }
3142
+ /**
3143
+ * Call a cheap vision model to extract a structured text observation from a screenshot.
3144
+ * Used in dual-model mode: the vision model describes the page, the text model reasons.
3145
+ * Returns a concise text description (~200-300 tokens) that replaces images in the main context.
3146
+ */
3147
+ async function callVisionObserver(client, visionModel, screenshot, pageState, config, stepCounter) {
3148
+ const screenshotUrl = await makeImageUrl(screenshot, 'image/jpeg', config.uploadImage);
3149
+ const messages = [
3150
+ {
3151
+ role: 'system',
3152
+ content: 'You are a page state observer for a web navigation agent. Describe the current page concisely and factually. Output ONLY the structured observation — no commentary.',
3153
+ },
3154
+ {
3155
+ role: 'user',
3156
+ content: buildVisionObserverPrompt({
3157
+ screenshotUrl,
3158
+ currentUrl: config.url,
3159
+ interactiveElements: pageState.interactiveElements,
3160
+ userGoal: config.prompt,
3161
+ currentLang: config.currentLang,
3162
+ currentTheme: config.currentTheme,
3163
+ currentPageId: config.variantManifest?.currentPageId ?? undefined,
3164
+ pageIdentitySummary: config.variantManifest?.currentPageIdentity?.summary ?? undefined,
3165
+ currentObjective: config.currentObjective,
3166
+ }),
3167
+ },
3168
+ ];
3169
+ try {
3170
+ const result = await client.chat.completions.create({ model: visionModel, messages, max_tokens: 300, stream: false, ...providerBody(visionModel, config.providerPreferences) }, { signal: config.abortSignal });
3171
+ const content = result.choices?.[0]?.message?.content?.trim() ?? '';
3172
+ const usage = extractUsage(result, stepCounter, 'agent_iteration', visionModel, 1);
3173
+ return { observation: content, usage };
3174
+ }
3175
+ catch (err) {
3176
+ if (isAbortError(err))
3177
+ throw err;
3178
+ logger.error(`Vision observer call failed: ${err.message}`);
3179
+ return { observation: '', usage: null };
3180
+ }
3181
+ }
3182
+ export async function runAgent(browser, config, apiKey) {
3183
+ const client = createClient(apiKey);
3184
+ const modelState = { active: config.model };
3185
+ const actionHistory = [];
3186
+ const workflowScreenshots = [];
3187
+ const usageLog = [];
3188
+ let stepCounter = 0;
3189
+ const hasCredentials = !!(config.credentials?.email || config.credentials?.password);
3190
+ let usedDeterministicRecovery = false;
3191
+ const userGuidanceMessages = [];
3192
+ let lastVerificationFailure;
3193
+ let lastVerificationResult;
3194
+ let consecutiveDialogFailures = 0;
3195
+ let consecutiveVerificationFailures = 0;
3196
+ let consecutiveTechnicalVerificationFailures = 0;
3197
+ let rejectedGiveUps = 0;
3198
+ let lastReplanIteration = 0; // Cooldown: skip replanning if we just replanned
3199
+ // Working memory: persistent notes stored by the agent via the `note` tool
3200
+ // Re-injected into every prompt to survive conversation trimming (Agent-E / HiAgent pattern)
3201
+ const agentNotes = [];
3202
+ // Hierarchical working memory (HiAgent pattern):
3203
+ // Subgoals organize notes into named phases; completed subgoals are archived as 1-line summaries
3204
+ let currentSubgoal = null;
3205
+ const completedSubgoals = [];
3206
+ // AWM within-run: cache of selectors that worked per subgoal for cross-subgoal hints
3207
+ const workflowCache = [];
3208
+ // Index in actionHistory where the current subgoal started (for scoped AWM extraction)
3209
+ let subgoalStartIndex = 0;
3210
+ // Screenshot hash history for visual loop detection
3211
+ // Tracks MD5 hashes of recent screenshots to detect when the agent is stuck on the same page
3212
+ const screenshotHashHistory = [];
3213
+ // Multi-turn conversation thread — persisted across all iterations
3214
+ // The LLM naturally sees its own history without us reconstructing context each time
3215
+ const cacheLayoutV2Enabled = process.env.SCREENSHOT_AGENT_CACHE_LAYOUT_V2 === '1';
3216
+ const promptCacheStrategy = resolvePromptCacheStrategy(config.model, {
3217
+ enableGeminiExplicitBreakpoints: process.env.SCREENSHOT_AGENT_GEMINI_EXPLICIT_CACHE_BREAKPOINTS === '1',
3218
+ });
3219
+ const systemPromptText = buildSystemPrompt({ reasoningLocale: config.reasoningLocale });
3220
+ const supportsCache = promptCacheStrategy === 'explicit_breakpoints'
3221
+ && await modelSupportsCaching(config.model, apiKey);
3222
+ const systemMessage = supportsCache
3223
+ ? {
3224
+ role: 'system',
3225
+ content: [{ type: 'text', text: systemPromptText, cache_control: { type: 'ephemeral' } }],
3226
+ }
3227
+ : { role: 'system', content: systemPromptText };
3228
+ const conversationMessages = [systemMessage];
3229
+ if (hasManualMultiProviderOrder(config.providerPreferences?.[config.model])) {
3230
+ logger.info(`[cache] ${config.model} uses a multi-provider provider.order override; OpenRouter sticky prompt caching may be reduced.`);
3231
+ }
3232
+ // Pre-loop planning: take an initial screenshot and generate a step-by-step plan
3233
+ // This gives the agent direction before it starts acting, reducing aimless exploration.
3234
+ // Skip planning when the agent is already on the target URL — the capture-first rule
3235
+ // will make it call ready_to_capture immediately, so planning is wasted.
3236
+ const currentUrl = browser.currentPage.url();
3237
+ const alreadyOnTarget = urlsRoughlyMatch(config.url, currentUrl);
3238
+ const planningScreenshot = alreadyOnTarget ? null : await browser.takeScreenshotForAI().catch(() => null);
3239
+ let taskPlan = null;
3240
+ if (planningScreenshot) {
3241
+ const plannerResult = await callPlanner(client, config.prompt, currentUrl, planningScreenshot, {
3242
+ model: modelState.active,
3243
+ fallbackModel: config.fallbackModel,
3244
+ visionModel: config.visionModel,
3245
+ lang: config.currentLang,
3246
+ theme: config.currentTheme,
3247
+ currentObjective: config.currentObjective,
3248
+ captureCursorSummary: summarizeCaptureCursorForPlanner(config),
3249
+ repairTicketSummary: summarizeRepairTicketForPlanner(config),
3250
+ remainingCaptureQueue: config.remainingCaptureQueue,
3251
+ authState: config.handoffContext?.authState ?? config.sessionProfile?.authState,
3252
+ handoffContextSummary: config.handoffContext?.summary,
3253
+ handoffNavigationHints: config.handoffContext?.navigationHints,
3254
+ variantManifestSummary: summarizeVariantManifestForPlanner(config.variantManifest),
3255
+ signal: config.abortSignal,
3256
+ uploadImage: config.uploadImage,
3257
+ stepCounter: ++stepCounter,
3258
+ providerPreferences: config.providerPreferences,
3259
+ });
3260
+ taskPlan = plannerResult.plan;
3261
+ if (plannerResult.usage)
3262
+ usageLog.push(plannerResult.usage);
3263
+ if (plannerResult.model && plannerResult.model !== modelState.active) {
3264
+ modelState.active = plannerResult.model;
3265
+ }
3266
+ if (taskPlan) {
3267
+ logger.info(`Task plan generated:\n${taskPlan}`);
3268
+ }
3269
+ }
3270
+ if (cacheLayoutV2Enabled) {
3271
+ const anchorPrompt = buildStableAnchorUserMessage({
3272
+ userPrompt: config.prompt,
3273
+ credentials: config.credentials,
3274
+ currentLang: config.currentLang,
3275
+ currentTheme: config.currentTheme,
3276
+ langInstructions: config.langInstructions,
3277
+ themeInstructions: config.themeInstructions,
3278
+ viewports: config.viewports,
3279
+ runHints: config.runHints,
3280
+ selectorMemory: config.selectorMemory,
3281
+ sessionProfile: config.sessionProfile,
3282
+ handoffContext: config.handoffContext,
3283
+ variantManifest: config.variantManifest,
3284
+ });
3285
+ // Mark the last content part of the anchor with cache_control so Anthropic's
3286
+ // prompt caching covers both the system message and the stable anchor.
3287
+ const anchorContent = supportsCache
3288
+ ? anchorPrompt.content.map((part, idx, arr) => idx === arr.length - 1 && part.type === 'text'
3289
+ ? { ...part, cache_control: { type: 'ephemeral' } }
3290
+ : part)
3291
+ : anchorPrompt.content;
3292
+ conversationMessages.push({ role: 'user', content: anchorContent });
3293
+ }
3294
+ // DOM fingerprint tracking: skip sending full DOM when page hasn't changed
3295
+ let lastDomFingerprint = null;
3296
+ for (let iteration = 1; iteration <= config.maxIterations; iteration++) {
3297
+ throwIfAborted(config.abortSignal, 'Agent run cancelled.');
3298
+ // Check for user guidance (pause & guide)
3299
+ let guidanceReceivedThisIteration = false;
3300
+ if (config.guidanceCallback) {
3301
+ const guidance = await config.guidanceCallback();
3302
+ if (guidance) {
3303
+ logger.info(`Indication reçue : ${guidance}`);
3304
+ userGuidanceMessages.push(guidance);
3305
+ guidanceReceivedThisIteration = true;
3306
+ }
3307
+ }
3308
+ // 1. Capture current page state
3309
+ // Pre-compute whether vision will likely be auto-triggered this iteration.
3310
+ // When vision is not needed, skip the expensive SoM annotation (sharp compositing).
3311
+ const preNoEffectCount = countRecentNoEffectActions(actionHistory);
3312
+ const likelyNeedsVision = preNoEffectCount >= 2 || (preNoEffectCount >= 1 && screenshotHashHistory.length >= 3);
3313
+ const pageState = await browser.getPageState({ skipAnnotation: !likelyNeedsVision });
3314
+ // Stream live screenshot to the web UI
3315
+ emitScreenshot(getLivePreviewScreenshot(pageState).toString('base64'));
3316
+ // Visual loop detection: track screenshot hashes to detect when the agent is stuck.
3317
+ // Use a larger sample (first 32KB) for more robust detection — small samples miss
3318
+ // subtle layout differences while being too sensitive to viewport size changes.
3319
+ const sampleSize = Math.min(32768, pageState.screenshot.length);
3320
+ const screenshotHash = createHash('md5').update(pageState.screenshot.subarray(0, sampleSize)).digest('hex');
3321
+ // Also track a URL-based hash to catch navigation loops where the page content
3322
+ // is near-identical but the viewport or minor rendering differences change the image hash.
3323
+ const urlHash = createHash('md5').update(browser.currentPage.url()).digest('hex').slice(0, 8);
3324
+ const compositeKey = `${urlHash}:${screenshotHash}`;
3325
+ screenshotHashHistory.push(compositeKey);
3326
+ if (screenshotHashHistory.length > 8)
3327
+ screenshotHashHistory.shift();
3328
+ const hashOccurrences = screenshotHashHistory.filter(h => h === compositeKey).length;
3329
+ // Also check URL-only loops: same URL appearing 4+ times indicates stuck navigation
3330
+ const urlOccurrences = screenshotHashHistory.filter(h => h.startsWith(urlHash + ':')).length;
3331
+ const isVisualLoop = hashOccurrences >= 3 || urlOccurrences >= 5;
3332
+ // Deterministic session repair when stuck (fast, no LLM cost)
3333
+ if (config.enableDeterministicRecovery !== false && shouldTriggerRecovery(actionHistory) && config.selectorMemory) {
3334
+ logger.info('Tentative de récupération automatique…');
3335
+ const repair = await performDeterministicSessionRepair(browser, {
3336
+ startUrl: config.url,
3337
+ requestedLang: config.currentLang,
3338
+ requestedTheme: config.currentTheme,
3339
+ credentials: config.credentials,
3340
+ profile: config.sessionProfile,
3341
+ selectorMemory: config.selectorMemory,
3342
+ }).catch(() => null);
3343
+ if (repair?.repaired) {
3344
+ usedDeterministicRecovery = true;
3345
+ const outcome = `Deterministic recovery succeeded via ${repair.pathUsed ?? 'selector memory'}.`;
3346
+ appendActionHistory(actionHistory, config, {
3347
+ iteration,
3348
+ action: 'wait',
3349
+ params: { reason: 'deterministic_repair' },
3350
+ success: true,
3351
+ outcome,
3352
+ stateChanged: true,
3353
+ origin: 'deterministic',
3354
+ phase: 'recover',
3355
+ });
3356
+ logger.success('Récupération automatique réussie');
3357
+ continue;
3358
+ }
3359
+ if (repair?.pathUsed)
3360
+ usedDeterministicRecovery = true;
3361
+ }
3362
+ // Dynamic replanning: triggered by stuck detection OR user guidance.
3363
+ // User guidance bypasses the cooldown and immediately forces a replan so the
3364
+ // agent does not waste iterations following an obsolete plan.
3365
+ const replanCooldownMet = iteration - lastReplanIteration >= 3;
3366
+ const isStuckEnoughToReplan = countRecentNoEffectActions(actionHistory) >= 3;
3367
+ const shouldReplan = guidanceReceivedThisIteration
3368
+ || (replanCooldownMet && (isVisualLoop || (isStuckEnoughToReplan && iteration > 3)));
3369
+ if (shouldReplan) {
3370
+ lastReplanIteration = iteration;
3371
+ logger.info('Régénération du plan d\'action…');
3372
+ const replanScreenshot = await browser.takeScreenshot();
3373
+ const stuckReplan = await callPlanner(client, config.prompt, browser.currentPage.url(), replanScreenshot, {
3374
+ model: modelState.active,
3375
+ fallbackModel: config.fallbackModel,
3376
+ visionModel: config.visionModel,
3377
+ lang: config.currentLang,
3378
+ theme: config.currentTheme,
3379
+ currentObjective: config.currentObjective,
3380
+ captureCursorSummary: summarizeCaptureCursorForPlanner(config),
3381
+ repairTicketSummary: summarizeRepairTicketForPlanner(config),
3382
+ remainingCaptureQueue: config.remainingCaptureQueue,
3383
+ variantManifestSummary: summarizeVariantManifestForPlanner(config.variantManifest),
3384
+ completedMilestones: workflowScreenshots.length || 1,
3385
+ signal: config.abortSignal,
3386
+ uploadImage: config.uploadImage,
3387
+ stepCounter: ++stepCounter,
3388
+ providerPreferences: config.providerPreferences,
3389
+ userGuidance: userGuidanceMessages.length > 0 ? userGuidanceMessages : undefined,
3390
+ failedAttemptsSummary: buildFailedAttemptsSummary(actionHistory),
3391
+ }).catch(() => ({ plan: null, usage: null }));
3392
+ if (stuckReplan.usage)
3393
+ usageLog.push(stuckReplan.usage);
3394
+ if (stuckReplan.model && stuckReplan.model !== modelState.active) {
3395
+ modelState.active = stuckReplan.model;
3396
+ }
3397
+ if (stuckReplan.plan) {
3398
+ taskPlan = stuckReplan.plan;
3399
+ logger.info(`Nouveau plan d'action :\n${stuckReplan.plan}`);
3400
+ }
3401
+ }
3402
+ // 2. Build observation and add to conversation thread
3403
+ // First iteration: includes full task context (prompt, session, memory, instructions)
3404
+ // Subsequent iterations: compact page-state-only observation
3405
+ //
3406
+ // Dual-model mode: when visionModel is set, call the cheap vision model to extract
3407
+ // a text observation, then pass that to the main text model (no images in main context).
3408
+ // Mono-model mode: send images directly as before.
3409
+ // DOM-first: the main model receives the simplified DOM as primary context.
3410
+ // Vision analysis is available on-demand via the analyze_screenshot tool.
3411
+ // Auto-trigger: inject a vision observation when the agent is genuinely stuck.
3412
+ // Only fires on repeated no-effect actions or visual loops — NOT on every iteration
3413
+ // after a verification failure (lastVerificationFailure is already in the text context).
3414
+ // Skip vision when a replan or user guidance just happened — the agent has fresh direction.
3415
+ let visionObservationText;
3416
+ const noEffectCount = countRecentNoEffectActions(actionHistory);
3417
+ const justReplanned = shouldReplan; // replan happened this iteration — let the agent try the new plan first
3418
+ const shouldAutoTriggerVision = !justReplanned
3419
+ && !guidanceReceivedThisIteration
3420
+ && (noEffectCount >= 2 || (isVisualLoop && noEffectCount >= 1));
3421
+ if (shouldAutoTriggerVision) {
3422
+ const visionModel = config.visionModel || modelState.active;
3423
+ logger.info('Analyse visuelle de la page…');
3424
+ const visionResult = await callVisionObserver(client, visionModel, pageState.screenshot, pageState, config, ++stepCounter);
3425
+ if (visionResult.usage)
3426
+ usageLog.push(visionResult.usage);
3427
+ visionObservationText = visionResult.observation || undefined;
3428
+ if (visionObservationText) {
3429
+ logger.debug(`Auto-triggered vision (stuck=${noEffectCount}, visualLoop=${isVisualLoop}, verifyFail=${!!lastVerificationFailure}): ${visionObservationText.slice(0, 200)}`);
3430
+ // If vision says the page matches the target and we're stuck, add a strong capture hint
3431
+ const matchLine = visionObservationText.split('\n').find(l => /^MATCH:/i.test(l.trim()));
3432
+ if (matchLine && /\b(yes|oui)\b/i.test(matchLine) && (noEffectCount >= 2 || isVisualLoop)) {
3433
+ visionObservationText += config.runMode === 'video_navigation_preflight'
3434
+ ? '\n\n⚠️ VISION SUGGESTS THE PAGE IS CLOSE TO THE GOAL. Before calling ready_to_capture, confirm that the EXACT pre-recording start state is visible now: the requested project/section/dialog must already be open, and the recorded interaction must not be done yet.'
3435
+ : '\n\n⚠️ VISION CONFIRMS PAGE MATCHES TARGET. You are stuck clicking without progress. Call ready_to_capture NOW — the current state is likely what you need to capture.';
3436
+ }
3437
+ }
3438
+ }
3439
+ // DOM-first: skip images in the main iteration prompt.
3440
+ // The simplified DOM + interactive elements provide page context.
3441
+ // Vision is available on-demand via analyze_screenshot tool.
3442
+ const screenshotUrl = undefined;
3443
+ const cleanScreenshotUrl = undefined;
3444
+ // DOM skip: compute triple fingerprint (DOM + element count + current URL).
3445
+ // If identical to last iteration, send a compact placeholder instead of full DOM.
3446
+ // Safety net: always send full DOM on iteration 1, every 4th iteration, and when stuck.
3447
+ const currentDomFingerprint = createHash('sha1')
3448
+ .update(pageState.simplifiedDOM || '')
3449
+ .update(String(pageState.interactiveElements.length))
3450
+ .update(browser.currentPage.url())
3451
+ .digest('hex');
3452
+ const noEffectCountForDom = countRecentNoEffectActions(actionHistory);
3453
+ const domUnchanged = iteration > 1
3454
+ && iteration % 4 !== 0
3455
+ && noEffectCountForDom === 0
3456
+ && lastDomFingerprint !== null
3457
+ && currentDomFingerprint === lastDomFingerprint;
3458
+ lastDomFingerprint = currentDomFingerprint;
3459
+ const iterationPrompt = buildIterationUserMessage({
3460
+ userPrompt: config.prompt,
3461
+ cleanScreenshotUrl,
3462
+ screenshotUrl: screenshotUrl || '',
3463
+ visionObservation: visionObservationText,
3464
+ simplifiedDOM: domUnchanged ? undefined : (pageState.simplifiedDOM || undefined),
3465
+ domUnchanged,
3466
+ accessibilityTree: pageState.accessibilityTree,
3467
+ interactiveElements: pageState.interactiveElements,
3468
+ screenshotsTaken: workflowScreenshots,
3469
+ iteration,
3470
+ maxIterations: config.maxIterations,
3471
+ credentials: config.credentials,
3472
+ currentLang: config.currentLang,
3473
+ currentTheme: config.currentTheme,
3474
+ langInstructions: config.langInstructions,
3475
+ themeInstructions: config.themeInstructions,
3476
+ viewports: config.viewports,
3477
+ runHints: config.runHints,
3478
+ selectorMemory: config.selectorMemory,
3479
+ sessionProfile: config.sessionProfile,
3480
+ hasCredentials,
3481
+ salienceCompressionEnabled: config.enableSalienceCompression !== false,
3482
+ viewport: browser.currentPage.viewportSize() ?? config.viewport,
3483
+ currentUrl: browser.currentPage.url(),
3484
+ scrollInfo: pageState.scrollInfo,
3485
+ agentNotes: agentNotes.length > 0 ? [...agentNotes] : undefined,
3486
+ currentSubgoal: currentSubgoal ?? undefined,
3487
+ completedSubgoals: completedSubgoals.length > 0 ? [...completedSubgoals] : undefined,
3488
+ trajectoryLog: buildTrajectoryLog(actionHistory) || undefined,
3489
+ handoffContext: iteration <= 3 ? config.handoffContext : undefined,
3490
+ variantManifest: config.variantManifest,
3491
+ variantReference: config.variantReference,
3492
+ currentObjective: config.currentObjective,
3493
+ captureCursor: config.captureCursor,
3494
+ activeRepairTicket: config.activeRepairTicket,
3495
+ remainingCaptureQueue: config.remainingCaptureQueue,
3496
+ stuckLoopWarning: (shouldTriggerRecovery(actionHistory) || isVisualLoop)
3497
+ ? (() => {
3498
+ const currentPageUrl = browser.currentPage.url();
3499
+ const targetUrl = config.url;
3500
+ // Detect wrong-page: if current URL path is completely different from target,
3501
+ // the agent is likely on the wrong page and needs to navigate first.
3502
+ let wrongPageHint = '';
3503
+ try {
3504
+ const currentPath = new URL(currentPageUrl).pathname;
3505
+ const targetPath = new URL(targetUrl).pathname;
3506
+ // If paths share no common segments beyond '/', flag it
3507
+ const currentSegments = currentPath.split('/').filter(Boolean);
3508
+ const targetSegments = targetPath.split('/').filter(Boolean);
3509
+ const commonSegments = currentSegments.filter(s => targetSegments.includes(s));
3510
+ if (currentSegments.length > 0 && targetSegments.length > 0 && commonSegments.length === 0) {
3511
+ wrongPageHint = ` ⚠️ WRONG PAGE: you are on ${currentPath} but the target is ${targetPath}. Navigate to the correct page FIRST before trying to interact with elements.`;
3512
+ }
3513
+ }
3514
+ catch { /* ignore URL parse errors */ }
3515
+ if (config.runMode === 'video_navigation_preflight') {
3516
+ return `STUCK: recent actions were ineffective${isVisualLoop ? ' (page appears visually unchanged)' : ''}.${wrongPageHint} BEFORE trying more actions, CHECK: is the browser on the EXACT pre-recording start state? The requested project/section/dialog must already be open, and the recorded interaction must not have been executed yet. Only then call ready_to_capture. Otherwise, continue the missing preparation step.`;
3517
+ }
3518
+ return `STUCK: recent actions were ineffective${isVisualLoop ? ' (page appears visually unchanged)' : ''}.${wrongPageHint} BEFORE trying more actions, CHECK: does the current page state ALREADY match the capture target? If a modal/panel/overlay is open that matches what you need to capture, call ready_to_capture NOW instead of clicking more things. If not, close any open dropdown/popover (press Escape) and try a completely different approach.`;
3519
+ })()
3520
+ : undefined,
3521
+ taskPlan: taskPlan ?? undefined,
3522
+ lastVerificationFailure,
3523
+ userGuidance: userGuidanceMessages.length > 0 ? [...userGuidanceMessages] : undefined,
3524
+ expansionLevel: Math.min(3, Math.max(0, countRecentNoEffectActions(actionHistory) - 1)),
3525
+ isFirstIteration: iteration === 1,
3526
+ cacheLayoutV2: cacheLayoutV2Enabled,
3527
+ failedAttemptsSummary: buildFailedAttemptsSummary(actionHistory),
3528
+ });
3529
+ conversationMessages.push({ role: 'user', content: iterationPrompt.content });
3530
+ // Trim thread to avoid context window overflow
3531
+ trimConversationHistory(conversationMessages, 24, cacheLayoutV2Enabled ? 2 : 1);
3532
+ // 3. Call OpenRouter with full conversation thread
3533
+ let response;
3534
+ try {
3535
+ throwIfAborted(config.abortSignal, 'Agent run cancelled.');
3536
+ const visionResult = await callVisionCapableModel({
3537
+ primaryModel: modelState.active,
3538
+ fallbackModel: modelState.active === config.model ? config.fallbackModel : undefined,
3539
+ onFallbackActivated: (model, reason) => {
3540
+ modelState.active = model;
3541
+ logger.debug(`Vision fallback activated: ${model} (reason: ${reason})`);
3542
+ },
3543
+ callModel: (model) => callWithRetry(client, {
3544
+ model,
3545
+ messages: conversationMessages,
3546
+ tools: agentTools,
3547
+ tool_choice: 'required',
3548
+ max_tokens: 2048,
3549
+ }, 3, config.abortSignal, config.providerPreferences, config.reasoningEffort),
3550
+ });
3551
+ response = visionResult.result;
3552
+ if (visionResult.model && visionResult.model !== modelState.active) {
3553
+ logger.debug(`OpenRouter model substitution detected: requested "${modelState.active}", got "${visionResult.model}"`);
3554
+ }
3555
+ modelState.active = visionResult.model;
3556
+ const imagesInPrompt = iterationPrompt.content.filter((part) => 'type' in part && part.type === 'image_url').length;
3557
+ const stepUsage = extractUsage(response, ++stepCounter, 'agent_iteration', visionResult.model, imagesInPrompt);
3558
+ const systemContent = conversationMessages[0].content;
3559
+ stepUsage.systemPromptChars = typeof systemContent === 'string' ? systemContent.length : 0;
3560
+ stepUsage.toolSchemaChars = JSON.stringify(agentTools).length;
3561
+ const userContent = iterationPrompt.content;
3562
+ stepUsage.userPayloadChars = Array.isArray(userContent)
3563
+ ? userContent.reduce((sum, part) => sum + ('text' in part && typeof part.text === 'string' ? part.text.length : 0), 0)
3564
+ : typeof userContent === 'string' ? userContent.length : 0;
3565
+ stepUsage.accessibilityChars = pageState.accessibilityTree.length;
3566
+ stepUsage.interactiveElementCount = pageState.interactiveElements.length;
3567
+ stepUsage.actionHistoryCount = actionHistory.length;
3568
+ stepUsage.elementsChars = iterationPrompt.metrics.elementsChars;
3569
+ stepUsage.sessionSummaryChars = iterationPrompt.metrics.sessionSummaryChars;
3570
+ stepUsage.selectorMemoryChars = iterationPrompt.metrics.selectorMemoryChars;
3571
+ stepUsage.agentContextChars = iterationPrompt.metrics.agentContextChars;
3572
+ stepUsage.profileValidationStatus = config.sessionProfile?.validationStatus;
3573
+ stepUsage.repairPathUsed = usedDeterministicRecovery ? 'deterministic_repair' : 'llm_full';
3574
+ usageLog.push(stepUsage);
3575
+ // Log per-iteration cache efficiency so cache behavior is visible in runtime logs.
3576
+ if (stepUsage.cacheReadTokens) {
3577
+ const hitPct = stepUsage.promptTokens
3578
+ ? Math.round((stepUsage.cacheReadTokens / stepUsage.promptTokens) * 100)
3579
+ : 0;
3580
+ logger.debug(`[cache] iter ${iteration}: ${stepUsage.cacheReadTokens} cached / ${stepUsage.promptTokens ?? '?'} total (${hitPct}% hit rate)`);
3581
+ }
3582
+ if (stepUsage.reasoningTokens) {
3583
+ logger.debug(`[reasoning] iter ${iteration}: ${stepUsage.reasoningTokens} reasoning tokens`);
3584
+ }
3585
+ }
3586
+ catch (err) {
3587
+ if (isAbortError(err))
3588
+ throw err;
3589
+ logger.error(`API call failed at iteration ${iteration}: ${err.message}`);
3590
+ if (err instanceof VisionModelUnsupportedError) {
3591
+ getPostHog().capture({
3592
+ distinctId: config.analyticsId ?? DISTINCT_ID,
3593
+ event: 'agent_gave_up',
3594
+ properties: {
3595
+ url: config.url,
3596
+ model: modelState.active,
3597
+ theme: config.currentTheme,
3598
+ lang: config.currentLang,
3599
+ reason: err.message,
3600
+ iterations: iteration,
3601
+ total_tokens: usageLog.reduce((s, u) => s + (u.totalTokens ?? 0), 0),
3602
+ },
3603
+ });
3604
+ return {
3605
+ success: false,
3606
+ screenshotPath: null,
3607
+ screenshots: workflowScreenshots,
3608
+ iterations: iteration,
3609
+ actions: actionHistory,
3610
+ assessment: err.message,
3611
+ diagnostic: {
3612
+ screenshot: pageState.screenshot,
3613
+ url: browser.currentPage.url(),
3614
+ interactiveElements: pageState.interactiveElements,
3615
+ accessibilityTreeSnippet: pageState.accessibilityTree.slice(0, 5000),
3616
+ giveUpReason: err.message,
3617
+ },
3618
+ usage: usageLog,
3619
+ runtimeStrategy: 'full_llm',
3620
+ deterministicRecoveryUsed: usedDeterministicRecovery,
3621
+ evaluatorUsed: false,
3622
+ };
3623
+ }
3624
+ appendActionHistory(actionHistory, config, {
3625
+ iteration,
3626
+ action: 'wait',
3627
+ params: { reason: 'API error' },
3628
+ success: false,
3629
+ error: err.message,
3630
+ });
3631
+ continue;
3632
+ }
3633
+ // 4. Extract tool calls and add assistant message to conversation thread
3634
+ const message = response.choices?.[0]?.message;
3635
+ let toolCalls = message?.tool_calls;
3636
+ const assistantText = extractAssistantText(message?.content);
3637
+ if (assistantText) {
3638
+ logger.ai(assistantText);
3639
+ }
3640
+ // Always add the assistant's response to keep the conversation coherent
3641
+ conversationMessages.push({
3642
+ role: 'assistant',
3643
+ content: message?.content ?? null,
3644
+ tool_calls: toolCalls,
3645
+ });
3646
+ if (!toolCalls || toolCalls.length === 0) {
3647
+ const inferredAction = assistantText ? inferActionFromAssistantText(assistantText) : null;
3648
+ if (inferredAction) {
3649
+ logger.debug(`No tool calls at iteration ${iteration}; inferred ${inferredAction.name} from assistant text.`);
3650
+ toolCalls = [
3651
+ {
3652
+ id: `synthetic-${iteration}-0`,
3653
+ type: 'function',
3654
+ function: {
3655
+ name: inferredAction.name,
3656
+ arguments: JSON.stringify(inferredAction.args),
3657
+ },
3658
+ },
3659
+ ];
3660
+ }
3661
+ else {
3662
+ logger.error(`No tool calls at iteration ${iteration}, skipping. Model: ${modelState.active}. Response: ${assistantText?.slice(0, 400) || '(empty)'}`);
3663
+ continue;
3664
+ }
3665
+ }
3666
+ // 5. Execute each tool call and add results to conversation thread
3667
+ for (const toolCall of toolCalls) {
3668
+ throwIfAborted(config.abortSignal, 'Agent run cancelled.');
3669
+ if (!('function' in toolCall))
3670
+ continue;
3671
+ const name = toolCall.function.name;
3672
+ let args;
3673
+ try {
3674
+ args = JSON.parse(toolCall.function.arguments);
3675
+ }
3676
+ catch {
3677
+ logger.error(`Invalid JSON in tool arguments: ${toolCall.function.arguments}`);
3678
+ conversationMessages.push({
3679
+ role: 'tool',
3680
+ tool_call_id: toolCall.id,
3681
+ content: 'ERROR: Invalid JSON in tool arguments.',
3682
+ });
3683
+ continue;
3684
+ }
3685
+ args = resolveActionCredentialArgs(name, args, config.credentials);
3686
+ if (name === 'type_text'
3687
+ && config.credentials?.password
3688
+ && typeof args.text === 'string'
3689
+ && GENERIC_PASSWORD_INPUT_RE.test(args.text.trim())
3690
+ && isPasswordFieldTarget(args, pageState.interactiveElements)) {
3691
+ args.text = config.credentials.password;
3692
+ }
3693
+ // Log the agent's reasoning before executing
3694
+ const reasoning = args.reason || args.reasoning || args.assessment;
3695
+ if (reasoning)
3696
+ logger.ai(reasoning);
3697
+ // Enrich log args with element label
3698
+ const logArgs = sanitizeCredentialParams({ ...args }, config.credentials);
3699
+ delete logArgs.reason;
3700
+ if ((name === 'click' || name === 'type_text' || name === 'select_option' || name === 'scroll') && args.index !== undefined) {
3701
+ const el = pageState.interactiveElements.find(e => e.index === args.index);
3702
+ if (el) {
3703
+ const label = el.text || el.ariaLabel || el.inputType || el.tag;
3704
+ if (label)
3705
+ logArgs.elementLabel = label;
3706
+ }
3707
+ }
3708
+ logger.action(iteration, config.maxIterations, name, logArgs, { lang: config.currentLang, theme: config.currentTheme });
3709
+ // --- begin_subgoal (hierarchical working memory + AWM within-run) ---
3710
+ if (name === 'begin_subgoal') {
3711
+ const subgoalName = String(args.name ?? 'step').slice(0, 40);
3712
+ const progressSummary = String(args.progress_summary ?? '').slice(0, 120);
3713
+ // AWM: extract selectors from the CURRENT subgoal only (scoped by subgoalStartIndex)
3714
+ if (currentSubgoal) {
3715
+ const successfulSelectors = Array.from(new Set(actionHistory
3716
+ .slice(subgoalStartIndex)
3717
+ .filter(a => a.success && a.stateChanged && typeof a.params.selector === 'string')
3718
+ .map(a => String(a.params.selector)))).slice(0, 8);
3719
+ if (successfulSelectors.length > 0) {
3720
+ workflowCache.push({ subgoalName: currentSubgoal, actionSummary: progressSummary, selectors: successfulSelectors });
3721
+ }
3722
+ // Archive current subgoal with its summary
3723
+ if (agentNotes.length > 0 || progressSummary) {
3724
+ completedSubgoals.push({ name: currentSubgoal, summary: progressSummary || `${agentNotes.length} note(s)` });
3725
+ }
3726
+ }
3727
+ // Start new subgoal: clear working notes and advance scope index
3728
+ currentSubgoal = subgoalName;
3729
+ agentNotes.length = 0;
3730
+ subgoalStartIndex = actionHistory.length; // scope future AWM extraction to this subgoal
3731
+ // AWM: inject selector hints from a similar past subgoal into the new subgoal's notes
3732
+ const similarWorkflow = findReusableWorkflow(workflowCache, subgoalName);
3733
+ if (similarWorkflow && similarWorkflow.selectors.length > 0) {
3734
+ agentNotes.push(`Selectors that worked for previous subgoal "${similarWorkflow.subgoalName}": ${similarWorkflow.selectors.slice(0, 4).join(', ')}`);
3735
+ }
3736
+ appendActionHistory(actionHistory, config, { iteration, action: 'begin_subgoal', params: args, success: true, outcome: `subgoal_started:${subgoalName}`, stateChanged: false });
3737
+ conversationMessages.push({ role: 'tool', tool_call_id: toolCall.id, content: `Subgoal "${subgoalName}" started. Working memory cleared.` });
3738
+ continue;
3739
+ }
3740
+ // --- note (working memory) ---
3741
+ if (name === 'note') {
3742
+ const content = String(args.content ?? '').slice(0, 120);
3743
+ agentNotes.push(content);
3744
+ appendActionHistory(actionHistory, config, { iteration, action: 'note', params: args, success: true, outcome: 'note_recorded', stateChanged: false });
3745
+ conversationMessages.push({ role: 'tool', tool_call_id: toolCall.id, content: 'Note recorded.' });
3746
+ continue;
3747
+ }
3748
+ // Element index validation: catch hallucinated indices before browser execution
3749
+ if (['click', 'type_text', 'select_option', 'scroll'].includes(name)
3750
+ && args.index !== undefined
3751
+ && args.selector === undefined) {
3752
+ const idx = args.index;
3753
+ const exists = pageState.interactiveElements.some(el => el.index === idx);
3754
+ if (!exists) {
3755
+ const visibleIds = pageState.interactiveElements
3756
+ .filter(el => el.visible)
3757
+ .slice(0, 12)
3758
+ .map(el => `[${el.index}]`)
3759
+ .join(' ');
3760
+ const msg = `Element [${idx}] not found in current page. Visible: ${visibleIds}. Use search_text to locate by text, or scroll to reach off-screen elements.`;
3761
+ logger.error(msg);
3762
+ appendActionHistory(actionHistory, config, { iteration, action: name, params: args, success: false, error: `index_not_found:${idx}`, stateChanged: false });
3763
+ conversationMessages.push({ role: 'tool', tool_call_id: toolCall.id, content: msg });
3764
+ continue;
3765
+ }
3766
+ }
3767
+ // Security check
3768
+ const securityDecision = evaluateActionSecurity(name, args, {
3769
+ rootUrl: config.url,
3770
+ currentUrl: browser.currentPage.url(),
3771
+ credentials: config.credentials,
3772
+ interactiveElements: pageState.interactiveElements,
3773
+ currentLang: config.currentLang,
3774
+ currentTheme: config.currentTheme,
3775
+ runMode: config.runMode,
3776
+ currentObjective: config.currentObjective,
3777
+ activeRepairCause: config.activeRepairTicket?.cause ?? null,
3778
+ });
3779
+ if (!securityDecision.allowed) {
3780
+ const targetLabel = describeSecurityTarget(securityDecision.target);
3781
+ const blockMsg = `BLOCKED: ${securityDecision.reason} (${targetLabel})`;
3782
+ logger.error(blockMsg);
3783
+ appendActionHistory(actionHistory, config, { iteration, action: name, params: args, success: false, error: securityDecision.reason });
3784
+ conversationMessages.push({ role: 'tool', tool_call_id: toolCall.id, content: blockMsg });
3785
+ break;
3786
+ }
3787
+ // --- search_text ---
3788
+ if (name === 'search_text') {
3789
+ const query = args.query || '';
3790
+ logger.info(`Recherche de "${query}" sur la page`);
3791
+ const results = await browser.searchText(query);
3792
+ const viewport = browser.currentPage.viewportSize();
3793
+ // Cross-reference search results with interactive elements to provide clickable indices.
3794
+ // When a search result overlaps spatially with an interactive element, include its index
3795
+ // so the agent can `click index=N` instead of using stale coordinates.
3796
+ const interactiveElements = pageState.interactiveElements;
3797
+ const resultText = results.length > 0
3798
+ ? results.map((r, i) => {
3799
+ const cx = Math.round(r.boundingBox.x + r.boundingBox.width / 2);
3800
+ const cy = Math.round(r.boundingBox.y + r.boundingBox.height / 2);
3801
+ // Find matching interactive element: same selector, or bounding box overlap
3802
+ const matchingElement = interactiveElements.find(el => {
3803
+ if (el.selector && r.selector && el.selector === r.selector)
3804
+ return true;
3805
+ if (r.container?.selector && el.selector === r.container.selector)
3806
+ return true;
3807
+ // Spatial overlap: search result center is inside interactive element bounds
3808
+ const bb = el.boundingBox;
3809
+ if (bb) {
3810
+ return cx >= bb.x && cx <= bb.x + bb.width && cy >= bb.y && cy <= bb.y + bb.height;
3811
+ }
3812
+ return false;
3813
+ });
3814
+ const indexHint = matchingElement ? ` ⇒ click index=${matchingElement.index}` : '';
3815
+ const flags = [
3816
+ r.clickable ? 'clickable' : 'not-clickable',
3817
+ r.visibilityState === 'full' ? 'fully-visible' : r.visibilityState === 'partial' ? 'partially-visible' : 'off-screen',
3818
+ ].join(', ');
3819
+ const hrefInfo = r.href ? ` href="${r.href}"` : '';
3820
+ const selectorInfo = r.selector ? ` sel="${r.selector}"` : '';
3821
+ // For off-screen elements, coordinates will be stale after scrolling.
3822
+ // Guide the agent to scroll first, then use the selector or re-search.
3823
+ const isOffScreen = r.visibilityState !== 'full' && r.visibilityState !== 'partial';
3824
+ let scrollHint = '';
3825
+ if (viewport && r.visibilityState !== 'full') {
3826
+ if (r.boundingBox.y < 0) {
3827
+ scrollHint = ` scroll up about ${Math.abs(r.boundingBox.y) + 24}px first`;
3828
+ }
3829
+ else if (r.boundingBox.y + r.boundingBox.height > viewport.height) {
3830
+ const delta = r.boundingBox.y + r.boundingBox.height - Math.max(48, viewport.height - 96);
3831
+ scrollHint = ` scroll down about ${Math.max(80, Math.round(delta))}px first`;
3832
+ }
3833
+ }
3834
+ const clickTarget = isOffScreen
3835
+ ? `(${flags})${scrollHint} — after scrolling, use selector="${r.selector}" or re-search to get updated coordinates`
3836
+ : `→ click x=${cx} y=${cy} (${flags})`;
3837
+ const containerInfo = r.container
3838
+ ? `\n ↳ container: <${r.container.tag}> ${r.container.boundingBox.width}x${r.container.boundingBox.height} (${r.container.reason}) sel="${r.container.selector}"`
3839
+ : '';
3840
+ return ` ${i}. <${r.tag}> role="${r.role}" "${r.text}"${hrefInfo}${selectorInfo} ${clickTarget}${indexHint}${containerInfo}`;
3841
+ }).join('\n')
3842
+ : ' (no matches found)';
3843
+ logger.debug(`Search results:\n${resultText}`);
3844
+ const searchResult = results.length === 0
3845
+ ? `No elements found matching "${query}". Try a different search term, or scroll to reveal more content.`
3846
+ : `Found ${results.length} match(es). For visible elements: click by x/y coordinates or by selector. For off-screen elements: scroll first, then click by selector or re-search.\n${resultText}`;
3847
+ appendActionHistory(actionHistory, config, { iteration, action: 'search_text', params: { query }, success: results.length > 0, error: searchResult });
3848
+ conversationMessages.push({ role: 'tool', tool_call_id: toolCall.id, content: searchResult });
3849
+ break; // Next iteration will show updated page state
3850
+ }
3851
+ if (name === 'analyze_screenshot') {
3852
+ const reason = args.reason || 'visual check';
3853
+ logger.info('Analyse visuelle de la page…');
3854
+ const freshScreenshot = await browser.takeScreenshotForAI();
3855
+ const visionModel = config.visionModel || modelState.active;
3856
+ const visionResult = await callVisionObserver(client, visionModel, freshScreenshot, pageState, config, ++stepCounter);
3857
+ if (visionResult.usage)
3858
+ usageLog.push(visionResult.usage);
3859
+ const observation = visionResult.observation || 'Unable to analyze screenshot.';
3860
+ logger.debug(`Visual analysis: ${observation.slice(0, 200)}`);
3861
+ appendActionHistory(actionHistory, config, {
3862
+ iteration, action: 'analyze_screenshot', params: { reason },
3863
+ success: !!visionResult.observation, outcome: observation.slice(0, 120),
3864
+ });
3865
+ conversationMessages.push({
3866
+ role: 'tool', tool_call_id: toolCall.id,
3867
+ content: `Visual analysis:\n${observation}`,
3868
+ });
3869
+ break;
3870
+ }
3871
+ // --- take_screenshot ---
3872
+ if (name === 'take_screenshot') {
3873
+ throwIfAborted(config.abortSignal, 'Agent run cancelled.');
3874
+ const label = args.label || `Screenshot ${workflowScreenshots.length + 1}`;
3875
+ const assessment = args.assessment || '';
3876
+ logger.info(`Prise du screenshot : "${label}"`);
3877
+ const screenshotBuffer = await browser.takeScreenshot();
3878
+ workflowScreenshots.push({ index: workflowScreenshots.length, iteration, label, buffer: screenshotBuffer, path: null });
3879
+ appendActionHistory(actionHistory, config, { iteration, action: 'take_screenshot', params: { label, assessment }, success: true });
3880
+ logger.success(`Screenshot #${workflowScreenshots.length} capturé : "${label}"`);
3881
+ // Dynamic replanning: after each milestone, refine the plan for remaining steps
3882
+ // This is the "soft Planner/Executor split" — the planner re-focuses on what's left
3883
+ const replanResult = await callPlanner(client, config.prompt, browser.currentPage.url(), screenshotBuffer, {
3884
+ model: modelState.active,
3885
+ fallbackModel: config.fallbackModel,
3886
+ visionModel: config.visionModel,
3887
+ lang: config.currentLang,
3888
+ theme: config.currentTheme,
3889
+ currentObjective: config.currentObjective,
3890
+ captureCursorSummary: summarizeCaptureCursorForPlanner(config),
3891
+ repairTicketSummary: summarizeRepairTicketForPlanner(config),
3892
+ remainingCaptureQueue: config.remainingCaptureQueue,
3893
+ variantManifestSummary: summarizeVariantManifestForPlanner(config.variantManifest),
3894
+ completedMilestones: workflowScreenshots.length,
3895
+ signal: config.abortSignal,
3896
+ uploadImage: config.uploadImage,
3897
+ stepCounter: ++stepCounter,
3898
+ providerPreferences: config.providerPreferences,
3899
+ }).catch(() => ({ plan: null, usage: null }));
3900
+ if (replanResult.usage)
3901
+ usageLog.push(replanResult.usage);
3902
+ if (replanResult.model && replanResult.model !== modelState.active) {
3903
+ modelState.active = replanResult.model;
3904
+ }
3905
+ if (replanResult.plan) {
3906
+ taskPlan = replanResult.plan;
3907
+ logger.info(`Plan affiné après le screenshot #${workflowScreenshots.length} :\n${replanResult.plan}`);
3908
+ }
3909
+ conversationMessages.push({ role: 'tool', tool_call_id: toolCall.id, content: `Workflow screenshot "${label}" captured successfully. Continue with the next step.` });
3910
+ break;
3911
+ }
3912
+ // --- ready_to_capture ---
3913
+ if (name === 'ready_to_capture') {
3914
+ throwIfAborted(config.abortSignal, 'Agent run cancelled.');
3915
+ const assessment = args.assessment || 'Ready';
3916
+ const forceFlag = args.force === true;
3917
+ const expectsDialogTarget = config.variantManifest?.currentPageIdentity?.dialogTarget === true;
3918
+ const expectsGalleryDetail = config.variantManifest?.currentPageIdentity?.kind === 'gallery'
3919
+ && GALLERY_DETAIL_PROMPT_RE.test(`${config.variantManifest?.currentPageId || ''} ${config.prompt}`.toLowerCase());
3920
+ // Skip the deterministic dialog check when:
3921
+ // 1. The agent explicitly sets force=true (dialog is the intended target)
3922
+ // 2. The dialog check has already failed 2+ times consecutively (auto-bypass to vision)
3923
+ // 3. The current capture intentionally targets a dialog/modal
3924
+ // 4. The gallery prompt expects a detail view (which opens as a dialog)
3925
+ // 5. The run is a language-only preflight
3926
+ const skipDialogCheck = forceFlag
3927
+ || consecutiveDialogFailures >= 2
3928
+ || expectsDialogTarget
3929
+ || expectsGalleryDetail
3930
+ || config.runMode === 'language_preflight';
3931
+ const verification = await verifyScreenshot(client, config, modelState, browser, assessment, ++stepCounter, {
3932
+ skipDialogCheck,
3933
+ allowDeterministicSuccess: !forceFlag && !config.variantManifest?.currentPageId,
3934
+ });
3935
+ const { verified, reason, usage: verifyUsage, fatal } = verification;
3936
+ lastVerificationResult = verification;
3937
+ if (verifyUsage)
3938
+ usageLog.push(verifyUsage);
3939
+ if (fatal) {
3940
+ conversationMessages.push({ role: 'tool', tool_call_id: toolCall.id, content: `Verification error: ${reason || 'Fatal error'}` });
3941
+ return {
3942
+ success: false, screenshotPath: null, screenshots: workflowScreenshots, iterations: iteration,
3943
+ actions: actionHistory, assessment: reason || 'Verification failed',
3944
+ diagnostic: { screenshot: pageState.screenshot, url: browser.currentPage.url(), interactiveElements: pageState.interactiveElements, accessibilityTreeSnippet: pageState.accessibilityTree.slice(0, 5000), giveUpReason: reason || 'Verification failed' },
3945
+ usage: usageLog, runtimeStrategy: 'full_llm', deterministicRecoveryUsed: usedDeterministicRecovery, evaluatorUsed: false,
3946
+ verification,
3947
+ };
3948
+ }
3949
+ if (verified) {
3950
+ consecutiveDialogFailures = 0;
3951
+ consecutiveVerificationFailures = 0;
3952
+ consecutiveTechnicalVerificationFailures = 0;
3953
+ conversationMessages.push({ role: 'tool', tool_call_id: toolCall.id, content: 'Verification passed. Workflow complete.' });
3954
+ getPostHog().capture({
3955
+ distinctId: config.analyticsId ?? DISTINCT_ID,
3956
+ event: 'agent_run_succeeded',
3957
+ properties: {
3958
+ url: config.url, model: modelState.active, theme: config.currentTheme, lang: config.currentLang,
3959
+ iterations: iteration, workflow_screenshots: workflowScreenshots.length,
3960
+ total_tokens: usageLog.reduce((s, u) => s + (u.totalTokens ?? 0), 0),
3961
+ cache_read_tokens: usageLog.reduce((s, u) => s + (u.cacheReadTokens ?? 0), 0),
3962
+ cache_hit_rate: (() => {
3963
+ const prompt = usageLog.reduce((s, u) => s + (u.promptTokens ?? 0), 0);
3964
+ const cached = usageLog.reduce((s, u) => s + (u.cacheReadTokens ?? 0), 0);
3965
+ return prompt > 0 ? Math.round((cached / prompt) * 100) : 0;
3966
+ })(),
3967
+ },
3968
+ });
3969
+ return {
3970
+ success: true, screenshotPath: null, screenshots: workflowScreenshots, iterations: iteration,
3971
+ actions: actionHistory, assessment, usage: usageLog, runtimeStrategy: 'full_llm',
3972
+ deterministicRecoveryUsed: usedDeterministicRecovery, evaluatorUsed: false,
3973
+ verification,
3974
+ };
3975
+ }
3976
+ lastVerificationFailure = reason || 'Verification failed';
3977
+ consecutiveVerificationFailures++;
3978
+ if (isTechnicalVerificationFailureReason(lastVerificationFailure)) {
3979
+ consecutiveTechnicalVerificationFailures++;
3980
+ }
3981
+ else {
3982
+ consecutiveTechnicalVerificationFailures = 0;
3983
+ }
3984
+ // Bail-out: if verification has failed too many times in a row, auto-accept the capture.
3985
+ // This prevents infinite loops when the verification model consistently fails to return
3986
+ // structured decisions despite the page being visually correct.
3987
+ const VERIFICATION_BAILOUT_THRESHOLD = 3;
3988
+ if (consecutiveTechnicalVerificationFailures >= VERIFICATION_BAILOUT_THRESHOLD) {
3989
+ logger.info('Vérification ignorée après plusieurs échecs techniques — capture acceptée automatiquement.');
3990
+ consecutiveVerificationFailures = 0;
3991
+ consecutiveTechnicalVerificationFailures = 0;
3992
+ consecutiveDialogFailures = 0;
3993
+ appendActionHistory(actionHistory, config, { iteration, action: 'ready_to_capture', params: args, success: true, outcome: `Auto-accepted after ${VERIFICATION_BAILOUT_THRESHOLD} consecutive verification failures` });
3994
+ conversationMessages.push({ role: 'tool', tool_call_id: toolCall.id, content: 'Verification passed (bail-out after repeated failures). Workflow complete.' });
3995
+ return {
3996
+ success: true, screenshotPath: null, screenshots: workflowScreenshots, iterations: iteration,
3997
+ actions: actionHistory, assessment, usage: usageLog, runtimeStrategy: 'full_llm',
3998
+ deterministicRecoveryUsed: usedDeterministicRecovery, evaluatorUsed: false,
3999
+ verification: { ...verification, verified: true, mode: 'bailout' },
4000
+ };
4001
+ }
4002
+ // Track consecutive dialog-related failures so we auto-bypass on the next attempt
4003
+ if (reason && reason.includes('Blocking dialog')) {
4004
+ consecutiveDialogFailures++;
4005
+ }
4006
+ else {
4007
+ consecutiveDialogFailures = 0;
4008
+ }
4009
+ appendActionHistory(actionHistory, config, { iteration, action: 'ready_to_capture', params: args, success: false, error: `Verification failed: ${lastVerificationFailure}` });
4010
+ const hint = reason && reason.includes('Blocking dialog')
4011
+ ? ` A dialog/modal is blocking the page. Close it first by pressing Escape or clicking outside. If the dialog/modal IS the intended capture target mentioned in the task, call ready_to_capture again with force=true.`
4012
+ : verification.duplicateOfPageId
4013
+ ? ` The current screen duplicates page "${verification.duplicateOfPageId}". Reach the distinct page/state for "${config.variantManifest?.currentPageId ?? 'current'}" before capturing.`
4014
+ : '';
4015
+ // After 2+ consecutive verification failures, regenerate the plan from current state
4016
+ // so the agent gets fresh direction instead of retrying the same approach.
4017
+ if (consecutiveVerificationFailures === 2 && iteration - lastReplanIteration >= 3) {
4018
+ lastReplanIteration = iteration;
4019
+ const replanScreenshot = await browser.takeScreenshot();
4020
+ const verificationReplan = await callPlanner(client, config.prompt, browser.currentPage.url(), replanScreenshot, {
4021
+ model: modelState.active,
4022
+ fallbackModel: config.fallbackModel,
4023
+ visionModel: config.visionModel,
4024
+ lang: config.currentLang,
4025
+ theme: config.currentTheme,
4026
+ currentObjective: config.currentObjective,
4027
+ captureCursorSummary: summarizeCaptureCursorForPlanner(config),
4028
+ remainingCaptureQueue: config.remainingCaptureQueue,
4029
+ variantManifestSummary: summarizeVariantManifestForPlanner(config.variantManifest),
4030
+ completedMilestones: workflowScreenshots.length || 1,
4031
+ signal: config.abortSignal,
4032
+ uploadImage: config.uploadImage,
4033
+ stepCounter: ++stepCounter,
4034
+ providerPreferences: config.providerPreferences,
4035
+ }).catch(() => ({ plan: null, usage: null }));
4036
+ if (verificationReplan.usage)
4037
+ usageLog.push(verificationReplan.usage);
4038
+ if (verificationReplan.model && verificationReplan.model !== modelState.active) {
4039
+ modelState.active = verificationReplan.model;
4040
+ }
4041
+ if (verificationReplan.plan) {
4042
+ taskPlan = verificationReplan.plan;
4043
+ logger.info(`Nouveau plan d'action après échec de vérification :\n${verificationReplan.plan}`);
4044
+ }
4045
+ }
4046
+ const loopHint = consecutiveTechnicalVerificationFailures >= 2
4047
+ ? ' IMPORTANT: Verification has failed multiple times due to technical validator issues. If the page is visually correct, you may retry ready_to_capture with force=true.'
4048
+ : '';
4049
+ conversationMessages.push({ role: 'tool', tool_call_id: toolCall.id, content: `Verification failed: ${lastVerificationFailure}.${hint}${loopHint} Fix the issues (overlays, spinners, wrong page) and call ready_to_capture again.` });
4050
+ break;
4051
+ }
4052
+ // --- give_up ---
4053
+ if (name === 'give_up') {
4054
+ const reason = args.reason || 'Unknown reason';
4055
+ if (config.credentials?.password && HIDDEN_PASSWORD_GIVE_UP_RE.test(reason)) {
4056
+ const correction = 'The password is available in credentials but intentionally hidden from the prompt. Continue with the email/password login flow.';
4057
+ logger.debug(`Rejecting incorrect give_up: ${correction}`);
4058
+ appendActionHistory(actionHistory, config, { iteration, action: 'wait', params: { reason: 'hidden_password_available' }, success: false, error: `INVALID_GIVE_UP: ${reason}. ${correction}` });
4059
+ conversationMessages.push({ role: 'tool', tool_call_id: toolCall.id, content: `REJECTED: ${correction}` });
4060
+ break;
4061
+ }
4062
+ const prematureGiveUpCorrection = rejectedGiveUps < 2
4063
+ ? inferPrematureGiveUpCorrection({
4064
+ reason,
4065
+ actionHistory,
4066
+ lastVerificationFailure,
4067
+ iteration,
4068
+ maxIterations: config.maxIterations,
4069
+ })
4070
+ : null;
4071
+ if (prematureGiveUpCorrection) {
4072
+ rejectedGiveUps += 1;
4073
+ logger.debug(`Rejecting premature give_up: ${prematureGiveUpCorrection}`);
4074
+ appendActionHistory(actionHistory, config, {
4075
+ iteration,
4076
+ action: 'wait',
4077
+ params: { reason: 'premature_give_up_rejected' },
4078
+ success: false,
4079
+ error: `INVALID_GIVE_UP: ${reason}. ${prematureGiveUpCorrection}`,
4080
+ });
4081
+ conversationMessages.push({ role: 'tool', tool_call_id: toolCall.id, content: `REJECTED: ${prematureGiveUpCorrection}` });
4082
+ break;
4083
+ }
4084
+ logger.error(`L'agent a abandonné : ${reason}`);
4085
+ let diagnostic;
4086
+ try {
4087
+ const diagScreenshot = await browser.takeScreenshot();
4088
+ diagnostic = { screenshot: diagScreenshot, url: browser.currentPage.url(), interactiveElements: pageState.interactiveElements, accessibilityTreeSnippet: pageState.accessibilityTree.slice(0, 5000), giveUpReason: reason };
4089
+ }
4090
+ catch {
4091
+ logger.error('Failed to capture diagnostic state');
4092
+ }
4093
+ conversationMessages.push({ role: 'tool', tool_call_id: toolCall.id, content: 'Acknowledged.' });
4094
+ getPostHog().capture({
4095
+ distinctId: config.analyticsId ?? DISTINCT_ID,
4096
+ event: 'agent_gave_up',
4097
+ properties: {
4098
+ url: config.url, model: modelState.active, theme: config.currentTheme, lang: config.currentLang,
4099
+ reason, iterations: iteration,
4100
+ total_tokens: usageLog.reduce((s, u) => s + (u.totalTokens ?? 0), 0),
4101
+ cache_read_tokens: usageLog.reduce((s, u) => s + (u.cacheReadTokens ?? 0), 0),
4102
+ cache_hit_rate: (() => {
4103
+ const prompt = usageLog.reduce((s, u) => s + (u.promptTokens ?? 0), 0);
4104
+ const cached = usageLog.reduce((s, u) => s + (u.cacheReadTokens ?? 0), 0);
4105
+ return prompt > 0 ? Math.round((cached / prompt) * 100) : 0;
4106
+ })(),
4107
+ },
4108
+ });
4109
+ return {
4110
+ success: false, screenshotPath: null, screenshots: workflowScreenshots, iterations: iteration,
4111
+ actions: actionHistory, assessment: reason, diagnostic, usage: usageLog, runtimeStrategy: 'full_llm',
4112
+ deterministicRecoveryUsed: usedDeterministicRecovery, evaluatorUsed: false,
4113
+ verification: lastVerificationResult,
4114
+ };
4115
+ }
4116
+ // --- OAuth guard ---
4117
+ if (name === 'click' && hasCredentials) {
4118
+ let blockedText = null;
4119
+ if (args.index !== undefined) {
4120
+ const targetEl = pageState.interactiveElements.find(el => el.index === args.index);
4121
+ if (targetEl && isOAuthElement(targetEl))
4122
+ blockedText = targetEl.text;
4123
+ }
4124
+ if (!blockedText && args.selector) {
4125
+ const sel = args.selector.toLowerCase();
4126
+ const oauthEl = pageState.interactiveElements.find(el => isOAuthElement(el) && el.selector.toLowerCase() === sel);
4127
+ if (oauthEl)
4128
+ blockedText = oauthEl.text;
4129
+ }
4130
+ if (!blockedText && args.x !== undefined && args.y !== undefined) {
4131
+ const x = args.x;
4132
+ const y = args.y;
4133
+ const oauthEl = pageState.interactiveElements.find(el => {
4134
+ if (!isOAuthElement(el) || !el.boundingBox)
4135
+ return false;
4136
+ const bb = el.boundingBox;
4137
+ return x >= bb.x && x <= bb.x + bb.width && y >= bb.y && y <= bb.y + bb.height;
4138
+ });
4139
+ if (oauthEl)
4140
+ blockedText = oauthEl.text;
4141
+ }
4142
+ if (blockedText) {
4143
+ const blockMsg = `BLOCKED: OAuth element "${blockedText}". Look for the email/password login option instead.`;
4144
+ logger.error(`BLOCKED: click on OAuth element "${blockedText}" — use email login instead`);
4145
+ appendActionHistory(actionHistory, config, { iteration, action: 'click', params: args, success: false, error: blockMsg });
4146
+ conversationMessages.push({ role: 'tool', tool_call_id: toolCall.id, content: blockMsg });
4147
+ break;
4148
+ }
4149
+ }
4150
+ // --- Regular browser actions ---
4151
+ const repeatedActionGuard = inferRepeatedActionGuard({
4152
+ actionHistory,
4153
+ action: name,
4154
+ args,
4155
+ currentUrl: browser.currentPage.url(),
4156
+ });
4157
+ if (repeatedActionGuard) {
4158
+ logger.debug(`Blocking repeated ineffective action: ${name}`);
4159
+ appendActionHistory(actionHistory, config, {
4160
+ iteration,
4161
+ action: name,
4162
+ params: args,
4163
+ success: false,
4164
+ error: repeatedActionGuard,
4165
+ stateChanged: false,
4166
+ });
4167
+ conversationMessages.push({ role: 'tool', tool_call_id: toolCall.id, content: repeatedActionGuard });
4168
+ break;
4169
+ }
4170
+ const execResult = await executeAction(browser, name, args);
4171
+ throwIfAborted(config.abortSignal, 'Agent run cancelled.');
4172
+ // Store in action history (for replay and telemetry)
4173
+ const storedParams = sanitizeCredentialParams({ ...args }, config.credentials);
4174
+ delete storedParams.reason;
4175
+ if (name === 'click' || name === 'type_text' || name === 'select_option' || name === 'scroll') {
4176
+ const resolvedElement = args.index !== undefined
4177
+ ? pageState.interactiveElements.find((element) => element.index === args.index)
4178
+ : (args.x !== undefined && args.y !== undefined)
4179
+ ? findElementForPoint(pageState.interactiveElements, args.x, args.y)
4180
+ : null;
4181
+ if (resolvedElement) {
4182
+ if (!containsInternalAutomationSelector(resolvedElement.selector)) {
4183
+ storedParams.selector = resolvedElement.selector;
4184
+ }
4185
+ else {
4186
+ delete storedParams.selector;
4187
+ }
4188
+ if (resolvedElement.href)
4189
+ storedParams.href = resolvedElement.href;
4190
+ const label = resolvedElement.text || resolvedElement.ariaLabel || resolvedElement.inputType || resolvedElement.tag;
4191
+ if (label)
4192
+ storedParams.elementLabel = label;
4193
+ // Store structural metadata for cross-language replay matching
4194
+ storedParams.elementTag = resolvedElement.tag;
4195
+ storedParams.elementRole = resolvedElement.role;
4196
+ if (resolvedElement.boundingBox) {
4197
+ storedParams.elementCx = Math.round(resolvedElement.boundingBox.x + resolvedElement.boundingBox.width / 2);
4198
+ storedParams.elementCy = Math.round(resolvedElement.boundingBox.y + resolvedElement.boundingBox.height / 2);
4199
+ }
4200
+ delete storedParams.index;
4201
+ delete storedParams.x;
4202
+ delete storedParams.y;
4203
+ }
4204
+ }
4205
+ // Record URLs for replay validation — allows detecting divergence
4206
+ // when the same actions are replayed in a different language/variant.
4207
+ storedParams.preActionUrl = browser.currentPage.url();
4208
+ if (execResult.success && execResult.stateChanged && (name === 'click' || name === 'navigate_to')) {
4209
+ storedParams.postActionUrl = browser.currentPage.url();
4210
+ }
4211
+ appendActionHistory(actionHistory, config, { iteration, action: name, params: storedParams, success: execResult.success, error: execResult.error, outcome: execResult.outcome, stateChanged: execResult.stateChanged });
4212
+ // Add tool result to conversation thread — this is how the LLM learns what happened
4213
+ const toolResultContent = formatToolResult(name, args, execResult, pageState.interactiveElements);
4214
+ conversationMessages.push({ role: 'tool', tool_call_id: toolCall.id, content: toolResultContent });
4215
+ if (!execResult.success) {
4216
+ logger.error(`Échec de l'action : ${execResult.error}`);
4217
+ }
4218
+ const postActionDelayMs = getPostActionDelayMs(name, execResult, {
4219
+ authSubmitAction: isLikelyAuthenticationSubmitAction(name, args),
4220
+ });
4221
+ if (postActionDelayMs > 0) {
4222
+ await browser.wait(postActionDelayMs);
4223
+ }
4224
+ }
4225
+ }
4226
+ // Max iterations exhausted
4227
+ logger.error('Max iterations reached');
4228
+ getPostHog().capture({
4229
+ distinctId: config.analyticsId ?? DISTINCT_ID,
4230
+ event: 'agent_max_iterations_reached',
4231
+ properties: {
4232
+ url: config.url,
4233
+ model: modelState.active,
4234
+ theme: config.currentTheme,
4235
+ lang: config.currentLang,
4236
+ max_iterations: config.maxIterations,
4237
+ total_tokens: usageLog.reduce((s, u) => s + (u.totalTokens ?? 0), 0),
4238
+ cache_read_tokens: usageLog.reduce((s, u) => s + (u.cacheReadTokens ?? 0), 0),
4239
+ cache_hit_rate: (() => {
4240
+ const prompt = usageLog.reduce((s, u) => s + (u.promptTokens ?? 0), 0);
4241
+ const cached = usageLog.reduce((s, u) => s + (u.cacheReadTokens ?? 0), 0);
4242
+ return prompt > 0 ? Math.round((cached / prompt) * 100) : 0;
4243
+ })(),
4244
+ },
4245
+ });
4246
+ return {
4247
+ success: false,
4248
+ screenshotPath: null,
4249
+ screenshots: workflowScreenshots,
4250
+ iterations: config.maxIterations,
4251
+ actions: actionHistory,
4252
+ assessment: 'Max iterations reached without completing the task.',
4253
+ usage: usageLog,
4254
+ runtimeStrategy: 'full_llm',
4255
+ deterministicRecoveryUsed: usedDeterministicRecovery,
4256
+ evaluatorUsed: false,
4257
+ verification: lastVerificationResult,
4258
+ };
4259
+ }
4260
+ /** Actions that can be safely replayed (no terminal or meta actions). */
4261
+ const REPLAYABLE_ACTIONS = [
4262
+ 'navigate_to', 'click', 'type_text', 'select_option', 'scroll',
4263
+ 'press_key', 'wait', 'resize_viewport', 'dismiss_overlays',
4264
+ // Legacy aliases kept for replaying old recordings
4265
+ 'hover', 'safe_expand', 'scroll_to_element',
4266
+ ];
4267
+ /**
4268
+ * Replay a previous successful capture's recorded actions instead of running the full AI agent.
4269
+ * Falls back to a full runAgent if replay fails or verification doesn't pass.
4270
+ * Much cheaper than a full agent run: only 1 verification call vs. N agent iterations.
4271
+ */
4272
+ export async function replayAgent(browser, config, apiKey, recordedActions, options = {}) {
4273
+ const client = createClient(apiKey);
4274
+ const modelState = { active: config.model };
4275
+ const usageLog = [];
4276
+ const allowFullAgentFallback = options.allowFullAgentFallback !== false;
4277
+ const replayObservation = await browser.captureObservation().catch(() => null);
4278
+ const replayAnalysis = analyzeReplayCandidate(recordedActions, {
4279
+ currentUrl: browser.currentPage.url(),
4280
+ targetUrl: config.url,
4281
+ currentViewport: browser.currentPage.viewportSize(),
4282
+ isAuthenticated: config.sessionProfile?.authState === 'authenticated',
4283
+ currentDialogCount: replayObservation?.dialogCount ?? null,
4284
+ pageIdentity: config.variantManifest?.currentPageIdentity ?? null,
4285
+ });
4286
+ const replayable = replayAnalysis.replayableActions;
4287
+ if (replayAnalysis.skipReason) {
4288
+ if (!allowFullAgentFallback) {
4289
+ logger.error(`Replay skipped (${replayAnalysis.skipReason}), staying in replay-only mode`);
4290
+ return {
4291
+ success: false,
4292
+ screenshotPath: null,
4293
+ screenshots: [],
4294
+ iterations: 0,
4295
+ actions: recordedActions,
4296
+ assessment: `Action replay skipped: ${replayAnalysis.skipReason}`,
4297
+ diagnostic: {
4298
+ screenshot: await browser.takeScreenshot().catch(() => Buffer.alloc(0)),
4299
+ url: browser.currentPage.url(),
4300
+ interactiveElements: [],
4301
+ accessibilityTreeSnippet: '',
4302
+ giveUpReason: `Action replay skipped: ${replayAnalysis.skipReason}`,
4303
+ },
4304
+ usage: usageLog,
4305
+ runtimeStrategy: 'action_replay',
4306
+ deterministicRecoveryUsed: false,
4307
+ evaluatorUsed: false,
4308
+ };
4309
+ }
4310
+ logger.info(`Action replay skipped (${replayAnalysis.skipReason}), falling back to full agent run`);
4311
+ if (!urlsRoughlyMatch(config.url, browser.currentPage.url())) {
4312
+ await browser.navigateTo(config.url);
4313
+ }
4314
+ const fallback = await runAgent(browser, config, apiKey);
4315
+ return {
4316
+ ...fallback,
4317
+ runtimeStrategy: 'action_replay_fallback',
4318
+ };
4319
+ }
4320
+ logger.info(`Action replay: replaying ${replayable.length} actions...`);
4321
+ try {
4322
+ for (const recorded of replayable) {
4323
+ throwIfAborted(config.abortSignal, 'Replay cancelled.');
4324
+ // Pre-action URL validation: if the recorded action stored the URL it was
4325
+ // executed from, verify the browser is on the same page before attempting it.
4326
+ // Skip URL validation for actions that work regardless of URL (press_key, scroll, wait,
4327
+ // dismiss_overlays) — these are often used between page transitions and don't depend
4328
+ // on being on a specific URL.
4329
+ const URL_INDEPENDENT_ACTIONS = new Set(['press_key', 'scroll', 'scroll_to_element', 'wait', 'dismiss_overlays', 'resize_viewport']);
4330
+ const expectedPreUrl = typeof recorded.params.preActionUrl === 'string' ? recorded.params.preActionUrl : null;
4331
+ if (expectedPreUrl && !URL_INDEPENDENT_ACTIONS.has(recorded.action)) {
4332
+ const currentUrl = browser.currentPage.url();
4333
+ if (!urlsRoughlyMatch(expectedPreUrl, currentUrl) && !urlsRoughlyMatch(currentUrl, expectedPreUrl)) {
4334
+ throw new Error(`Replay starting URL mismatch for "${recorded.action}": expected ${expectedPreUrl}, browser is on ${currentUrl}`);
4335
+ }
4336
+ }
4337
+ const interactiveElements = replayActionRequiresAnchor(recorded)
4338
+ ? await browser.getInteractiveElements().catch(() => [])
4339
+ : [];
4340
+ const resolvedReplayAction = resolveReplayActionArgs(recorded, interactiveElements);
4341
+ if (!resolvedReplayAction.args) {
4342
+ throw new Error(resolvedReplayAction.reason ?? `Action "${recorded.action}" cannot be replayed on the current page`);
4343
+ }
4344
+ const resolvedReplayArgs = resolveActionCredentialArgs(recorded.action, resolvedReplayAction.args, config.credentials);
4345
+ // Force clicks during replay to bypass pointer-event interception
4346
+ // (e.g., after dark theme switch, <html class="dark"> may briefly intercept events)
4347
+ if (recorded.action === 'click' || recorded.action === 'safe_expand') {
4348
+ resolvedReplayArgs.__forceClick = true;
4349
+ }
4350
+ const result = await executeAction(browser, recorded.action, resolvedReplayArgs);
4351
+ throwIfAborted(config.abortSignal, 'Replay cancelled.');
4352
+ if (!result.success) {
4353
+ throw new Error(`Action "${recorded.action}" failed: ${result.error}`);
4354
+ }
4355
+ // Replay needs longer delays than live agent — the live agent adapts to
4356
+ // page reactions but replay fires actions blindly. Use the full delay for
4357
+ // state-changing clicks (page transitions, modals) to let async operations settle.
4358
+ const baseDelay = getPostActionDelayMs(recorded.action, result, {
4359
+ authSubmitAction: isLikelyAuthenticationSubmitAction(recorded.action, recorded.params),
4360
+ });
4361
+ const postActionDelayMs = result.stateChanged ? Math.max(baseDelay, 300) : baseDelay;
4362
+ if (postActionDelayMs > 0) {
4363
+ await browser.wait(postActionDelayMs);
4364
+ }
4365
+ // Validate intermediate URL: if the recorded action caused a navigation,
4366
+ // verify the replay landed on a matching URL. This catches divergence early
4367
+ // (e.g., a translated button click that navigates to a different page).
4368
+ const expectedUrl = typeof recorded.params.postActionUrl === 'string' ? recorded.params.postActionUrl : null;
4369
+ if (expectedUrl && result.stateChanged) {
4370
+ const actualUrl = browser.currentPage.url();
4371
+ if (!urlsRoughlyMatch(expectedUrl, actualUrl) && !urlsRoughlyMatch(actualUrl, expectedUrl)) {
4372
+ throw new Error(`Replay URL divergence after "${recorded.action}": expected ${expectedUrl}, got ${actualUrl}`);
4373
+ }
4374
+ }
4375
+ }
4376
+ }
4377
+ catch (err) {
4378
+ if (isAbortError(err)) {
4379
+ throw err;
4380
+ }
4381
+ const replayError = err.message;
4382
+ if (!allowFullAgentFallback) {
4383
+ logger.error(`Replay failed (${replayError}), staying in replay-only mode`);
4384
+ return {
4385
+ success: false,
4386
+ screenshotPath: null,
4387
+ screenshots: [],
4388
+ iterations: 0,
4389
+ actions: recordedActions,
4390
+ assessment: `Action replay failed: ${replayError}`,
4391
+ diagnostic: {
4392
+ screenshot: await browser.takeScreenshot().catch(() => Buffer.alloc(0)),
4393
+ url: browser.currentPage.url(),
4394
+ interactiveElements: [],
4395
+ accessibilityTreeSnippet: '',
4396
+ giveUpReason: `Action replay failed: ${replayError}`,
4397
+ },
4398
+ usage: usageLog,
4399
+ runtimeStrategy: 'action_replay',
4400
+ deterministicRecoveryUsed: false,
4401
+ evaluatorUsed: false,
4402
+ };
4403
+ }
4404
+ logger.error(`Replay failed (${replayError}), falling back to full agent run`);
4405
+ if (!urlsRoughlyMatch(config.url, browser.currentPage.url())) {
4406
+ await browser.navigateTo(config.url);
4407
+ }
4408
+ const fallback = await runAgent(browser, config, apiKey);
4409
+ return {
4410
+ ...fallback,
4411
+ runtimeStrategy: 'action_replay_fallback',
4412
+ };
4413
+ }
4414
+ // One verification call to confirm the page state looks correct
4415
+ logger.info('Replay done, verifying...');
4416
+ throwIfAborted(config.abortSignal, 'Replay cancelled.');
4417
+ const replayExpectsGalleryDetail = config.variantManifest?.currentPageIdentity?.kind === 'gallery'
4418
+ && GALLERY_DETAIL_PROMPT_RE.test(`${config.variantManifest?.currentPageId || ''} ${config.prompt}`.toLowerCase());
4419
+ const verification = await verifyScreenshot(client, config, modelState, browser, 'Page prepared via action replay', 1, {
4420
+ skipDialogCheck: config.variantManifest?.currentPageIdentity?.dialogTarget
4421
+ || replayExpectsGalleryDetail,
4422
+ });
4423
+ const { verified, reason, usage: verifyUsage, fatal } = verification;
4424
+ if (verifyUsage)
4425
+ usageLog.push(verifyUsage);
4426
+ if (fatal) {
4427
+ return {
4428
+ success: false,
4429
+ screenshotPath: null,
4430
+ screenshots: [],
4431
+ iterations: 0,
4432
+ actions: recordedActions,
4433
+ assessment: reason || 'Replay verification failed',
4434
+ diagnostic: {
4435
+ screenshot: await browser.takeScreenshot().catch(() => Buffer.alloc(0)),
4436
+ url: browser.currentPage.url(),
4437
+ interactiveElements: [],
4438
+ accessibilityTreeSnippet: '',
4439
+ giveUpReason: reason || 'Replay verification failed',
4440
+ },
4441
+ usage: usageLog,
4442
+ runtimeStrategy: 'action_replay',
4443
+ deterministicRecoveryUsed: false,
4444
+ evaluatorUsed: false,
4445
+ verification,
4446
+ };
4447
+ }
4448
+ if (verified) {
4449
+ logger.success('Action replay verified');
4450
+ getPostHog().capture({
4451
+ distinctId: config.analyticsId ?? DISTINCT_ID,
4452
+ event: 'agent_run_succeeded',
4453
+ properties: {
4454
+ url: config.url,
4455
+ model: modelState.active,
4456
+ theme: config.currentTheme,
4457
+ lang: config.currentLang,
4458
+ iterations: 0,
4459
+ replay: true,
4460
+ total_tokens: usageLog.reduce((s, u) => s + (u.totalTokens ?? 0), 0),
4461
+ },
4462
+ });
4463
+ return {
4464
+ success: true,
4465
+ screenshotPath: null,
4466
+ screenshots: [],
4467
+ iterations: 0,
4468
+ actions: recordedActions,
4469
+ assessment: 'Captured via action replay',
4470
+ usage: usageLog,
4471
+ runtimeStrategy: 'action_replay',
4472
+ deterministicRecoveryUsed: false,
4473
+ evaluatorUsed: false,
4474
+ verification,
4475
+ };
4476
+ }
4477
+ // Verification failed — reset and fall back to full agent run
4478
+ if (!allowFullAgentFallback) {
4479
+ logger.error(`Replay verification failed (${reason}), staying in replay-only mode`);
4480
+ return {
4481
+ success: false,
4482
+ screenshotPath: null,
4483
+ screenshots: [],
4484
+ iterations: 0,
4485
+ actions: recordedActions,
4486
+ assessment: reason || 'Replay verification failed',
4487
+ diagnostic: {
4488
+ screenshot: await browser.takeScreenshot().catch(() => Buffer.alloc(0)),
4489
+ url: browser.currentPage.url(),
4490
+ interactiveElements: [],
4491
+ accessibilityTreeSnippet: '',
4492
+ giveUpReason: reason || 'Replay verification failed',
4493
+ },
4494
+ usage: usageLog,
4495
+ runtimeStrategy: 'action_replay',
4496
+ deterministicRecoveryUsed: false,
4497
+ evaluatorUsed: false,
4498
+ verification,
4499
+ };
4500
+ }
4501
+ logger.error(`Replay verification failed (${reason}), falling back to full agent run`);
4502
+ if (!urlsRoughlyMatch(config.url, browser.currentPage.url())) {
4503
+ await browser.navigateTo(config.url);
4504
+ }
4505
+ const fallback = await runAgent(browser, config, apiKey);
4506
+ return {
4507
+ ...fallback,
4508
+ runtimeStrategy: 'action_replay_fallback',
4509
+ };
4510
+ }
4511
+ //# sourceMappingURL=agent.js.map