autokap 1.0.5 → 1.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/assets/chrome/ios-statusbar-comparison-reference.jpg +0 -0
- package/assets/chrome/ios-statusbar-dark-reference.jpg +0 -0
- package/assets/chrome/ios-statusbar-light-reference.jpg +0 -0
- package/assets/devices/ipad-pro-11-m4.json +52 -0
- package/assets/devices/iphone-16-pro.json +53 -0
- package/assets/devices/macbook-air-13.json +45 -0
- package/assets/frames/MacBook Air 13.svg +242 -0
- package/assets/frames/Status bar - iPhone.png +0 -0
- Menu bar- iPad.png +0 -0
- package/assets/frames/iPad Pro M4 11_.png +0 -0
- package/assets/frames/iPhone 16 Pro.png +0 -0
- package/assets/icons/Cellular Connection.svg +3 -0
- package/assets/icons/Union.svg +6 -0
- package/assets/icons/Wifi.svg +3 -0
- package/assets/icons/battery.svg +5 -0
- package/assets/icons/battery_charging.svg +8 -0
- package/dist/abort.d.ts +5 -0
- package/dist/abort.js +44 -0
- package/dist/agent.d.ts +142 -0
- package/dist/agent.js +4511 -0
- package/dist/billing-operation-logging.d.ts +38 -0
- package/dist/billing-operation-logging.js +248 -0
- package/dist/browser-bar.d.ts +40 -0
- package/dist/browser-bar.js +147 -0
- package/dist/browser.d.ts +25 -0
- package/dist/browser.js +177 -9
- package/dist/capture-alt-text.d.ts +12 -0
- package/dist/capture-alt-text.js +51 -0
- package/dist/capture-encryption.d.ts +10 -0
- package/dist/capture-encryption.js +41 -0
- package/dist/capture-language-preflight.d.ts +41 -0
- package/dist/capture-language-preflight.js +286 -0
- package/dist/capture-llm-page-identity.d.ts +15 -0
- package/dist/capture-llm-page-identity.js +116 -0
- package/dist/capture-model-resolution.d.ts +9 -0
- package/dist/capture-model-resolution.js +21 -0
- package/dist/capture-page-identity.d.ts +9 -0
- package/dist/capture-page-identity.js +219 -0
- package/dist/capture-preset-credentials.d.ts +12 -0
- package/dist/capture-preset-credentials.js +57 -0
- package/dist/capture-request-plan.d.ts +58 -0
- package/dist/capture-request-plan.js +216 -0
- package/dist/capture-run-optimizer.d.ts +139 -0
- package/dist/capture-run-optimizer.js +848 -0
- package/dist/capture-selector-memory.d.ts +26 -0
- package/dist/capture-selector-memory.js +327 -0
- package/dist/capture-session-profile-encryption.d.ts +2 -0
- package/dist/capture-session-profile-encryption.js +22 -0
- package/dist/capture-step-timeout.d.ts +10 -0
- package/dist/capture-step-timeout.js +30 -0
- package/dist/capture-studio-sync.d.ts +22 -0
- package/dist/capture-studio-sync.js +166 -0
- package/dist/capture-variant-state.d.ts +54 -0
- package/dist/capture-variant-state.js +156 -0
- package/dist/cli.js +21 -0
- package/dist/clip-orchestrator.d.ts +148 -0
- package/dist/clip-orchestrator.js +950 -0
- package/dist/clip-postprocess.d.ts +42 -0
- package/dist/clip-postprocess.js +192 -0
- package/dist/cost-logging.d.ts +27 -0
- package/dist/cost-logging.js +128 -0
- package/dist/credential-templates.d.ts +5 -0
- package/dist/credential-templates.js +60 -0
- package/dist/element-capture.d.ts +53 -0
- package/dist/element-capture.js +766 -0
- package/dist/hybrid-navigator.d.ts +138 -0
- package/dist/hybrid-navigator.js +468 -0
- package/dist/index.d.ts +15 -0
- package/dist/index.js +11 -0
- package/dist/llm-usage.d.ts +17 -0
- package/dist/llm-usage.js +45 -0
- package/dist/mockup-html.d.ts +119 -0
- package/dist/mockup-html.js +253 -0
- package/dist/mockup.d.ts +94 -0
- package/dist/mockup.js +608 -0
- package/dist/mouse-animation.d.ts +46 -0
- package/dist/mouse-animation.js +100 -0
- package/dist/overlay-utils.d.ts +14 -0
- package/dist/overlay-utils.js +13 -0
- package/dist/posthog.d.ts +4 -0
- package/dist/posthog.js +26 -0
- package/dist/prompt-cache.d.ts +10 -0
- package/dist/prompt-cache.js +24 -0
- package/dist/prompts.d.ts +167 -0
- package/dist/prompts.js +1165 -0
- package/dist/remote-browser.d.ts +191 -0
- package/dist/remote-browser.js +305 -0
- package/dist/security.d.ts +20 -0
- package/dist/security.js +569 -0
- package/dist/server-capture-runtime.d.ts +123 -0
- package/dist/server-capture-runtime.js +638 -0
- package/dist/server-credit-usage.d.ts +12 -0
- package/dist/server-credit-usage.js +41 -0
- package/dist/server-posthog.d.ts +2 -0
- package/dist/server-posthog.js +16 -0
- package/dist/server-project-webhooks.d.ts +45 -0
- package/dist/server-project-webhooks.js +97 -0
- package/dist/server-screenshot-watermark.d.ts +7 -0
- package/dist/server-screenshot-watermark.js +38 -0
- package/dist/session-profile.d.ts +86 -0
- package/dist/session-profile.js +1373 -0
- package/dist/sf-pro-fonts.d.ts +4 -0
- package/dist/sf-pro-fonts.js +7 -0
- package/dist/status-bar-l10n.d.ts +14 -0
- package/dist/status-bar-l10n.js +177 -0
- package/dist/status-bar.d.ts +44 -0
- package/dist/status-bar.js +336 -0
- package/dist/tools.d.ts +4 -0
- package/dist/tools.js +578 -0
- package/dist/video-agent.d.ts +143 -0
- package/dist/video-agent.js +4783 -0
- package/dist/video-observation.d.ts +36 -0
- package/dist/video-observation.js +192 -0
- package/dist/video-planner.d.ts +12 -0
- package/dist/video-planner.js +500 -0
- package/dist/video-prompts.d.ts +37 -0
- package/dist/video-prompts.js +554 -0
- package/dist/video-tools.d.ts +3 -0
- package/dist/video-tools.js +59 -0
- package/dist/video-variant-state.d.ts +29 -0
- package/dist/video-variant-state.js +80 -0
- package/dist/vision-model.d.ts +17 -0
- package/dist/vision-model.js +74 -0
- package/dist/ws-auth.d.ts +20 -0
- package/dist/ws-auth.js +67 -0
- package/dist/ws-handler.d.ts +10 -0
- package/dist/ws-handler.js +1663 -0
- package/dist/ws-server.d.ts +9 -0
- package/dist/ws-server.js +52 -0
- package/package.json +93 -39
package/dist/agent.js
ADDED
|
@@ -0,0 +1,4511 @@
|
|
|
1
|
+
import OpenAI from 'openai';
|
|
2
|
+
import { describeObservationChange } from './browser.js';
|
|
3
|
+
import { agentTools } from './tools.js';
|
|
4
|
+
import { buildSystemPrompt, buildStableAnchorUserMessage, buildIterationUserMessage, buildVerificationMessage, buildVisionObserverPrompt, } from './prompts.js';
|
|
5
|
+
import { logger, emitScreenshot, emitReasoningDelta } from './logger.js';
|
|
6
|
+
import { extractStepUsage } from './llm-usage.js';
|
|
7
|
+
import { hasManualMultiProviderOrder, resolvePromptCacheStrategy } from './prompt-cache.js';
|
|
8
|
+
import { dismissCookiesAndWidgets } from './cookie-dismiss.js';
|
|
9
|
+
import { describeSecurityTarget, evaluateActionSecurity } from './security.js';
|
|
10
|
+
import { getPostHog, DISTINCT_ID } from './posthog.js';
|
|
11
|
+
import { isAbortError, sleepWithAbort, throwIfAborted } from './abort.js';
|
|
12
|
+
import { evaluateRequestedLanguageState, evaluateRequestedThemeState, performDeterministicSessionRepair, } from './session-profile.js';
|
|
13
|
+
import { resolveActionCredentialArgs, sanitizeCredentialParams, } from './credential-templates.js';
|
|
14
|
+
import { callVisionCapableModel, VisionModelUnsupportedError, } from './vision-model.js';
|
|
15
|
+
import { createHash } from 'crypto';
|
|
16
|
+
/**
|
|
17
|
+
* Converts a screenshot buffer to an image URL for LLM messages.
|
|
18
|
+
* Uses the provided uploader (Supabase storage → HTTPS URL) when available,
|
|
19
|
+
* falls back to a base64 data URI for compatibility.
|
|
20
|
+
*/
|
|
21
|
+
async function makeImageUrl(buffer, mimeType, uploadImage) {
|
|
22
|
+
if (uploadImage) {
|
|
23
|
+
try {
|
|
24
|
+
return await uploadImage(buffer, mimeType);
|
|
25
|
+
}
|
|
26
|
+
catch {
|
|
27
|
+
// fall through to base64
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
return `data:${mimeType};base64,${buffer.toString('base64')}`;
|
|
31
|
+
}
|
|
32
|
+
function computeScreenshotFingerprint(buffer) {
|
|
33
|
+
return createHash('sha1').update(buffer).digest('hex');
|
|
34
|
+
}
|
|
35
|
+
function summarizeVariantManifestForPlanner(manifest) {
|
|
36
|
+
if (!manifest)
|
|
37
|
+
return '';
|
|
38
|
+
const parts = [
|
|
39
|
+
`expected=${manifest.expectedPageIds.join(',') || 'main'}`,
|
|
40
|
+
`current=${manifest.currentPageId ?? 'main'}`,
|
|
41
|
+
manifest.currentPageIdentity ? `identity=${manifest.currentPageIdentity.summary}` : '',
|
|
42
|
+
`completed=${manifest.completedPages.join(',') || 'none'}`,
|
|
43
|
+
`remaining=${manifest.remainingPages.join(',') || 'none'}`,
|
|
44
|
+
manifest.lastCheckpointId ? `checkpoint=${manifest.lastCheckpointId}` : '',
|
|
45
|
+
];
|
|
46
|
+
if (manifest.captureStatuses) {
|
|
47
|
+
parts.push(`statuses=${Object.entries(manifest.captureStatuses)
|
|
48
|
+
.slice(0, 6)
|
|
49
|
+
.map(([pageId, status]) => `${pageId}:${status}`)
|
|
50
|
+
.join('|') || 'none'}`);
|
|
51
|
+
}
|
|
52
|
+
if (manifest.previousValidatedCaptures.length > 0) {
|
|
53
|
+
parts.push(`validated=${manifest.previousValidatedCaptures
|
|
54
|
+
.slice(-3)
|
|
55
|
+
.map((capture) => `${capture.pageId}${capture.identity ? `[${capture.identity.summary}]` : ''}:${capture.assessment.slice(0, 80)}`)
|
|
56
|
+
.join(' | ')}`);
|
|
57
|
+
}
|
|
58
|
+
return parts.join('; ');
|
|
59
|
+
}
|
|
60
|
+
function summarizeCaptureCursorForPlanner(config) {
|
|
61
|
+
if (!config.captureCursor)
|
|
62
|
+
return '';
|
|
63
|
+
return [
|
|
64
|
+
`page=${config.captureCursor.pageId}`,
|
|
65
|
+
`target=${config.captureCursor.targetId}`,
|
|
66
|
+
`phase=${config.captureCursor.phase}`,
|
|
67
|
+
`resume=${config.captureCursor.resumeFromActionIndex}`,
|
|
68
|
+
config.captureCursor.lastVerifiedCheckpointId
|
|
69
|
+
? `checkpoint=${config.captureCursor.lastVerifiedCheckpointId}`
|
|
70
|
+
: '',
|
|
71
|
+
].filter(Boolean).join('; ');
|
|
72
|
+
}
|
|
73
|
+
function summarizeRepairTicketForPlanner(config) {
|
|
74
|
+
if (!config.activeRepairTicket)
|
|
75
|
+
return '';
|
|
76
|
+
const ticket = config.activeRepairTicket;
|
|
77
|
+
return [
|
|
78
|
+
`ticket=${ticket.id}`,
|
|
79
|
+
`cause=${ticket.cause}`,
|
|
80
|
+
`status=${ticket.status}`,
|
|
81
|
+
`summary=${ticket.summary}`,
|
|
82
|
+
ticket.expectedState.lang ? `lang=${ticket.expectedState.lang}` : '',
|
|
83
|
+
ticket.expectedState.theme ? `theme=${ticket.expectedState.theme}` : '',
|
|
84
|
+
ticket.expectedState.authState ? `auth=${ticket.expectedState.authState}` : '',
|
|
85
|
+
ticket.expectedState.url ? `url=${ticket.expectedState.url}` : '',
|
|
86
|
+
ticket.expectedState.pageId ? `page=${ticket.expectedState.pageId}` : '',
|
|
87
|
+
].filter(Boolean).join('; ');
|
|
88
|
+
}
|
|
89
|
+
function findDuplicateVariantCapture(manifest, fingerprint) {
|
|
90
|
+
if (!manifest?.currentPageId)
|
|
91
|
+
return null;
|
|
92
|
+
const duplicate = manifest.previousValidatedCaptures.find((capture) => capture.pageId !== manifest.currentPageId
|
|
93
|
+
&& capture.fingerprint
|
|
94
|
+
&& capture.fingerprint === fingerprint);
|
|
95
|
+
return duplicate?.pageId ?? null;
|
|
96
|
+
}
|
|
97
|
+
const PRESET_EDITOR_RE = /\b(edit[_ -]?preset|preset[_ -]editor|preset[_ -]edit|preset[_ -]form)\b/i;
|
|
98
|
+
const PRESET_CONTEXT_RE = /\bpreset\b/i;
|
|
99
|
+
const CONFIGURATION_RE = /\b(edit|editor|config|configuration|configure|settings|modify|modifier|form)\b/i;
|
|
100
|
+
const EDITOR_PROMPT_RE = /\b(additional instructions|instructions suppl[eé]mentaires|continue with ai|continuer avec l['’]ia)\b/i;
|
|
101
|
+
const DIALOG_TARGET_RE = /\b(modal|dialog|drawer|popup)\b/i;
|
|
102
|
+
const GALLERY_ROUTE_RE = /\/(gallery|captures?|screenshots?)\b/i;
|
|
103
|
+
const SETTINGS_ROUTE_RE = /\/settings\b/i;
|
|
104
|
+
const GALLERY_SURFACE_RE = /\b(gallery|galerie|captures?|screenshots?|thumbnail|thumbnails|miniatures?)\b/i;
|
|
105
|
+
const GALLERY_DETAIL_PROMPT_RE = /\b(first (?:image|screenshot|result|card)|open the first|click on the first|after clicking|individual screenshots?|detail view|detailed view|subsequent page|premier resultat|premier résultat|captures individuelles?)\b/i;
|
|
106
|
+
const GALLERY_OVERVIEW_CONTROL_RE = /\b(filter by preset|filtrer par preset|rechercher|search|filter|preset)\b/i;
|
|
107
|
+
const GALLERY_GROUP_CARD_RE = /\b(download all|tout telecharger|tout télécharger|\d+\s*(?:screenshots?|captures?(?: d['’]ecran| d’écran)?))\b/i;
|
|
108
|
+
const SETTINGS_SURFACE_RE = /\b(settings|param[eè]tres|project name|team members|billing|workspace settings|general settings)\b/i;
|
|
109
|
+
const TEMPLATE_GALLERY_MARKERS = [
|
|
110
|
+
'new preset',
|
|
111
|
+
'nouveau preset',
|
|
112
|
+
'search templates',
|
|
113
|
+
'rechercher des templates',
|
|
114
|
+
'start from scratch',
|
|
115
|
+
'partir de zéro',
|
|
116
|
+
'homepage hero',
|
|
117
|
+
'pricing page',
|
|
118
|
+
'mobile responsive',
|
|
119
|
+
'dark mode showcase',
|
|
120
|
+
'multi-language',
|
|
121
|
+
'feature showcase',
|
|
122
|
+
'marketing',
|
|
123
|
+
'responsive',
|
|
124
|
+
'video',
|
|
125
|
+
];
|
|
126
|
+
const MODAL_GENERIC_SUBJECT_TOKENS = new Set([
|
|
127
|
+
'modal',
|
|
128
|
+
'dialog',
|
|
129
|
+
'drawer',
|
|
130
|
+
'popup',
|
|
131
|
+
'overlay',
|
|
132
|
+
'preset',
|
|
133
|
+
'presets',
|
|
134
|
+
'template',
|
|
135
|
+
'templates',
|
|
136
|
+
'selection',
|
|
137
|
+
'select',
|
|
138
|
+
'choose',
|
|
139
|
+
'details',
|
|
140
|
+
'detail',
|
|
141
|
+
'configuration',
|
|
142
|
+
'config',
|
|
143
|
+
'gallery',
|
|
144
|
+
'captures',
|
|
145
|
+
'screenshots',
|
|
146
|
+
'settings',
|
|
147
|
+
'editor',
|
|
148
|
+
]);
|
|
149
|
+
const MODAL_CONFIGURATION_SURFACE_RE = /\b(additional instructions|instructions supplementaires|continue with ai|continuer avec (?:l )?ia|ai prompt|prompt ia)\b/i;
|
|
150
|
+
function normalizeEvidenceText(value) {
|
|
151
|
+
return value
|
|
152
|
+
.toLowerCase()
|
|
153
|
+
.normalize('NFD')
|
|
154
|
+
.replace(/[\u0300-\u036f]/g, ' ')
|
|
155
|
+
.replace(/[^a-z0-9]+/g, ' ')
|
|
156
|
+
.trim();
|
|
157
|
+
}
|
|
158
|
+
function tokenizeEvidenceText(value) {
|
|
159
|
+
return normalizeEvidenceText(value)
|
|
160
|
+
.split(/\s+/g)
|
|
161
|
+
.map(part => part.trim())
|
|
162
|
+
.filter(part => part.length >= 3);
|
|
163
|
+
}
|
|
164
|
+
function countDialogSubjectTokenMatches(params) {
|
|
165
|
+
const strongTokens = params.subjectTokens.filter(token => token.length >= 4 && !MODAL_GENERIC_SUBJECT_TOKENS.has(token));
|
|
166
|
+
if (strongTokens.length === 0)
|
|
167
|
+
return 0;
|
|
168
|
+
const evidenceTokens = new Set(tokenizeEvidenceText(`${params.visibleText} ${params.currentUrl}`));
|
|
169
|
+
return strongTokens.filter(token => evidenceTokens.has(normalizeEvidenceText(token))).length;
|
|
170
|
+
}
|
|
171
|
+
function hasExpectedDialogSurface(params) {
|
|
172
|
+
const normalizedVisibleText = normalizeEvidenceText(params.visibleText);
|
|
173
|
+
const selectionSurfaceScore = TEMPLATE_GALLERY_MARKERS.filter(marker => normalizedVisibleText.includes(normalizeEvidenceText(marker))).length
|
|
174
|
+
+ (/\b(template|templates|modele|modeles|picker|selection)\b/i.test(normalizedVisibleText) ? 1 : 0);
|
|
175
|
+
const configurationSurfaceScore = selectionSurfaceScore
|
|
176
|
+
+ (MODAL_CONFIGURATION_SURFACE_RE.test(normalizedVisibleText) ? 2 : 0);
|
|
177
|
+
const subjectTokenMatches = countDialogSubjectTokenMatches({
|
|
178
|
+
subjectTokens: params.identity.subjectTokens,
|
|
179
|
+
visibleText: params.visibleText,
|
|
180
|
+
currentUrl: params.currentUrl,
|
|
181
|
+
});
|
|
182
|
+
if (params.identity.kind === 'modal_selection') {
|
|
183
|
+
return selectionSurfaceScore > 0 || subjectTokenMatches > 0;
|
|
184
|
+
}
|
|
185
|
+
if (params.identity.kind === 'modal_configuration') {
|
|
186
|
+
return configurationSurfaceScore > 0 || subjectTokenMatches > 0;
|
|
187
|
+
}
|
|
188
|
+
return true;
|
|
189
|
+
}
|
|
190
|
+
function inferGallerySubstateFailure(params) {
|
|
191
|
+
if (!GALLERY_DETAIL_PROMPT_RE.test(params.context)) {
|
|
192
|
+
return null;
|
|
193
|
+
}
|
|
194
|
+
const normalizedVisibleText = normalizeEvidenceText(params.visibleText);
|
|
195
|
+
const stillShowsOverviewControls = GALLERY_OVERVIEW_CONTROL_RE.test(normalizedVisibleText);
|
|
196
|
+
const stillShowsGroupedCollections = GALLERY_GROUP_CARD_RE.test(normalizedVisibleText);
|
|
197
|
+
if (stillShowsOverviewControls && stillShowsGroupedCollections) {
|
|
198
|
+
return 'Expected the opened gallery result/details view, but the gallery overview/filter controls are still visible.';
|
|
199
|
+
}
|
|
200
|
+
return null;
|
|
201
|
+
}
|
|
202
|
+
export function inferVariantIdentityFailure(params) {
|
|
203
|
+
const pageId = (params.pageId || '').toLowerCase();
|
|
204
|
+
const prompt = params.prompt.toLowerCase();
|
|
205
|
+
const context = `${pageId} ${prompt}`;
|
|
206
|
+
const visibleText = params.visibleText.toLowerCase();
|
|
207
|
+
const identity = params.pageIdentity ?? null;
|
|
208
|
+
const galleryMarkerCount = TEMPLATE_GALLERY_MARKERS.filter((marker) => visibleText.includes(marker)).length;
|
|
209
|
+
const hasMultipleKnownTemplates = ['homepage hero', 'pricing page', 'mobile responsive', 'dark mode showcase', 'multi-language', 'feature showcase']
|
|
210
|
+
.filter((marker) => visibleText.includes(marker))
|
|
211
|
+
.length >= 2;
|
|
212
|
+
const looksLikeTemplateGallery = galleryMarkerCount >= 3
|
|
213
|
+
|| (visibleText.includes('new preset') && hasMultipleKnownTemplates)
|
|
214
|
+
|| (visibleText.includes('nouveau preset') && hasMultipleKnownTemplates);
|
|
215
|
+
const expectsPresetEditor = identity?.dedicatedRoute
|
|
216
|
+
|| identity?.kind === 'editor_route'
|
|
217
|
+
|| (!identity
|
|
218
|
+
&& (PRESET_EDITOR_RE.test(context)
|
|
219
|
+
|| (PRESET_CONTEXT_RE.test(context) && CONFIGURATION_RE.test(context))
|
|
220
|
+
|| (EDITOR_PROMPT_RE.test(context) && !DIALOG_TARGET_RE.test(context))));
|
|
221
|
+
const expectsDialogTarget = identity?.dialogTarget ?? false;
|
|
222
|
+
const expectsGalleryTarget = identity?.kind === 'gallery';
|
|
223
|
+
const onPresetEditorRoute = /\/projects\/[^/]+\/presets\/[^/?#]+/i.test(params.currentUrl) || /\/presets\/[^/?#]+/i.test(params.currentUrl);
|
|
224
|
+
const onGalleryRoute = GALLERY_ROUTE_RE.test(params.currentUrl);
|
|
225
|
+
const onSettingsRoute = SETTINGS_ROUTE_RE.test(params.currentUrl);
|
|
226
|
+
const gallerySurfaceScore = TEMPLATE_GALLERY_MARKERS
|
|
227
|
+
.filter((marker) => marker === 'new preset' || marker === 'nouveau preset')
|
|
228
|
+
.reduce((score, marker) => score + (visibleText.includes(marker) ? 1 : 0), 0)
|
|
229
|
+
+ (GALLERY_SURFACE_RE.test(visibleText) ? 2 : 0);
|
|
230
|
+
const settingsSurfaceScore = SETTINGS_SURFACE_RE.test(visibleText) ? 2 : 0;
|
|
231
|
+
if (expectsDialogTarget) {
|
|
232
|
+
if (params.dialogCount === 0) {
|
|
233
|
+
return 'Expected a dialog/modal capture target, but no dialog is currently open.';
|
|
234
|
+
}
|
|
235
|
+
if (identity && !hasExpectedDialogSurface({
|
|
236
|
+
identity,
|
|
237
|
+
visibleText,
|
|
238
|
+
currentUrl: params.currentUrl,
|
|
239
|
+
})) {
|
|
240
|
+
return identity.kind === 'modal_configuration'
|
|
241
|
+
? 'Expected the configured modal state, but the current dialog content does not match the requested template/details view.'
|
|
242
|
+
: 'Expected the template picker modal, but the current dialog content does not match the requested selection state.';
|
|
243
|
+
}
|
|
244
|
+
return null;
|
|
245
|
+
}
|
|
246
|
+
if (expectsGalleryTarget) {
|
|
247
|
+
const expectsDetailView = GALLERY_DETAIL_PROMPT_RE.test(context);
|
|
248
|
+
if (!expectsDetailView && params.dialogCount > 0) {
|
|
249
|
+
return 'Expected the gallery overview, but a dialog/modal is still open on top of the page.';
|
|
250
|
+
}
|
|
251
|
+
if (onSettingsRoute || (settingsSurfaceScore > 0 && !onGalleryRoute && gallerySurfaceScore === 0)) {
|
|
252
|
+
return 'Expected the gallery overview, but the page still looks like settings or another non-gallery section.';
|
|
253
|
+
}
|
|
254
|
+
const gallerySubstateFailure = inferGallerySubstateFailure({ context, visibleText });
|
|
255
|
+
if (gallerySubstateFailure)
|
|
256
|
+
return gallerySubstateFailure;
|
|
257
|
+
return null;
|
|
258
|
+
}
|
|
259
|
+
if (!expectsPresetEditor)
|
|
260
|
+
return null;
|
|
261
|
+
if (looksLikeTemplateGallery) {
|
|
262
|
+
return 'Expected the preset editor page, but the template gallery/sidebar is still visible.';
|
|
263
|
+
}
|
|
264
|
+
if (params.dialogCount > 0) {
|
|
265
|
+
return 'Expected the preset editor page, but a dialog/modal is still open on top of the page.';
|
|
266
|
+
}
|
|
267
|
+
if (PRESET_CONTEXT_RE.test(context) && !onPresetEditorRoute) {
|
|
268
|
+
return 'Expected the preset editor page, but the browser is not on a preset editor route.';
|
|
269
|
+
}
|
|
270
|
+
return null;
|
|
271
|
+
}
|
|
272
|
+
// OAuth guard: block clicks on OAuth elements when credentials are provided
|
|
273
|
+
const OAUTH_TEXT_RE = /\b(google|apple|microsoft|github|facebook|twitter|linkedin|sso)\b/i;
|
|
274
|
+
const OAUTH_HREF_RE = /google\.com|apple\.com|microsoft\.com|github\.com|facebook\.com|twitter\.com|linkedin\.com|auth0\.com|oauth/i;
|
|
275
|
+
const HIDDEN_PASSWORD_GIVE_UP_RE = /\b(password (is )?missing|missing password|mot de passe manquant|password unavailable|no password provided)\b/i;
|
|
276
|
+
const GENERIC_PASSWORD_INPUT_RE = /^(password|your password|motdepasse|mot de passe|password123|secret)$/i;
|
|
277
|
+
function isOAuthElement(el) {
|
|
278
|
+
return OAUTH_TEXT_RE.test(el.text) || OAUTH_TEXT_RE.test(el.ariaLabel || '') || OAUTH_HREF_RE.test(el.href || '');
|
|
279
|
+
}
|
|
280
|
+
function isPasswordFieldTarget(args, interactiveElements) {
|
|
281
|
+
const byIndex = args.index !== undefined
|
|
282
|
+
? interactiveElements.find((el) => el.index === args.index)
|
|
283
|
+
: null;
|
|
284
|
+
const bySelector = !byIndex && typeof args.selector === 'string'
|
|
285
|
+
? interactiveElements.find((el) => el.selector === args.selector)
|
|
286
|
+
: null;
|
|
287
|
+
const target = byIndex ?? bySelector;
|
|
288
|
+
if (!target)
|
|
289
|
+
return false;
|
|
290
|
+
const haystack = `${target.inputType || ''} ${target.text || ''} ${target.ariaLabel || ''} ${target.selector}`.toLowerCase();
|
|
291
|
+
return target.inputType === 'password' || /\b(password|mot de passe|passcode)\b/i.test(haystack);
|
|
292
|
+
}
|
|
293
|
+
function createClient(apiKey) {
|
|
294
|
+
return new OpenAI({
|
|
295
|
+
baseURL: 'https://openrouter.ai/api/v1',
|
|
296
|
+
apiKey,
|
|
297
|
+
defaultHeaders: {
|
|
298
|
+
'HTTP-Referer': 'https://github.com/screenshot-agent',
|
|
299
|
+
'X-Title': 'Screenshot Agent',
|
|
300
|
+
},
|
|
301
|
+
});
|
|
302
|
+
}
|
|
303
|
+
/** Check if a model is a Grok model (xAI). These models have broken streaming tool calls via OpenRouter. */
|
|
304
|
+
function isGrokModel(model) {
|
|
305
|
+
const normalized = model.toLowerCase();
|
|
306
|
+
return normalized.includes('grok') || normalized.startsWith('x-ai/');
|
|
307
|
+
}
|
|
308
|
+
/** Check if a model supports OpenRouter's reasoning/thinking parameter. */
|
|
309
|
+
function supportsReasoning(model) {
|
|
310
|
+
return isGrokModel(model);
|
|
311
|
+
}
|
|
312
|
+
/** Build the reasoning body fragment for OpenRouter requests. */
|
|
313
|
+
function reasoningBody(model, effort) {
|
|
314
|
+
if (!effort || effort === 'off')
|
|
315
|
+
return {};
|
|
316
|
+
if (!supportsReasoning(model))
|
|
317
|
+
return {};
|
|
318
|
+
return { reasoning: { effort } };
|
|
319
|
+
}
|
|
320
|
+
/** Spread into OpenRouter request bodies to inject provider routing preferences for a given model. */
|
|
321
|
+
function providerBody(model, prefsMap) {
|
|
322
|
+
const prefs = prefsMap?.[model];
|
|
323
|
+
// Enforce Zero Data Retention on all requests (GDPR compliance)
|
|
324
|
+
return { provider: { ...prefs, zdr: true } };
|
|
325
|
+
}
|
|
326
|
+
let _reasoningMessageCounter = 0;
|
|
327
|
+
/**
|
|
328
|
+
* Parse tool calls from XML content emitted by models that don't support native tool_calls
|
|
329
|
+
* streaming (e.g., MiniMax M2.5 via OpenRouter). Supports two formats:
|
|
330
|
+
* - <minimax:tool_call><invoke name="X"><parameter name="Y">Z</parameter></invoke></minimax:tool_call>
|
|
331
|
+
* - <tool_code>function_name\n{"arg": "value"}</tool_code>
|
|
332
|
+
*/
|
|
333
|
+
function parseXmlToolCalls(content, completionId) {
|
|
334
|
+
const results = [];
|
|
335
|
+
// Format 1: <minimax:tool_call><invoke name="..."><parameter name="...">...</parameter></invoke></minimax:tool_call>
|
|
336
|
+
const minimaxRe = /<minimax:tool_call>([\s\S]*?)<\/minimax:tool_call>/gi;
|
|
337
|
+
for (const match of content.matchAll(minimaxRe)) {
|
|
338
|
+
const block = match[1];
|
|
339
|
+
const invokeRe = /<invoke\s+name="([^"]+)">([\s\S]*?)<\/invoke>/gi;
|
|
340
|
+
for (const invokeMatch of block.matchAll(invokeRe)) {
|
|
341
|
+
const name = invokeMatch[1];
|
|
342
|
+
const paramsBlock = invokeMatch[2];
|
|
343
|
+
const args = {};
|
|
344
|
+
const paramRe = /<parameter\s+name="([^"]+)">([\s\S]*?)<\/parameter>/gi;
|
|
345
|
+
for (const paramMatch of paramsBlock.matchAll(paramRe)) {
|
|
346
|
+
args[paramMatch[1]] = paramMatch[2].trim();
|
|
347
|
+
}
|
|
348
|
+
results.push({
|
|
349
|
+
id: `xml-${completionId || Date.now()}-${results.length}`,
|
|
350
|
+
type: 'function',
|
|
351
|
+
function: { name, arguments: JSON.stringify(args) },
|
|
352
|
+
});
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
// Format 2: <tool_code>function_name\n{"arg": "value"}</tool_code>
|
|
356
|
+
if (results.length === 0) {
|
|
357
|
+
const toolCodeRe = /<tool_code>([\s\S]*?)<\/tool_code>/gi;
|
|
358
|
+
for (const match of content.matchAll(toolCodeRe)) {
|
|
359
|
+
const inner = match[1].trim();
|
|
360
|
+
const newlineIdx = inner.indexOf('\n');
|
|
361
|
+
if (newlineIdx > 0) {
|
|
362
|
+
const name = inner.slice(0, newlineIdx).trim();
|
|
363
|
+
const argsStr = inner.slice(newlineIdx + 1).trim();
|
|
364
|
+
results.push({
|
|
365
|
+
id: `xml-${completionId || Date.now()}-${results.length}`,
|
|
366
|
+
type: 'function',
|
|
367
|
+
function: { name, arguments: argsStr },
|
|
368
|
+
});
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
return results;
|
|
373
|
+
}
|
|
374
|
+
async function callWithRetry(client, params, maxRetries = 3, signal, providerPreferences, reasoningEffort) {
|
|
375
|
+
// Grok models have broken streaming tool calls via OpenRouter — the provider
|
|
376
|
+
// silently drops tool_calls from SSE deltas, resulting in 0 tool calls received
|
|
377
|
+
// despite the model producing them. Use non-streaming for these models.
|
|
378
|
+
const useStreaming = !isGrokModel(params.model);
|
|
379
|
+
let messagesToUse = params.messages;
|
|
380
|
+
for (let attempt = 1; attempt <= maxRetries; attempt++) {
|
|
381
|
+
throwIfAborted(signal, 'Agent run cancelled.');
|
|
382
|
+
try {
|
|
383
|
+
let content = '';
|
|
384
|
+
let toolCalls = [];
|
|
385
|
+
let finishReason = null;
|
|
386
|
+
let completionId = '';
|
|
387
|
+
let model = params.model;
|
|
388
|
+
let chunkCount = 0;
|
|
389
|
+
const usageData = { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 };
|
|
390
|
+
if (useStreaming) {
|
|
391
|
+
// Streaming path for most models
|
|
392
|
+
const stream = await client.chat.completions.create({ ...params, messages: messagesToUse, stream: true, ...providerBody(params.model, providerPreferences), ...reasoningBody(params.model, reasoningEffort) }, { signal });
|
|
393
|
+
const toolCallBuffers = new Map();
|
|
394
|
+
const messageId = `reasoning-${++_reasoningMessageCounter}`;
|
|
395
|
+
for await (const chunk of stream) {
|
|
396
|
+
chunkCount++;
|
|
397
|
+
if (chunk.id)
|
|
398
|
+
completionId = chunk.id;
|
|
399
|
+
if (chunk.model)
|
|
400
|
+
model = chunk.model;
|
|
401
|
+
if (chunk.usage) {
|
|
402
|
+
usageData.prompt_tokens = chunk.usage.prompt_tokens ?? 0;
|
|
403
|
+
usageData.completion_tokens = chunk.usage.completion_tokens ?? 0;
|
|
404
|
+
usageData.total_tokens = chunk.usage.total_tokens ?? 0;
|
|
405
|
+
}
|
|
406
|
+
const delta = chunk.choices?.[0]?.delta;
|
|
407
|
+
if (!delta)
|
|
408
|
+
continue;
|
|
409
|
+
if (chunk.choices[0].finish_reason) {
|
|
410
|
+
finishReason = chunk.choices[0].finish_reason;
|
|
411
|
+
}
|
|
412
|
+
if (delta.content) {
|
|
413
|
+
content += delta.content;
|
|
414
|
+
emitReasoningDelta(delta.content, messageId);
|
|
415
|
+
}
|
|
416
|
+
if (delta.tool_calls) {
|
|
417
|
+
for (const tc of delta.tool_calls) {
|
|
418
|
+
const existing = toolCallBuffers.get(tc.index);
|
|
419
|
+
if (existing) {
|
|
420
|
+
if (tc.function?.name)
|
|
421
|
+
existing.name = tc.function.name;
|
|
422
|
+
if (tc.id)
|
|
423
|
+
existing.id = tc.id;
|
|
424
|
+
if (tc.function?.arguments)
|
|
425
|
+
existing.arguments += tc.function.arguments;
|
|
426
|
+
}
|
|
427
|
+
else {
|
|
428
|
+
toolCallBuffers.set(tc.index, {
|
|
429
|
+
id: tc.id || `tc-${completionId || `gen-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`}-${tc.index}`,
|
|
430
|
+
name: tc.function?.name || '',
|
|
431
|
+
arguments: tc.function?.arguments || '',
|
|
432
|
+
});
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
toolCalls = Array.from(toolCallBuffers.entries())
|
|
438
|
+
.sort(([a], [b]) => a - b)
|
|
439
|
+
.filter(([, tc]) => tc.name.trim().length > 0)
|
|
440
|
+
.map(([, tc]) => ({
|
|
441
|
+
id: tc.id,
|
|
442
|
+
type: 'function',
|
|
443
|
+
function: { name: tc.name, arguments: tc.arguments },
|
|
444
|
+
}));
|
|
445
|
+
}
|
|
446
|
+
else {
|
|
447
|
+
// Non-streaming path for models with broken streaming (e.g. Grok via OpenRouter)
|
|
448
|
+
const nonStreamStart = Date.now();
|
|
449
|
+
logger.info('Réflexion en cours…');
|
|
450
|
+
const response = await client.chat.completions.create({ ...params, messages: messagesToUse, stream: false, ...providerBody(params.model, providerPreferences), ...reasoningBody(params.model, reasoningEffort) }, { signal });
|
|
451
|
+
logger.debug(`LLM response received in ${Date.now() - nonStreamStart}ms (model: ${params.model})`);
|
|
452
|
+
completionId = response.id ?? '';
|
|
453
|
+
model = response.model ?? params.model;
|
|
454
|
+
finishReason = response.choices?.[0]?.finish_reason ?? null;
|
|
455
|
+
content = response.choices?.[0]?.message?.content ?? '';
|
|
456
|
+
if (response.usage) {
|
|
457
|
+
usageData.prompt_tokens = response.usage.prompt_tokens ?? 0;
|
|
458
|
+
usageData.completion_tokens = response.usage.completion_tokens ?? 0;
|
|
459
|
+
usageData.total_tokens = response.usage.total_tokens ?? 0;
|
|
460
|
+
}
|
|
461
|
+
if (response.choices?.[0]?.message?.tool_calls) {
|
|
462
|
+
toolCalls = response.choices[0].message.tool_calls
|
|
463
|
+
.filter((tc) => tc.type === 'function' && 'function' in tc)
|
|
464
|
+
.map((tc) => ({
|
|
465
|
+
id: tc.id,
|
|
466
|
+
type: 'function',
|
|
467
|
+
function: { name: tc.function.name, arguments: tc.function.arguments },
|
|
468
|
+
}));
|
|
469
|
+
}
|
|
470
|
+
chunkCount = 1;
|
|
471
|
+
}
|
|
472
|
+
// Some models (e.g., MiniMax M2.5) return tool calls as XML in the content
|
|
473
|
+
// field instead of using the native tool_calls streaming delta. Parse them.
|
|
474
|
+
if (toolCalls.length === 0 && content) {
|
|
475
|
+
const xmlParsed = parseXmlToolCalls(content, completionId);
|
|
476
|
+
if (xmlParsed.length > 0) {
|
|
477
|
+
toolCalls.push(...xmlParsed);
|
|
478
|
+
content = '';
|
|
479
|
+
logger.debug(`Parsed ${xmlParsed.length} tool call(s) from XML content (model: ${model})`);
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
// If the model returned no tool calls, log diagnostics and retry with coercion.
|
|
483
|
+
const hasSubstantiveContent = content ? /[a-zA-Z]{3,}/.test(content) : false;
|
|
484
|
+
if (toolCalls.length === 0 && attempt < maxRetries && (hasSubstantiveContent || !content)) {
|
|
485
|
+
logger.info(`Model ${model} returned no usable tool calls (attempt ${attempt}/${maxRetries}). `
|
|
486
|
+
+ `Diagnostics: chunks=${chunkCount}, finish_reason=${finishReason}, `
|
|
487
|
+
+ `content_length=${content.length}, `
|
|
488
|
+
+ `usage=${JSON.stringify(usageData)}, content_preview=${JSON.stringify(content.slice(0, 500) || '(empty)')}`);
|
|
489
|
+
if (content) {
|
|
490
|
+
messagesToUse = [
|
|
491
|
+
...params.messages,
|
|
492
|
+
{ role: 'assistant', content },
|
|
493
|
+
{
|
|
494
|
+
role: 'user',
|
|
495
|
+
content: 'You must call one of the available tools. Do not respond with text — select the most appropriate tool and call it now.',
|
|
496
|
+
},
|
|
497
|
+
];
|
|
498
|
+
}
|
|
499
|
+
else {
|
|
500
|
+
messagesToUse = [
|
|
501
|
+
...params.messages,
|
|
502
|
+
{
|
|
503
|
+
role: 'user',
|
|
504
|
+
content: 'You must call one of the available tools. Do not respond with text — select the most appropriate tool and call it now.',
|
|
505
|
+
},
|
|
506
|
+
];
|
|
507
|
+
}
|
|
508
|
+
continue;
|
|
509
|
+
}
|
|
510
|
+
if (toolCalls.length === 0 && content && !hasSubstantiveContent) {
|
|
511
|
+
logger.info(`Model ${model} returned non-substantive content (${content.length} chars), skipping coercion retry. Preview: ${JSON.stringify(content.slice(0, 200))}`);
|
|
512
|
+
}
|
|
513
|
+
const result = {
|
|
514
|
+
id: completionId,
|
|
515
|
+
object: 'chat.completion',
|
|
516
|
+
created: Math.floor(Date.now() / 1000),
|
|
517
|
+
model,
|
|
518
|
+
choices: [{
|
|
519
|
+
index: 0,
|
|
520
|
+
message: {
|
|
521
|
+
role: 'assistant',
|
|
522
|
+
content: toolCalls.length > 0 ? null : (content || null),
|
|
523
|
+
tool_calls: toolCalls.length > 0 ? toolCalls : undefined,
|
|
524
|
+
refusal: null,
|
|
525
|
+
},
|
|
526
|
+
finish_reason: (finishReason || 'stop'),
|
|
527
|
+
logprobs: null,
|
|
528
|
+
}],
|
|
529
|
+
usage: usageData,
|
|
530
|
+
};
|
|
531
|
+
return result;
|
|
532
|
+
}
|
|
533
|
+
catch (err) {
|
|
534
|
+
if (isAbortError(err)) {
|
|
535
|
+
throw err;
|
|
536
|
+
}
|
|
537
|
+
const error = err;
|
|
538
|
+
if (error.status === 429 && attempt < maxRetries) {
|
|
539
|
+
const delay = Math.pow(2, attempt) * 1000;
|
|
540
|
+
logger.info(`Rate limited, retrying in ${delay}ms...`);
|
|
541
|
+
await sleepWithAbort(delay, signal);
|
|
542
|
+
continue;
|
|
543
|
+
}
|
|
544
|
+
throw err;
|
|
545
|
+
}
|
|
546
|
+
}
|
|
547
|
+
throw new Error('Max retries exceeded');
|
|
548
|
+
}
|
|
549
|
+
async function callTextOnlyWithRetry(client, params, maxRetries = 3, signal) {
|
|
550
|
+
for (let attempt = 1; attempt <= maxRetries; attempt++) {
|
|
551
|
+
throwIfAborted(signal, 'Agent run cancelled.');
|
|
552
|
+
try {
|
|
553
|
+
return await client.chat.completions.create({ ...params, stream: false }, { signal });
|
|
554
|
+
}
|
|
555
|
+
catch (err) {
|
|
556
|
+
if (isAbortError(err)) {
|
|
557
|
+
throw err;
|
|
558
|
+
}
|
|
559
|
+
const error = err;
|
|
560
|
+
if (error.status === 429 && attempt < maxRetries) {
|
|
561
|
+
const delay = Math.pow(2, attempt) * 1000;
|
|
562
|
+
logger.info(`Rate limited, retrying in ${delay}ms...`);
|
|
563
|
+
await sleepWithAbort(delay, signal);
|
|
564
|
+
continue;
|
|
565
|
+
}
|
|
566
|
+
throw err;
|
|
567
|
+
}
|
|
568
|
+
}
|
|
569
|
+
throw new Error('Max retries exceeded');
|
|
570
|
+
}
|
|
571
|
+
function extractUsage(response, stepNumber, stepType, modelRequested, imagesInPrompt) {
|
|
572
|
+
return extractStepUsage(response, {
|
|
573
|
+
stepNumber,
|
|
574
|
+
stepType,
|
|
575
|
+
modelRequested,
|
|
576
|
+
imagesInPrompt,
|
|
577
|
+
});
|
|
578
|
+
}
|
|
579
|
+
function mergeUsage(primary, secondary) {
|
|
580
|
+
if (!primary)
|
|
581
|
+
return secondary;
|
|
582
|
+
if (!secondary)
|
|
583
|
+
return primary;
|
|
584
|
+
return {
|
|
585
|
+
...secondary,
|
|
586
|
+
stepNumber: primary.stepNumber,
|
|
587
|
+
stepType: primary.stepType,
|
|
588
|
+
modelRequested: primary.modelRequested,
|
|
589
|
+
imagesInPrompt: Math.max(primary.imagesInPrompt, secondary.imagesInPrompt),
|
|
590
|
+
promptTokens: (primary.promptTokens ?? 0) + (secondary.promptTokens ?? 0),
|
|
591
|
+
completionTokens: (primary.completionTokens ?? 0) + (secondary.completionTokens ?? 0),
|
|
592
|
+
totalTokens: (primary.totalTokens ?? 0) + (secondary.totalTokens ?? 0),
|
|
593
|
+
cacheReadTokens: (primary.cacheReadTokens ?? 0) + (secondary.cacheReadTokens ?? 0),
|
|
594
|
+
cacheWriteTokens: (primary.cacheWriteTokens ?? 0) + (secondary.cacheWriteTokens ?? 0),
|
|
595
|
+
};
|
|
596
|
+
}
|
|
597
|
+
function extractAssistantText(content) {
|
|
598
|
+
if (typeof content === 'string')
|
|
599
|
+
return content.trim();
|
|
600
|
+
if (!Array.isArray(content))
|
|
601
|
+
return '';
|
|
602
|
+
return content
|
|
603
|
+
.map((part) => {
|
|
604
|
+
if (typeof part === 'string')
|
|
605
|
+
return part;
|
|
606
|
+
return typeof part.text === 'string' ? part.text : '';
|
|
607
|
+
})
|
|
608
|
+
.join('\n')
|
|
609
|
+
.trim();
|
|
610
|
+
}
|
|
611
|
+
export function inferDeterministicReadyDecision(params) {
|
|
612
|
+
const parsedAssessment = parseVerificationDecisionText(params.assessment);
|
|
613
|
+
const pageLooksStable = params.observation.readyState !== 'loading';
|
|
614
|
+
const hasBlockingDialog = params.observation.dialogCount > 0;
|
|
615
|
+
const hasLoadingIndicators = params.observation.loadingIndicatorCount > 0;
|
|
616
|
+
const onExpectedOrigin = urlsRoughlyMatch(params.targetUrl, params.observation.url)
|
|
617
|
+
|| urlsRoughlyMatch(params.observation.url, params.targetUrl);
|
|
618
|
+
// Dialog presence is no longer a hard rejection — the LLM sees the screenshot
|
|
619
|
+
// and can judge whether the dialog is the intended capture target or an obstruction.
|
|
620
|
+
if (hasLoadingIndicators) {
|
|
621
|
+
return { verified: false, reason: `Loading indicators still visible (${params.observation.loadingIndicatorCount}).` };
|
|
622
|
+
}
|
|
623
|
+
if (parsedAssessment?.verified && pageLooksStable && onExpectedOrigin && params.allowSuccess !== false) {
|
|
624
|
+
return { verified: true };
|
|
625
|
+
}
|
|
626
|
+
return null;
|
|
627
|
+
}
|
|
628
|
+
function hasHardVariantMismatch(config, bundle) {
|
|
629
|
+
if (config.currentLang) {
|
|
630
|
+
const languageState = evaluateRequestedLanguageState({
|
|
631
|
+
currentUrl: bundle.url,
|
|
632
|
+
requestedLang: config.currentLang,
|
|
633
|
+
signals: bundle.pageSignals,
|
|
634
|
+
});
|
|
635
|
+
if (!languageState.active && !languageState.ambiguous && languageState.confidence === 'high') {
|
|
636
|
+
return true;
|
|
637
|
+
}
|
|
638
|
+
}
|
|
639
|
+
if (config.currentTheme) {
|
|
640
|
+
const themeState = evaluateRequestedThemeState({
|
|
641
|
+
requestedTheme: config.currentTheme,
|
|
642
|
+
signals: bundle.pageSignals,
|
|
643
|
+
});
|
|
644
|
+
if (!themeState.active && !themeState.ambiguous && themeState.confidence === 'high') {
|
|
645
|
+
return true;
|
|
646
|
+
}
|
|
647
|
+
}
|
|
648
|
+
return false;
|
|
649
|
+
}
|
|
650
|
+
export function inferManifestReadyDecision(params) {
|
|
651
|
+
const manifest = params.config.variantManifest;
|
|
652
|
+
const identity = manifest?.currentPageIdentity;
|
|
653
|
+
if (!manifest?.currentPageId || !identity)
|
|
654
|
+
return null;
|
|
655
|
+
if (!(identity.dialogTarget || identity.kind === 'gallery'))
|
|
656
|
+
return null;
|
|
657
|
+
const parsedAssessment = parseVerificationDecisionText(params.assessment);
|
|
658
|
+
if (!parsedAssessment?.verified)
|
|
659
|
+
return null;
|
|
660
|
+
const pageLooksStable = params.observation.readyState !== 'loading'
|
|
661
|
+
&& params.observation.loadingIndicatorCount === 0;
|
|
662
|
+
const onExpectedOrigin = urlsRoughlyMatch(params.config.url, params.bundle.url)
|
|
663
|
+
|| urlsRoughlyMatch(params.bundle.url, params.config.url);
|
|
664
|
+
if (!pageLooksStable || !onExpectedOrigin || hasHardVariantMismatch(params.config, params.bundle)) {
|
|
665
|
+
return null;
|
|
666
|
+
}
|
|
667
|
+
if (identity.dialogTarget) {
|
|
668
|
+
const visibleText = [
|
|
669
|
+
params.bundle.pageSignals.title,
|
|
670
|
+
params.bundle.pageSignals.headings.join(' '),
|
|
671
|
+
params.bundle.pageSignals.navLabels.join(' '),
|
|
672
|
+
params.bundle.pageSignals.visibleText,
|
|
673
|
+
].filter(Boolean).join(' ');
|
|
674
|
+
const strongSubjectTokens = identity.subjectTokens.filter(token => token.length >= 4 && !MODAL_GENERIC_SUBJECT_TOKENS.has(token));
|
|
675
|
+
const subjectTokenMatches = countDialogSubjectTokenMatches({
|
|
676
|
+
subjectTokens: identity.subjectTokens,
|
|
677
|
+
visibleText,
|
|
678
|
+
currentUrl: params.bundle.url,
|
|
679
|
+
});
|
|
680
|
+
const dialogSurfaceMatches = hasExpectedDialogSurface({
|
|
681
|
+
identity,
|
|
682
|
+
visibleText,
|
|
683
|
+
currentUrl: params.bundle.url,
|
|
684
|
+
});
|
|
685
|
+
if (!dialogSurfaceMatches || (strongSubjectTokens.length > 0 && subjectTokenMatches === 0)) {
|
|
686
|
+
return {
|
|
687
|
+
verified: false,
|
|
688
|
+
reason: identity.kind === 'modal_configuration'
|
|
689
|
+
? 'Expected the configured modal state, but the open dialog does not match the requested content.'
|
|
690
|
+
: 'Expected the requested dialog/modal target, but the open dialog content does not match it.',
|
|
691
|
+
};
|
|
692
|
+
}
|
|
693
|
+
}
|
|
694
|
+
return { verified: true };
|
|
695
|
+
}
|
|
696
|
+
/**
|
|
697
|
+
* Page-signal deterministic verification: accepts when ALL of the following hold:
|
|
698
|
+
* 1. Assessment text parses as positive (caller already believes ready)
|
|
699
|
+
* 2. inferVariantIdentityFailure finds no identity mismatch
|
|
700
|
+
* 3. No loading indicators, readyState === 'complete'
|
|
701
|
+
* 4. No hard variant (lang/theme) mismatch
|
|
702
|
+
* 5. No unexpected dialog blocking the page (unless dialog is the target)
|
|
703
|
+
*
|
|
704
|
+
* Returns { verified: true } when confident, null when inconclusive (→ LLM).
|
|
705
|
+
* NEVER returns { verified: false } — absence of evidence is not evidence of absence.
|
|
706
|
+
*/
|
|
707
|
+
export function inferPageSignalReadyDecision(params) {
|
|
708
|
+
// Require a positive assessment from the caller
|
|
709
|
+
const parsedAssessment = parseVerificationDecisionText(params.assessment);
|
|
710
|
+
if (!parsedAssessment?.verified)
|
|
711
|
+
return null;
|
|
712
|
+
// Require page stability — no loading or transitional state
|
|
713
|
+
if (params.observation.readyState !== 'complete')
|
|
714
|
+
return null;
|
|
715
|
+
if (params.observation.loadingIndicatorCount > 0)
|
|
716
|
+
return null;
|
|
717
|
+
// Require URL origin match
|
|
718
|
+
if (!urlsRoughlyMatch(params.config.url, params.bundle.url)
|
|
719
|
+
&& !urlsRoughlyMatch(params.bundle.url, params.config.url)) {
|
|
720
|
+
return null;
|
|
721
|
+
}
|
|
722
|
+
// Reject if hard variant mismatch (wrong lang or theme with high confidence)
|
|
723
|
+
if (hasHardVariantMismatch(params.config, params.bundle))
|
|
724
|
+
return null;
|
|
725
|
+
// Require no identity failure — if inferVariantIdentityFailure flagged something,
|
|
726
|
+
// we cannot accept deterministically (the page content doesn't match expectations)
|
|
727
|
+
if (params.identityFailure)
|
|
728
|
+
return null;
|
|
729
|
+
// Reject if an unexpected dialog is open (unless the target IS a dialog)
|
|
730
|
+
const expectsDialog = params.config.variantManifest?.currentPageIdentity?.dialogTarget;
|
|
731
|
+
if (params.observation.dialogCount > 0 && !expectsDialog)
|
|
732
|
+
return null;
|
|
733
|
+
return { verified: true };
|
|
734
|
+
}
|
|
735
|
+
function summarizeVerificationDiagnostics(config, bundle) {
|
|
736
|
+
const result = {};
|
|
737
|
+
if (config.currentLang) {
|
|
738
|
+
const languageState = evaluateRequestedLanguageState({
|
|
739
|
+
currentUrl: bundle.url,
|
|
740
|
+
requestedLang: config.currentLang,
|
|
741
|
+
signals: bundle.pageSignals,
|
|
742
|
+
});
|
|
743
|
+
const status = languageState.active
|
|
744
|
+
? 'match'
|
|
745
|
+
: languageState.ambiguous
|
|
746
|
+
? 'ambiguous'
|
|
747
|
+
: 'mismatch';
|
|
748
|
+
result.lang = `${status}/${languageState.confidence}: ${languageState.reasons.join('; ') || 'no_language_signal'}`;
|
|
749
|
+
}
|
|
750
|
+
if (config.currentTheme) {
|
|
751
|
+
const themeState = evaluateRequestedThemeState({
|
|
752
|
+
requestedTheme: config.currentTheme,
|
|
753
|
+
signals: bundle.pageSignals,
|
|
754
|
+
});
|
|
755
|
+
const status = themeState.active
|
|
756
|
+
? 'match'
|
|
757
|
+
: themeState.ambiguous
|
|
758
|
+
? 'ambiguous'
|
|
759
|
+
: 'mismatch';
|
|
760
|
+
result.theme = `${status}/${themeState.confidence}: ${themeState.reasons.join('; ') || 'no_theme_signal'}`;
|
|
761
|
+
}
|
|
762
|
+
return result;
|
|
763
|
+
}
|
|
764
|
+
function buildLanguagePreflightVerificationFailureReason(params) {
|
|
765
|
+
const requestedLang = params.requestedLang?.trim().toLowerCase();
|
|
766
|
+
const requestedTheme = params.requestedTheme;
|
|
767
|
+
if ((!requestedLang && !requestedTheme) || !params.signals) {
|
|
768
|
+
return 'Language preflight failed: unable to confirm the requested fixed UI variant.';
|
|
769
|
+
}
|
|
770
|
+
const failures = [];
|
|
771
|
+
if (requestedLang) {
|
|
772
|
+
const languageState = evaluateRequestedLanguageState({
|
|
773
|
+
currentUrl: params.currentUrl,
|
|
774
|
+
requestedLang,
|
|
775
|
+
signals: params.signals,
|
|
776
|
+
});
|
|
777
|
+
if (!languageState.active || languageState.ambiguous) {
|
|
778
|
+
if (languageState.reasons.length > 0) {
|
|
779
|
+
failures.push(`requested "${requestedLang}", detected "${languageState.detected ?? 'unknown'}" (${languageState.reasons.join('; ')})`);
|
|
780
|
+
}
|
|
781
|
+
else {
|
|
782
|
+
failures.push(`requested "${requestedLang}", but the fixed app UI is still not confirmed in that language`);
|
|
783
|
+
}
|
|
784
|
+
}
|
|
785
|
+
}
|
|
786
|
+
if (requestedTheme) {
|
|
787
|
+
const themeState = evaluateRequestedThemeState({
|
|
788
|
+
requestedTheme,
|
|
789
|
+
signals: params.signals,
|
|
790
|
+
});
|
|
791
|
+
if (!themeState.active || themeState.ambiguous) {
|
|
792
|
+
if (themeState.reasons.length > 0) {
|
|
793
|
+
failures.push(`theme requested "${requestedTheme}", detected "${themeState.detected ?? 'unknown'}" (${themeState.reasons.join('; ')})`);
|
|
794
|
+
}
|
|
795
|
+
else {
|
|
796
|
+
failures.push(`theme "${requestedTheme}" is still not confirmed`);
|
|
797
|
+
}
|
|
798
|
+
}
|
|
799
|
+
}
|
|
800
|
+
if (failures.length === 0) {
|
|
801
|
+
return '';
|
|
802
|
+
}
|
|
803
|
+
return `Language preflight failed: ${failures.join(' | ')}.`;
|
|
804
|
+
}
|
|
805
|
+
function isTechnicalVerificationFailureReason(reason) {
|
|
806
|
+
if (!reason)
|
|
807
|
+
return false;
|
|
808
|
+
return /\b(timeout|timed out|parser|transport|stale verification snapshot|captureverificationbundle|invalid json|json parse|response format|temporary|network error|connection reset|econnreset|service unavailable)\b/i.test(reason);
|
|
809
|
+
}
|
|
810
|
+
const ACTION_OBSERVATION_TIMEOUT_MS = 2500;
|
|
811
|
+
const VERIFICATION_SCREENSHOT_TIMEOUT_MS = 5000;
|
|
812
|
+
const VERIFICATION_BUNDLE_TIMEOUT_MS = 8000;
|
|
813
|
+
const ACTION_DIAGNOSTIC_WAIT_MS = 200;
|
|
814
|
+
class AgentStepTimeoutError extends Error {
|
|
815
|
+
timeoutMs;
|
|
816
|
+
stepLabel;
|
|
817
|
+
constructor(stepLabel, timeoutMs) {
|
|
818
|
+
super(`Timed out after ${timeoutMs}ms while ${stepLabel}.`);
|
|
819
|
+
this.name = 'AgentStepTimeoutError';
|
|
820
|
+
this.stepLabel = stepLabel;
|
|
821
|
+
this.timeoutMs = timeoutMs;
|
|
822
|
+
}
|
|
823
|
+
}
|
|
824
|
+
function isAgentStepTimeoutError(error) {
|
|
825
|
+
return error instanceof AgentStepTimeoutError;
|
|
826
|
+
}
|
|
827
|
+
async function withAgentStepTimeout(work, params) {
|
|
828
|
+
return new Promise((resolve, reject) => {
|
|
829
|
+
const timer = setTimeout(() => {
|
|
830
|
+
reject(new AgentStepTimeoutError(params.stepLabel, params.timeoutMs));
|
|
831
|
+
}, params.timeoutMs);
|
|
832
|
+
void work()
|
|
833
|
+
.then((result) => {
|
|
834
|
+
clearTimeout(timer);
|
|
835
|
+
resolve(result);
|
|
836
|
+
})
|
|
837
|
+
.catch((error) => {
|
|
838
|
+
clearTimeout(timer);
|
|
839
|
+
reject(error);
|
|
840
|
+
});
|
|
841
|
+
});
|
|
842
|
+
}
|
|
843
|
+
async function withOptionalAgentStepTimeout(work, params) {
|
|
844
|
+
try {
|
|
845
|
+
return await withAgentStepTimeout(work, params);
|
|
846
|
+
}
|
|
847
|
+
catch (error) {
|
|
848
|
+
if (isAgentStepTimeoutError(error)) {
|
|
849
|
+
throw error;
|
|
850
|
+
}
|
|
851
|
+
return null;
|
|
852
|
+
}
|
|
853
|
+
}
|
|
854
|
+
function getActionExecutionTimeoutMs(action, args) {
|
|
855
|
+
switch (action) {
|
|
856
|
+
case 'navigate_to':
|
|
857
|
+
return 25000;
|
|
858
|
+
case 'click':
|
|
859
|
+
case 'safe_expand':
|
|
860
|
+
case 'select_option':
|
|
861
|
+
return 5000;
|
|
862
|
+
case 'type_text':
|
|
863
|
+
case 'press_key':
|
|
864
|
+
case 'scroll':
|
|
865
|
+
case 'scroll_to_element':
|
|
866
|
+
case 'resize_viewport':
|
|
867
|
+
case 'dismiss_overlays':
|
|
868
|
+
case 'hover':
|
|
869
|
+
return 3500;
|
|
870
|
+
case 'wait':
|
|
871
|
+
return Math.min(6000, Math.max(1000, Number(args.milliseconds ?? 1000) + 1000));
|
|
872
|
+
default:
|
|
873
|
+
return 3500;
|
|
874
|
+
}
|
|
875
|
+
}
|
|
876
|
+
function buildActionTimeoutMessage(stage, action, timeoutMs) {
|
|
877
|
+
return `Action timeout during ${stage} (${action}, ${timeoutMs}ms)`;
|
|
878
|
+
}
|
|
879
|
+
function buildVerificationTimeoutMessage(stage, timeoutMs) {
|
|
880
|
+
return `Verification timeout during ${stage} (${timeoutMs}ms).`;
|
|
881
|
+
}
|
|
882
|
+
function logActionProgress(action, stage) {
|
|
883
|
+
logger.debug(`Action progress: ${action} -> ${stage}`);
|
|
884
|
+
}
|
|
885
|
+
function getDiagnosticRecoveryFailureMessage(stage, action, timeoutMs, options = {}) {
|
|
886
|
+
const prefix = buildActionTimeoutMessage(stage, action, timeoutMs);
|
|
887
|
+
if (options.recoveryError) {
|
|
888
|
+
return `${prefix}; ${options.recoveryError}`;
|
|
889
|
+
}
|
|
890
|
+
if (stage === 'browser execution') {
|
|
891
|
+
return `${prefix}; state may be partially changed.`;
|
|
892
|
+
}
|
|
893
|
+
if (options.afterChanged) {
|
|
894
|
+
return `${prefix}; diagnostic recovery could not confirm a stable post-action state.`;
|
|
895
|
+
}
|
|
896
|
+
return `${prefix}; no reliable state change detected.`;
|
|
897
|
+
}
|
|
898
|
+
async function performActionDiagnosticRecovery(browser, action, before) {
|
|
899
|
+
logActionProgress(action, 'diagnostic recovery');
|
|
900
|
+
try {
|
|
901
|
+
await withAgentStepTimeout(() => browser.dismissOverlays(), {
|
|
902
|
+
stepLabel: `${action} diagnostic dismissOverlays`,
|
|
903
|
+
timeoutMs: getActionExecutionTimeoutMs('dismiss_overlays', {}),
|
|
904
|
+
});
|
|
905
|
+
await withAgentStepTimeout(() => browser.wait(ACTION_DIAGNOSTIC_WAIT_MS), {
|
|
906
|
+
stepLabel: `${action} diagnostic wait`,
|
|
907
|
+
timeoutMs: getActionExecutionTimeoutMs('wait', { milliseconds: ACTION_DIAGNOSTIC_WAIT_MS }),
|
|
908
|
+
});
|
|
909
|
+
const after = await withAgentStepTimeout(() => browser.captureObservation(), {
|
|
910
|
+
stepLabel: `${action} diagnostic captureObservation`,
|
|
911
|
+
timeoutMs: ACTION_OBSERVATION_TIMEOUT_MS,
|
|
912
|
+
});
|
|
913
|
+
if (!before) {
|
|
914
|
+
return {
|
|
915
|
+
reaction: null,
|
|
916
|
+
error: 'baseline observation was unavailable, so diagnostic recovery could not compare page state.',
|
|
917
|
+
};
|
|
918
|
+
}
|
|
919
|
+
return {
|
|
920
|
+
reaction: describeObservationChange(before, after),
|
|
921
|
+
};
|
|
922
|
+
}
|
|
923
|
+
catch (error) {
|
|
924
|
+
if (isAgentStepTimeoutError(error)) {
|
|
925
|
+
return {
|
|
926
|
+
reaction: null,
|
|
927
|
+
error: `${buildActionTimeoutMessage('diagnostic recovery', action, error.timeoutMs)}.`,
|
|
928
|
+
};
|
|
929
|
+
}
|
|
930
|
+
return {
|
|
931
|
+
reaction: null,
|
|
932
|
+
error: `diagnostic recovery failed: ${error.message}`,
|
|
933
|
+
};
|
|
934
|
+
}
|
|
935
|
+
}
|
|
936
|
+
async function deterministicReadyObservationCheck(params) {
|
|
937
|
+
logger.debug('Verification progress: captureObservation');
|
|
938
|
+
try {
|
|
939
|
+
const observation = await withAgentStepTimeout(() => params.browser.captureObservation(), {
|
|
940
|
+
stepLabel: 'verification captureObservation',
|
|
941
|
+
timeoutMs: ACTION_OBSERVATION_TIMEOUT_MS,
|
|
942
|
+
});
|
|
943
|
+
return inferDeterministicReadyDecision({
|
|
944
|
+
observation,
|
|
945
|
+
assessment: params.assessment,
|
|
946
|
+
targetUrl: params.config.url,
|
|
947
|
+
skipDialogCheck: params.skipDialogCheck,
|
|
948
|
+
allowSuccess: params.allowSuccess,
|
|
949
|
+
});
|
|
950
|
+
}
|
|
951
|
+
catch (error) {
|
|
952
|
+
if (isAgentStepTimeoutError(error)) {
|
|
953
|
+
return {
|
|
954
|
+
verified: false,
|
|
955
|
+
reason: buildVerificationTimeoutMessage('captureObservation', error.timeoutMs),
|
|
956
|
+
};
|
|
957
|
+
}
|
|
958
|
+
return null;
|
|
959
|
+
}
|
|
960
|
+
}
|
|
961
|
+
/**
|
|
962
|
+
* Fast-path optimization: extract a PASS/FAIL verdict from LLM text output
|
|
963
|
+
* using keyword matching, avoiding a separate LLM call when the signal is clear.
|
|
964
|
+
*
|
|
965
|
+
* Returns null when uncertain — the caller MUST fall back to an LLM vision call.
|
|
966
|
+
* Do not add more patterns here; expand LLM fallback coverage instead.
|
|
967
|
+
*/
|
|
968
|
+
export function parseVerificationDecisionText(content) {
|
|
969
|
+
const normalized = content.replace(/\s+/g, ' ').trim();
|
|
970
|
+
if (!normalized)
|
|
971
|
+
return null;
|
|
972
|
+
const upper = normalized.toUpperCase();
|
|
973
|
+
if (upper === 'PASS' || upper.startsWith('PASS ')) {
|
|
974
|
+
return { verified: true };
|
|
975
|
+
}
|
|
976
|
+
if (upper.startsWith('FAIL:')) {
|
|
977
|
+
return { verified: false, reason: normalized.slice(5).trim() || 'Verification failed' };
|
|
978
|
+
}
|
|
979
|
+
if (upper.startsWith('FAIL ')) {
|
|
980
|
+
return { verified: false, reason: normalized.slice(5).trim() || 'Verification failed' };
|
|
981
|
+
}
|
|
982
|
+
if (/^PASS\b/i.test(normalized)) {
|
|
983
|
+
return { verified: true };
|
|
984
|
+
}
|
|
985
|
+
if (/^FAIL\b/i.test(normalized)) {
|
|
986
|
+
return { verified: false, reason: normalized.replace(/^FAIL\b[:\s-]*/i, '').trim() || 'Verification failed' };
|
|
987
|
+
}
|
|
988
|
+
const positiveSignals = [
|
|
989
|
+
/\bready to capture\b/i,
|
|
990
|
+
/\bready for capture\b/i,
|
|
991
|
+
/\bcapture can proceed\b/i,
|
|
992
|
+
/\bpage (?:is|looks|appears)? ?clean\b/i,
|
|
993
|
+
/\bpage est propre\b/i,
|
|
994
|
+
/\bpage propre\b/i,
|
|
995
|
+
/\bno (?:overlay|overlays|banner|banners|spinner|spinners|modal|modals|popup|popups)\b/i,
|
|
996
|
+
/\baucun(?:e)? (?:overlay|banni[eè]re|spinner|modal|popup|widget)\b/i,
|
|
997
|
+
/\bsans overlay\b/i,
|
|
998
|
+
/\bfully loaded\b/i,
|
|
999
|
+
/\bcompl[eè]tement charg[ée]\b/i,
|
|
1000
|
+
/\bchargement (?:est )?termin[ée]\b/i,
|
|
1001
|
+
/\ball elements visible\b/i,
|
|
1002
|
+
/\btous les [ée]l[ée]ments visibles\b/i,
|
|
1003
|
+
/\bmatches (?:the )?(?:user request|request)\b/i,
|
|
1004
|
+
/\bcorrespond(?: parfaitement)? [àa] la demande\b/i,
|
|
1005
|
+
/\bpr[êe]te? pour la capture\b/i,
|
|
1006
|
+
];
|
|
1007
|
+
const negativeSignals = [
|
|
1008
|
+
/\bnot ready\b/i,
|
|
1009
|
+
/\bpas pr[êe]te?\b/i,
|
|
1010
|
+
/\bnot clean\b/i,
|
|
1011
|
+
/\bpas propre\b/i,
|
|
1012
|
+
/\bwrong page\b/i,
|
|
1013
|
+
/\bmauvaise page\b/i,
|
|
1014
|
+
/\bstill visible\b/i,
|
|
1015
|
+
/\bencore visible\b/i,
|
|
1016
|
+
/\btoujours visible\b/i,
|
|
1017
|
+
/\bstill loading\b/i,
|
|
1018
|
+
/\bencore en chargement\b/i,
|
|
1019
|
+
/\bspinner visible\b/i,
|
|
1020
|
+
/\bcookie banner visible\b/i,
|
|
1021
|
+
/\bbanni[eè]re cookie visible\b/i,
|
|
1022
|
+
/\boverlay present\b/i,
|
|
1023
|
+
/\bmodal visible\b/i,
|
|
1024
|
+
/\bpopup visible\b/i,
|
|
1025
|
+
/\bcropped\b/i,
|
|
1026
|
+
/\bcut off\b/i,
|
|
1027
|
+
/\bpartially visible\b/i,
|
|
1028
|
+
/\btronqu[ée]\b/i,
|
|
1029
|
+
/\bcoup[ée]\b/i,
|
|
1030
|
+
/\bpartiellement visible\b/i,
|
|
1031
|
+
/\bneeds fixing\b/i,
|
|
1032
|
+
/\bmust be fixed\b/i,
|
|
1033
|
+
/\b[àa] corriger\b/i,
|
|
1034
|
+
];
|
|
1035
|
+
const positiveCount = positiveSignals.filter((pattern) => pattern.test(normalized)).length;
|
|
1036
|
+
const hasNegativeSignal = negativeSignals.some((pattern) => pattern.test(normalized));
|
|
1037
|
+
if (positiveCount >= 1 && !hasNegativeSignal) {
|
|
1038
|
+
return { verified: true };
|
|
1039
|
+
}
|
|
1040
|
+
if (hasNegativeSignal && positiveCount === 0) {
|
|
1041
|
+
return { verified: false, reason: normalized.slice(0, 160) };
|
|
1042
|
+
}
|
|
1043
|
+
return null;
|
|
1044
|
+
}
|
|
1045
|
+
function parseWaitDurationMs(content) {
|
|
1046
|
+
const normalized = content.toLowerCase();
|
|
1047
|
+
const msMatch = normalized.match(/(\d{2,5})\s*(ms|millisecond|milliseconds)/i);
|
|
1048
|
+
if (msMatch) {
|
|
1049
|
+
return Math.min(5000, Math.max(300, Number(msMatch[1])));
|
|
1050
|
+
}
|
|
1051
|
+
const secMatch = normalized.match(/(\d{1,2}(?:[.,]\d+)?)\s*(s|sec|secs|second|seconds|seconde|secondes)/i);
|
|
1052
|
+
if (secMatch) {
|
|
1053
|
+
const seconds = Number(secMatch[1].replace(',', '.'));
|
|
1054
|
+
if (!Number.isNaN(seconds)) {
|
|
1055
|
+
return Math.min(5000, Math.max(300, Math.round(seconds * 1000)));
|
|
1056
|
+
}
|
|
1057
|
+
}
|
|
1058
|
+
return 1000;
|
|
1059
|
+
}
|
|
1060
|
+
/** Known tool names for validating embedded JSON tool calls in text. */
|
|
1061
|
+
const KNOWN_TOOL_NAMES = new Set([
|
|
1062
|
+
'click', 'type_text', 'select_option', 'scroll', 'press_key',
|
|
1063
|
+
'dismiss_overlays', 'wait', 'search_text', 'navigate_to',
|
|
1064
|
+
'resize_viewport', 'take_screenshot', 'ready_to_capture',
|
|
1065
|
+
'give_up', 'begin_subgoal', 'note', 'capture_by_selector',
|
|
1066
|
+
]);
|
|
1067
|
+
/**
|
|
1068
|
+
* Try to extract a tool call embedded as JSON in the model's text output.
|
|
1069
|
+
* Some models emit tool calls as plain text in the response content instead of
|
|
1070
|
+
* using the structured tool_calls array, especially during streaming or format confusion:
|
|
1071
|
+
* {"name": "click", "arguments": {"index": 5, "reason": "..."}}
|
|
1072
|
+
* This function gracefully recovers from such cases.
|
|
1073
|
+
*/
|
|
1074
|
+
function parseEmbeddedToolCall(text) {
|
|
1075
|
+
// Pattern 1: JSON object with "name" and "arguments" keys
|
|
1076
|
+
const jsonMatch = text.match(/\{[^{}]*"name"\s*:\s*"(\w+)"[^{}]*"arguments"\s*:\s*(\{[^}]+\})/s);
|
|
1077
|
+
if (jsonMatch) {
|
|
1078
|
+
const [, name, argsStr] = jsonMatch;
|
|
1079
|
+
if (KNOWN_TOOL_NAMES.has(name)) {
|
|
1080
|
+
try {
|
|
1081
|
+
const args = JSON.parse(argsStr);
|
|
1082
|
+
return { name: name, args };
|
|
1083
|
+
}
|
|
1084
|
+
catch { /* ignore */ }
|
|
1085
|
+
}
|
|
1086
|
+
}
|
|
1087
|
+
// Pattern 2: function-call-like syntax: toolName({...})
|
|
1088
|
+
const funcMatch = text.match(/\b(\w+)\s*\(\s*(\{[\s\S]*?\})\s*\)/);
|
|
1089
|
+
if (funcMatch) {
|
|
1090
|
+
const [, name, argsStr] = funcMatch;
|
|
1091
|
+
if (KNOWN_TOOL_NAMES.has(name)) {
|
|
1092
|
+
try {
|
|
1093
|
+
const args = JSON.parse(argsStr);
|
|
1094
|
+
return { name: name, args };
|
|
1095
|
+
}
|
|
1096
|
+
catch { /* ignore */ }
|
|
1097
|
+
}
|
|
1098
|
+
}
|
|
1099
|
+
return null;
|
|
1100
|
+
}
|
|
1101
|
+
export function inferActionFromAssistantText(content) {
|
|
1102
|
+
const normalized = content.replace(/\s+/g, ' ').trim();
|
|
1103
|
+
if (!normalized)
|
|
1104
|
+
return null;
|
|
1105
|
+
// Try to extract an embedded tool call from the text (some models emit
|
|
1106
|
+
// tool calls as plain text in content instead of structured tool_calls).
|
|
1107
|
+
const embedded = parseEmbeddedToolCall(normalized);
|
|
1108
|
+
if (embedded) {
|
|
1109
|
+
logger.debug(`Recovered embedded tool call from text: ${embedded.name}`);
|
|
1110
|
+
return embedded;
|
|
1111
|
+
}
|
|
1112
|
+
const verificationDecision = parseVerificationDecisionText(normalized);
|
|
1113
|
+
if (verificationDecision?.verified) {
|
|
1114
|
+
return {
|
|
1115
|
+
name: 'ready_to_capture',
|
|
1116
|
+
args: { assessment: normalized.slice(0, 800) },
|
|
1117
|
+
};
|
|
1118
|
+
}
|
|
1119
|
+
if (verificationDecision && !verificationDecision.verified) {
|
|
1120
|
+
return {
|
|
1121
|
+
name: 'give_up',
|
|
1122
|
+
args: { reason: verificationDecision.reason ?? normalized.slice(0, 400) },
|
|
1123
|
+
};
|
|
1124
|
+
}
|
|
1125
|
+
const lower = normalized.toLowerCase();
|
|
1126
|
+
if ((/\bwait\b/.test(lower) || /\battend/.test(lower))
|
|
1127
|
+
&& (/\bsettle\b/.test(lower) || /\bcharg/.test(lower) || /\bload/.test(lower) || /\bnavigation\b/.test(lower))) {
|
|
1128
|
+
return {
|
|
1129
|
+
name: 'wait',
|
|
1130
|
+
args: {
|
|
1131
|
+
milliseconds: parseWaitDurationMs(normalized),
|
|
1132
|
+
reason: 'assistant_text_fallback',
|
|
1133
|
+
},
|
|
1134
|
+
};
|
|
1135
|
+
}
|
|
1136
|
+
return null;
|
|
1137
|
+
}
|
|
1138
|
+
/**
|
|
1139
|
+
* In dual-model mode, extract a text observation from a screenshot via the cheap vision model.
|
|
1140
|
+
* Returns the observation text and usage, or null if visionModel is not configured.
|
|
1141
|
+
*/
|
|
1142
|
+
async function getVisionObservation(client, config, screenshot, context, stepNumber) {
|
|
1143
|
+
if (!config.visionModel)
|
|
1144
|
+
return null;
|
|
1145
|
+
const screenshotUrl = await makeImageUrl(screenshot, 'image/jpeg', config.uploadImage);
|
|
1146
|
+
const videoGuidance = config.runMode === 'video_navigation_preflight'
|
|
1147
|
+
? ' For video navigation preflight, explicitly say whether the screenshot is the EXACT pre-recording start state. Call out generic /home or dashboard states, unopened projects/sections/dialogs, and wrong fixed UI language/theme in the chrome.'
|
|
1148
|
+
: '';
|
|
1149
|
+
const messages = [
|
|
1150
|
+
{
|
|
1151
|
+
role: 'system',
|
|
1152
|
+
content: 'You are a page state observer for screenshot verification. Describe the current page concisely: layout, visible content, overlays, loading state, and whether the page actually matches the requested target.'
|
|
1153
|
+
+ videoGuidance,
|
|
1154
|
+
},
|
|
1155
|
+
{ role: 'user', content: [
|
|
1156
|
+
{ type: 'image_url', image_url: { url: screenshotUrl } },
|
|
1157
|
+
{ type: 'text', text: context },
|
|
1158
|
+
] },
|
|
1159
|
+
];
|
|
1160
|
+
try {
|
|
1161
|
+
const result = await client.chat.completions.create({ model: config.visionModel, messages, max_tokens: 300, stream: false, ...providerBody(config.visionModel, config.providerPreferences) }, { signal: config.abortSignal });
|
|
1162
|
+
const content = result.choices?.[0]?.message?.content?.trim() ?? '';
|
|
1163
|
+
const usage = extractUsage(result, stepNumber, 'verification', config.visionModel, 1);
|
|
1164
|
+
return { observation: content, usage };
|
|
1165
|
+
}
|
|
1166
|
+
catch (err) {
|
|
1167
|
+
if (isAbortError(err))
|
|
1168
|
+
throw err;
|
|
1169
|
+
logger.error(`Vision observer for verification failed: ${err.message}`);
|
|
1170
|
+
return null; // Fall through to mono-model path
|
|
1171
|
+
}
|
|
1172
|
+
}
|
|
1173
|
+
function buildVerificationVisionContext(params) {
|
|
1174
|
+
const parts = [
|
|
1175
|
+
'Describe this page for verification.',
|
|
1176
|
+
`User request: ${params.config.prompt}`,
|
|
1177
|
+
`Agent assessment: ${params.assessment}`,
|
|
1178
|
+
params.pageContext.currentUrl ? `Current URL: ${params.pageContext.currentUrl}` : null,
|
|
1179
|
+
params.pageContext.pageTitle ? `Current page title: ${params.pageContext.pageTitle}` : null,
|
|
1180
|
+
params.config.currentLang ? `Expected language: ${params.config.currentLang}` : null,
|
|
1181
|
+
params.config.currentTheme ? `Expected theme: ${params.config.currentTheme}` : null,
|
|
1182
|
+
].filter(Boolean);
|
|
1183
|
+
if (params.config.runMode === 'video_navigation_preflight') {
|
|
1184
|
+
parts.push('Verification target: exact pre-recording start state.', 'Explicitly say if the page is still only a generic home/dashboard/list state instead of the requested project/section/dialog.', 'Explicitly call out wrong fixed app chrome language/theme if buttons, navigation, headings, or breadcrumbs do not match the requested variant.');
|
|
1185
|
+
}
|
|
1186
|
+
return parts.join('\n');
|
|
1187
|
+
}
|
|
1188
|
+
/**
|
|
1189
|
+
* Strip image_url parts from ChatCompletionContentPart[] and prepend a text observation.
|
|
1190
|
+
*/
|
|
1191
|
+
function replaceImagesWithObservation(parts, observation) {
|
|
1192
|
+
const textParts = parts.filter(p => p.type !== 'image_url');
|
|
1193
|
+
return [
|
|
1194
|
+
{ type: 'text', text: `Page observation (from vision model):\n${observation}` },
|
|
1195
|
+
...textParts,
|
|
1196
|
+
];
|
|
1197
|
+
}
|
|
1198
|
+
async function fallbackVerifyScreenshotJson(client, config, modelState, screenshot, assessment, pageContext, stepNumber, pageFingerprint, precomputedVisionObs) {
|
|
1199
|
+
const verificationTarget = config.runMode === 'video_navigation_preflight'
|
|
1200
|
+
? 'exact pre-recording start state'
|
|
1201
|
+
: 'requested page';
|
|
1202
|
+
// Dual-model mode: reuse precomputed observation or call vision observer
|
|
1203
|
+
const visionObs = precomputedVisionObs !== undefined
|
|
1204
|
+
? precomputedVisionObs
|
|
1205
|
+
: await getVisionObservation(client, config, screenshot, buildVerificationVisionContext({
|
|
1206
|
+
config,
|
|
1207
|
+
pageContext,
|
|
1208
|
+
assessment,
|
|
1209
|
+
}), stepNumber);
|
|
1210
|
+
const screenshotUrl = visionObs ? '' : await makeImageUrl(screenshot, 'image/jpeg', config.uploadImage);
|
|
1211
|
+
const rawParts = buildVerificationMessage({
|
|
1212
|
+
userPrompt: config.prompt,
|
|
1213
|
+
screenshotUrl,
|
|
1214
|
+
previousAssessment: assessment,
|
|
1215
|
+
runMode: config.runMode,
|
|
1216
|
+
currentLang: config.currentLang,
|
|
1217
|
+
currentTheme: config.currentTheme,
|
|
1218
|
+
pageContext,
|
|
1219
|
+
runHints: config.runHints,
|
|
1220
|
+
variantManifest: config.variantManifest,
|
|
1221
|
+
});
|
|
1222
|
+
const userContent = visionObs ? replaceImagesWithObservation(rawParts, visionObs.observation) : rawParts;
|
|
1223
|
+
const fallbackMessages = [
|
|
1224
|
+
{
|
|
1225
|
+
role: 'system',
|
|
1226
|
+
content: `You are a screenshot quality inspector. Respond with a JSON object: { "ready": true } if the screenshot is clean, fully loaded, and matches the ${verificationTarget}. Otherwise respond with { "ready": false, "reason": "<short explanation>" }. Output ONLY valid JSON, nothing else.`,
|
|
1227
|
+
},
|
|
1228
|
+
{ role: 'user', content: userContent },
|
|
1229
|
+
];
|
|
1230
|
+
if (visionObs) {
|
|
1231
|
+
// Text-only call — no vision fallback needed
|
|
1232
|
+
const result = await client.chat.completions.create({ model: modelState.active, messages: fallbackMessages, max_tokens: 256, response_format: { type: 'json_object' }, stream: false, ...providerBody(modelState.active, config.providerPreferences) }, { signal: config.abortSignal });
|
|
1233
|
+
const primaryUsage = extractUsage(result, stepNumber, 'verification', modelState.active, 0);
|
|
1234
|
+
const usage = visionObs.usage ? mergeUsage(visionObs.usage, primaryUsage) : primaryUsage;
|
|
1235
|
+
const content = extractAssistantText(result.choices?.[0]?.message?.content);
|
|
1236
|
+
if (!content || !content.trim()) {
|
|
1237
|
+
logger.debug('Verification JSON fallback (dual-model) returned empty response — trying plain-text fallback');
|
|
1238
|
+
return fallbackVerifyScreenshotText(client, config, modelState, screenshot, assessment, stepNumber, pageFingerprint, visionObs);
|
|
1239
|
+
}
|
|
1240
|
+
try {
|
|
1241
|
+
const json = JSON.parse(content);
|
|
1242
|
+
if (json.ready === true) {
|
|
1243
|
+
logger.success('Qualité du screenshot vérifiée');
|
|
1244
|
+
return { verified: true, usage, matchedPageId: config.variantManifest?.currentPageId ?? null, pageFingerprint, mode: 'text_fallback' };
|
|
1245
|
+
}
|
|
1246
|
+
const reason = (typeof json.reason === 'string' && json.reason.trim()) || 'Verification failed';
|
|
1247
|
+
logger.ai(`Problème de vérification : ${reason}`);
|
|
1248
|
+
return { verified: false, reason, usage, blockingReason: reason, pageFingerprint, mode: 'text_fallback' };
|
|
1249
|
+
}
|
|
1250
|
+
catch {
|
|
1251
|
+
const snippet = (content || '').trim().slice(0, 160);
|
|
1252
|
+
return { verified: false, reason: `Verification fallback returned invalid JSON: ${snippet}`, usage, blockingReason: `Verification fallback returned invalid JSON: ${snippet}`, pageFingerprint, mode: 'text_fallback' };
|
|
1253
|
+
}
|
|
1254
|
+
}
|
|
1255
|
+
// Mono-model mode: send image directly with vision fallback
|
|
1256
|
+
const fallbackResult = await callVisionCapableModel({
|
|
1257
|
+
primaryModel: modelState.active,
|
|
1258
|
+
fallbackModel: modelState.active === config.model ? config.fallbackModel : undefined,
|
|
1259
|
+
onFallbackActivated: (model, reason) => {
|
|
1260
|
+
modelState.active = model;
|
|
1261
|
+
logger.info(`Vision fallback activated for JSON verification: ${model} (reason: ${reason})`);
|
|
1262
|
+
},
|
|
1263
|
+
callModel: (model) => client.chat.completions.create({
|
|
1264
|
+
model,
|
|
1265
|
+
messages: fallbackMessages,
|
|
1266
|
+
max_tokens: 256,
|
|
1267
|
+
response_format: { type: 'json_object' },
|
|
1268
|
+
stream: false,
|
|
1269
|
+
...providerBody(model, config.providerPreferences),
|
|
1270
|
+
}, { signal: config.abortSignal }),
|
|
1271
|
+
});
|
|
1272
|
+
modelState.active = fallbackResult.model;
|
|
1273
|
+
const usage = extractUsage(fallbackResult.result, stepNumber, 'verification', fallbackResult.model, 1);
|
|
1274
|
+
// Some models return tool_calls even when no tools are provided — check before parsing content
|
|
1275
|
+
const fallbackToolCall = fallbackResult.result.choices?.[0]?.message?.tool_calls?.[0];
|
|
1276
|
+
if (fallbackToolCall && 'function' in fallbackToolCall) {
|
|
1277
|
+
if (fallbackToolCall.function.name === 'ready_to_capture') {
|
|
1278
|
+
logger.success('Qualité du screenshot vérifiée');
|
|
1279
|
+
return { verified: true, usage, matchedPageId: config.variantManifest?.currentPageId ?? null, pageFingerprint, mode: 'text_fallback' };
|
|
1280
|
+
}
|
|
1281
|
+
if (fallbackToolCall.function.name === 'give_up') {
|
|
1282
|
+
const tcArgs = JSON.parse(fallbackToolCall.function.arguments);
|
|
1283
|
+
const reason = (typeof tcArgs.reason === 'string' && tcArgs.reason.trim()) || 'Verification failed';
|
|
1284
|
+
logger.ai(`Problème de vérification : ${reason}`);
|
|
1285
|
+
return { verified: false, reason, usage, blockingReason: reason, pageFingerprint, mode: 'text_fallback' };
|
|
1286
|
+
}
|
|
1287
|
+
}
|
|
1288
|
+
const content = extractAssistantText(fallbackResult.result.choices?.[0]?.message?.content);
|
|
1289
|
+
// Guard against empty responses — some models don't support response_format:json_object.
|
|
1290
|
+
// Fall through to a plain-text verification call as a last resort.
|
|
1291
|
+
if (!content || !content.trim()) {
|
|
1292
|
+
logger.debug('Verification JSON fallback returned empty response — trying plain-text fallback');
|
|
1293
|
+
return fallbackVerifyScreenshotText(client, config, modelState, screenshot, assessment, stepNumber, pageFingerprint);
|
|
1294
|
+
}
|
|
1295
|
+
try {
|
|
1296
|
+
const json = JSON.parse(content);
|
|
1297
|
+
const ready = json.ready === true;
|
|
1298
|
+
if (ready) {
|
|
1299
|
+
logger.success('Qualité du screenshot vérifiée');
|
|
1300
|
+
return {
|
|
1301
|
+
verified: true,
|
|
1302
|
+
usage,
|
|
1303
|
+
matchedPageId: config.variantManifest?.currentPageId ?? null,
|
|
1304
|
+
pageFingerprint,
|
|
1305
|
+
mode: 'text_fallback',
|
|
1306
|
+
};
|
|
1307
|
+
}
|
|
1308
|
+
const reason = (typeof json.reason === 'string' && json.reason.trim()) || 'Verification failed';
|
|
1309
|
+
logger.ai(`Problème de vérification : ${reason}`);
|
|
1310
|
+
return {
|
|
1311
|
+
verified: false,
|
|
1312
|
+
reason,
|
|
1313
|
+
usage,
|
|
1314
|
+
blockingReason: reason,
|
|
1315
|
+
pageFingerprint,
|
|
1316
|
+
mode: 'text_fallback',
|
|
1317
|
+
};
|
|
1318
|
+
}
|
|
1319
|
+
catch {
|
|
1320
|
+
// JSON parsing failed — treat as verification failure
|
|
1321
|
+
const snippet = (content || '').trim().slice(0, 160);
|
|
1322
|
+
logger.debug(`Verification JSON fallback returned invalid JSON: ${snippet}`);
|
|
1323
|
+
return {
|
|
1324
|
+
verified: false,
|
|
1325
|
+
reason: snippet ? `Verification fallback returned invalid JSON: ${snippet}` : 'Verification returned no valid decision',
|
|
1326
|
+
usage,
|
|
1327
|
+
blockingReason: snippet ? `Verification fallback returned invalid JSON: ${snippet}` : 'Verification returned no valid decision',
|
|
1328
|
+
pageFingerprint,
|
|
1329
|
+
mode: 'text_fallback',
|
|
1330
|
+
};
|
|
1331
|
+
}
|
|
1332
|
+
}
|
|
1333
|
+
async function fallbackVerifyScreenshotText(client, config, modelState, screenshot, assessment, stepNumber, pageFingerprint, precomputedVisionObs) {
|
|
1334
|
+
const verificationTarget = config.runMode === 'video_navigation_preflight'
|
|
1335
|
+
? 'exact pre-recording start state'
|
|
1336
|
+
: 'requested page';
|
|
1337
|
+
const verifyPromptText = `Requested page: ${config.prompt.slice(0, 300)}\nAgent assessment: ${assessment.slice(0, 300)}\nIs this screenshot at the ${verificationTarget}?`;
|
|
1338
|
+
// Dual-model mode: reuse precomputed observation or call vision observer
|
|
1339
|
+
const visionObs = precomputedVisionObs !== undefined
|
|
1340
|
+
? precomputedVisionObs
|
|
1341
|
+
: await getVisionObservation(client, config, screenshot, verifyPromptText, stepNumber);
|
|
1342
|
+
if (visionObs) {
|
|
1343
|
+
try {
|
|
1344
|
+
const messages = [
|
|
1345
|
+
{
|
|
1346
|
+
role: 'system',
|
|
1347
|
+
content: `You are a screenshot quality inspector. Reply with exactly one word: READY if the page is clean, fully loaded, and matches the ${verificationTarget}. Otherwise reply NOT_READY followed by a colon and a short reason. Example: "NOT_READY: cookie banner visible". Output nothing else.`,
|
|
1348
|
+
},
|
|
1349
|
+
{ role: 'user', content: `Page observation (from vision model):\n${visionObs.observation}\n\n${verifyPromptText}` },
|
|
1350
|
+
];
|
|
1351
|
+
const result = await client.chat.completions.create({ model: modelState.active, messages, max_tokens: 64, stream: false, ...providerBody(modelState.active, config.providerPreferences) }, { signal: config.abortSignal });
|
|
1352
|
+
const primaryUsage = extractUsage(result, stepNumber, 'verification', modelState.active, 0);
|
|
1353
|
+
const usage = visionObs.usage ? mergeUsage(visionObs.usage, primaryUsage) : primaryUsage;
|
|
1354
|
+
const content = extractAssistantText(result.choices?.[0]?.message?.content);
|
|
1355
|
+
if (!content || !content.trim()) {
|
|
1356
|
+
return { verified: false, reason: 'Verification model returned empty response', usage, blockingReason: 'Verification model returned empty response', pageFingerprint, mode: 'text_fallback' };
|
|
1357
|
+
}
|
|
1358
|
+
const normalized = content.trim().toUpperCase();
|
|
1359
|
+
if (normalized === 'READY' || normalized.startsWith('READY')) {
|
|
1360
|
+
logger.success('Qualité du screenshot vérifiée');
|
|
1361
|
+
return { verified: true, usage, matchedPageId: config.variantManifest?.currentPageId ?? null, pageFingerprint, mode: 'text_fallback' };
|
|
1362
|
+
}
|
|
1363
|
+
const notReadyMatch = content.match(/NOT[_\s-]?READY\s*[:\-—]\s*(.*)/i);
|
|
1364
|
+
const reason = notReadyMatch?.[1]?.trim() || content.trim().slice(0, 200);
|
|
1365
|
+
logger.ai(`Problème de vérification : ${reason}`);
|
|
1366
|
+
return { verified: false, reason, usage, blockingReason: reason, pageFingerprint, mode: 'text_fallback' };
|
|
1367
|
+
}
|
|
1368
|
+
catch (err) {
|
|
1369
|
+
if (isAbortError(err))
|
|
1370
|
+
throw err;
|
|
1371
|
+
logger.error(`Verification text fallback (dual-model) call failed: ${err.message}`);
|
|
1372
|
+
return { verified: false, reason: 'Verification text fallback call failed', usage: null, blockingReason: 'Verification text fallback call failed', pageFingerprint, mode: 'text_fallback' };
|
|
1373
|
+
}
|
|
1374
|
+
}
|
|
1375
|
+
// Mono-model mode: send image directly with vision fallback
|
|
1376
|
+
const screenshotUrl = await makeImageUrl(screenshot, 'image/jpeg', config.uploadImage);
|
|
1377
|
+
const messages = [
|
|
1378
|
+
{
|
|
1379
|
+
role: 'system',
|
|
1380
|
+
content: `You are a screenshot quality inspector. Reply with exactly one word: READY if the screenshot is clean, fully loaded, and matches the ${verificationTarget}. Otherwise reply NOT_READY followed by a colon and a short reason. Example: "NOT_READY: cookie banner visible". Output nothing else.`,
|
|
1381
|
+
},
|
|
1382
|
+
{
|
|
1383
|
+
role: 'user',
|
|
1384
|
+
content: [
|
|
1385
|
+
{ type: 'image_url', image_url: { url: screenshotUrl } },
|
|
1386
|
+
{ type: 'text', text: verifyPromptText },
|
|
1387
|
+
],
|
|
1388
|
+
},
|
|
1389
|
+
];
|
|
1390
|
+
try {
|
|
1391
|
+
const result = await callVisionCapableModel({
|
|
1392
|
+
primaryModel: modelState.active,
|
|
1393
|
+
fallbackModel: modelState.active === config.model ? config.fallbackModel : undefined,
|
|
1394
|
+
onFallbackActivated: (model, reason) => {
|
|
1395
|
+
modelState.active = model;
|
|
1396
|
+
logger.info(`Vision fallback activated for text verification: ${model} (reason: ${reason})`);
|
|
1397
|
+
},
|
|
1398
|
+
callModel: (model) => client.chat.completions.create({
|
|
1399
|
+
model,
|
|
1400
|
+
messages,
|
|
1401
|
+
max_tokens: 64,
|
|
1402
|
+
stream: false,
|
|
1403
|
+
...providerBody(model, config.providerPreferences),
|
|
1404
|
+
}, { signal: config.abortSignal }),
|
|
1405
|
+
});
|
|
1406
|
+
modelState.active = result.model;
|
|
1407
|
+
const usage = extractUsage(result.result, stepNumber, 'verification', result.model, 1);
|
|
1408
|
+
const content = extractAssistantText(result.result.choices?.[0]?.message?.content);
|
|
1409
|
+
if (!content || !content.trim()) {
|
|
1410
|
+
logger.debug('Verification text fallback returned empty response');
|
|
1411
|
+
return { verified: false, reason: 'Verification model returned empty response', usage, blockingReason: 'Verification model returned empty response', pageFingerprint, mode: 'text_fallback' };
|
|
1412
|
+
}
|
|
1413
|
+
const normalized = content.trim().toUpperCase();
|
|
1414
|
+
if (normalized === 'READY' || normalized.startsWith('READY')) {
|
|
1415
|
+
logger.success('Qualité du screenshot vérifiée');
|
|
1416
|
+
return { verified: true, usage, matchedPageId: config.variantManifest?.currentPageId ?? null, pageFingerprint, mode: 'text_fallback' };
|
|
1417
|
+
}
|
|
1418
|
+
const notReadyMatch = content.match(/NOT[_\s-]?READY\s*[:\-—]\s*(.*)/i);
|
|
1419
|
+
const reason = notReadyMatch?.[1]?.trim() || content.trim().slice(0, 200);
|
|
1420
|
+
logger.ai(`Problème de vérification : ${reason}`);
|
|
1421
|
+
return { verified: false, reason, usage, blockingReason: reason, pageFingerprint, mode: 'text_fallback' };
|
|
1422
|
+
}
|
|
1423
|
+
catch (err) {
|
|
1424
|
+
if (isAbortError(err))
|
|
1425
|
+
throw err;
|
|
1426
|
+
logger.error(`Verification text fallback call failed: ${err.message}`);
|
|
1427
|
+
return { verified: false, reason: 'Verification text fallback call failed', usage: null, blockingReason: 'Verification text fallback call failed', pageFingerprint, mode: 'text_fallback' };
|
|
1428
|
+
}
|
|
1429
|
+
}
|
|
1430
|
+
async function performBrowserAction(browser, action, args) {
|
|
1431
|
+
const dismissPageOverlays = async () => {
|
|
1432
|
+
if (typeof browser.dismissOverlays === 'function') {
|
|
1433
|
+
await browser.dismissOverlays();
|
|
1434
|
+
return;
|
|
1435
|
+
}
|
|
1436
|
+
await dismissCookiesAndWidgets(browser.currentPage);
|
|
1437
|
+
};
|
|
1438
|
+
switch (action) {
|
|
1439
|
+
case 'click':
|
|
1440
|
+
case 'safe_expand': {
|
|
1441
|
+
const hoverOnly = args.hover_only === true || action === 'safe_expand';
|
|
1442
|
+
// Replay actions may need force:true to bypass pointer-event interception
|
|
1443
|
+
// (e.g., after dark theme switch, <html class="dark"> intercepts events briefly)
|
|
1444
|
+
const forceClick = args.__forceClick === true;
|
|
1445
|
+
if (args.index !== undefined) {
|
|
1446
|
+
if (hoverOnly) {
|
|
1447
|
+
await browser.hoverByIndex(args.index);
|
|
1448
|
+
}
|
|
1449
|
+
else {
|
|
1450
|
+
await browser.clickByIndex(args.index);
|
|
1451
|
+
}
|
|
1452
|
+
}
|
|
1453
|
+
else if (args.selector) {
|
|
1454
|
+
if (hoverOnly) {
|
|
1455
|
+
await browser.hoverBySelector(args.selector);
|
|
1456
|
+
}
|
|
1457
|
+
else {
|
|
1458
|
+
await browser.clickBySelector(args.selector, { force: forceClick });
|
|
1459
|
+
}
|
|
1460
|
+
}
|
|
1461
|
+
else if (args.x !== undefined && args.y !== undefined) {
|
|
1462
|
+
if (hoverOnly) {
|
|
1463
|
+
await browser.hoverByCoordinates(args.x, args.y);
|
|
1464
|
+
}
|
|
1465
|
+
else {
|
|
1466
|
+
await browser.clickByCoordinates(args.x, args.y);
|
|
1467
|
+
}
|
|
1468
|
+
}
|
|
1469
|
+
else {
|
|
1470
|
+
throw new Error('click requires index, selector, or x/y coordinates');
|
|
1471
|
+
}
|
|
1472
|
+
return;
|
|
1473
|
+
}
|
|
1474
|
+
case 'hover':
|
|
1475
|
+
if (args.index !== undefined) {
|
|
1476
|
+
await browser.hoverByIndex(args.index);
|
|
1477
|
+
}
|
|
1478
|
+
else if (args.selector) {
|
|
1479
|
+
await browser.hoverBySelector(args.selector);
|
|
1480
|
+
}
|
|
1481
|
+
else if (args.x !== undefined && args.y !== undefined) {
|
|
1482
|
+
await browser.hoverByCoordinates(args.x, args.y);
|
|
1483
|
+
}
|
|
1484
|
+
else {
|
|
1485
|
+
throw new Error('hover requires index, selector, or x/y coordinates');
|
|
1486
|
+
}
|
|
1487
|
+
return;
|
|
1488
|
+
case 'type_text':
|
|
1489
|
+
await browser.typeText(args.text, {
|
|
1490
|
+
index: args.index,
|
|
1491
|
+
selector: args.selector,
|
|
1492
|
+
clearFirst: args.clearFirst,
|
|
1493
|
+
});
|
|
1494
|
+
return;
|
|
1495
|
+
case 'select_option':
|
|
1496
|
+
await browser.selectOption({
|
|
1497
|
+
index: args.index,
|
|
1498
|
+
selector: args.selector,
|
|
1499
|
+
optionLabel: args.optionLabel,
|
|
1500
|
+
optionValue: args.optionValue,
|
|
1501
|
+
optionIndex: args.optionIndex,
|
|
1502
|
+
});
|
|
1503
|
+
return;
|
|
1504
|
+
case 'scroll':
|
|
1505
|
+
if (args.index !== undefined) {
|
|
1506
|
+
await browser.scrollElementIntoView(args.index, {
|
|
1507
|
+
align: args.align,
|
|
1508
|
+
margin: args.margin,
|
|
1509
|
+
});
|
|
1510
|
+
}
|
|
1511
|
+
else {
|
|
1512
|
+
await browser.scroll(args.direction, args.amount, args.selector);
|
|
1513
|
+
}
|
|
1514
|
+
return;
|
|
1515
|
+
case 'scroll_to_element':
|
|
1516
|
+
await browser.scrollElementIntoView(args.index, {
|
|
1517
|
+
align: args.align,
|
|
1518
|
+
margin: args.margin,
|
|
1519
|
+
});
|
|
1520
|
+
return;
|
|
1521
|
+
case 'press_key':
|
|
1522
|
+
await browser.pressKey(args.key);
|
|
1523
|
+
return;
|
|
1524
|
+
case 'navigate_to':
|
|
1525
|
+
await browser.navigateTo(args.url);
|
|
1526
|
+
await dismissPageOverlays();
|
|
1527
|
+
return;
|
|
1528
|
+
case 'wait':
|
|
1529
|
+
await browser.wait(Math.min(args.milliseconds || 1000, 5000));
|
|
1530
|
+
return;
|
|
1531
|
+
case 'resize_viewport':
|
|
1532
|
+
await browser.resizeViewport(args.width, args.height);
|
|
1533
|
+
await browser.wait(500);
|
|
1534
|
+
await dismissPageOverlays();
|
|
1535
|
+
return;
|
|
1536
|
+
case 'dismiss_overlays':
|
|
1537
|
+
await browser.dismissOverlays();
|
|
1538
|
+
return;
|
|
1539
|
+
default:
|
|
1540
|
+
throw new Error(`Unknown action: ${action}`);
|
|
1541
|
+
}
|
|
1542
|
+
}
|
|
1543
|
+
export async function executeAction(browser, action, args) {
|
|
1544
|
+
let before = null;
|
|
1545
|
+
let reaction = null;
|
|
1546
|
+
let outcome;
|
|
1547
|
+
let stateChanged;
|
|
1548
|
+
logActionProgress(action, 'capturing baseline observation');
|
|
1549
|
+
try {
|
|
1550
|
+
before = await withAgentStepTimeout(() => browser.captureObservation(), {
|
|
1551
|
+
stepLabel: `${action} baseline captureObservation`,
|
|
1552
|
+
timeoutMs: ACTION_OBSERVATION_TIMEOUT_MS,
|
|
1553
|
+
});
|
|
1554
|
+
}
|
|
1555
|
+
catch (error) {
|
|
1556
|
+
if (isAgentStepTimeoutError(error)) {
|
|
1557
|
+
logger.info(`${buildActionTimeoutMessage('baseline observation', action, error.timeoutMs)}; continuing without baseline.`);
|
|
1558
|
+
before = null;
|
|
1559
|
+
}
|
|
1560
|
+
else {
|
|
1561
|
+
logger.info(`Action progress: ${action} -> baseline observation unavailable (${error.message})`);
|
|
1562
|
+
before = null;
|
|
1563
|
+
}
|
|
1564
|
+
}
|
|
1565
|
+
const authSubmitAction = isLikelyAuthenticationSubmitAction(action, args, before);
|
|
1566
|
+
const reactionOptions = getReactionOptions(action, args, authSubmitAction);
|
|
1567
|
+
logActionProgress(action, 'executing browser action');
|
|
1568
|
+
try {
|
|
1569
|
+
await withAgentStepTimeout(() => performBrowserAction(browser, action, args), {
|
|
1570
|
+
stepLabel: `${action} browser execution`,
|
|
1571
|
+
timeoutMs: getActionExecutionTimeoutMs(action, args),
|
|
1572
|
+
});
|
|
1573
|
+
}
|
|
1574
|
+
catch (error) {
|
|
1575
|
+
if (isAgentStepTimeoutError(error)) {
|
|
1576
|
+
const recovery = await performActionDiagnosticRecovery(browser, action, before);
|
|
1577
|
+
if (recovery.reaction?.changed) {
|
|
1578
|
+
return {
|
|
1579
|
+
success: true,
|
|
1580
|
+
outcome: `${buildActionTimeoutMessage('browser execution', action, error.timeoutMs)}; recovered via diagnostic observation: ${recovery.reaction.summary}`,
|
|
1581
|
+
stateChanged: true,
|
|
1582
|
+
};
|
|
1583
|
+
}
|
|
1584
|
+
return {
|
|
1585
|
+
success: false,
|
|
1586
|
+
error: getDiagnosticRecoveryFailureMessage('browser execution', action, error.timeoutMs, { afterChanged: recovery.reaction?.changed, recoveryError: recovery.error }),
|
|
1587
|
+
};
|
|
1588
|
+
}
|
|
1589
|
+
return { success: false, error: enrichErrorMessage(error.message) };
|
|
1590
|
+
}
|
|
1591
|
+
if (before) {
|
|
1592
|
+
logActionProgress(action, 'waiting for page reaction');
|
|
1593
|
+
try {
|
|
1594
|
+
reaction = await withAgentStepTimeout(() => browser.waitForPageReaction(before, reactionOptions), {
|
|
1595
|
+
stepLabel: `${action} post-action waitForPageReaction`,
|
|
1596
|
+
timeoutMs: reactionOptions.timeoutMs + 1200,
|
|
1597
|
+
});
|
|
1598
|
+
}
|
|
1599
|
+
catch (error) {
|
|
1600
|
+
if (isAgentStepTimeoutError(error)) {
|
|
1601
|
+
const recovery = await performActionDiagnosticRecovery(browser, action, before);
|
|
1602
|
+
if (recovery.reaction?.changed) {
|
|
1603
|
+
return {
|
|
1604
|
+
success: true,
|
|
1605
|
+
outcome: `${buildActionTimeoutMessage('post-action reaction', action, error.timeoutMs)}; recovered via diagnostic observation: ${recovery.reaction.summary}`,
|
|
1606
|
+
stateChanged: true,
|
|
1607
|
+
};
|
|
1608
|
+
}
|
|
1609
|
+
return {
|
|
1610
|
+
success: false,
|
|
1611
|
+
error: getDiagnosticRecoveryFailureMessage('post-action reaction', action, error.timeoutMs, { afterChanged: recovery.reaction?.changed, recoveryError: recovery.error }),
|
|
1612
|
+
};
|
|
1613
|
+
}
|
|
1614
|
+
logger.info(`Action progress: ${action} -> page reaction unavailable (${error.message})`);
|
|
1615
|
+
reaction = null;
|
|
1616
|
+
}
|
|
1617
|
+
}
|
|
1618
|
+
if (reaction) {
|
|
1619
|
+
outcome = reaction.summary;
|
|
1620
|
+
stateChanged = reaction.changed;
|
|
1621
|
+
if (action !== 'navigate_to' && reaction.before.url !== reaction.after.url) {
|
|
1622
|
+
// Only attempt overlay dismissal when the new page signals possible overlays
|
|
1623
|
+
// (dialogs, loading indicators, or expanded elements). Skipping saves 1-3.5s per navigation.
|
|
1624
|
+
const hasOverlaySignals = reaction.after.dialogCount > 0
|
|
1625
|
+
|| reaction.after.loadingIndicatorCount > 0
|
|
1626
|
+
|| reaction.after.expandedCount > 0;
|
|
1627
|
+
if (hasOverlaySignals) {
|
|
1628
|
+
logActionProgress(action, 'post-navigation cleanup');
|
|
1629
|
+
try {
|
|
1630
|
+
const cleanup = await withAgentStepTimeout(() => browser.dismissOverlays(), {
|
|
1631
|
+
stepLabel: `${action} post-navigation dismissOverlays`,
|
|
1632
|
+
timeoutMs: getActionExecutionTimeoutMs('dismiss_overlays', {}),
|
|
1633
|
+
});
|
|
1634
|
+
if (cleanup.dismissed) {
|
|
1635
|
+
const cleanupReaction = await withAgentStepTimeout(() => browser.waitForPageReaction(reaction.after, {
|
|
1636
|
+
timeoutMs: 1200,
|
|
1637
|
+
settleMs: 200,
|
|
1638
|
+
}), {
|
|
1639
|
+
stepLabel: `${action} post-navigation waitForPageReaction`,
|
|
1640
|
+
timeoutMs: 2400,
|
|
1641
|
+
}).catch(() => null);
|
|
1642
|
+
outcome = cleanupReaction
|
|
1643
|
+
? `${reaction.summary}; cleanup: ${cleanupReaction.summary}; page cleanup reapplied after navigation.`
|
|
1644
|
+
: `${reaction.summary}; page cleanup reapplied after navigation.`;
|
|
1645
|
+
stateChanged = reaction.changed || cleanupReaction?.changed || stateChanged;
|
|
1646
|
+
}
|
|
1647
|
+
else {
|
|
1648
|
+
outcome = `${reaction.summary}; page cleanup checked after navigation.`;
|
|
1649
|
+
}
|
|
1650
|
+
}
|
|
1651
|
+
catch (error) {
|
|
1652
|
+
if (isAgentStepTimeoutError(error)) {
|
|
1653
|
+
outcome = `${reaction.summary}; post-navigation cleanup timed out after ${error.timeoutMs}ms.`;
|
|
1654
|
+
}
|
|
1655
|
+
else {
|
|
1656
|
+
outcome = `${reaction.summary}; post-navigation cleanup skipped: ${error.message}`;
|
|
1657
|
+
}
|
|
1658
|
+
}
|
|
1659
|
+
}
|
|
1660
|
+
}
|
|
1661
|
+
}
|
|
1662
|
+
return { success: true, outcome, stateChanged };
|
|
1663
|
+
}
|
|
1664
|
+
function enrichErrorMessage(message) {
|
|
1665
|
+
const msg = message.toLowerCase();
|
|
1666
|
+
if (msg.includes('timeout') && (msg.includes('element') || msg.includes('selector') || msg.includes('locator'))) {
|
|
1667
|
+
return `${message}. HINT: Element not found or not interactable. Try search_text to locate it, scroll to reveal it, or use a different element/approach.`;
|
|
1668
|
+
}
|
|
1669
|
+
if (msg.includes('not visible') || msg.includes('hidden') || msg.includes('display: none')) {
|
|
1670
|
+
return `${message}. HINT: Element is hidden. It may be inside a collapsed menu or behind an overlay. Try safe_expand on a parent trigger, or dismiss_overlays first.`;
|
|
1671
|
+
}
|
|
1672
|
+
if (msg.includes('outside') && msg.includes('viewport')) {
|
|
1673
|
+
return `${message}. HINT: Element is outside the viewport. Use scroll_to_element to bring it into view first.`;
|
|
1674
|
+
}
|
|
1675
|
+
if (msg.includes('navigation') && msg.includes('timeout')) {
|
|
1676
|
+
return `${message}. HINT: Page load timed out. The site may be slow. Try wait(2000) and then continue.`;
|
|
1677
|
+
}
|
|
1678
|
+
if (msg.includes('resolved to 0') || msg.includes('no element') || msg.includes('strict mode')) {
|
|
1679
|
+
return `${message}. HINT: CSS selector matched nothing or too many elements. Use element index instead, or search_text to find the element.`;
|
|
1680
|
+
}
|
|
1681
|
+
if (msg.includes('intercept') || msg.includes('other element would receive')) {
|
|
1682
|
+
return `${message}. HINT: Another element is covering the target (overlay, banner, or modal). Use dismiss_overlays or press_key("Escape") first.`;
|
|
1683
|
+
}
|
|
1684
|
+
return message;
|
|
1685
|
+
}
|
|
1686
|
+
async function verifyScreenshot(client, config, modelState, browser, assessment, stepNumber, options) {
|
|
1687
|
+
throwIfAborted(config.abortSignal, 'Agent verification cancelled.');
|
|
1688
|
+
logger.info('Vérification de la qualité du screenshot…');
|
|
1689
|
+
let verificationBundle;
|
|
1690
|
+
try {
|
|
1691
|
+
verificationBundle = await withAgentStepTimeout(() => browser.captureVerificationBundle(), {
|
|
1692
|
+
stepLabel: 'verification captureVerificationBundle',
|
|
1693
|
+
timeoutMs: VERIFICATION_BUNDLE_TIMEOUT_MS,
|
|
1694
|
+
});
|
|
1695
|
+
logger.debug(`Verification snapshot ready [${verificationBundle.coherenceKey}] @ ${verificationBundle.url}`);
|
|
1696
|
+
}
|
|
1697
|
+
catch (error) {
|
|
1698
|
+
if (isAgentStepTimeoutError(error)) {
|
|
1699
|
+
const reason = buildVerificationTimeoutMessage('captureVerificationBundle', error.timeoutMs);
|
|
1700
|
+
logger.ai(`Problème de vérification : ${reason}`);
|
|
1701
|
+
return {
|
|
1702
|
+
verified: false,
|
|
1703
|
+
reason,
|
|
1704
|
+
usage: null,
|
|
1705
|
+
blockingReason: reason,
|
|
1706
|
+
mode: 'deterministic',
|
|
1707
|
+
};
|
|
1708
|
+
}
|
|
1709
|
+
const reason = error.message;
|
|
1710
|
+
logger.ai(`Problème de vérification : ${reason}`);
|
|
1711
|
+
return {
|
|
1712
|
+
verified: false,
|
|
1713
|
+
reason,
|
|
1714
|
+
usage: null,
|
|
1715
|
+
blockingReason: reason,
|
|
1716
|
+
mode: 'deterministic',
|
|
1717
|
+
};
|
|
1718
|
+
}
|
|
1719
|
+
if (config.runMode === 'language_preflight') {
|
|
1720
|
+
try {
|
|
1721
|
+
const reason = buildLanguagePreflightVerificationFailureReason({
|
|
1722
|
+
requestedLang: config.currentLang,
|
|
1723
|
+
requestedTheme: config.currentTheme,
|
|
1724
|
+
currentUrl: verificationBundle.url,
|
|
1725
|
+
signals: verificationBundle.pageSignals,
|
|
1726
|
+
});
|
|
1727
|
+
if (!reason) {
|
|
1728
|
+
logger.success('Qualité du screenshot vérifiée');
|
|
1729
|
+
return {
|
|
1730
|
+
verified: true,
|
|
1731
|
+
usage: null,
|
|
1732
|
+
matchedPageId: config.variantManifest?.currentPageId ?? null,
|
|
1733
|
+
mode: 'deterministic',
|
|
1734
|
+
};
|
|
1735
|
+
}
|
|
1736
|
+
logger.ai(`Problème de vérification : ${reason}`);
|
|
1737
|
+
return {
|
|
1738
|
+
verified: false,
|
|
1739
|
+
reason,
|
|
1740
|
+
usage: null,
|
|
1741
|
+
blockingReason: reason,
|
|
1742
|
+
mode: 'deterministic',
|
|
1743
|
+
};
|
|
1744
|
+
}
|
|
1745
|
+
catch (error) {
|
|
1746
|
+
const reason = error.message;
|
|
1747
|
+
logger.ai(`Problème de vérification : ${reason}`);
|
|
1748
|
+
return {
|
|
1749
|
+
verified: false,
|
|
1750
|
+
reason,
|
|
1751
|
+
usage: null,
|
|
1752
|
+
blockingReason: reason,
|
|
1753
|
+
mode: 'deterministic',
|
|
1754
|
+
};
|
|
1755
|
+
}
|
|
1756
|
+
}
|
|
1757
|
+
const allowDeterministicSuccess = options?.allowDeterministicSuccess
|
|
1758
|
+
?? (!config.variantManifest?.currentPageId
|
|
1759
|
+
&& (config.variantManifest?.previousValidatedCaptures.length ?? 0) === 0);
|
|
1760
|
+
{
|
|
1761
|
+
const deterministicCheck = inferDeterministicReadyDecision({
|
|
1762
|
+
observation: verificationBundle.observation,
|
|
1763
|
+
assessment,
|
|
1764
|
+
targetUrl: config.url,
|
|
1765
|
+
skipDialogCheck: options?.skipDialogCheck,
|
|
1766
|
+
allowSuccess: allowDeterministicSuccess,
|
|
1767
|
+
});
|
|
1768
|
+
if (deterministicCheck?.verified) {
|
|
1769
|
+
// For video navigation preflight, also check that we're not on a generic dashboard.
|
|
1770
|
+
// The deterministic check may pass on /home when the assessment mentions the project,
|
|
1771
|
+
// but the agent hasn't actually navigated into it.
|
|
1772
|
+
const isGenericDashboard = config.runMode === 'video_navigation_preflight'
|
|
1773
|
+
&& verificationBundle.url
|
|
1774
|
+
&& /^\/(home|dashboard|app)?\/?$/.test(new URL(verificationBundle.url).pathname);
|
|
1775
|
+
if (isGenericDashboard) {
|
|
1776
|
+
logger.ai('Problème de vérification : la page est un dashboard générique, le projet/section demandé n\'est pas encore ouvert.');
|
|
1777
|
+
return {
|
|
1778
|
+
verified: false,
|
|
1779
|
+
reason: 'Still on generic dashboard (/home). Navigate into the specific project/section before calling ready_to_capture.',
|
|
1780
|
+
usage: null,
|
|
1781
|
+
blockingReason: 'generic_dashboard',
|
|
1782
|
+
pageFingerprint: computeScreenshotFingerprint(verificationBundle.screenshot),
|
|
1783
|
+
mode: 'deterministic',
|
|
1784
|
+
};
|
|
1785
|
+
}
|
|
1786
|
+
logger.success('Qualité du screenshot vérifiée');
|
|
1787
|
+
return {
|
|
1788
|
+
verified: true,
|
|
1789
|
+
usage: null,
|
|
1790
|
+
matchedPageId: config.variantManifest?.currentPageId ?? null,
|
|
1791
|
+
pageFingerprint: computeScreenshotFingerprint(verificationBundle.screenshot),
|
|
1792
|
+
mode: 'deterministic',
|
|
1793
|
+
};
|
|
1794
|
+
}
|
|
1795
|
+
if (deterministicCheck && !deterministicCheck.verified) {
|
|
1796
|
+
logger.ai(`Problème de vérification : ${deterministicCheck.reason}`);
|
|
1797
|
+
return {
|
|
1798
|
+
verified: false,
|
|
1799
|
+
reason: deterministicCheck.reason,
|
|
1800
|
+
usage: null,
|
|
1801
|
+
blockingReason: deterministicCheck.reason,
|
|
1802
|
+
pageFingerprint: computeScreenshotFingerprint(verificationBundle.screenshot),
|
|
1803
|
+
mode: 'deterministic',
|
|
1804
|
+
};
|
|
1805
|
+
}
|
|
1806
|
+
}
|
|
1807
|
+
const screenshotBuf = verificationBundle.screenshot;
|
|
1808
|
+
const pageFingerprint = computeScreenshotFingerprint(screenshotBuf);
|
|
1809
|
+
const duplicateOfPageId = findDuplicateVariantCapture(config.variantManifest, pageFingerprint);
|
|
1810
|
+
if (duplicateOfPageId) {
|
|
1811
|
+
const reason = `Duplicate capture detected: current page matches previously validated page "${duplicateOfPageId}". Reach a distinct state for "${config.variantManifest?.currentPageId ?? 'current'}" before capturing.`;
|
|
1812
|
+
logger.ai(`Problème de vérification : ${reason}`);
|
|
1813
|
+
return {
|
|
1814
|
+
verified: false,
|
|
1815
|
+
reason,
|
|
1816
|
+
usage: null,
|
|
1817
|
+
duplicateOfPageId,
|
|
1818
|
+
blockingReason: reason,
|
|
1819
|
+
pageFingerprint,
|
|
1820
|
+
mode: 'vision',
|
|
1821
|
+
};
|
|
1822
|
+
}
|
|
1823
|
+
const pageSignals = verificationBundle.pageSignals;
|
|
1824
|
+
const latestObservation = verificationBundle.observation;
|
|
1825
|
+
// Content-based identity checks are advisory — the LLM sees the screenshot
|
|
1826
|
+
// and can judge correctly, so we pass these as hints instead of hard rejections.
|
|
1827
|
+
const identityHint = config.runMode === 'video_navigation_preflight' && !config.variantManifest?.currentPageIdentity
|
|
1828
|
+
? null
|
|
1829
|
+
: inferVariantIdentityFailure({
|
|
1830
|
+
pageId: config.variantManifest?.currentPageId ?? null,
|
|
1831
|
+
prompt: config.prompt,
|
|
1832
|
+
currentUrl: verificationBundle.url,
|
|
1833
|
+
visibleText: [
|
|
1834
|
+
pageSignals.title,
|
|
1835
|
+
pageSignals.headings.join(' '),
|
|
1836
|
+
pageSignals.navLabels.join(' '),
|
|
1837
|
+
pageSignals.visibleText,
|
|
1838
|
+
].filter(Boolean).join(' '),
|
|
1839
|
+
dialogCount: latestObservation.dialogCount,
|
|
1840
|
+
pageIdentity: config.variantManifest?.currentPageIdentity,
|
|
1841
|
+
});
|
|
1842
|
+
if (identityHint) {
|
|
1843
|
+
logger.debug(`Identity hint for LLM verification: ${identityHint}`);
|
|
1844
|
+
}
|
|
1845
|
+
const variantMismatchHint = config.runMode === 'video_navigation_preflight'
|
|
1846
|
+
? buildLanguagePreflightVerificationFailureReason({
|
|
1847
|
+
requestedLang: config.currentLang,
|
|
1848
|
+
requestedTheme: config.currentTheme,
|
|
1849
|
+
currentUrl: verificationBundle.url,
|
|
1850
|
+
signals: verificationBundle.pageSignals,
|
|
1851
|
+
}) || null
|
|
1852
|
+
: null;
|
|
1853
|
+
if (variantMismatchHint) {
|
|
1854
|
+
logger.debug(`Variant hint for LLM verification: ${variantMismatchHint}`);
|
|
1855
|
+
}
|
|
1856
|
+
// Manifest-aware deterministic check — only trust its ACCEPT decisions.
|
|
1857
|
+
// Rejections based on content analysis are passed to the LLM as hints.
|
|
1858
|
+
const manifestReadyDecision = inferManifestReadyDecision({
|
|
1859
|
+
observation: latestObservation,
|
|
1860
|
+
assessment,
|
|
1861
|
+
config,
|
|
1862
|
+
bundle: verificationBundle,
|
|
1863
|
+
});
|
|
1864
|
+
if (manifestReadyDecision?.verified) {
|
|
1865
|
+
logger.success('Qualité du screenshot vérifiée');
|
|
1866
|
+
return {
|
|
1867
|
+
verified: true,
|
|
1868
|
+
usage: null,
|
|
1869
|
+
matchedPageId: config.variantManifest?.currentPageId ?? null,
|
|
1870
|
+
pageFingerprint,
|
|
1871
|
+
mode: 'deterministic',
|
|
1872
|
+
};
|
|
1873
|
+
}
|
|
1874
|
+
// Manifest rejection is downgraded to an advisory hint for the LLM
|
|
1875
|
+
const manifestHint = manifestReadyDecision && !manifestReadyDecision.verified
|
|
1876
|
+
? manifestReadyDecision.reason
|
|
1877
|
+
: null;
|
|
1878
|
+
if (manifestHint) {
|
|
1879
|
+
logger.debug(`Manifest hint for LLM verification: ${manifestHint}`);
|
|
1880
|
+
}
|
|
1881
|
+
// Page-signal deterministic check: combine assessment parse, identity check,
|
|
1882
|
+
// stability, variant match, and dialog absence into a single accept gate.
|
|
1883
|
+
// Never rejects — only accepts or returns null (inconclusive → LLM).
|
|
1884
|
+
const pageSignalDecision = inferPageSignalReadyDecision({
|
|
1885
|
+
observation: latestObservation,
|
|
1886
|
+
assessment,
|
|
1887
|
+
config,
|
|
1888
|
+
bundle: verificationBundle,
|
|
1889
|
+
identityFailure: identityHint,
|
|
1890
|
+
});
|
|
1891
|
+
if (pageSignalDecision?.verified) {
|
|
1892
|
+
logger.success('Qualité du screenshot vérifiée');
|
|
1893
|
+
return {
|
|
1894
|
+
verified: true,
|
|
1895
|
+
usage: null,
|
|
1896
|
+
matchedPageId: config.variantManifest?.currentPageId ?? null,
|
|
1897
|
+
pageFingerprint,
|
|
1898
|
+
mode: 'deterministic',
|
|
1899
|
+
};
|
|
1900
|
+
}
|
|
1901
|
+
// Dialog presence hint — let the LLM judge whether the dialog is the intended
|
|
1902
|
+
// capture target or an obstruction, rather than making a deterministic call.
|
|
1903
|
+
const dialogHint = latestObservation.dialogCount > 0
|
|
1904
|
+
? `A dialog/modal is currently open (${latestObservation.dialogCount}). Judge whether this dialog is the intended capture target or an unwanted overlay blocking the page.`
|
|
1905
|
+
: null;
|
|
1906
|
+
const pageContext = {
|
|
1907
|
+
currentUrl: verificationBundle.url,
|
|
1908
|
+
pageTitle: verificationBundle.title,
|
|
1909
|
+
};
|
|
1910
|
+
const verificationDiagnostics = summarizeVerificationDiagnostics(config, verificationBundle);
|
|
1911
|
+
const verificationTools = agentTools.filter(t => ['ready_to_capture', 'give_up'].includes(t.function.name));
|
|
1912
|
+
// Dual-model mode: use vision observer to describe the page, then text-only primary model
|
|
1913
|
+
const visionObs = await getVisionObservation(client, config, screenshotBuf, buildVerificationVisionContext({
|
|
1914
|
+
config,
|
|
1915
|
+
pageContext,
|
|
1916
|
+
assessment,
|
|
1917
|
+
}), stepNumber);
|
|
1918
|
+
const screenshotUrl = visionObs ? '' : await makeImageUrl(screenshotBuf, 'image/jpeg', config.uploadImage);
|
|
1919
|
+
const rawParts = buildVerificationMessage({
|
|
1920
|
+
userPrompt: config.prompt,
|
|
1921
|
+
screenshotUrl,
|
|
1922
|
+
previousAssessment: assessment,
|
|
1923
|
+
runMode: config.runMode,
|
|
1924
|
+
currentLang: config.currentLang,
|
|
1925
|
+
currentTheme: config.currentTheme,
|
|
1926
|
+
pageContext,
|
|
1927
|
+
runHints: config.runHints,
|
|
1928
|
+
variantManifest: config.variantManifest,
|
|
1929
|
+
verificationDiagnostics,
|
|
1930
|
+
identityHints: [identityHint, manifestHint, dialogHint, variantMismatchHint].filter(Boolean),
|
|
1931
|
+
});
|
|
1932
|
+
const userContent = visionObs ? replaceImagesWithObservation(rawParts, visionObs.observation) : rawParts;
|
|
1933
|
+
const systemPrompt = config.runMode === 'video_navigation_preflight'
|
|
1934
|
+
? 'You are a video navigation verification inspector. You MUST respond by calling exactly one tool — do NOT reply with text.\n'
|
|
1935
|
+
+ '- Call ready_to_capture ONLY if the screenshot shows the EXACT pre-recording start state.\n'
|
|
1936
|
+
+ '- Reject generic /home or dashboard states when the requested project, section, tab, or dialog is not actually open yet.\n'
|
|
1937
|
+
+ '- Reject if the fixed app chrome is in the wrong language or theme.\n'
|
|
1938
|
+
+ '- Reject if overlays, popups, spinners, or unrelated dialogs remain.\n'
|
|
1939
|
+
+ '- Call give_up with a short reason for any mismatch or uncertainty.\n'
|
|
1940
|
+
+ 'Pick one tool and call it now.'
|
|
1941
|
+
: 'You are a screenshot quality inspector. You MUST respond by calling exactly one tool — do NOT reply with text.\n'
|
|
1942
|
+
+ '- Call ready_to_capture if the screenshot is clean, fully loaded, free of overlays/spinners, and matches the user request.\n'
|
|
1943
|
+
+ '- Call give_up with a reason if there are issues (overlays, wrong page, loading state, etc.).\n'
|
|
1944
|
+
+ 'Pick one tool and call it now.';
|
|
1945
|
+
const messages = [
|
|
1946
|
+
{
|
|
1947
|
+
role: 'system',
|
|
1948
|
+
content: systemPrompt,
|
|
1949
|
+
},
|
|
1950
|
+
{ role: 'user', content: userContent },
|
|
1951
|
+
];
|
|
1952
|
+
try {
|
|
1953
|
+
let visionResult;
|
|
1954
|
+
if (visionObs) {
|
|
1955
|
+
// Text-only call — no vision fallback needed
|
|
1956
|
+
logger.info('Vérification de la capture…');
|
|
1957
|
+
const result = await callWithRetry(client, {
|
|
1958
|
+
model: modelState.active,
|
|
1959
|
+
messages,
|
|
1960
|
+
tools: verificationTools,
|
|
1961
|
+
tool_choice: 'required',
|
|
1962
|
+
max_tokens: 1024,
|
|
1963
|
+
}, 3, config.abortSignal, config.providerPreferences);
|
|
1964
|
+
visionResult = { result, model: modelState.active, fellBack: false };
|
|
1965
|
+
}
|
|
1966
|
+
else {
|
|
1967
|
+
// Mono-model mode: send image directly with vision fallback
|
|
1968
|
+
logger.info('Vérification de la capture…');
|
|
1969
|
+
visionResult = await callVisionCapableModel({
|
|
1970
|
+
primaryModel: modelState.active,
|
|
1971
|
+
fallbackModel: modelState.active === config.model ? config.fallbackModel : undefined,
|
|
1972
|
+
onFallbackActivated: (model, reason) => {
|
|
1973
|
+
modelState.active = model;
|
|
1974
|
+
logger.debug(`Vision fallback activated for verification: ${model} (reason: ${reason})`);
|
|
1975
|
+
},
|
|
1976
|
+
callModel: (model) => callWithRetry(client, {
|
|
1977
|
+
model,
|
|
1978
|
+
messages,
|
|
1979
|
+
tools: verificationTools,
|
|
1980
|
+
tool_choice: 'required',
|
|
1981
|
+
max_tokens: 1024,
|
|
1982
|
+
}, 3, config.abortSignal, config.providerPreferences),
|
|
1983
|
+
});
|
|
1984
|
+
}
|
|
1985
|
+
const response = visionResult.result;
|
|
1986
|
+
if (visionResult.model && visionResult.model !== modelState.active) {
|
|
1987
|
+
logger.debug(`OpenRouter model substitution detected: requested "${modelState.active}", got "${visionResult.model}"`);
|
|
1988
|
+
}
|
|
1989
|
+
modelState.active = visionResult.model;
|
|
1990
|
+
const primaryUsage = extractUsage(response, stepNumber, 'verification', visionResult.model, visionObs ? 0 : 1);
|
|
1991
|
+
// Merge vision observer usage if dual-model mode was used
|
|
1992
|
+
const usage = visionObs?.usage ? mergeUsage(visionObs.usage, primaryUsage) : primaryUsage;
|
|
1993
|
+
const toolCall = response.choices?.[0]?.message?.tool_calls?.[0];
|
|
1994
|
+
const assistantText = extractAssistantText(response.choices?.[0]?.message?.content);
|
|
1995
|
+
if (toolCall && 'function' in toolCall) {
|
|
1996
|
+
if (toolCall.function.name === 'ready_to_capture') {
|
|
1997
|
+
logger.success('Qualité du screenshot vérifiée');
|
|
1998
|
+
return {
|
|
1999
|
+
verified: true,
|
|
2000
|
+
usage,
|
|
2001
|
+
matchedPageId: config.variantManifest?.currentPageId ?? null,
|
|
2002
|
+
pageFingerprint,
|
|
2003
|
+
mode: 'vision',
|
|
2004
|
+
};
|
|
2005
|
+
}
|
|
2006
|
+
if (toolCall.function.name === 'give_up') {
|
|
2007
|
+
const args = JSON.parse(toolCall.function.arguments);
|
|
2008
|
+
logger.ai(`Problème de vérification : ${args.reason}`);
|
|
2009
|
+
return {
|
|
2010
|
+
verified: false,
|
|
2011
|
+
reason: args.reason,
|
|
2012
|
+
usage,
|
|
2013
|
+
blockingReason: args.reason,
|
|
2014
|
+
pageFingerprint,
|
|
2015
|
+
mode: 'vision',
|
|
2016
|
+
};
|
|
2017
|
+
}
|
|
2018
|
+
}
|
|
2019
|
+
// No tool call found — try to extract a verdict from the assistant's text response
|
|
2020
|
+
// before falling back to a separate JSON verification call.
|
|
2021
|
+
if (assistantText) {
|
|
2022
|
+
const textDecision = parseVerificationDecisionText(assistantText);
|
|
2023
|
+
if (textDecision?.verified) {
|
|
2024
|
+
logger.success('Qualité du screenshot vérifiée');
|
|
2025
|
+
return {
|
|
2026
|
+
verified: true,
|
|
2027
|
+
usage,
|
|
2028
|
+
matchedPageId: config.variantManifest?.currentPageId ?? null,
|
|
2029
|
+
pageFingerprint,
|
|
2030
|
+
mode: 'text_fallback',
|
|
2031
|
+
};
|
|
2032
|
+
}
|
|
2033
|
+
if (textDecision && !textDecision.verified) {
|
|
2034
|
+
const reason = textDecision.reason ?? assistantText.slice(0, 400);
|
|
2035
|
+
logger.ai(`Problème de vérification : ${reason}`);
|
|
2036
|
+
return {
|
|
2037
|
+
verified: false,
|
|
2038
|
+
reason,
|
|
2039
|
+
usage,
|
|
2040
|
+
blockingReason: reason,
|
|
2041
|
+
pageFingerprint,
|
|
2042
|
+
mode: 'text_fallback',
|
|
2043
|
+
};
|
|
2044
|
+
}
|
|
2045
|
+
}
|
|
2046
|
+
// For models that generally support tool use well, skip the JSON fallback
|
|
2047
|
+
// (which costs an extra LLM call) and go straight to the cheaper text fallback.
|
|
2048
|
+
// The JSON fallback is only useful for models that support response_format but not tool_choice.
|
|
2049
|
+
const modelLower = (modelState.active || '').toLowerCase();
|
|
2050
|
+
const hasReliableToolUse = /claude|gpt-4|gemini|sonnet|opus|haiku/.test(modelLower);
|
|
2051
|
+
if (hasReliableToolUse) {
|
|
2052
|
+
logger.debug(`Verification returned no tool call from reliable model; skipping JSON fallback, trying text fallback directly. Model text: ${(assistantText || '(empty)').slice(0, 300)}`);
|
|
2053
|
+
return fallbackVerifyScreenshotText(client, config, modelState, screenshotBuf, assessment, stepNumber, pageFingerprint, visionObs);
|
|
2054
|
+
}
|
|
2055
|
+
logger.debug(`Verification returned no structured tool call; retrying with JSON fallback. Model text: ${(assistantText || '(empty)').slice(0, 300)}`);
|
|
2056
|
+
const fallback = await fallbackVerifyScreenshotJson(client, config, modelState, screenshotBuf, assessment, pageContext, stepNumber, pageFingerprint, visionObs);
|
|
2057
|
+
return {
|
|
2058
|
+
...fallback,
|
|
2059
|
+
usage: mergeUsage(usage, fallback.usage),
|
|
2060
|
+
};
|
|
2061
|
+
}
|
|
2062
|
+
catch (err) {
|
|
2063
|
+
if (isAbortError(err)) {
|
|
2064
|
+
throw err;
|
|
2065
|
+
}
|
|
2066
|
+
logger.error(`Verification call failed: ${err.message}`);
|
|
2067
|
+
return {
|
|
2068
|
+
verified: false,
|
|
2069
|
+
reason: err.message,
|
|
2070
|
+
usage: null,
|
|
2071
|
+
fatal: err instanceof VisionModelUnsupportedError,
|
|
2072
|
+
blockingReason: err.message,
|
|
2073
|
+
mode: 'vision',
|
|
2074
|
+
};
|
|
2075
|
+
}
|
|
2076
|
+
}
|
|
2077
|
+
export async function verifyCaptureReadiness(browser, config, apiKey, options = {}) {
|
|
2078
|
+
const client = createClient(apiKey);
|
|
2079
|
+
const modelState = { active: config.model };
|
|
2080
|
+
return verifyScreenshot(client, config, modelState, browser, options.assessment ?? 'Preflight readiness check for a carried-over page state.', options.stepNumber ?? 1);
|
|
2081
|
+
}
|
|
2082
|
+
// Meta-actions that don't interact with the browser — excluded from stuck/no-effect detection
|
|
2083
|
+
const META_ACTIONS = new Set(['note', 'begin_subgoal', 'ready_to_capture']);
|
|
2084
|
+
const BOOTSTRAP_ACTIONS = new Set(['dismiss_overlays', 'wait']);
|
|
2085
|
+
function isNoEffectAction(action) {
|
|
2086
|
+
return (action.success === false
|
|
2087
|
+
|| action.stateChanged === false
|
|
2088
|
+
|| action.outcome === 'No visible state change detected after the action.');
|
|
2089
|
+
}
|
|
2090
|
+
function isBootstrapStabilizationAction(action) {
|
|
2091
|
+
if (BOOTSTRAP_ACTIONS.has(action.action))
|
|
2092
|
+
return true;
|
|
2093
|
+
return action.action === 'press_key' && String(action.params.key || '').toLowerCase() === 'escape';
|
|
2094
|
+
}
|
|
2095
|
+
function hasMeaningfulBrowserAction(actionHistory) {
|
|
2096
|
+
return actionHistory.some((action) => !META_ACTIONS.has(action.action) && !isBootstrapStabilizationAction(action));
|
|
2097
|
+
}
|
|
2098
|
+
function buildRecoveryActionSignature(action) {
|
|
2099
|
+
const parts = [action.action];
|
|
2100
|
+
if (typeof action.params.selector === 'string' && action.params.selector.trim()) {
|
|
2101
|
+
parts.push(`selector:${action.params.selector.trim()}`);
|
|
2102
|
+
}
|
|
2103
|
+
else if (action.params.index !== undefined) {
|
|
2104
|
+
parts.push(`index:${String(action.params.index)}`);
|
|
2105
|
+
}
|
|
2106
|
+
else if (typeof action.params.url === 'string' && action.params.url.trim()) {
|
|
2107
|
+
parts.push(`url:${action.params.url.trim().slice(0, 160)}`);
|
|
2108
|
+
}
|
|
2109
|
+
else if (action.params.x !== undefined && action.params.y !== undefined) {
|
|
2110
|
+
parts.push(`xy:${String(action.params.x)},${String(action.params.y)}`);
|
|
2111
|
+
}
|
|
2112
|
+
if (typeof action.params.optionLabel === 'string' && action.params.optionLabel.trim()) {
|
|
2113
|
+
parts.push(`optionLabel:${action.params.optionLabel.trim()}`);
|
|
2114
|
+
}
|
|
2115
|
+
else if (typeof action.params.optionValue === 'string' && action.params.optionValue.trim()) {
|
|
2116
|
+
parts.push(`optionValue:${action.params.optionValue.trim()}`);
|
|
2117
|
+
}
|
|
2118
|
+
if (typeof action.params.key === 'string' && action.params.key.trim()) {
|
|
2119
|
+
parts.push(`key:${action.params.key.trim()}`);
|
|
2120
|
+
}
|
|
2121
|
+
return parts.join('|');
|
|
2122
|
+
}
|
|
2123
|
+
function urlsRoughlyMatch(expectedUrl, currentUrl) {
|
|
2124
|
+
if (!expectedUrl || !currentUrl)
|
|
2125
|
+
return false;
|
|
2126
|
+
try {
|
|
2127
|
+
const expected = new URL(expectedUrl);
|
|
2128
|
+
const current = new URL(currentUrl);
|
|
2129
|
+
if (expected.origin !== current.origin)
|
|
2130
|
+
return false;
|
|
2131
|
+
// Exact pathname match, or current is a sub-path with a '/' separator.
|
|
2132
|
+
// e.g., /products matches /products/123 but NOT /products-edit.
|
|
2133
|
+
const expectedPath = expected.pathname || '/';
|
|
2134
|
+
const currentPath = current.pathname || '/';
|
|
2135
|
+
if (currentPath === expectedPath)
|
|
2136
|
+
return true;
|
|
2137
|
+
if (currentPath.startsWith(expectedPath) && (expectedPath.endsWith('/') || currentPath[expectedPath.length] === '/')) {
|
|
2138
|
+
return true;
|
|
2139
|
+
}
|
|
2140
|
+
return false;
|
|
2141
|
+
}
|
|
2142
|
+
catch {
|
|
2143
|
+
return currentUrl === expectedUrl;
|
|
2144
|
+
}
|
|
2145
|
+
}
|
|
2146
|
+
function normalizeGuardPageUrl(value) {
|
|
2147
|
+
if (typeof value !== 'string' || value.trim().length === 0)
|
|
2148
|
+
return '';
|
|
2149
|
+
try {
|
|
2150
|
+
const parsed = new URL(value);
|
|
2151
|
+
return `${parsed.origin}${parsed.pathname}`.replace(/\/$/, '') || parsed.origin;
|
|
2152
|
+
}
|
|
2153
|
+
catch {
|
|
2154
|
+
return value.trim().replace(/\/$/, '');
|
|
2155
|
+
}
|
|
2156
|
+
}
|
|
2157
|
+
function buildClickGuardAnchor(params) {
|
|
2158
|
+
const parts = [];
|
|
2159
|
+
if (typeof params.selector === 'string' && params.selector.trim()) {
|
|
2160
|
+
parts.push(`selector:${params.selector.trim()}`);
|
|
2161
|
+
}
|
|
2162
|
+
if (params.index !== undefined) {
|
|
2163
|
+
parts.push(`index:${String(params.index)}`);
|
|
2164
|
+
}
|
|
2165
|
+
if (typeof params.elementLabel === 'string' && params.elementLabel.trim()) {
|
|
2166
|
+
parts.push(`label:${params.elementLabel.trim().toLowerCase()}`);
|
|
2167
|
+
}
|
|
2168
|
+
if (typeof params.href === 'string' && params.href.trim()) {
|
|
2169
|
+
parts.push(`href:${normalizeGuardPageUrl(params.href)}`);
|
|
2170
|
+
}
|
|
2171
|
+
return parts.length > 0 ? parts.join('|') : null;
|
|
2172
|
+
}
|
|
2173
|
+
function buildClickGuardSignature(params, currentUrl) {
|
|
2174
|
+
const anchor = buildClickGuardAnchor(params);
|
|
2175
|
+
if (!anchor)
|
|
2176
|
+
return null;
|
|
2177
|
+
const pageKey = normalizeGuardPageUrl(params.preActionUrl ?? params.postActionUrl ?? currentUrl);
|
|
2178
|
+
return `click|${pageKey}|${anchor}`;
|
|
2179
|
+
}
|
|
2180
|
+
function getPostActionDelayMs(action, execResult, options = {}) {
|
|
2181
|
+
if (!execResult.success)
|
|
2182
|
+
return 0;
|
|
2183
|
+
if (options.authSubmitAction) {
|
|
2184
|
+
return execResult.stateChanged ? 1200 : 700;
|
|
2185
|
+
}
|
|
2186
|
+
switch (action) {
|
|
2187
|
+
case 'navigate_to':
|
|
2188
|
+
case 'wait':
|
|
2189
|
+
case 'resize_viewport':
|
|
2190
|
+
case 'dismiss_overlays':
|
|
2191
|
+
return 0;
|
|
2192
|
+
case 'scroll':
|
|
2193
|
+
case 'scroll_to_element':
|
|
2194
|
+
return execResult.stateChanged ? 90 : 0;
|
|
2195
|
+
case 'type_text':
|
|
2196
|
+
return 60;
|
|
2197
|
+
case 'click':
|
|
2198
|
+
case 'safe_expand':
|
|
2199
|
+
case 'select_option':
|
|
2200
|
+
case 'press_key':
|
|
2201
|
+
case 'hover':
|
|
2202
|
+
return execResult.stateChanged ? 120 : 40;
|
|
2203
|
+
default:
|
|
2204
|
+
return 80;
|
|
2205
|
+
}
|
|
2206
|
+
}
|
|
2207
|
+
export function getLivePreviewScreenshot(pageState) {
|
|
2208
|
+
return pageState.cleanScreenshot ?? pageState.screenshot;
|
|
2209
|
+
}
|
|
2210
|
+
const LOGIN_URL_RE = /\b(login|log-in|signin|sign-in|auth|session|connexion|connect)\b/i;
|
|
2211
|
+
const LOGIN_FIELD_RE = /\b(password|mot de passe|passcode|otp|verification.?code)\b/i;
|
|
2212
|
+
const LOGIN_CLICK_RE = /\b(login|log-in|signin|sign-in|auth|session|connexion|connect|password|mot de passe|passcode|otp|verification.?code|email|e-mail)\b/i;
|
|
2213
|
+
const AUTH_SUBMIT_RE = /\b(continue|next|submit|verify|unlock|access|enter|continuer|suivant|soumettre|verifier|v[eé]rifier|acc[eé]der|entrer)\b/i;
|
|
2214
|
+
const INTERNAL_AUTOMATION_SELECTOR_RE = /\[data-ak-[^\]]+\]|data-ak-interactive-index/i;
|
|
2215
|
+
function getActionLabelHaystack(args) {
|
|
2216
|
+
return [
|
|
2217
|
+
args.selector,
|
|
2218
|
+
args.elementLabel,
|
|
2219
|
+
args.href,
|
|
2220
|
+
args.text,
|
|
2221
|
+
args.reason,
|
|
2222
|
+
]
|
|
2223
|
+
.filter((value) => typeof value === 'string' && value.trim().length > 0)
|
|
2224
|
+
.join(' ');
|
|
2225
|
+
}
|
|
2226
|
+
function observationLooksLikeAuthSurface(observation) {
|
|
2227
|
+
if (!observation)
|
|
2228
|
+
return false;
|
|
2229
|
+
const haystack = [observation.url, observation.title, observation.textSample]
|
|
2230
|
+
.filter(Boolean)
|
|
2231
|
+
.join(' ');
|
|
2232
|
+
return LOGIN_URL_RE.test(haystack) || LOGIN_FIELD_RE.test(haystack) || LOGIN_CLICK_RE.test(haystack);
|
|
2233
|
+
}
|
|
2234
|
+
function isLikelyAuthenticationSubmitAction(action, args, before) {
|
|
2235
|
+
const haystack = getActionLabelHaystack(args);
|
|
2236
|
+
if (action === 'click') {
|
|
2237
|
+
if (LOGIN_CLICK_RE.test(haystack))
|
|
2238
|
+
return true;
|
|
2239
|
+
if (AUTH_SUBMIT_RE.test(haystack) && observationLooksLikeAuthSurface(before))
|
|
2240
|
+
return true;
|
|
2241
|
+
}
|
|
2242
|
+
if (action === 'press_key' && args.key === 'Enter' && observationLooksLikeAuthSurface(before)) {
|
|
2243
|
+
return true;
|
|
2244
|
+
}
|
|
2245
|
+
return false;
|
|
2246
|
+
}
|
|
2247
|
+
function getReactionOptions(action, args, authSubmitAction) {
|
|
2248
|
+
if (authSubmitAction) {
|
|
2249
|
+
return { timeoutMs: 4200, settleMs: 550, idleGraceMs: 1400 };
|
|
2250
|
+
}
|
|
2251
|
+
switch (action) {
|
|
2252
|
+
case 'navigate_to':
|
|
2253
|
+
return { timeoutMs: 3200, settleMs: 450 };
|
|
2254
|
+
case 'click':
|
|
2255
|
+
case 'safe_expand':
|
|
2256
|
+
case 'select_option':
|
|
2257
|
+
case 'press_key':
|
|
2258
|
+
return { timeoutMs: 2400, settleMs: 350 };
|
|
2259
|
+
case 'hover':
|
|
2260
|
+
case 'scroll':
|
|
2261
|
+
case 'scroll_to_element':
|
|
2262
|
+
case 'resize_viewport':
|
|
2263
|
+
case 'dismiss_overlays':
|
|
2264
|
+
return { timeoutMs: 1600, settleMs: 250 };
|
|
2265
|
+
case 'type_text':
|
|
2266
|
+
return { timeoutMs: 1400, settleMs: 250 };
|
|
2267
|
+
case 'wait':
|
|
2268
|
+
return {
|
|
2269
|
+
timeoutMs: Math.min(2200, Math.max(800, Number(args.milliseconds ?? 1000) + 400)),
|
|
2270
|
+
settleMs: 250,
|
|
2271
|
+
};
|
|
2272
|
+
default:
|
|
2273
|
+
return { timeoutMs: 1800, settleMs: 300 };
|
|
2274
|
+
}
|
|
2275
|
+
}
|
|
2276
|
+
function containsInternalAutomationSelector(selector) {
|
|
2277
|
+
return typeof selector === 'string' && INTERNAL_AUTOMATION_SELECTOR_RE.test(selector);
|
|
2278
|
+
}
|
|
2279
|
+
function replayActionRequiresAnchor(action) {
|
|
2280
|
+
return [
|
|
2281
|
+
'click',
|
|
2282
|
+
'type_text',
|
|
2283
|
+
'select_option',
|
|
2284
|
+
'scroll',
|
|
2285
|
+
'safe_expand',
|
|
2286
|
+
'hover',
|
|
2287
|
+
'scroll_to_element',
|
|
2288
|
+
].includes(action.action);
|
|
2289
|
+
}
|
|
2290
|
+
function hasReplayAnchor(params) {
|
|
2291
|
+
if (typeof params.selector === 'string' && params.selector.trim().length > 0 && !containsInternalAutomationSelector(params.selector)) {
|
|
2292
|
+
return true;
|
|
2293
|
+
}
|
|
2294
|
+
if (typeof params.index === 'number') {
|
|
2295
|
+
return true;
|
|
2296
|
+
}
|
|
2297
|
+
if (typeof params.x === 'number' && typeof params.y === 'number') {
|
|
2298
|
+
return true;
|
|
2299
|
+
}
|
|
2300
|
+
if (typeof params.href === 'string' && params.href.trim().length > 0) {
|
|
2301
|
+
return true;
|
|
2302
|
+
}
|
|
2303
|
+
if (typeof params.elementLabel === 'string' && params.elementLabel.trim().length > 0) {
|
|
2304
|
+
return true;
|
|
2305
|
+
}
|
|
2306
|
+
return false;
|
|
2307
|
+
}
|
|
2308
|
+
export function analyzeReplayCandidate(recordedActions, params = {}) {
|
|
2309
|
+
const replayable = compactReplayActions(recordedActions, {
|
|
2310
|
+
currentUrl: params.currentUrl,
|
|
2311
|
+
targetUrl: params.targetUrl,
|
|
2312
|
+
currentViewport: params.currentViewport,
|
|
2313
|
+
isAuthenticated: params.isAuthenticated,
|
|
2314
|
+
});
|
|
2315
|
+
if (replayable.length === 0) {
|
|
2316
|
+
return {
|
|
2317
|
+
replayableActions: [],
|
|
2318
|
+
skipReason: 'no replayable actions remain after filtering bootstrap/auth steps',
|
|
2319
|
+
};
|
|
2320
|
+
}
|
|
2321
|
+
const currentDialogCount = params.currentDialogCount ?? null;
|
|
2322
|
+
const pageIdentity = params.pageIdentity ?? null;
|
|
2323
|
+
if (!pageIdentity?.dialogTarget
|
|
2324
|
+
&& (pageIdentity?.dedicatedRoute || pageIdentity?.kind === 'editor_route')
|
|
2325
|
+
&& (currentDialogCount ?? 0) > 0) {
|
|
2326
|
+
return {
|
|
2327
|
+
replayableActions: [],
|
|
2328
|
+
skipReason: 'a dialog/modal is still open, but the target expects the underlying page/editor route',
|
|
2329
|
+
};
|
|
2330
|
+
}
|
|
2331
|
+
const sanitized = [];
|
|
2332
|
+
for (const action of replayable) {
|
|
2333
|
+
if (replayActionRequiresAnchor(action) && !hasReplayAnchor(action.params)) {
|
|
2334
|
+
if (sanitized.length === 0) {
|
|
2335
|
+
return {
|
|
2336
|
+
replayableActions: [],
|
|
2337
|
+
skipReason: `the first replay action "${action.action}" has no reusable selector, coordinates, href, or label anchor`,
|
|
2338
|
+
};
|
|
2339
|
+
}
|
|
2340
|
+
continue;
|
|
2341
|
+
}
|
|
2342
|
+
sanitized.push(action);
|
|
2343
|
+
}
|
|
2344
|
+
if (sanitized.length === 0) {
|
|
2345
|
+
return {
|
|
2346
|
+
replayableActions: [],
|
|
2347
|
+
skipReason: 'no replayable actions remain after removing non-reusable interaction steps',
|
|
2348
|
+
};
|
|
2349
|
+
}
|
|
2350
|
+
// Validate that the first action's expected starting URL matches the current browser URL.
|
|
2351
|
+
// This prevents replaying actions recorded from page A when the browser is on page B.
|
|
2352
|
+
const firstActionPreUrl = typeof sanitized[0].params.preActionUrl === 'string' ? sanitized[0].params.preActionUrl : null;
|
|
2353
|
+
if (firstActionPreUrl && params.currentUrl) {
|
|
2354
|
+
if (!urlsRoughlyMatch(firstActionPreUrl, params.currentUrl) && !urlsRoughlyMatch(params.currentUrl, firstActionPreUrl)) {
|
|
2355
|
+
return {
|
|
2356
|
+
replayableActions: [],
|
|
2357
|
+
skipReason: `browser is on ${params.currentUrl} but recorded actions expect to start from ${firstActionPreUrl}`,
|
|
2358
|
+
};
|
|
2359
|
+
}
|
|
2360
|
+
}
|
|
2361
|
+
return {
|
|
2362
|
+
replayableActions: sanitized,
|
|
2363
|
+
skipReason: null,
|
|
2364
|
+
};
|
|
2365
|
+
}
|
|
2366
|
+
function normalizeReplayText(value) {
|
|
2367
|
+
return typeof value === 'string' ? value.trim().toLowerCase() : '';
|
|
2368
|
+
}
|
|
2369
|
+
/**
|
|
2370
|
+
* Compute word-token overlap ratio between two strings.
|
|
2371
|
+
* Used for cross-language replay matching: "Filter by preset" and "Filtrer par preset"
|
|
2372
|
+
* share the token "preset", yielding a non-zero overlap even across translations.
|
|
2373
|
+
* Only considers tokens with length > 2 to skip noise words.
|
|
2374
|
+
*/
|
|
2375
|
+
function computeTokenOverlap(a, b) {
|
|
2376
|
+
if (!a || !b)
|
|
2377
|
+
return 0;
|
|
2378
|
+
const tokensA = new Set(a.split(/\s+/).filter(t => t.length > 2));
|
|
2379
|
+
const tokensB = new Set(b.split(/\s+/).filter(t => t.length > 2));
|
|
2380
|
+
if (tokensA.size === 0 || tokensB.size === 0)
|
|
2381
|
+
return 0;
|
|
2382
|
+
let overlap = 0;
|
|
2383
|
+
for (const token of tokensA) {
|
|
2384
|
+
if (tokensB.has(token))
|
|
2385
|
+
overlap++;
|
|
2386
|
+
}
|
|
2387
|
+
return overlap / Math.max(tokensA.size, tokensB.size);
|
|
2388
|
+
}
|
|
2389
|
+
function findElementForPoint(interactiveElements, x, y) {
|
|
2390
|
+
const matches = interactiveElements
|
|
2391
|
+
.filter((element) => {
|
|
2392
|
+
const box = element.boundingBox;
|
|
2393
|
+
return !!box
|
|
2394
|
+
&& x >= box.x
|
|
2395
|
+
&& x <= box.x + box.width
|
|
2396
|
+
&& y >= box.y
|
|
2397
|
+
&& y <= box.y + box.height;
|
|
2398
|
+
})
|
|
2399
|
+
.sort((a, b) => {
|
|
2400
|
+
const aArea = (a.boundingBox?.width ?? Number.MAX_SAFE_INTEGER) * (a.boundingBox?.height ?? Number.MAX_SAFE_INTEGER);
|
|
2401
|
+
const bArea = (b.boundingBox?.width ?? Number.MAX_SAFE_INTEGER) * (b.boundingBox?.height ?? Number.MAX_SAFE_INTEGER);
|
|
2402
|
+
return aArea - bArea;
|
|
2403
|
+
});
|
|
2404
|
+
return matches[0] ?? null;
|
|
2405
|
+
}
|
|
2406
|
+
export function matchRecordedActionToElement(action, interactiveElements) {
|
|
2407
|
+
// 1. Exact stable selector match (highest confidence — language-independent)
|
|
2408
|
+
if (typeof action.params.selector === 'string' && !containsInternalAutomationSelector(action.params.selector)) {
|
|
2409
|
+
const exactSelectorMatch = interactiveElements.find((element) => element.selector === action.params.selector);
|
|
2410
|
+
if (exactSelectorMatch) {
|
|
2411
|
+
return exactSelectorMatch;
|
|
2412
|
+
}
|
|
2413
|
+
}
|
|
2414
|
+
// 2. Fuzzy scoring — all signals contribute to a composite score.
|
|
2415
|
+
// Coordinates are NO LONGER an early-return path; they participate in scoring
|
|
2416
|
+
// so that a strong label/href match isn't overridden by a stale coordinate hit.
|
|
2417
|
+
const desiredHref = normalizeReplayText(action.params.href);
|
|
2418
|
+
const desiredLabel = normalizeReplayText(action.params.elementLabel);
|
|
2419
|
+
const desiredSelector = normalizeReplayText(action.params.selector);
|
|
2420
|
+
const desiredTag = normalizeReplayText(action.params.elementTag);
|
|
2421
|
+
const desiredRole = normalizeReplayText(action.params.elementRole);
|
|
2422
|
+
const recordedCx = action.params.elementCx;
|
|
2423
|
+
const recordedCy = action.params.elementCy;
|
|
2424
|
+
let best = null;
|
|
2425
|
+
for (const element of interactiveElements) {
|
|
2426
|
+
let score = 0;
|
|
2427
|
+
const elementText = normalizeReplayText(element.text);
|
|
2428
|
+
const elementAria = normalizeReplayText(element.ariaLabel);
|
|
2429
|
+
const elementHref = normalizeReplayText(element.href);
|
|
2430
|
+
const elementSelector = normalizeReplayText(element.selector);
|
|
2431
|
+
// --- Selector match (language-independent, very stable) ---
|
|
2432
|
+
if (desiredSelector && !containsInternalAutomationSelector(desiredSelector) && elementSelector === desiredSelector) {
|
|
2433
|
+
score += 900;
|
|
2434
|
+
}
|
|
2435
|
+
// --- href match (language-independent, very stable) ---
|
|
2436
|
+
if (desiredHref) {
|
|
2437
|
+
if (elementHref === desiredHref)
|
|
2438
|
+
score += 700;
|
|
2439
|
+
else if (elementHref && urlsRoughlyMatch(elementHref, desiredHref))
|
|
2440
|
+
score += 550;
|
|
2441
|
+
}
|
|
2442
|
+
// --- Label match (language-dependent, tiered confidence) ---
|
|
2443
|
+
if (desiredLabel) {
|
|
2444
|
+
if (elementText === desiredLabel || elementAria === desiredLabel) {
|
|
2445
|
+
score += 450;
|
|
2446
|
+
}
|
|
2447
|
+
else if ((elementText && (elementText.includes(desiredLabel) || desiredLabel.includes(elementText)))
|
|
2448
|
+
|| (elementAria && (elementAria.includes(desiredLabel) || desiredLabel.includes(elementAria)))) {
|
|
2449
|
+
score += 260;
|
|
2450
|
+
}
|
|
2451
|
+
else {
|
|
2452
|
+
// Cross-language token overlap: "filter by preset" ↔ "filtrer par preset" share "preset"
|
|
2453
|
+
const textOverlap = computeTokenOverlap(desiredLabel, elementText);
|
|
2454
|
+
const ariaOverlap = computeTokenOverlap(desiredLabel, elementAria);
|
|
2455
|
+
const bestOverlap = Math.max(textOverlap, ariaOverlap);
|
|
2456
|
+
if (bestOverlap >= 0.3) {
|
|
2457
|
+
score += Math.round(200 * bestOverlap);
|
|
2458
|
+
}
|
|
2459
|
+
}
|
|
2460
|
+
}
|
|
2461
|
+
// --- Structural match: tag + role (language-independent) ---
|
|
2462
|
+
// Stronger weight so it can serve as a tiebreaker for fuzzy label matches.
|
|
2463
|
+
if (desiredTag && desiredTag === element.tag.toLowerCase()) {
|
|
2464
|
+
score += 50;
|
|
2465
|
+
if (desiredRole && desiredRole === element.role.toLowerCase()) {
|
|
2466
|
+
score += 60;
|
|
2467
|
+
}
|
|
2468
|
+
}
|
|
2469
|
+
// --- Bounding box proximity (layout-dependent but language-independent) ---
|
|
2470
|
+
if (recordedCx !== undefined && recordedCy !== undefined && element.boundingBox) {
|
|
2471
|
+
const elCx = element.boundingBox.x + element.boundingBox.width / 2;
|
|
2472
|
+
const elCy = element.boundingBox.y + element.boundingBox.height / 2;
|
|
2473
|
+
const dist = Math.sqrt((elCx - recordedCx) ** 2 + (elCy - recordedCy) ** 2);
|
|
2474
|
+
if (dist < 50)
|
|
2475
|
+
score += 80;
|
|
2476
|
+
else if (dist < 120)
|
|
2477
|
+
score += 40;
|
|
2478
|
+
}
|
|
2479
|
+
// --- Legacy coordinate match (weakest signal — layout shift can misfire) ---
|
|
2480
|
+
if (typeof action.params.x === 'number' && typeof action.params.y === 'number' && element.boundingBox) {
|
|
2481
|
+
const box = element.boundingBox;
|
|
2482
|
+
if (action.params.x >= box.x
|
|
2483
|
+
&& action.params.x <= box.x + box.width
|
|
2484
|
+
&& action.params.y >= box.y
|
|
2485
|
+
&& action.params.y <= box.y + box.height) {
|
|
2486
|
+
score += 60;
|
|
2487
|
+
}
|
|
2488
|
+
}
|
|
2489
|
+
if (score <= 0)
|
|
2490
|
+
continue;
|
|
2491
|
+
if (!best || score > best.score) {
|
|
2492
|
+
best = { element, score };
|
|
2493
|
+
}
|
|
2494
|
+
}
|
|
2495
|
+
// Require a minimum confidence threshold to avoid false positives.
|
|
2496
|
+
// A score < 65 means the match is based on very weak signals only (e.g., coordinates
|
|
2497
|
+
// alone = 60) — too risky to replay. Token overlap alone (≥66) or structural signals
|
|
2498
|
+
// combined with coordinates are sufficient.
|
|
2499
|
+
if (best && best.score < 65) {
|
|
2500
|
+
return null;
|
|
2501
|
+
}
|
|
2502
|
+
return best?.element ?? null;
|
|
2503
|
+
}
|
|
2504
|
+
function hasExecutableReplayArgs(action, params) {
|
|
2505
|
+
const hasStableSelector = typeof params.selector === 'string'
|
|
2506
|
+
&& params.selector.trim().length > 0
|
|
2507
|
+
&& !containsInternalAutomationSelector(params.selector);
|
|
2508
|
+
const hasIndex = typeof params.index === 'number';
|
|
2509
|
+
const hasCoordinates = typeof params.x === 'number' && typeof params.y === 'number';
|
|
2510
|
+
switch (action) {
|
|
2511
|
+
case 'click':
|
|
2512
|
+
case 'safe_expand':
|
|
2513
|
+
case 'hover':
|
|
2514
|
+
return hasStableSelector || hasIndex || hasCoordinates;
|
|
2515
|
+
case 'type_text':
|
|
2516
|
+
return typeof params.text === 'string' && (hasStableSelector || hasIndex);
|
|
2517
|
+
case 'select_option':
|
|
2518
|
+
return ((typeof params.optionLabel === 'string'
|
|
2519
|
+
|| typeof params.optionValue === 'string'
|
|
2520
|
+
|| typeof params.optionIndex === 'number')
|
|
2521
|
+
&& (hasStableSelector || hasIndex));
|
|
2522
|
+
case 'scroll':
|
|
2523
|
+
return hasIndex || hasStableSelector || typeof params.direction === 'string';
|
|
2524
|
+
case 'scroll_to_element':
|
|
2525
|
+
return hasIndex;
|
|
2526
|
+
case 'navigate_to':
|
|
2527
|
+
return typeof params.url === 'string' && params.url.trim().length > 0;
|
|
2528
|
+
case 'resize_viewport':
|
|
2529
|
+
return typeof params.width === 'number' && typeof params.height === 'number';
|
|
2530
|
+
case 'press_key':
|
|
2531
|
+
return typeof params.key === 'string' && params.key.trim().length > 0;
|
|
2532
|
+
case 'wait':
|
|
2533
|
+
case 'dismiss_overlays':
|
|
2534
|
+
return true;
|
|
2535
|
+
default:
|
|
2536
|
+
return false;
|
|
2537
|
+
}
|
|
2538
|
+
}
|
|
2539
|
+
export function resolveReplayActionArgs(action, interactiveElements) {
|
|
2540
|
+
const replayArgs = { ...action.params };
|
|
2541
|
+
if (hasExecutableReplayArgs(action.action, replayArgs)) {
|
|
2542
|
+
return { args: replayArgs, reason: null };
|
|
2543
|
+
}
|
|
2544
|
+
const matchedElement = matchRecordedActionToElement(action, interactiveElements);
|
|
2545
|
+
if (matchedElement) {
|
|
2546
|
+
if (action.action === 'click'
|
|
2547
|
+
|| action.action === 'safe_expand'
|
|
2548
|
+
|| action.action === 'hover'
|
|
2549
|
+
|| action.action === 'type_text'
|
|
2550
|
+
|| action.action === 'select_option'
|
|
2551
|
+
|| action.action === 'scroll'
|
|
2552
|
+
|| action.action === 'scroll_to_element') {
|
|
2553
|
+
replayArgs.index = matchedElement.index;
|
|
2554
|
+
delete replayArgs.selector;
|
|
2555
|
+
delete replayArgs.x;
|
|
2556
|
+
delete replayArgs.y;
|
|
2557
|
+
}
|
|
2558
|
+
}
|
|
2559
|
+
if (replayArgs.selector && replayArgs.index !== undefined) {
|
|
2560
|
+
delete replayArgs.index;
|
|
2561
|
+
}
|
|
2562
|
+
if (hasExecutableReplayArgs(action.action, replayArgs)) {
|
|
2563
|
+
return { args: replayArgs, reason: null };
|
|
2564
|
+
}
|
|
2565
|
+
const anchors = [
|
|
2566
|
+
typeof action.params.selector === 'string' ? action.params.selector : null,
|
|
2567
|
+
typeof action.params.href === 'string' ? action.params.href : null,
|
|
2568
|
+
typeof action.params.elementLabel === 'string' ? action.params.elementLabel : null,
|
|
2569
|
+
].filter((value) => !!value && value.trim().length > 0);
|
|
2570
|
+
const anchorSummary = anchors.length > 0 ? anchors.join(' / ').slice(0, 160) : 'no replay anchor';
|
|
2571
|
+
return {
|
|
2572
|
+
args: null,
|
|
2573
|
+
reason: `replay action "${action.action}" could not be resolved on the current page (${anchorSummary})`,
|
|
2574
|
+
};
|
|
2575
|
+
}
|
|
2576
|
+
function isExplicitLoginAction(action) {
|
|
2577
|
+
if (action.action === 'navigate_to' && typeof action.params.url === 'string' && LOGIN_URL_RE.test(action.params.url)) {
|
|
2578
|
+
return true;
|
|
2579
|
+
}
|
|
2580
|
+
if (action.action === 'type_text' && typeof action.params.selector === 'string' && LOGIN_FIELD_RE.test(action.params.selector)) {
|
|
2581
|
+
return true;
|
|
2582
|
+
}
|
|
2583
|
+
if (action.action === 'type_text' && typeof action.params.text === 'string' && /\{\{credential\./i.test(action.params.text)) {
|
|
2584
|
+
return true;
|
|
2585
|
+
}
|
|
2586
|
+
if (action.action === 'click') {
|
|
2587
|
+
const haystack = [
|
|
2588
|
+
action.params.selector,
|
|
2589
|
+
action.params.elementLabel,
|
|
2590
|
+
action.params.href,
|
|
2591
|
+
]
|
|
2592
|
+
.filter((value) => typeof value === 'string')
|
|
2593
|
+
.join(' ');
|
|
2594
|
+
if (LOGIN_CLICK_RE.test(haystack)) {
|
|
2595
|
+
return true;
|
|
2596
|
+
}
|
|
2597
|
+
}
|
|
2598
|
+
return false;
|
|
2599
|
+
}
|
|
2600
|
+
function hasRecentExplicitLoginAction(previousActions) {
|
|
2601
|
+
return previousActions.slice(-3).some(isExplicitLoginAction);
|
|
2602
|
+
}
|
|
2603
|
+
function isLoginAction(action, previousActions = []) {
|
|
2604
|
+
if (isExplicitLoginAction(action)) {
|
|
2605
|
+
return true;
|
|
2606
|
+
}
|
|
2607
|
+
if (action.action === 'click') {
|
|
2608
|
+
const haystack = [
|
|
2609
|
+
action.params.selector,
|
|
2610
|
+
action.params.elementLabel,
|
|
2611
|
+
action.params.href,
|
|
2612
|
+
]
|
|
2613
|
+
.filter((value) => typeof value === 'string')
|
|
2614
|
+
.join(' ');
|
|
2615
|
+
if (AUTH_SUBMIT_RE.test(haystack) && hasRecentExplicitLoginAction(previousActions)) {
|
|
2616
|
+
return true;
|
|
2617
|
+
}
|
|
2618
|
+
}
|
|
2619
|
+
if (action.action === 'press_key'
|
|
2620
|
+
&& action.params.key === 'Enter'
|
|
2621
|
+
&& hasRecentExplicitLoginAction(previousActions)) {
|
|
2622
|
+
return true;
|
|
2623
|
+
}
|
|
2624
|
+
return false;
|
|
2625
|
+
}
|
|
2626
|
+
export function compactReplayActions(recordedActions, params = {}) {
|
|
2627
|
+
let replayable = recordedActions.filter(a => REPLAYABLE_ACTIONS.includes(a.action));
|
|
2628
|
+
// When the session is already authenticated, strip login-related actions
|
|
2629
|
+
if (params.isAuthenticated) {
|
|
2630
|
+
const authAwareReplayable = replayable;
|
|
2631
|
+
replayable = authAwareReplayable.filter((action, index) => !isLoginAction(action, authAwareReplayable.slice(0, index)));
|
|
2632
|
+
}
|
|
2633
|
+
let startIndex = 0;
|
|
2634
|
+
while (startIndex < replayable.length) {
|
|
2635
|
+
const action = replayable[startIndex];
|
|
2636
|
+
if (action.action === 'wait') {
|
|
2637
|
+
startIndex += 1;
|
|
2638
|
+
continue;
|
|
2639
|
+
}
|
|
2640
|
+
if (action.action === 'dismiss_overlays') {
|
|
2641
|
+
startIndex += 1;
|
|
2642
|
+
continue;
|
|
2643
|
+
}
|
|
2644
|
+
if (action.action === 'resize_viewport'
|
|
2645
|
+
&& params.currentViewport
|
|
2646
|
+
&& Number(action.params.width) === params.currentViewport.width
|
|
2647
|
+
&& Number(action.params.height) === params.currentViewport.height) {
|
|
2648
|
+
startIndex += 1;
|
|
2649
|
+
continue;
|
|
2650
|
+
}
|
|
2651
|
+
if (action.action === 'navigate_to'
|
|
2652
|
+
&& typeof action.params.url === 'string'
|
|
2653
|
+
&& (urlsRoughlyMatch(action.params.url, params.currentUrl)
|
|
2654
|
+
|| urlsRoughlyMatch(action.params.url, params.targetUrl))) {
|
|
2655
|
+
startIndex += 1;
|
|
2656
|
+
continue;
|
|
2657
|
+
}
|
|
2658
|
+
break;
|
|
2659
|
+
}
|
|
2660
|
+
return replayable.slice(startIndex);
|
|
2661
|
+
}
|
|
2662
|
+
export function countRecentNoEffectActions(actionHistory) {
|
|
2663
|
+
let count = 0;
|
|
2664
|
+
for (let index = actionHistory.length - 1; index >= 0; index -= 1) {
|
|
2665
|
+
const action = actionHistory[index];
|
|
2666
|
+
// Skip meta-actions: note/begin_subgoal always have stateChanged=false but are not stuck indicators.
|
|
2667
|
+
if (META_ACTIONS.has(action.action))
|
|
2668
|
+
continue;
|
|
2669
|
+
if (isNoEffectAction(action)) {
|
|
2670
|
+
count += 1;
|
|
2671
|
+
continue;
|
|
2672
|
+
}
|
|
2673
|
+
break;
|
|
2674
|
+
}
|
|
2675
|
+
return count;
|
|
2676
|
+
}
|
|
2677
|
+
export function shouldTriggerRecovery(actionHistory) {
|
|
2678
|
+
// Only consider browser actions for recovery detection (exclude meta-actions like note/begin_subgoal)
|
|
2679
|
+
const browserActions = actionHistory.filter(a => !META_ACTIONS.has(a.action));
|
|
2680
|
+
if (browserActions.length < 2)
|
|
2681
|
+
return false;
|
|
2682
|
+
if (!hasMeaningfulBrowserAction(browserActions))
|
|
2683
|
+
return false;
|
|
2684
|
+
const last = browserActions[browserActions.length - 1];
|
|
2685
|
+
const previous = browserActions[browserActions.length - 2];
|
|
2686
|
+
const sameFailureSignature = !last.success
|
|
2687
|
+
&& !previous.success
|
|
2688
|
+
&& buildRecoveryActionSignature(last) === buildRecoveryActionSignature(previous)
|
|
2689
|
+
&& String(last.error || '').slice(0, 120) === String(previous.error || '').slice(0, 120);
|
|
2690
|
+
// Detect A→B→A→B oscillation: last 4 browser actions form a repeating 2-cycle
|
|
2691
|
+
if (browserActions.length >= 4) {
|
|
2692
|
+
const [a, b, c, d] = browserActions.slice(-4);
|
|
2693
|
+
const sigA = buildRecoveryActionSignature(a);
|
|
2694
|
+
const sigB = buildRecoveryActionSignature(b);
|
|
2695
|
+
const sigC = buildRecoveryActionSignature(c);
|
|
2696
|
+
const sigD = buildRecoveryActionSignature(d);
|
|
2697
|
+
if ([a, b, c, d].every(isNoEffectAction)
|
|
2698
|
+
&& sigA === sigC
|
|
2699
|
+
&& sigB === sigD
|
|
2700
|
+
&& sigA !== sigB) {
|
|
2701
|
+
return true;
|
|
2702
|
+
}
|
|
2703
|
+
}
|
|
2704
|
+
return sameFailureSignature || countRecentNoEffectActions(actionHistory) >= 2;
|
|
2705
|
+
}
|
|
2706
|
+
function getMeaningfulBrowserActions(actionHistory) {
|
|
2707
|
+
return actionHistory.filter(action => !META_ACTIONS.has(action.action) && !isBootstrapStabilizationAction(action));
|
|
2708
|
+
}
|
|
2709
|
+
function countDistinctActionSignatures(actionHistory) {
|
|
2710
|
+
return new Set(getMeaningfulBrowserActions(actionHistory).map(action => buildRecoveryActionSignature(action))).size;
|
|
2711
|
+
}
|
|
2712
|
+
const HARD_GIVE_UP_RE = /\b(5xx|500\b|404\b|page not found|not found|http error|server error|js crash|javascript crash|browser crashed|connection refused|dns|net::|ssl|certificate|blank page|white page|no content|infinite spinner)\b/i;
|
|
2713
|
+
const RECOVERABLE_GIVE_UP_RE = /\b(verification|ready_to_capture|dialog|modal|overlay|gallery|editor|route|navigation|assistant|conversation|wrong page|duplicate capture|capture target)\b/i;
|
|
2714
|
+
export function inferPrematureGiveUpCorrection(params) {
|
|
2715
|
+
const reason = params.reason.trim();
|
|
2716
|
+
const lastVerificationFailure = params.lastVerificationFailure?.trim();
|
|
2717
|
+
if (HARD_GIVE_UP_RE.test(reason) || (lastVerificationFailure && HARD_GIVE_UP_RE.test(lastVerificationFailure))) {
|
|
2718
|
+
return null;
|
|
2719
|
+
}
|
|
2720
|
+
// Detect verification contradiction: 3+ recent ready_to_capture failures with 2+ distinct
|
|
2721
|
+
// reasons means the validators are cycling between irreconcilable states — allow give_up.
|
|
2722
|
+
const recentCaptureFailures = params.actionHistory
|
|
2723
|
+
.filter(a => a.action === 'ready_to_capture' && !a.success && a.error)
|
|
2724
|
+
.slice(-6);
|
|
2725
|
+
if (recentCaptureFailures.length >= 3) {
|
|
2726
|
+
const distinctReasons = new Set(recentCaptureFailures.map(a => (a.error || '').replace(/^Verification failed:\s*/i, '').trim().slice(0, 120)));
|
|
2727
|
+
if (distinctReasons.size >= 2 && distinctReasons.size <= 3) {
|
|
2728
|
+
return null;
|
|
2729
|
+
}
|
|
2730
|
+
}
|
|
2731
|
+
// If the agent has accumulated many consecutive no-effect actions, the objective
|
|
2732
|
+
// is likely unreachable — allow give_up to avoid burning the remaining budget.
|
|
2733
|
+
if (countRecentNoEffectActions(params.actionHistory) >= 8) {
|
|
2734
|
+
return null;
|
|
2735
|
+
}
|
|
2736
|
+
if (lastVerificationFailure && RECOVERABLE_GIVE_UP_RE.test(lastVerificationFailure)) {
|
|
2737
|
+
return `Do not give up yet. The last verification failure is still recoverable: ${lastVerificationFailure}. Try a materially different navigation or repair step first.`;
|
|
2738
|
+
}
|
|
2739
|
+
const meaningfulActions = getMeaningfulBrowserActions(params.actionHistory);
|
|
2740
|
+
const distinctActionCount = countDistinctActionSignatures(params.actionHistory);
|
|
2741
|
+
const hasTriedEnoughStrategies = meaningfulActions.length >= 4 && distinctActionCount >= 3;
|
|
2742
|
+
const nearingBudget = params.iteration >= Math.max(6, params.maxIterations - 2);
|
|
2743
|
+
if (!hasTriedEnoughStrategies && !nearingBudget) {
|
|
2744
|
+
return 'Do not give up yet. You have not tried enough materially different actions. Change strategy before giving up.';
|
|
2745
|
+
}
|
|
2746
|
+
if (RECOVERABLE_GIVE_UP_RE.test(reason) && !nearingBudget) {
|
|
2747
|
+
return 'Do not give up yet. The current issue still looks recoverable. Try a different navigation, search, or repair approach first.';
|
|
2748
|
+
}
|
|
2749
|
+
return null;
|
|
2750
|
+
}
|
|
2751
|
+
const REPEAT_GUARD_ACTIONS = new Set([
|
|
2752
|
+
'click',
|
|
2753
|
+
'safe_expand',
|
|
2754
|
+
'hover',
|
|
2755
|
+
'select_option',
|
|
2756
|
+
'press_key',
|
|
2757
|
+
]);
|
|
2758
|
+
export function inferRepeatedActionGuard(params) {
|
|
2759
|
+
// Guard 0: navigate-back detection — block navigating to a URL the agent just left
|
|
2760
|
+
if (params.action === 'navigate_to' && typeof params.args.url === 'string' && params.currentUrl) {
|
|
2761
|
+
const targetUrl = params.args.url.replace(/\/$/, '');
|
|
2762
|
+
const recentNavigations = params.actionHistory
|
|
2763
|
+
.filter(a => !META_ACTIONS.has(a.action) && a.success !== false)
|
|
2764
|
+
.slice(-5);
|
|
2765
|
+
// Check if we navigated AWAY from the target URL recently (within last 5 actions)
|
|
2766
|
+
const justLeftTarget = recentNavigations.some(a => a.action === 'navigate_to'
|
|
2767
|
+
&& typeof a.params.url === 'string'
|
|
2768
|
+
&& !urlsRoughlyMatch(a.params.url, targetUrl)
|
|
2769
|
+
&& a.params.previousUrl
|
|
2770
|
+
&& urlsRoughlyMatch(String(a.params.previousUrl), targetUrl));
|
|
2771
|
+
// Also check: are we trying to navigate to the same URL we're already on?
|
|
2772
|
+
if (urlsRoughlyMatch(params.currentUrl, targetUrl)) {
|
|
2773
|
+
return 'WARNING: You are already on this URL. No navigation needed. Focus on interacting with the current page instead.';
|
|
2774
|
+
}
|
|
2775
|
+
// Simpler check: did the last successful click/navigation bring us away from target? Going back suggests confusion.
|
|
2776
|
+
// EXCEPTION: if the agent has recent no-effect actions, it's stuck and legitimately trying
|
|
2777
|
+
// a different approach (e.g., navigating to the correct page). Don't block recovery navigation.
|
|
2778
|
+
const recentAllActions = params.actionHistory
|
|
2779
|
+
.filter(a => !META_ACTIONS.has(a.action))
|
|
2780
|
+
.slice(-4);
|
|
2781
|
+
const hasRecentNoEffect = recentAllActions.some(a => isNoEffectAction(a));
|
|
2782
|
+
if (!hasRecentNoEffect) {
|
|
2783
|
+
const recentUrls = recentNavigations
|
|
2784
|
+
.filter(a => typeof a.params.href === 'string' || typeof a.params.url === 'string')
|
|
2785
|
+
.slice(-3)
|
|
2786
|
+
.map(a => String(a.params.href || a.params.url || '').replace(/\/$/, ''));
|
|
2787
|
+
const wasRecentlyOnTarget = recentUrls.some(url => urlsRoughlyMatch(url, targetUrl));
|
|
2788
|
+
if (wasRecentlyOnTarget && !urlsRoughlyMatch(params.currentUrl, targetUrl)) {
|
|
2789
|
+
return `WARNING: You just navigated away from ${targetUrl} — going back suggests you are confused about the goal. Re-read the <task>/<goal> and <variant_manifest> carefully. If you need a modal, open it from the current page instead of navigating back.`;
|
|
2790
|
+
}
|
|
2791
|
+
}
|
|
2792
|
+
}
|
|
2793
|
+
if (!REPEAT_GUARD_ACTIONS.has(params.action))
|
|
2794
|
+
return null;
|
|
2795
|
+
const recentBrowserActions = params.actionHistory
|
|
2796
|
+
.filter(action => !META_ACTIONS.has(action.action))
|
|
2797
|
+
.slice(-2);
|
|
2798
|
+
if (recentBrowserActions.length < 2)
|
|
2799
|
+
return null;
|
|
2800
|
+
// Guard 1: exact same action signature repeated with no effect
|
|
2801
|
+
const candidateSignature = buildRecoveryActionSignature({
|
|
2802
|
+
iteration: 0,
|
|
2803
|
+
action: params.action,
|
|
2804
|
+
params: params.args,
|
|
2805
|
+
success: false,
|
|
2806
|
+
});
|
|
2807
|
+
const candidateClickSignature = params.action === 'click'
|
|
2808
|
+
? buildClickGuardSignature(params.args, params.currentUrl)
|
|
2809
|
+
: null;
|
|
2810
|
+
const repeatedNoEffect = recentBrowserActions.every(action => isNoEffectAction(action)
|
|
2811
|
+
&& (candidateClickSignature && action.action === 'click'
|
|
2812
|
+
? buildClickGuardSignature(action.params, params.currentUrl) === candidateClickSignature
|
|
2813
|
+
: buildRecoveryActionSignature(action) === candidateSignature));
|
|
2814
|
+
if (repeatedNoEffect) {
|
|
2815
|
+
return 'BLOCKED: The previous attempts on this same target had no effect. Try a different control, search_text, scrolling, or a repair step instead of repeating it.';
|
|
2816
|
+
}
|
|
2817
|
+
// Guard 2: consecutive failed/blocked browser actions (different targets) —
|
|
2818
|
+
// the agent is thrashing without making progress. Triggers after 4+ recent failures.
|
|
2819
|
+
const recentActions = params.actionHistory
|
|
2820
|
+
.filter(action => !META_ACTIONS.has(action.action))
|
|
2821
|
+
.slice(-4);
|
|
2822
|
+
if (recentActions.length >= 4 && recentActions.every(a => isNoEffectAction(a))) {
|
|
2823
|
+
return 'BLOCKED: The last 4 browser actions all failed or had no visible effect. You are stuck. Step back and reconsider: verify the current URL, check if you are on the right page, try navigate_to to reach the correct page, or call give_up if this capture is impossible.';
|
|
2824
|
+
}
|
|
2825
|
+
// Guard 3: same element clicked 3+ times in recent history (regardless of effect).
|
|
2826
|
+
// Catches multi-select dropdown toggling where each click "succeeds" but makes no progress.
|
|
2827
|
+
if (params.action === 'click') {
|
|
2828
|
+
const candidateClickSignature = buildClickGuardSignature(params.args, params.currentUrl);
|
|
2829
|
+
if (candidateClickSignature) {
|
|
2830
|
+
const candidateTarget = String((typeof params.args.elementLabel === 'string' && params.args.elementLabel.trim())
|
|
2831
|
+
? params.args.elementLabel
|
|
2832
|
+
: params.args.selector ?? params.args.index ?? 'this target');
|
|
2833
|
+
const recentClicks = params.actionHistory
|
|
2834
|
+
.filter(a => a.action === 'click' && !META_ACTIONS.has(a.action))
|
|
2835
|
+
.slice(-6);
|
|
2836
|
+
const sameTargetCount = recentClicks.filter(a => buildClickGuardSignature(a.params, params.currentUrl) === candidateClickSignature).length;
|
|
2837
|
+
if (sameTargetCount >= 3) {
|
|
2838
|
+
return `BLOCKED: You have clicked "${candidateTarget}" ${sameTargetCount} times recently without progress. This is likely a toggle/multi-select control. Press Escape to close any open dropdown, then look for a DIFFERENT button (e.g., an edit/settings icon) to achieve your goal.`;
|
|
2839
|
+
}
|
|
2840
|
+
}
|
|
2841
|
+
}
|
|
2842
|
+
return null;
|
|
2843
|
+
}
|
|
2844
|
+
function normalizeSubgoalName(value) {
|
|
2845
|
+
return value.trim().toLowerCase();
|
|
2846
|
+
}
|
|
2847
|
+
export function findReusableWorkflow(workflowCache, subgoalName) {
|
|
2848
|
+
const normalizedSubgoal = normalizeSubgoalName(subgoalName);
|
|
2849
|
+
if (!normalizedSubgoal)
|
|
2850
|
+
return null;
|
|
2851
|
+
let fuzzyMatch = null;
|
|
2852
|
+
for (let index = workflowCache.length - 1; index >= 0; index -= 1) {
|
|
2853
|
+
const candidate = workflowCache[index];
|
|
2854
|
+
if (candidate.selectors.length === 0)
|
|
2855
|
+
continue;
|
|
2856
|
+
const normalizedCandidate = normalizeSubgoalName(candidate.subgoalName);
|
|
2857
|
+
if (!normalizedCandidate)
|
|
2858
|
+
continue;
|
|
2859
|
+
if (normalizedCandidate === normalizedSubgoal)
|
|
2860
|
+
return candidate;
|
|
2861
|
+
if (!fuzzyMatch
|
|
2862
|
+
&& (normalizedCandidate.includes(normalizedSubgoal) || normalizedSubgoal.includes(normalizedCandidate))) {
|
|
2863
|
+
fuzzyMatch = candidate;
|
|
2864
|
+
}
|
|
2865
|
+
}
|
|
2866
|
+
return fuzzyMatch;
|
|
2867
|
+
}
|
|
2868
|
+
/**
|
|
2869
|
+
* Replace base64 image content in older conversation messages with a text placeholder.
|
|
2870
|
+
* Keeps only the most recent `keepRecentImages` user messages that contain screenshots.
|
|
2871
|
+
* This dramatically reduces token cost on long agent runs (40-60% savings).
|
|
2872
|
+
*/
|
|
2873
|
+
function compressOldScreenshots(messages, keepRecentImages = 3) {
|
|
2874
|
+
let imageCount = 0;
|
|
2875
|
+
for (let i = messages.length - 1; i >= 1; i--) {
|
|
2876
|
+
const msg = messages[i];
|
|
2877
|
+
if (msg.role !== 'user' || !Array.isArray(msg.content))
|
|
2878
|
+
continue;
|
|
2879
|
+
const content = msg.content;
|
|
2880
|
+
const hasImage = content.some(p => p.type === 'image_url');
|
|
2881
|
+
if (!hasImage)
|
|
2882
|
+
continue;
|
|
2883
|
+
imageCount++;
|
|
2884
|
+
if (imageCount <= keepRecentImages)
|
|
2885
|
+
continue;
|
|
2886
|
+
// Replace image_url parts with a compact text placeholder
|
|
2887
|
+
msg.content = content.map(p => p.type === 'image_url'
|
|
2888
|
+
? { type: 'text', text: '[screenshot removed — older context]' }
|
|
2889
|
+
: p);
|
|
2890
|
+
}
|
|
2891
|
+
}
|
|
2892
|
+
/**
|
|
2893
|
+
* Strip <page_dom>...</page_dom> blocks from old user messages to reduce token count.
|
|
2894
|
+
* Keeps the DOM in the most recent `keepRecentWithDom` user messages intact.
|
|
2895
|
+
* Only modifies text content parts — images and other types are left untouched.
|
|
2896
|
+
*/
|
|
2897
|
+
function compressOldDomBlocks(messages, keepRecentWithDom = 6, preservedPrefixMessages = 1) {
|
|
2898
|
+
const PAGE_DOM_RE = /<page_dom>[\s\S]*?<\/page_dom>/g;
|
|
2899
|
+
const PLACEHOLDER = '<page_dom>[older context — see current iteration for latest DOM]</page_dom>';
|
|
2900
|
+
let domMessageCount = 0;
|
|
2901
|
+
for (let i = messages.length - 1; i >= preservedPrefixMessages; i--) {
|
|
2902
|
+
const msg = messages[i];
|
|
2903
|
+
if (msg.role !== 'user')
|
|
2904
|
+
continue;
|
|
2905
|
+
const contentArr = Array.isArray(msg.content) ? msg.content : null;
|
|
2906
|
+
const hasDom = contentArr
|
|
2907
|
+
? contentArr.some(p => typeof p === 'object' && 'text' in p && typeof p.text === 'string' && PAGE_DOM_RE.test(p.text))
|
|
2908
|
+
: typeof msg.content === 'string' && PAGE_DOM_RE.test(msg.content);
|
|
2909
|
+
// Reset regex lastIndex after test
|
|
2910
|
+
PAGE_DOM_RE.lastIndex = 0;
|
|
2911
|
+
if (!hasDom)
|
|
2912
|
+
continue;
|
|
2913
|
+
domMessageCount++;
|
|
2914
|
+
if (domMessageCount <= keepRecentWithDom)
|
|
2915
|
+
continue;
|
|
2916
|
+
// Replace DOM block with compact placeholder
|
|
2917
|
+
if (contentArr) {
|
|
2918
|
+
for (const part of contentArr) {
|
|
2919
|
+
if (typeof part === 'object' && 'text' in part && typeof part.text === 'string') {
|
|
2920
|
+
PAGE_DOM_RE.lastIndex = 0;
|
|
2921
|
+
part.text = part.text.replace(PAGE_DOM_RE, PLACEHOLDER);
|
|
2922
|
+
}
|
|
2923
|
+
}
|
|
2924
|
+
}
|
|
2925
|
+
else if (typeof msg.content === 'string') {
|
|
2926
|
+
PAGE_DOM_RE.lastIndex = 0;
|
|
2927
|
+
msg.content = msg.content.replace(PAGE_DOM_RE, PLACEHOLDER);
|
|
2928
|
+
}
|
|
2929
|
+
}
|
|
2930
|
+
}
|
|
2931
|
+
/**
|
|
2932
|
+
* Trim the conversation thread to avoid context window overflow.
|
|
2933
|
+
* Always preserves the first `preservedPrefixMessages` entries, then the last maxMessages messages.
|
|
2934
|
+
* Also compresses old screenshots and DOM blocks to save tokens.
|
|
2935
|
+
*/
|
|
2936
|
+
export function trimConversationHistory(messages, maxMessages = 48, preservedPrefixMessages = 1) {
|
|
2937
|
+
// Compress old screenshots before trimming to maximise token savings
|
|
2938
|
+
compressOldScreenshots(messages, 1);
|
|
2939
|
+
// Strip <page_dom> from older user messages — the current DOM is always available
|
|
2940
|
+
compressOldDomBlocks(messages, 6, preservedPrefixMessages);
|
|
2941
|
+
if (messages.length <= maxMessages + preservedPrefixMessages)
|
|
2942
|
+
return;
|
|
2943
|
+
const preservedPrefix = messages.slice(0, preservedPrefixMessages);
|
|
2944
|
+
const recent = messages.slice(-(maxMessages));
|
|
2945
|
+
messages.splice(0, messages.length, ...preservedPrefix, ...recent);
|
|
2946
|
+
}
|
|
2947
|
+
/**
|
|
2948
|
+
* Format a tool result message for the conversation thread.
|
|
2949
|
+
* This is what the LLM sees after each action — it replaces the old action history mechanism.
|
|
2950
|
+
*/
|
|
2951
|
+
function formatToolResult(name, args, execResult, elements) {
|
|
2952
|
+
if (!execResult.success) {
|
|
2953
|
+
return `FAILED: ${execResult.error || 'Unknown error'}`;
|
|
2954
|
+
}
|
|
2955
|
+
// Prefix with no-effect warning when the action succeeded but nothing visibly changed.
|
|
2956
|
+
// This helps the model detect when its action didn't produce the intended result
|
|
2957
|
+
// and avoid retrying the same ineffective approach (Anthropic best practice).
|
|
2958
|
+
const noEffectPrefix = execResult.stateChanged === false && name !== 'wait' && name !== 'note' && name !== 'begin_subgoal'
|
|
2959
|
+
? 'NO_EFFECT: '
|
|
2960
|
+
: '';
|
|
2961
|
+
// Enrich result with element label when clicking by index
|
|
2962
|
+
if ((name === 'click' || name === 'type_text' || name === 'select_option' || name === 'scroll') && args.index !== undefined) {
|
|
2963
|
+
const el = elements.find(e => e.index === args.index);
|
|
2964
|
+
const label = el ? (el.text || el.ariaLabel || el.inputType || el.tag) : '';
|
|
2965
|
+
const labelStr = label ? ` "${label}"` : '';
|
|
2966
|
+
return execResult.outcome
|
|
2967
|
+
? `${noEffectPrefix}[${args.index}]${labelStr}: ${execResult.outcome}`
|
|
2968
|
+
: `${noEffectPrefix}[${args.index}]${labelStr}: ok`;
|
|
2969
|
+
}
|
|
2970
|
+
return `${noEffectPrefix}${execResult.outcome || 'ok'}`;
|
|
2971
|
+
}
|
|
2972
|
+
async function callPlanner(client, prompt, url, firstScreenshot, options) {
|
|
2973
|
+
try {
|
|
2974
|
+
const isReplanning = (options.completedMilestones ?? 0) > 0;
|
|
2975
|
+
const systemContent = isReplanning
|
|
2976
|
+
? `You are a web navigation planner. Given a screenshot of the current page and a goal, output the REMAINING steps (3–5 steps) needed to complete the workflow. ${options.completedMilestones} milestone(s) have already been captured. Focus on what still needs to be done. Never propose blocked actions such as "Continue with AI", AI/generate/create buttons, logout/sign out, billing/purchase flows, save/submit/publish actions, or other account-changing/destructive steps unless the goal explicitly is the login screen. No preamble — just the numbered steps.`
|
|
2977
|
+
: 'You are a web navigation planner. Given a screenshot of the starting page and a goal, output a concise numbered action plan (3–7 steps) to achieve the goal. This plan is a rough guide — the agent will adapt if the actual page differs from expectations. Be specific about what to click or navigate, but acknowledge that labels and layouts may differ. Never propose blocked actions such as "Continue with AI", AI/generate/create buttons, logout/sign out, billing/purchase flows, save/submit/publish actions, or other account-changing/destructive steps unless the goal explicitly is the login screen. No preamble — just the numbered steps.';
|
|
2978
|
+
const userGuidanceText = options.userGuidance && options.userGuidance.length > 0
|
|
2979
|
+
? `\n\n⚠️ USER OVERRIDE — the operator has provided explicit guidance that MUST take priority over any previous plan:\n${options.userGuidance.map((g, i) => ` ${i + 1}. ${g}`).join('\n')}\nYour new plan MUST follow this guidance. If the guidance contradicts the previous plan, discard the previous plan entirely.`
|
|
2980
|
+
: '';
|
|
2981
|
+
const failedAttemptsText = options.failedAttemptsSummary
|
|
2982
|
+
? `\n\n⚠️ PREVIOUS ATTEMPTS FAILED — the agent already tried these approaches and they did NOT work:\n${options.failedAttemptsSummary}\nDo NOT repeat these failed approaches. The agent may be on the WRONG PAGE — check if navigating to a different section/page is needed first.`
|
|
2983
|
+
: '';
|
|
2984
|
+
const goalText = `Goal: ${prompt}\nCurrent URL: ${url}${options.lang ? `\nTarget language: ${options.lang}` : ''}${options.theme ? `\nTarget theme: ${options.theme}` : ''}${options.currentObjective ? `\nCurrent objective: ${options.currentObjective}` : ''}${options.captureCursorSummary ? `\nCapture cursor: ${options.captureCursorSummary}` : ''}${options.remainingCaptureQueue && options.remainingCaptureQueue.length > 0 ? `\nRemaining capture queue: ${options.remainingCaptureQueue.join(', ')}` : ''}${options.repairTicketSummary ? `\nActive repair ticket: ${options.repairTicketSummary}\nUse repair steps only to unblock the current cursor, then resume the same capture.` : ''}${options.authState === 'authenticated' ? '\nAuthenticated browser state is already active. Do NOT plan login, OAuth, or sign-in steps again unless the explicit goal is the login screen or the screenshot clearly shows a login screen.' : ''}${options.handoffContextSummary ? `\nLive handoff context: ${options.handoffContextSummary}\nContinue from the carried-over page state before deciding to navigate.` : ''}${options.handoffNavigationHints && options.handoffNavigationHints.length > 0 ? `\nLikely next controls from the carried-over state: ${options.handoffNavigationHints.join(' | ')}` : ''}${options.variantManifestSummary ? `\nVariant manifest: ${options.variantManifestSummary}\nThe next steps must satisfy the current page id specifically and avoid duplicating already completed pages.` : ''}${failedAttemptsText}${userGuidanceText}\nForbidden planning actions: Continue with AI, generate/create with AI, logout/sign out, billing/purchase, save/submit/publish, or other mutating/account-changing steps unless the explicit goal is the login screen.\nIf a blocked control is visible, plan the safe alternative route instead.\n\nWrite the ${isReplanning ? 'remaining' : 'numbered action'} plan.`;
|
|
2985
|
+
// Dual-model mode: use cheap vision model to describe the page, then text-only primary model
|
|
2986
|
+
if (options.visionModel) {
|
|
2987
|
+
const screenshotUrl = await makeImageUrl(firstScreenshot, 'image/jpeg', options.uploadImage);
|
|
2988
|
+
const observerMessages = [
|
|
2989
|
+
{ role: 'system', content: 'You are a page state observer. Describe the current page layout, visible elements, and navigation options concisely. Focus on what a planner needs to know to navigate the page.' },
|
|
2990
|
+
{ role: 'user', content: [
|
|
2991
|
+
{ type: 'image_url', image_url: { url: screenshotUrl } },
|
|
2992
|
+
{ type: 'text', text: `Describe this page for a navigation planner. URL: ${url}` },
|
|
2993
|
+
] },
|
|
2994
|
+
];
|
|
2995
|
+
const observerResult = await client.chat.completions.create({ model: options.visionModel, messages: observerMessages, max_tokens: 300, stream: false, ...providerBody(options.visionModel, options.providerPreferences) }, { signal: options.signal });
|
|
2996
|
+
const observation = observerResult.choices?.[0]?.message?.content?.trim() ?? '';
|
|
2997
|
+
const observerUsage = extractUsage(observerResult, options.stepCounter ?? 0, 'agent_iteration', options.visionModel, 1);
|
|
2998
|
+
const planningMessages = [
|
|
2999
|
+
{ role: 'system', content: systemContent },
|
|
3000
|
+
{ role: 'user', content: `Page observation:\n${observation}\n\n${goalText}` },
|
|
3001
|
+
];
|
|
3002
|
+
const planResult = await client.chat.completions.create({ model: options.model, messages: planningMessages, max_tokens: 256, ...providerBody(options.model, options.providerPreferences) }, { signal: options.signal });
|
|
3003
|
+
const plan = planResult.choices?.[0]?.message?.content?.trim() ?? null;
|
|
3004
|
+
const planUsage = extractUsage(planResult, (options.stepCounter ?? 0) + 1, 'agent_iteration', options.model, 0);
|
|
3005
|
+
// Return the primary model's usage; observer usage is tracked separately via usageLog at the call site
|
|
3006
|
+
return { plan, usage: planUsage ?? observerUsage, model: options.model };
|
|
3007
|
+
}
|
|
3008
|
+
// Mono-model mode: send image directly to the primary model (with vision fallback)
|
|
3009
|
+
const screenshotUrl = await makeImageUrl(firstScreenshot, 'image/jpeg', options.uploadImage);
|
|
3010
|
+
const planningMessages = [
|
|
3011
|
+
{
|
|
3012
|
+
role: 'system',
|
|
3013
|
+
content: systemContent,
|
|
3014
|
+
},
|
|
3015
|
+
{
|
|
3016
|
+
role: 'user',
|
|
3017
|
+
content: [
|
|
3018
|
+
{ type: 'image_url', image_url: { url: screenshotUrl } },
|
|
3019
|
+
{ type: 'text', text: goalText },
|
|
3020
|
+
],
|
|
3021
|
+
},
|
|
3022
|
+
];
|
|
3023
|
+
const plannerResult = await callVisionCapableModel({
|
|
3024
|
+
primaryModel: options.model,
|
|
3025
|
+
fallbackModel: options.fallbackModel,
|
|
3026
|
+
onFallbackActivated: (m, reason) => logger.debug(`Planning vision fallback activated: ${m} (reason: ${reason})`),
|
|
3027
|
+
callModel: (model) => client.chat.completions.create({
|
|
3028
|
+
model,
|
|
3029
|
+
messages: planningMessages,
|
|
3030
|
+
max_tokens: 256,
|
|
3031
|
+
...providerBody(model, options.providerPreferences),
|
|
3032
|
+
}, { signal: options.signal }),
|
|
3033
|
+
});
|
|
3034
|
+
const plan = plannerResult.result.choices?.[0]?.message?.content?.trim() ?? null;
|
|
3035
|
+
const usage = extractUsage(plannerResult.result, options.stepCounter ?? 0, 'agent_iteration', plannerResult.model, 1);
|
|
3036
|
+
return { plan, usage, model: plannerResult.model };
|
|
3037
|
+
}
|
|
3038
|
+
catch (err) {
|
|
3039
|
+
if (!isAbortError(err)) {
|
|
3040
|
+
const modelTried = options.fallbackModel
|
|
3041
|
+
? `${options.model} → ${options.fallbackModel}`
|
|
3042
|
+
: options.model;
|
|
3043
|
+
logger.debug(`Planning call failed (non-fatal) [${modelTried}]: ${err.message}`);
|
|
3044
|
+
}
|
|
3045
|
+
return { plan: null, usage: null };
|
|
3046
|
+
}
|
|
3047
|
+
}
|
|
3048
|
+
function resolveActionOrigin(config) {
|
|
3049
|
+
if (config.runMode === 'language_preflight')
|
|
3050
|
+
return 'preflight';
|
|
3051
|
+
if (config.runMode === 'repair' || config.currentObjective === 'repair')
|
|
3052
|
+
return 'repair_subplan';
|
|
3053
|
+
return 'main_plan';
|
|
3054
|
+
}
|
|
3055
|
+
function appendActionHistory(actionHistory, config, action) {
|
|
3056
|
+
actionHistory.push({
|
|
3057
|
+
...action,
|
|
3058
|
+
origin: action.origin ?? resolveActionOrigin(config),
|
|
3059
|
+
phase: action.phase ?? config.captureCursor?.phase,
|
|
3060
|
+
checkpointId: action.checkpointId !== undefined
|
|
3061
|
+
? action.checkpointId
|
|
3062
|
+
: config.captureCursor?.lastVerifiedCheckpointId ?? null,
|
|
3063
|
+
});
|
|
3064
|
+
}
|
|
3065
|
+
/**
|
|
3066
|
+
* Build a compact trajectory log from action history.
|
|
3067
|
+
* Re-injected every iteration to preserve action context across conversation trimming.
|
|
3068
|
+
* Format: +* iter: action [index] "reason" (+ success, - fail, * state changed)
|
|
3069
|
+
* Capped at the last 30 actions to avoid unbounded token growth.
|
|
3070
|
+
* Meta-actions (note, begin_subgoal) are rendered compactly to reduce noise.
|
|
3071
|
+
*/
|
|
3072
|
+
/**
|
|
3073
|
+
* Module-level cache so the OpenRouter model list is fetched at most once per process.
|
|
3074
|
+
* key = model id, value = supports cache_control
|
|
3075
|
+
*/
|
|
3076
|
+
const cachingCapabilityCache = new Map();
|
|
3077
|
+
/**
|
|
3078
|
+
* Checks whether a model supports explicit prompt caching via cache_control by
|
|
3079
|
+
* inspecting its `pricing.input_cache_read` field in the OpenRouter models API.
|
|
3080
|
+
* Results are memoised for the lifetime of the process.
|
|
3081
|
+
*/
|
|
3082
|
+
async function modelSupportsCaching(model, apiKey) {
|
|
3083
|
+
if (cachingCapabilityCache.has(model))
|
|
3084
|
+
return cachingCapabilityCache.get(model);
|
|
3085
|
+
try {
|
|
3086
|
+
const res = await fetch('https://openrouter.ai/api/v1/models', {
|
|
3087
|
+
headers: { Authorization: `Bearer ${apiKey}` },
|
|
3088
|
+
});
|
|
3089
|
+
if (!res.ok)
|
|
3090
|
+
throw new Error(`HTTP ${res.status}`);
|
|
3091
|
+
const body = await res.json();
|
|
3092
|
+
for (const m of body.data ?? []) {
|
|
3093
|
+
const supported = Number(m.pricing?.input_cache_read ?? 0) > 0;
|
|
3094
|
+
cachingCapabilityCache.set(m.id, supported);
|
|
3095
|
+
}
|
|
3096
|
+
return cachingCapabilityCache.get(model) ?? false;
|
|
3097
|
+
}
|
|
3098
|
+
catch (err) {
|
|
3099
|
+
logger.info(`Could not detect caching support for ${model}: ${err.message}. Disabling cache_control.`);
|
|
3100
|
+
return false;
|
|
3101
|
+
}
|
|
3102
|
+
}
|
|
3103
|
+
function buildTrajectoryLog(actionHistory) {
|
|
3104
|
+
if (actionHistory.length === 0)
|
|
3105
|
+
return '';
|
|
3106
|
+
return actionHistory.slice(-25).map(a => {
|
|
3107
|
+
const status = a.success ? '+' : '-';
|
|
3108
|
+
const changed = a.stateChanged ? '*' : ' ';
|
|
3109
|
+
// Meta-actions: show their name/content inline without index/reason noise
|
|
3110
|
+
if (a.action === 'begin_subgoal')
|
|
3111
|
+
return ` ${a.iteration}: [subgoal: ${String(a.params.name ?? '').slice(0, 30)}]`;
|
|
3112
|
+
if (a.action === 'note')
|
|
3113
|
+
return ` ${a.iteration}: [note]`;
|
|
3114
|
+
const target = a.params.index !== undefined ? ` [${a.params.index}]` : '';
|
|
3115
|
+
const reason = a.params.reason ? ` "${String(a.params.reason).slice(0, 30)}"` : '';
|
|
3116
|
+
return `${status}${changed} ${a.iteration}: ${a.action}${target}${reason}`;
|
|
3117
|
+
}).join('\n');
|
|
3118
|
+
}
|
|
3119
|
+
/**
|
|
3120
|
+
* Build a compact summary of recent failed/no-effect actions for the planner.
|
|
3121
|
+
* Helps the planner understand what was already tried so it doesn't repeat the same mistakes.
|
|
3122
|
+
*/
|
|
3123
|
+
function buildFailedAttemptsSummary(actionHistory) {
|
|
3124
|
+
const recent = actionHistory.slice(-12);
|
|
3125
|
+
const failed = recent.filter(a => !a.success || a.stateChanged === false).filter(a => a.action !== 'note' && a.action !== 'begin_subgoal' && a.action !== 'wait');
|
|
3126
|
+
if (failed.length === 0)
|
|
3127
|
+
return undefined;
|
|
3128
|
+
return failed.map(a => {
|
|
3129
|
+
const target = a.params.elementLabel
|
|
3130
|
+
? `"${String(a.params.elementLabel).slice(0, 40)}"`
|
|
3131
|
+
: a.params.index !== undefined
|
|
3132
|
+
? `[${a.params.index}]`
|
|
3133
|
+
: a.params.query
|
|
3134
|
+
? `"${String(a.params.query).slice(0, 40)}"`
|
|
3135
|
+
: '';
|
|
3136
|
+
const outcome = !a.success
|
|
3137
|
+
? `FAILED: ${(a.error || 'unknown').slice(0, 60)}`
|
|
3138
|
+
: 'NO EFFECT';
|
|
3139
|
+
return `- ${a.action} ${target} → ${outcome}`;
|
|
3140
|
+
}).join('\n');
|
|
3141
|
+
}
|
|
3142
|
+
/**
|
|
3143
|
+
* Call a cheap vision model to extract a structured text observation from a screenshot.
|
|
3144
|
+
* Used in dual-model mode: the vision model describes the page, the text model reasons.
|
|
3145
|
+
* Returns a concise text description (~200-300 tokens) that replaces images in the main context.
|
|
3146
|
+
*/
|
|
3147
|
+
async function callVisionObserver(client, visionModel, screenshot, pageState, config, stepCounter) {
|
|
3148
|
+
const screenshotUrl = await makeImageUrl(screenshot, 'image/jpeg', config.uploadImage);
|
|
3149
|
+
const messages = [
|
|
3150
|
+
{
|
|
3151
|
+
role: 'system',
|
|
3152
|
+
content: 'You are a page state observer for a web navigation agent. Describe the current page concisely and factually. Output ONLY the structured observation — no commentary.',
|
|
3153
|
+
},
|
|
3154
|
+
{
|
|
3155
|
+
role: 'user',
|
|
3156
|
+
content: buildVisionObserverPrompt({
|
|
3157
|
+
screenshotUrl,
|
|
3158
|
+
currentUrl: config.url,
|
|
3159
|
+
interactiveElements: pageState.interactiveElements,
|
|
3160
|
+
userGoal: config.prompt,
|
|
3161
|
+
currentLang: config.currentLang,
|
|
3162
|
+
currentTheme: config.currentTheme,
|
|
3163
|
+
currentPageId: config.variantManifest?.currentPageId ?? undefined,
|
|
3164
|
+
pageIdentitySummary: config.variantManifest?.currentPageIdentity?.summary ?? undefined,
|
|
3165
|
+
currentObjective: config.currentObjective,
|
|
3166
|
+
}),
|
|
3167
|
+
},
|
|
3168
|
+
];
|
|
3169
|
+
try {
|
|
3170
|
+
const result = await client.chat.completions.create({ model: visionModel, messages, max_tokens: 300, stream: false, ...providerBody(visionModel, config.providerPreferences) }, { signal: config.abortSignal });
|
|
3171
|
+
const content = result.choices?.[0]?.message?.content?.trim() ?? '';
|
|
3172
|
+
const usage = extractUsage(result, stepCounter, 'agent_iteration', visionModel, 1);
|
|
3173
|
+
return { observation: content, usage };
|
|
3174
|
+
}
|
|
3175
|
+
catch (err) {
|
|
3176
|
+
if (isAbortError(err))
|
|
3177
|
+
throw err;
|
|
3178
|
+
logger.error(`Vision observer call failed: ${err.message}`);
|
|
3179
|
+
return { observation: '', usage: null };
|
|
3180
|
+
}
|
|
3181
|
+
}
|
|
3182
|
+
export async function runAgent(browser, config, apiKey) {
|
|
3183
|
+
const client = createClient(apiKey);
|
|
3184
|
+
const modelState = { active: config.model };
|
|
3185
|
+
const actionHistory = [];
|
|
3186
|
+
const workflowScreenshots = [];
|
|
3187
|
+
const usageLog = [];
|
|
3188
|
+
let stepCounter = 0;
|
|
3189
|
+
const hasCredentials = !!(config.credentials?.email || config.credentials?.password);
|
|
3190
|
+
let usedDeterministicRecovery = false;
|
|
3191
|
+
const userGuidanceMessages = [];
|
|
3192
|
+
let lastVerificationFailure;
|
|
3193
|
+
let lastVerificationResult;
|
|
3194
|
+
let consecutiveDialogFailures = 0;
|
|
3195
|
+
let consecutiveVerificationFailures = 0;
|
|
3196
|
+
let consecutiveTechnicalVerificationFailures = 0;
|
|
3197
|
+
let rejectedGiveUps = 0;
|
|
3198
|
+
let lastReplanIteration = 0; // Cooldown: skip replanning if we just replanned
|
|
3199
|
+
// Working memory: persistent notes stored by the agent via the `note` tool
|
|
3200
|
+
// Re-injected into every prompt to survive conversation trimming (Agent-E / HiAgent pattern)
|
|
3201
|
+
const agentNotes = [];
|
|
3202
|
+
// Hierarchical working memory (HiAgent pattern):
|
|
3203
|
+
// Subgoals organize notes into named phases; completed subgoals are archived as 1-line summaries
|
|
3204
|
+
let currentSubgoal = null;
|
|
3205
|
+
const completedSubgoals = [];
|
|
3206
|
+
// AWM within-run: cache of selectors that worked per subgoal for cross-subgoal hints
|
|
3207
|
+
const workflowCache = [];
|
|
3208
|
+
// Index in actionHistory where the current subgoal started (for scoped AWM extraction)
|
|
3209
|
+
let subgoalStartIndex = 0;
|
|
3210
|
+
// Screenshot hash history for visual loop detection
|
|
3211
|
+
// Tracks MD5 hashes of recent screenshots to detect when the agent is stuck on the same page
|
|
3212
|
+
const screenshotHashHistory = [];
|
|
3213
|
+
// Multi-turn conversation thread — persisted across all iterations
|
|
3214
|
+
// The LLM naturally sees its own history without us reconstructing context each time
|
|
3215
|
+
const cacheLayoutV2Enabled = process.env.SCREENSHOT_AGENT_CACHE_LAYOUT_V2 === '1';
|
|
3216
|
+
const promptCacheStrategy = resolvePromptCacheStrategy(config.model, {
|
|
3217
|
+
enableGeminiExplicitBreakpoints: process.env.SCREENSHOT_AGENT_GEMINI_EXPLICIT_CACHE_BREAKPOINTS === '1',
|
|
3218
|
+
});
|
|
3219
|
+
const systemPromptText = buildSystemPrompt({ reasoningLocale: config.reasoningLocale });
|
|
3220
|
+
const supportsCache = promptCacheStrategy === 'explicit_breakpoints'
|
|
3221
|
+
&& await modelSupportsCaching(config.model, apiKey);
|
|
3222
|
+
const systemMessage = supportsCache
|
|
3223
|
+
? {
|
|
3224
|
+
role: 'system',
|
|
3225
|
+
content: [{ type: 'text', text: systemPromptText, cache_control: { type: 'ephemeral' } }],
|
|
3226
|
+
}
|
|
3227
|
+
: { role: 'system', content: systemPromptText };
|
|
3228
|
+
const conversationMessages = [systemMessage];
|
|
3229
|
+
if (hasManualMultiProviderOrder(config.providerPreferences?.[config.model])) {
|
|
3230
|
+
logger.info(`[cache] ${config.model} uses a multi-provider provider.order override; OpenRouter sticky prompt caching may be reduced.`);
|
|
3231
|
+
}
|
|
3232
|
+
// Pre-loop planning: take an initial screenshot and generate a step-by-step plan
|
|
3233
|
+
// This gives the agent direction before it starts acting, reducing aimless exploration.
|
|
3234
|
+
// Skip planning when the agent is already on the target URL — the capture-first rule
|
|
3235
|
+
// will make it call ready_to_capture immediately, so planning is wasted.
|
|
3236
|
+
const currentUrl = browser.currentPage.url();
|
|
3237
|
+
const alreadyOnTarget = urlsRoughlyMatch(config.url, currentUrl);
|
|
3238
|
+
const planningScreenshot = alreadyOnTarget ? null : await browser.takeScreenshotForAI().catch(() => null);
|
|
3239
|
+
let taskPlan = null;
|
|
3240
|
+
if (planningScreenshot) {
|
|
3241
|
+
const plannerResult = await callPlanner(client, config.prompt, currentUrl, planningScreenshot, {
|
|
3242
|
+
model: modelState.active,
|
|
3243
|
+
fallbackModel: config.fallbackModel,
|
|
3244
|
+
visionModel: config.visionModel,
|
|
3245
|
+
lang: config.currentLang,
|
|
3246
|
+
theme: config.currentTheme,
|
|
3247
|
+
currentObjective: config.currentObjective,
|
|
3248
|
+
captureCursorSummary: summarizeCaptureCursorForPlanner(config),
|
|
3249
|
+
repairTicketSummary: summarizeRepairTicketForPlanner(config),
|
|
3250
|
+
remainingCaptureQueue: config.remainingCaptureQueue,
|
|
3251
|
+
authState: config.handoffContext?.authState ?? config.sessionProfile?.authState,
|
|
3252
|
+
handoffContextSummary: config.handoffContext?.summary,
|
|
3253
|
+
handoffNavigationHints: config.handoffContext?.navigationHints,
|
|
3254
|
+
variantManifestSummary: summarizeVariantManifestForPlanner(config.variantManifest),
|
|
3255
|
+
signal: config.abortSignal,
|
|
3256
|
+
uploadImage: config.uploadImage,
|
|
3257
|
+
stepCounter: ++stepCounter,
|
|
3258
|
+
providerPreferences: config.providerPreferences,
|
|
3259
|
+
});
|
|
3260
|
+
taskPlan = plannerResult.plan;
|
|
3261
|
+
if (plannerResult.usage)
|
|
3262
|
+
usageLog.push(plannerResult.usage);
|
|
3263
|
+
if (plannerResult.model && plannerResult.model !== modelState.active) {
|
|
3264
|
+
modelState.active = plannerResult.model;
|
|
3265
|
+
}
|
|
3266
|
+
if (taskPlan) {
|
|
3267
|
+
logger.info(`Task plan generated:\n${taskPlan}`);
|
|
3268
|
+
}
|
|
3269
|
+
}
|
|
3270
|
+
if (cacheLayoutV2Enabled) {
|
|
3271
|
+
const anchorPrompt = buildStableAnchorUserMessage({
|
|
3272
|
+
userPrompt: config.prompt,
|
|
3273
|
+
credentials: config.credentials,
|
|
3274
|
+
currentLang: config.currentLang,
|
|
3275
|
+
currentTheme: config.currentTheme,
|
|
3276
|
+
langInstructions: config.langInstructions,
|
|
3277
|
+
themeInstructions: config.themeInstructions,
|
|
3278
|
+
viewports: config.viewports,
|
|
3279
|
+
runHints: config.runHints,
|
|
3280
|
+
selectorMemory: config.selectorMemory,
|
|
3281
|
+
sessionProfile: config.sessionProfile,
|
|
3282
|
+
handoffContext: config.handoffContext,
|
|
3283
|
+
variantManifest: config.variantManifest,
|
|
3284
|
+
});
|
|
3285
|
+
// Mark the last content part of the anchor with cache_control so Anthropic's
|
|
3286
|
+
// prompt caching covers both the system message and the stable anchor.
|
|
3287
|
+
const anchorContent = supportsCache
|
|
3288
|
+
? anchorPrompt.content.map((part, idx, arr) => idx === arr.length - 1 && part.type === 'text'
|
|
3289
|
+
? { ...part, cache_control: { type: 'ephemeral' } }
|
|
3290
|
+
: part)
|
|
3291
|
+
: anchorPrompt.content;
|
|
3292
|
+
conversationMessages.push({ role: 'user', content: anchorContent });
|
|
3293
|
+
}
|
|
3294
|
+
// DOM fingerprint tracking: skip sending full DOM when page hasn't changed
|
|
3295
|
+
let lastDomFingerprint = null;
|
|
3296
|
+
for (let iteration = 1; iteration <= config.maxIterations; iteration++) {
|
|
3297
|
+
throwIfAborted(config.abortSignal, 'Agent run cancelled.');
|
|
3298
|
+
// Check for user guidance (pause & guide)
|
|
3299
|
+
let guidanceReceivedThisIteration = false;
|
|
3300
|
+
if (config.guidanceCallback) {
|
|
3301
|
+
const guidance = await config.guidanceCallback();
|
|
3302
|
+
if (guidance) {
|
|
3303
|
+
logger.info(`Indication reçue : ${guidance}`);
|
|
3304
|
+
userGuidanceMessages.push(guidance);
|
|
3305
|
+
guidanceReceivedThisIteration = true;
|
|
3306
|
+
}
|
|
3307
|
+
}
|
|
3308
|
+
// 1. Capture current page state
|
|
3309
|
+
// Pre-compute whether vision will likely be auto-triggered this iteration.
|
|
3310
|
+
// When vision is not needed, skip the expensive SoM annotation (sharp compositing).
|
|
3311
|
+
const preNoEffectCount = countRecentNoEffectActions(actionHistory);
|
|
3312
|
+
const likelyNeedsVision = preNoEffectCount >= 2 || (preNoEffectCount >= 1 && screenshotHashHistory.length >= 3);
|
|
3313
|
+
const pageState = await browser.getPageState({ skipAnnotation: !likelyNeedsVision });
|
|
3314
|
+
// Stream live screenshot to the web UI
|
|
3315
|
+
emitScreenshot(getLivePreviewScreenshot(pageState).toString('base64'));
|
|
3316
|
+
// Visual loop detection: track screenshot hashes to detect when the agent is stuck.
|
|
3317
|
+
// Use a larger sample (first 32KB) for more robust detection — small samples miss
|
|
3318
|
+
// subtle layout differences while being too sensitive to viewport size changes.
|
|
3319
|
+
const sampleSize = Math.min(32768, pageState.screenshot.length);
|
|
3320
|
+
const screenshotHash = createHash('md5').update(pageState.screenshot.subarray(0, sampleSize)).digest('hex');
|
|
3321
|
+
// Also track a URL-based hash to catch navigation loops where the page content
|
|
3322
|
+
// is near-identical but the viewport or minor rendering differences change the image hash.
|
|
3323
|
+
const urlHash = createHash('md5').update(browser.currentPage.url()).digest('hex').slice(0, 8);
|
|
3324
|
+
const compositeKey = `${urlHash}:${screenshotHash}`;
|
|
3325
|
+
screenshotHashHistory.push(compositeKey);
|
|
3326
|
+
if (screenshotHashHistory.length > 8)
|
|
3327
|
+
screenshotHashHistory.shift();
|
|
3328
|
+
const hashOccurrences = screenshotHashHistory.filter(h => h === compositeKey).length;
|
|
3329
|
+
// Also check URL-only loops: same URL appearing 4+ times indicates stuck navigation
|
|
3330
|
+
const urlOccurrences = screenshotHashHistory.filter(h => h.startsWith(urlHash + ':')).length;
|
|
3331
|
+
const isVisualLoop = hashOccurrences >= 3 || urlOccurrences >= 5;
|
|
3332
|
+
// Deterministic session repair when stuck (fast, no LLM cost)
|
|
3333
|
+
if (config.enableDeterministicRecovery !== false && shouldTriggerRecovery(actionHistory) && config.selectorMemory) {
|
|
3334
|
+
logger.info('Tentative de récupération automatique…');
|
|
3335
|
+
const repair = await performDeterministicSessionRepair(browser, {
|
|
3336
|
+
startUrl: config.url,
|
|
3337
|
+
requestedLang: config.currentLang,
|
|
3338
|
+
requestedTheme: config.currentTheme,
|
|
3339
|
+
credentials: config.credentials,
|
|
3340
|
+
profile: config.sessionProfile,
|
|
3341
|
+
selectorMemory: config.selectorMemory,
|
|
3342
|
+
}).catch(() => null);
|
|
3343
|
+
if (repair?.repaired) {
|
|
3344
|
+
usedDeterministicRecovery = true;
|
|
3345
|
+
const outcome = `Deterministic recovery succeeded via ${repair.pathUsed ?? 'selector memory'}.`;
|
|
3346
|
+
appendActionHistory(actionHistory, config, {
|
|
3347
|
+
iteration,
|
|
3348
|
+
action: 'wait',
|
|
3349
|
+
params: { reason: 'deterministic_repair' },
|
|
3350
|
+
success: true,
|
|
3351
|
+
outcome,
|
|
3352
|
+
stateChanged: true,
|
|
3353
|
+
origin: 'deterministic',
|
|
3354
|
+
phase: 'recover',
|
|
3355
|
+
});
|
|
3356
|
+
logger.success('Récupération automatique réussie');
|
|
3357
|
+
continue;
|
|
3358
|
+
}
|
|
3359
|
+
if (repair?.pathUsed)
|
|
3360
|
+
usedDeterministicRecovery = true;
|
|
3361
|
+
}
|
|
3362
|
+
// Dynamic replanning: triggered by stuck detection OR user guidance.
|
|
3363
|
+
// User guidance bypasses the cooldown and immediately forces a replan so the
|
|
3364
|
+
// agent does not waste iterations following an obsolete plan.
|
|
3365
|
+
const replanCooldownMet = iteration - lastReplanIteration >= 3;
|
|
3366
|
+
const isStuckEnoughToReplan = countRecentNoEffectActions(actionHistory) >= 3;
|
|
3367
|
+
const shouldReplan = guidanceReceivedThisIteration
|
|
3368
|
+
|| (replanCooldownMet && (isVisualLoop || (isStuckEnoughToReplan && iteration > 3)));
|
|
3369
|
+
if (shouldReplan) {
|
|
3370
|
+
lastReplanIteration = iteration;
|
|
3371
|
+
logger.info('Régénération du plan d\'action…');
|
|
3372
|
+
const replanScreenshot = await browser.takeScreenshot();
|
|
3373
|
+
const stuckReplan = await callPlanner(client, config.prompt, browser.currentPage.url(), replanScreenshot, {
|
|
3374
|
+
model: modelState.active,
|
|
3375
|
+
fallbackModel: config.fallbackModel,
|
|
3376
|
+
visionModel: config.visionModel,
|
|
3377
|
+
lang: config.currentLang,
|
|
3378
|
+
theme: config.currentTheme,
|
|
3379
|
+
currentObjective: config.currentObjective,
|
|
3380
|
+
captureCursorSummary: summarizeCaptureCursorForPlanner(config),
|
|
3381
|
+
repairTicketSummary: summarizeRepairTicketForPlanner(config),
|
|
3382
|
+
remainingCaptureQueue: config.remainingCaptureQueue,
|
|
3383
|
+
variantManifestSummary: summarizeVariantManifestForPlanner(config.variantManifest),
|
|
3384
|
+
completedMilestones: workflowScreenshots.length || 1,
|
|
3385
|
+
signal: config.abortSignal,
|
|
3386
|
+
uploadImage: config.uploadImage,
|
|
3387
|
+
stepCounter: ++stepCounter,
|
|
3388
|
+
providerPreferences: config.providerPreferences,
|
|
3389
|
+
userGuidance: userGuidanceMessages.length > 0 ? userGuidanceMessages : undefined,
|
|
3390
|
+
failedAttemptsSummary: buildFailedAttemptsSummary(actionHistory),
|
|
3391
|
+
}).catch(() => ({ plan: null, usage: null }));
|
|
3392
|
+
if (stuckReplan.usage)
|
|
3393
|
+
usageLog.push(stuckReplan.usage);
|
|
3394
|
+
if (stuckReplan.model && stuckReplan.model !== modelState.active) {
|
|
3395
|
+
modelState.active = stuckReplan.model;
|
|
3396
|
+
}
|
|
3397
|
+
if (stuckReplan.plan) {
|
|
3398
|
+
taskPlan = stuckReplan.plan;
|
|
3399
|
+
logger.info(`Nouveau plan d'action :\n${stuckReplan.plan}`);
|
|
3400
|
+
}
|
|
3401
|
+
}
|
|
3402
|
+
// 2. Build observation and add to conversation thread
|
|
3403
|
+
// First iteration: includes full task context (prompt, session, memory, instructions)
|
|
3404
|
+
// Subsequent iterations: compact page-state-only observation
|
|
3405
|
+
//
|
|
3406
|
+
// Dual-model mode: when visionModel is set, call the cheap vision model to extract
|
|
3407
|
+
// a text observation, then pass that to the main text model (no images in main context).
|
|
3408
|
+
// Mono-model mode: send images directly as before.
|
|
3409
|
+
// DOM-first: the main model receives the simplified DOM as primary context.
|
|
3410
|
+
// Vision analysis is available on-demand via the analyze_screenshot tool.
|
|
3411
|
+
// Auto-trigger: inject a vision observation when the agent is genuinely stuck.
|
|
3412
|
+
// Only fires on repeated no-effect actions or visual loops — NOT on every iteration
|
|
3413
|
+
// after a verification failure (lastVerificationFailure is already in the text context).
|
|
3414
|
+
// Skip vision when a replan or user guidance just happened — the agent has fresh direction.
|
|
3415
|
+
let visionObservationText;
|
|
3416
|
+
const noEffectCount = countRecentNoEffectActions(actionHistory);
|
|
3417
|
+
const justReplanned = shouldReplan; // replan happened this iteration — let the agent try the new plan first
|
|
3418
|
+
const shouldAutoTriggerVision = !justReplanned
|
|
3419
|
+
&& !guidanceReceivedThisIteration
|
|
3420
|
+
&& (noEffectCount >= 2 || (isVisualLoop && noEffectCount >= 1));
|
|
3421
|
+
if (shouldAutoTriggerVision) {
|
|
3422
|
+
const visionModel = config.visionModel || modelState.active;
|
|
3423
|
+
logger.info('Analyse visuelle de la page…');
|
|
3424
|
+
const visionResult = await callVisionObserver(client, visionModel, pageState.screenshot, pageState, config, ++stepCounter);
|
|
3425
|
+
if (visionResult.usage)
|
|
3426
|
+
usageLog.push(visionResult.usage);
|
|
3427
|
+
visionObservationText = visionResult.observation || undefined;
|
|
3428
|
+
if (visionObservationText) {
|
|
3429
|
+
logger.debug(`Auto-triggered vision (stuck=${noEffectCount}, visualLoop=${isVisualLoop}, verifyFail=${!!lastVerificationFailure}): ${visionObservationText.slice(0, 200)}`);
|
|
3430
|
+
// If vision says the page matches the target and we're stuck, add a strong capture hint
|
|
3431
|
+
const matchLine = visionObservationText.split('\n').find(l => /^MATCH:/i.test(l.trim()));
|
|
3432
|
+
if (matchLine && /\b(yes|oui)\b/i.test(matchLine) && (noEffectCount >= 2 || isVisualLoop)) {
|
|
3433
|
+
visionObservationText += config.runMode === 'video_navigation_preflight'
|
|
3434
|
+
? '\n\n⚠️ VISION SUGGESTS THE PAGE IS CLOSE TO THE GOAL. Before calling ready_to_capture, confirm that the EXACT pre-recording start state is visible now: the requested project/section/dialog must already be open, and the recorded interaction must not be done yet.'
|
|
3435
|
+
: '\n\n⚠️ VISION CONFIRMS PAGE MATCHES TARGET. You are stuck clicking without progress. Call ready_to_capture NOW — the current state is likely what you need to capture.';
|
|
3436
|
+
}
|
|
3437
|
+
}
|
|
3438
|
+
}
|
|
3439
|
+
// DOM-first: skip images in the main iteration prompt.
|
|
3440
|
+
// The simplified DOM + interactive elements provide page context.
|
|
3441
|
+
// Vision is available on-demand via analyze_screenshot tool.
|
|
3442
|
+
const screenshotUrl = undefined;
|
|
3443
|
+
const cleanScreenshotUrl = undefined;
|
|
3444
|
+
// DOM skip: compute triple fingerprint (DOM + element count + current URL).
|
|
3445
|
+
// If identical to last iteration, send a compact placeholder instead of full DOM.
|
|
3446
|
+
// Safety net: always send full DOM on iteration 1, every 4th iteration, and when stuck.
|
|
3447
|
+
const currentDomFingerprint = createHash('sha1')
|
|
3448
|
+
.update(pageState.simplifiedDOM || '')
|
|
3449
|
+
.update(String(pageState.interactiveElements.length))
|
|
3450
|
+
.update(browser.currentPage.url())
|
|
3451
|
+
.digest('hex');
|
|
3452
|
+
const noEffectCountForDom = countRecentNoEffectActions(actionHistory);
|
|
3453
|
+
const domUnchanged = iteration > 1
|
|
3454
|
+
&& iteration % 4 !== 0
|
|
3455
|
+
&& noEffectCountForDom === 0
|
|
3456
|
+
&& lastDomFingerprint !== null
|
|
3457
|
+
&& currentDomFingerprint === lastDomFingerprint;
|
|
3458
|
+
lastDomFingerprint = currentDomFingerprint;
|
|
3459
|
+
const iterationPrompt = buildIterationUserMessage({
|
|
3460
|
+
userPrompt: config.prompt,
|
|
3461
|
+
cleanScreenshotUrl,
|
|
3462
|
+
screenshotUrl: screenshotUrl || '',
|
|
3463
|
+
visionObservation: visionObservationText,
|
|
3464
|
+
simplifiedDOM: domUnchanged ? undefined : (pageState.simplifiedDOM || undefined),
|
|
3465
|
+
domUnchanged,
|
|
3466
|
+
accessibilityTree: pageState.accessibilityTree,
|
|
3467
|
+
interactiveElements: pageState.interactiveElements,
|
|
3468
|
+
screenshotsTaken: workflowScreenshots,
|
|
3469
|
+
iteration,
|
|
3470
|
+
maxIterations: config.maxIterations,
|
|
3471
|
+
credentials: config.credentials,
|
|
3472
|
+
currentLang: config.currentLang,
|
|
3473
|
+
currentTheme: config.currentTheme,
|
|
3474
|
+
langInstructions: config.langInstructions,
|
|
3475
|
+
themeInstructions: config.themeInstructions,
|
|
3476
|
+
viewports: config.viewports,
|
|
3477
|
+
runHints: config.runHints,
|
|
3478
|
+
selectorMemory: config.selectorMemory,
|
|
3479
|
+
sessionProfile: config.sessionProfile,
|
|
3480
|
+
hasCredentials,
|
|
3481
|
+
salienceCompressionEnabled: config.enableSalienceCompression !== false,
|
|
3482
|
+
viewport: browser.currentPage.viewportSize() ?? config.viewport,
|
|
3483
|
+
currentUrl: browser.currentPage.url(),
|
|
3484
|
+
scrollInfo: pageState.scrollInfo,
|
|
3485
|
+
agentNotes: agentNotes.length > 0 ? [...agentNotes] : undefined,
|
|
3486
|
+
currentSubgoal: currentSubgoal ?? undefined,
|
|
3487
|
+
completedSubgoals: completedSubgoals.length > 0 ? [...completedSubgoals] : undefined,
|
|
3488
|
+
trajectoryLog: buildTrajectoryLog(actionHistory) || undefined,
|
|
3489
|
+
handoffContext: iteration <= 3 ? config.handoffContext : undefined,
|
|
3490
|
+
variantManifest: config.variantManifest,
|
|
3491
|
+
variantReference: config.variantReference,
|
|
3492
|
+
currentObjective: config.currentObjective,
|
|
3493
|
+
captureCursor: config.captureCursor,
|
|
3494
|
+
activeRepairTicket: config.activeRepairTicket,
|
|
3495
|
+
remainingCaptureQueue: config.remainingCaptureQueue,
|
|
3496
|
+
stuckLoopWarning: (shouldTriggerRecovery(actionHistory) || isVisualLoop)
|
|
3497
|
+
? (() => {
|
|
3498
|
+
const currentPageUrl = browser.currentPage.url();
|
|
3499
|
+
const targetUrl = config.url;
|
|
3500
|
+
// Detect wrong-page: if current URL path is completely different from target,
|
|
3501
|
+
// the agent is likely on the wrong page and needs to navigate first.
|
|
3502
|
+
let wrongPageHint = '';
|
|
3503
|
+
try {
|
|
3504
|
+
const currentPath = new URL(currentPageUrl).pathname;
|
|
3505
|
+
const targetPath = new URL(targetUrl).pathname;
|
|
3506
|
+
// If paths share no common segments beyond '/', flag it
|
|
3507
|
+
const currentSegments = currentPath.split('/').filter(Boolean);
|
|
3508
|
+
const targetSegments = targetPath.split('/').filter(Boolean);
|
|
3509
|
+
const commonSegments = currentSegments.filter(s => targetSegments.includes(s));
|
|
3510
|
+
if (currentSegments.length > 0 && targetSegments.length > 0 && commonSegments.length === 0) {
|
|
3511
|
+
wrongPageHint = ` ⚠️ WRONG PAGE: you are on ${currentPath} but the target is ${targetPath}. Navigate to the correct page FIRST before trying to interact with elements.`;
|
|
3512
|
+
}
|
|
3513
|
+
}
|
|
3514
|
+
catch { /* ignore URL parse errors */ }
|
|
3515
|
+
if (config.runMode === 'video_navigation_preflight') {
|
|
3516
|
+
return `STUCK: recent actions were ineffective${isVisualLoop ? ' (page appears visually unchanged)' : ''}.${wrongPageHint} BEFORE trying more actions, CHECK: is the browser on the EXACT pre-recording start state? The requested project/section/dialog must already be open, and the recorded interaction must not have been executed yet. Only then call ready_to_capture. Otherwise, continue the missing preparation step.`;
|
|
3517
|
+
}
|
|
3518
|
+
return `STUCK: recent actions were ineffective${isVisualLoop ? ' (page appears visually unchanged)' : ''}.${wrongPageHint} BEFORE trying more actions, CHECK: does the current page state ALREADY match the capture target? If a modal/panel/overlay is open that matches what you need to capture, call ready_to_capture NOW instead of clicking more things. If not, close any open dropdown/popover (press Escape) and try a completely different approach.`;
|
|
3519
|
+
})()
|
|
3520
|
+
: undefined,
|
|
3521
|
+
taskPlan: taskPlan ?? undefined,
|
|
3522
|
+
lastVerificationFailure,
|
|
3523
|
+
userGuidance: userGuidanceMessages.length > 0 ? [...userGuidanceMessages] : undefined,
|
|
3524
|
+
expansionLevel: Math.min(3, Math.max(0, countRecentNoEffectActions(actionHistory) - 1)),
|
|
3525
|
+
isFirstIteration: iteration === 1,
|
|
3526
|
+
cacheLayoutV2: cacheLayoutV2Enabled,
|
|
3527
|
+
failedAttemptsSummary: buildFailedAttemptsSummary(actionHistory),
|
|
3528
|
+
});
|
|
3529
|
+
conversationMessages.push({ role: 'user', content: iterationPrompt.content });
|
|
3530
|
+
// Trim thread to avoid context window overflow
|
|
3531
|
+
trimConversationHistory(conversationMessages, 24, cacheLayoutV2Enabled ? 2 : 1);
|
|
3532
|
+
// 3. Call OpenRouter with full conversation thread
|
|
3533
|
+
let response;
|
|
3534
|
+
try {
|
|
3535
|
+
throwIfAborted(config.abortSignal, 'Agent run cancelled.');
|
|
3536
|
+
const visionResult = await callVisionCapableModel({
|
|
3537
|
+
primaryModel: modelState.active,
|
|
3538
|
+
fallbackModel: modelState.active === config.model ? config.fallbackModel : undefined,
|
|
3539
|
+
onFallbackActivated: (model, reason) => {
|
|
3540
|
+
modelState.active = model;
|
|
3541
|
+
logger.debug(`Vision fallback activated: ${model} (reason: ${reason})`);
|
|
3542
|
+
},
|
|
3543
|
+
callModel: (model) => callWithRetry(client, {
|
|
3544
|
+
model,
|
|
3545
|
+
messages: conversationMessages,
|
|
3546
|
+
tools: agentTools,
|
|
3547
|
+
tool_choice: 'required',
|
|
3548
|
+
max_tokens: 2048,
|
|
3549
|
+
}, 3, config.abortSignal, config.providerPreferences, config.reasoningEffort),
|
|
3550
|
+
});
|
|
3551
|
+
response = visionResult.result;
|
|
3552
|
+
if (visionResult.model && visionResult.model !== modelState.active) {
|
|
3553
|
+
logger.debug(`OpenRouter model substitution detected: requested "${modelState.active}", got "${visionResult.model}"`);
|
|
3554
|
+
}
|
|
3555
|
+
modelState.active = visionResult.model;
|
|
3556
|
+
const imagesInPrompt = iterationPrompt.content.filter((part) => 'type' in part && part.type === 'image_url').length;
|
|
3557
|
+
const stepUsage = extractUsage(response, ++stepCounter, 'agent_iteration', visionResult.model, imagesInPrompt);
|
|
3558
|
+
const systemContent = conversationMessages[0].content;
|
|
3559
|
+
stepUsage.systemPromptChars = typeof systemContent === 'string' ? systemContent.length : 0;
|
|
3560
|
+
stepUsage.toolSchemaChars = JSON.stringify(agentTools).length;
|
|
3561
|
+
const userContent = iterationPrompt.content;
|
|
3562
|
+
stepUsage.userPayloadChars = Array.isArray(userContent)
|
|
3563
|
+
? userContent.reduce((sum, part) => sum + ('text' in part && typeof part.text === 'string' ? part.text.length : 0), 0)
|
|
3564
|
+
: typeof userContent === 'string' ? userContent.length : 0;
|
|
3565
|
+
stepUsage.accessibilityChars = pageState.accessibilityTree.length;
|
|
3566
|
+
stepUsage.interactiveElementCount = pageState.interactiveElements.length;
|
|
3567
|
+
stepUsage.actionHistoryCount = actionHistory.length;
|
|
3568
|
+
stepUsage.elementsChars = iterationPrompt.metrics.elementsChars;
|
|
3569
|
+
stepUsage.sessionSummaryChars = iterationPrompt.metrics.sessionSummaryChars;
|
|
3570
|
+
stepUsage.selectorMemoryChars = iterationPrompt.metrics.selectorMemoryChars;
|
|
3571
|
+
stepUsage.agentContextChars = iterationPrompt.metrics.agentContextChars;
|
|
3572
|
+
stepUsage.profileValidationStatus = config.sessionProfile?.validationStatus;
|
|
3573
|
+
stepUsage.repairPathUsed = usedDeterministicRecovery ? 'deterministic_repair' : 'llm_full';
|
|
3574
|
+
usageLog.push(stepUsage);
|
|
3575
|
+
// Log per-iteration cache efficiency so cache behavior is visible in runtime logs.
|
|
3576
|
+
if (stepUsage.cacheReadTokens) {
|
|
3577
|
+
const hitPct = stepUsage.promptTokens
|
|
3578
|
+
? Math.round((stepUsage.cacheReadTokens / stepUsage.promptTokens) * 100)
|
|
3579
|
+
: 0;
|
|
3580
|
+
logger.debug(`[cache] iter ${iteration}: ${stepUsage.cacheReadTokens} cached / ${stepUsage.promptTokens ?? '?'} total (${hitPct}% hit rate)`);
|
|
3581
|
+
}
|
|
3582
|
+
if (stepUsage.reasoningTokens) {
|
|
3583
|
+
logger.debug(`[reasoning] iter ${iteration}: ${stepUsage.reasoningTokens} reasoning tokens`);
|
|
3584
|
+
}
|
|
3585
|
+
}
|
|
3586
|
+
catch (err) {
|
|
3587
|
+
if (isAbortError(err))
|
|
3588
|
+
throw err;
|
|
3589
|
+
logger.error(`API call failed at iteration ${iteration}: ${err.message}`);
|
|
3590
|
+
if (err instanceof VisionModelUnsupportedError) {
|
|
3591
|
+
getPostHog().capture({
|
|
3592
|
+
distinctId: config.analyticsId ?? DISTINCT_ID,
|
|
3593
|
+
event: 'agent_gave_up',
|
|
3594
|
+
properties: {
|
|
3595
|
+
url: config.url,
|
|
3596
|
+
model: modelState.active,
|
|
3597
|
+
theme: config.currentTheme,
|
|
3598
|
+
lang: config.currentLang,
|
|
3599
|
+
reason: err.message,
|
|
3600
|
+
iterations: iteration,
|
|
3601
|
+
total_tokens: usageLog.reduce((s, u) => s + (u.totalTokens ?? 0), 0),
|
|
3602
|
+
},
|
|
3603
|
+
});
|
|
3604
|
+
return {
|
|
3605
|
+
success: false,
|
|
3606
|
+
screenshotPath: null,
|
|
3607
|
+
screenshots: workflowScreenshots,
|
|
3608
|
+
iterations: iteration,
|
|
3609
|
+
actions: actionHistory,
|
|
3610
|
+
assessment: err.message,
|
|
3611
|
+
diagnostic: {
|
|
3612
|
+
screenshot: pageState.screenshot,
|
|
3613
|
+
url: browser.currentPage.url(),
|
|
3614
|
+
interactiveElements: pageState.interactiveElements,
|
|
3615
|
+
accessibilityTreeSnippet: pageState.accessibilityTree.slice(0, 5000),
|
|
3616
|
+
giveUpReason: err.message,
|
|
3617
|
+
},
|
|
3618
|
+
usage: usageLog,
|
|
3619
|
+
runtimeStrategy: 'full_llm',
|
|
3620
|
+
deterministicRecoveryUsed: usedDeterministicRecovery,
|
|
3621
|
+
evaluatorUsed: false,
|
|
3622
|
+
};
|
|
3623
|
+
}
|
|
3624
|
+
appendActionHistory(actionHistory, config, {
|
|
3625
|
+
iteration,
|
|
3626
|
+
action: 'wait',
|
|
3627
|
+
params: { reason: 'API error' },
|
|
3628
|
+
success: false,
|
|
3629
|
+
error: err.message,
|
|
3630
|
+
});
|
|
3631
|
+
continue;
|
|
3632
|
+
}
|
|
3633
|
+
// 4. Extract tool calls and add assistant message to conversation thread
|
|
3634
|
+
const message = response.choices?.[0]?.message;
|
|
3635
|
+
let toolCalls = message?.tool_calls;
|
|
3636
|
+
const assistantText = extractAssistantText(message?.content);
|
|
3637
|
+
if (assistantText) {
|
|
3638
|
+
logger.ai(assistantText);
|
|
3639
|
+
}
|
|
3640
|
+
// Always add the assistant's response to keep the conversation coherent
|
|
3641
|
+
conversationMessages.push({
|
|
3642
|
+
role: 'assistant',
|
|
3643
|
+
content: message?.content ?? null,
|
|
3644
|
+
tool_calls: toolCalls,
|
|
3645
|
+
});
|
|
3646
|
+
if (!toolCalls || toolCalls.length === 0) {
|
|
3647
|
+
const inferredAction = assistantText ? inferActionFromAssistantText(assistantText) : null;
|
|
3648
|
+
if (inferredAction) {
|
|
3649
|
+
logger.debug(`No tool calls at iteration ${iteration}; inferred ${inferredAction.name} from assistant text.`);
|
|
3650
|
+
toolCalls = [
|
|
3651
|
+
{
|
|
3652
|
+
id: `synthetic-${iteration}-0`,
|
|
3653
|
+
type: 'function',
|
|
3654
|
+
function: {
|
|
3655
|
+
name: inferredAction.name,
|
|
3656
|
+
arguments: JSON.stringify(inferredAction.args),
|
|
3657
|
+
},
|
|
3658
|
+
},
|
|
3659
|
+
];
|
|
3660
|
+
}
|
|
3661
|
+
else {
|
|
3662
|
+
logger.error(`No tool calls at iteration ${iteration}, skipping. Model: ${modelState.active}. Response: ${assistantText?.slice(0, 400) || '(empty)'}`);
|
|
3663
|
+
continue;
|
|
3664
|
+
}
|
|
3665
|
+
}
|
|
3666
|
+
// 5. Execute each tool call and add results to conversation thread
|
|
3667
|
+
for (const toolCall of toolCalls) {
|
|
3668
|
+
throwIfAborted(config.abortSignal, 'Agent run cancelled.');
|
|
3669
|
+
if (!('function' in toolCall))
|
|
3670
|
+
continue;
|
|
3671
|
+
const name = toolCall.function.name;
|
|
3672
|
+
let args;
|
|
3673
|
+
try {
|
|
3674
|
+
args = JSON.parse(toolCall.function.arguments);
|
|
3675
|
+
}
|
|
3676
|
+
catch {
|
|
3677
|
+
logger.error(`Invalid JSON in tool arguments: ${toolCall.function.arguments}`);
|
|
3678
|
+
conversationMessages.push({
|
|
3679
|
+
role: 'tool',
|
|
3680
|
+
tool_call_id: toolCall.id,
|
|
3681
|
+
content: 'ERROR: Invalid JSON in tool arguments.',
|
|
3682
|
+
});
|
|
3683
|
+
continue;
|
|
3684
|
+
}
|
|
3685
|
+
args = resolveActionCredentialArgs(name, args, config.credentials);
|
|
3686
|
+
if (name === 'type_text'
|
|
3687
|
+
&& config.credentials?.password
|
|
3688
|
+
&& typeof args.text === 'string'
|
|
3689
|
+
&& GENERIC_PASSWORD_INPUT_RE.test(args.text.trim())
|
|
3690
|
+
&& isPasswordFieldTarget(args, pageState.interactiveElements)) {
|
|
3691
|
+
args.text = config.credentials.password;
|
|
3692
|
+
}
|
|
3693
|
+
// Log the agent's reasoning before executing
|
|
3694
|
+
const reasoning = args.reason || args.reasoning || args.assessment;
|
|
3695
|
+
if (reasoning)
|
|
3696
|
+
logger.ai(reasoning);
|
|
3697
|
+
// Enrich log args with element label
|
|
3698
|
+
const logArgs = sanitizeCredentialParams({ ...args }, config.credentials);
|
|
3699
|
+
delete logArgs.reason;
|
|
3700
|
+
if ((name === 'click' || name === 'type_text' || name === 'select_option' || name === 'scroll') && args.index !== undefined) {
|
|
3701
|
+
const el = pageState.interactiveElements.find(e => e.index === args.index);
|
|
3702
|
+
if (el) {
|
|
3703
|
+
const label = el.text || el.ariaLabel || el.inputType || el.tag;
|
|
3704
|
+
if (label)
|
|
3705
|
+
logArgs.elementLabel = label;
|
|
3706
|
+
}
|
|
3707
|
+
}
|
|
3708
|
+
logger.action(iteration, config.maxIterations, name, logArgs, { lang: config.currentLang, theme: config.currentTheme });
|
|
3709
|
+
// --- begin_subgoal (hierarchical working memory + AWM within-run) ---
|
|
3710
|
+
if (name === 'begin_subgoal') {
|
|
3711
|
+
const subgoalName = String(args.name ?? 'step').slice(0, 40);
|
|
3712
|
+
const progressSummary = String(args.progress_summary ?? '').slice(0, 120);
|
|
3713
|
+
// AWM: extract selectors from the CURRENT subgoal only (scoped by subgoalStartIndex)
|
|
3714
|
+
if (currentSubgoal) {
|
|
3715
|
+
const successfulSelectors = Array.from(new Set(actionHistory
|
|
3716
|
+
.slice(subgoalStartIndex)
|
|
3717
|
+
.filter(a => a.success && a.stateChanged && typeof a.params.selector === 'string')
|
|
3718
|
+
.map(a => String(a.params.selector)))).slice(0, 8);
|
|
3719
|
+
if (successfulSelectors.length > 0) {
|
|
3720
|
+
workflowCache.push({ subgoalName: currentSubgoal, actionSummary: progressSummary, selectors: successfulSelectors });
|
|
3721
|
+
}
|
|
3722
|
+
// Archive current subgoal with its summary
|
|
3723
|
+
if (agentNotes.length > 0 || progressSummary) {
|
|
3724
|
+
completedSubgoals.push({ name: currentSubgoal, summary: progressSummary || `${agentNotes.length} note(s)` });
|
|
3725
|
+
}
|
|
3726
|
+
}
|
|
3727
|
+
// Start new subgoal: clear working notes and advance scope index
|
|
3728
|
+
currentSubgoal = subgoalName;
|
|
3729
|
+
agentNotes.length = 0;
|
|
3730
|
+
subgoalStartIndex = actionHistory.length; // scope future AWM extraction to this subgoal
|
|
3731
|
+
// AWM: inject selector hints from a similar past subgoal into the new subgoal's notes
|
|
3732
|
+
const similarWorkflow = findReusableWorkflow(workflowCache, subgoalName);
|
|
3733
|
+
if (similarWorkflow && similarWorkflow.selectors.length > 0) {
|
|
3734
|
+
agentNotes.push(`Selectors that worked for previous subgoal "${similarWorkflow.subgoalName}": ${similarWorkflow.selectors.slice(0, 4).join(', ')}`);
|
|
3735
|
+
}
|
|
3736
|
+
appendActionHistory(actionHistory, config, { iteration, action: 'begin_subgoal', params: args, success: true, outcome: `subgoal_started:${subgoalName}`, stateChanged: false });
|
|
3737
|
+
conversationMessages.push({ role: 'tool', tool_call_id: toolCall.id, content: `Subgoal "${subgoalName}" started. Working memory cleared.` });
|
|
3738
|
+
continue;
|
|
3739
|
+
}
|
|
3740
|
+
// --- note (working memory) ---
|
|
3741
|
+
if (name === 'note') {
|
|
3742
|
+
const content = String(args.content ?? '').slice(0, 120);
|
|
3743
|
+
agentNotes.push(content);
|
|
3744
|
+
appendActionHistory(actionHistory, config, { iteration, action: 'note', params: args, success: true, outcome: 'note_recorded', stateChanged: false });
|
|
3745
|
+
conversationMessages.push({ role: 'tool', tool_call_id: toolCall.id, content: 'Note recorded.' });
|
|
3746
|
+
continue;
|
|
3747
|
+
}
|
|
3748
|
+
// Element index validation: catch hallucinated indices before browser execution
|
|
3749
|
+
if (['click', 'type_text', 'select_option', 'scroll'].includes(name)
|
|
3750
|
+
&& args.index !== undefined
|
|
3751
|
+
&& args.selector === undefined) {
|
|
3752
|
+
const idx = args.index;
|
|
3753
|
+
const exists = pageState.interactiveElements.some(el => el.index === idx);
|
|
3754
|
+
if (!exists) {
|
|
3755
|
+
const visibleIds = pageState.interactiveElements
|
|
3756
|
+
.filter(el => el.visible)
|
|
3757
|
+
.slice(0, 12)
|
|
3758
|
+
.map(el => `[${el.index}]`)
|
|
3759
|
+
.join(' ');
|
|
3760
|
+
const msg = `Element [${idx}] not found in current page. Visible: ${visibleIds}. Use search_text to locate by text, or scroll to reach off-screen elements.`;
|
|
3761
|
+
logger.error(msg);
|
|
3762
|
+
appendActionHistory(actionHistory, config, { iteration, action: name, params: args, success: false, error: `index_not_found:${idx}`, stateChanged: false });
|
|
3763
|
+
conversationMessages.push({ role: 'tool', tool_call_id: toolCall.id, content: msg });
|
|
3764
|
+
continue;
|
|
3765
|
+
}
|
|
3766
|
+
}
|
|
3767
|
+
// Security check
|
|
3768
|
+
const securityDecision = evaluateActionSecurity(name, args, {
|
|
3769
|
+
rootUrl: config.url,
|
|
3770
|
+
currentUrl: browser.currentPage.url(),
|
|
3771
|
+
credentials: config.credentials,
|
|
3772
|
+
interactiveElements: pageState.interactiveElements,
|
|
3773
|
+
currentLang: config.currentLang,
|
|
3774
|
+
currentTheme: config.currentTheme,
|
|
3775
|
+
runMode: config.runMode,
|
|
3776
|
+
currentObjective: config.currentObjective,
|
|
3777
|
+
activeRepairCause: config.activeRepairTicket?.cause ?? null,
|
|
3778
|
+
});
|
|
3779
|
+
if (!securityDecision.allowed) {
|
|
3780
|
+
const targetLabel = describeSecurityTarget(securityDecision.target);
|
|
3781
|
+
const blockMsg = `BLOCKED: ${securityDecision.reason} (${targetLabel})`;
|
|
3782
|
+
logger.error(blockMsg);
|
|
3783
|
+
appendActionHistory(actionHistory, config, { iteration, action: name, params: args, success: false, error: securityDecision.reason });
|
|
3784
|
+
conversationMessages.push({ role: 'tool', tool_call_id: toolCall.id, content: blockMsg });
|
|
3785
|
+
break;
|
|
3786
|
+
}
|
|
3787
|
+
// --- search_text ---
|
|
3788
|
+
if (name === 'search_text') {
|
|
3789
|
+
const query = args.query || '';
|
|
3790
|
+
logger.info(`Recherche de "${query}" sur la page`);
|
|
3791
|
+
const results = await browser.searchText(query);
|
|
3792
|
+
const viewport = browser.currentPage.viewportSize();
|
|
3793
|
+
// Cross-reference search results with interactive elements to provide clickable indices.
|
|
3794
|
+
// When a search result overlaps spatially with an interactive element, include its index
|
|
3795
|
+
// so the agent can `click index=N` instead of using stale coordinates.
|
|
3796
|
+
const interactiveElements = pageState.interactiveElements;
|
|
3797
|
+
const resultText = results.length > 0
|
|
3798
|
+
? results.map((r, i) => {
|
|
3799
|
+
const cx = Math.round(r.boundingBox.x + r.boundingBox.width / 2);
|
|
3800
|
+
const cy = Math.round(r.boundingBox.y + r.boundingBox.height / 2);
|
|
3801
|
+
// Find matching interactive element: same selector, or bounding box overlap
|
|
3802
|
+
const matchingElement = interactiveElements.find(el => {
|
|
3803
|
+
if (el.selector && r.selector && el.selector === r.selector)
|
|
3804
|
+
return true;
|
|
3805
|
+
if (r.container?.selector && el.selector === r.container.selector)
|
|
3806
|
+
return true;
|
|
3807
|
+
// Spatial overlap: search result center is inside interactive element bounds
|
|
3808
|
+
const bb = el.boundingBox;
|
|
3809
|
+
if (bb) {
|
|
3810
|
+
return cx >= bb.x && cx <= bb.x + bb.width && cy >= bb.y && cy <= bb.y + bb.height;
|
|
3811
|
+
}
|
|
3812
|
+
return false;
|
|
3813
|
+
});
|
|
3814
|
+
const indexHint = matchingElement ? ` ⇒ click index=${matchingElement.index}` : '';
|
|
3815
|
+
const flags = [
|
|
3816
|
+
r.clickable ? 'clickable' : 'not-clickable',
|
|
3817
|
+
r.visibilityState === 'full' ? 'fully-visible' : r.visibilityState === 'partial' ? 'partially-visible' : 'off-screen',
|
|
3818
|
+
].join(', ');
|
|
3819
|
+
const hrefInfo = r.href ? ` href="${r.href}"` : '';
|
|
3820
|
+
const selectorInfo = r.selector ? ` sel="${r.selector}"` : '';
|
|
3821
|
+
// For off-screen elements, coordinates will be stale after scrolling.
|
|
3822
|
+
// Guide the agent to scroll first, then use the selector or re-search.
|
|
3823
|
+
const isOffScreen = r.visibilityState !== 'full' && r.visibilityState !== 'partial';
|
|
3824
|
+
let scrollHint = '';
|
|
3825
|
+
if (viewport && r.visibilityState !== 'full') {
|
|
3826
|
+
if (r.boundingBox.y < 0) {
|
|
3827
|
+
scrollHint = ` scroll up about ${Math.abs(r.boundingBox.y) + 24}px first`;
|
|
3828
|
+
}
|
|
3829
|
+
else if (r.boundingBox.y + r.boundingBox.height > viewport.height) {
|
|
3830
|
+
const delta = r.boundingBox.y + r.boundingBox.height - Math.max(48, viewport.height - 96);
|
|
3831
|
+
scrollHint = ` scroll down about ${Math.max(80, Math.round(delta))}px first`;
|
|
3832
|
+
}
|
|
3833
|
+
}
|
|
3834
|
+
const clickTarget = isOffScreen
|
|
3835
|
+
? `(${flags})${scrollHint} — after scrolling, use selector="${r.selector}" or re-search to get updated coordinates`
|
|
3836
|
+
: `→ click x=${cx} y=${cy} (${flags})`;
|
|
3837
|
+
const containerInfo = r.container
|
|
3838
|
+
? `\n ↳ container: <${r.container.tag}> ${r.container.boundingBox.width}x${r.container.boundingBox.height} (${r.container.reason}) sel="${r.container.selector}"`
|
|
3839
|
+
: '';
|
|
3840
|
+
return ` ${i}. <${r.tag}> role="${r.role}" "${r.text}"${hrefInfo}${selectorInfo} ${clickTarget}${indexHint}${containerInfo}`;
|
|
3841
|
+
}).join('\n')
|
|
3842
|
+
: ' (no matches found)';
|
|
3843
|
+
logger.debug(`Search results:\n${resultText}`);
|
|
3844
|
+
const searchResult = results.length === 0
|
|
3845
|
+
? `No elements found matching "${query}". Try a different search term, or scroll to reveal more content.`
|
|
3846
|
+
: `Found ${results.length} match(es). For visible elements: click by x/y coordinates or by selector. For off-screen elements: scroll first, then click by selector or re-search.\n${resultText}`;
|
|
3847
|
+
appendActionHistory(actionHistory, config, { iteration, action: 'search_text', params: { query }, success: results.length > 0, error: searchResult });
|
|
3848
|
+
conversationMessages.push({ role: 'tool', tool_call_id: toolCall.id, content: searchResult });
|
|
3849
|
+
break; // Next iteration will show updated page state
|
|
3850
|
+
}
|
|
3851
|
+
if (name === 'analyze_screenshot') {
|
|
3852
|
+
const reason = args.reason || 'visual check';
|
|
3853
|
+
logger.info('Analyse visuelle de la page…');
|
|
3854
|
+
const freshScreenshot = await browser.takeScreenshotForAI();
|
|
3855
|
+
const visionModel = config.visionModel || modelState.active;
|
|
3856
|
+
const visionResult = await callVisionObserver(client, visionModel, freshScreenshot, pageState, config, ++stepCounter);
|
|
3857
|
+
if (visionResult.usage)
|
|
3858
|
+
usageLog.push(visionResult.usage);
|
|
3859
|
+
const observation = visionResult.observation || 'Unable to analyze screenshot.';
|
|
3860
|
+
logger.debug(`Visual analysis: ${observation.slice(0, 200)}`);
|
|
3861
|
+
appendActionHistory(actionHistory, config, {
|
|
3862
|
+
iteration, action: 'analyze_screenshot', params: { reason },
|
|
3863
|
+
success: !!visionResult.observation, outcome: observation.slice(0, 120),
|
|
3864
|
+
});
|
|
3865
|
+
conversationMessages.push({
|
|
3866
|
+
role: 'tool', tool_call_id: toolCall.id,
|
|
3867
|
+
content: `Visual analysis:\n${observation}`,
|
|
3868
|
+
});
|
|
3869
|
+
break;
|
|
3870
|
+
}
|
|
3871
|
+
// --- take_screenshot ---
|
|
3872
|
+
if (name === 'take_screenshot') {
|
|
3873
|
+
throwIfAborted(config.abortSignal, 'Agent run cancelled.');
|
|
3874
|
+
const label = args.label || `Screenshot ${workflowScreenshots.length + 1}`;
|
|
3875
|
+
const assessment = args.assessment || '';
|
|
3876
|
+
logger.info(`Prise du screenshot : "${label}"`);
|
|
3877
|
+
const screenshotBuffer = await browser.takeScreenshot();
|
|
3878
|
+
workflowScreenshots.push({ index: workflowScreenshots.length, iteration, label, buffer: screenshotBuffer, path: null });
|
|
3879
|
+
appendActionHistory(actionHistory, config, { iteration, action: 'take_screenshot', params: { label, assessment }, success: true });
|
|
3880
|
+
logger.success(`Screenshot #${workflowScreenshots.length} capturé : "${label}"`);
|
|
3881
|
+
// Dynamic replanning: after each milestone, refine the plan for remaining steps
|
|
3882
|
+
// This is the "soft Planner/Executor split" — the planner re-focuses on what's left
|
|
3883
|
+
const replanResult = await callPlanner(client, config.prompt, browser.currentPage.url(), screenshotBuffer, {
|
|
3884
|
+
model: modelState.active,
|
|
3885
|
+
fallbackModel: config.fallbackModel,
|
|
3886
|
+
visionModel: config.visionModel,
|
|
3887
|
+
lang: config.currentLang,
|
|
3888
|
+
theme: config.currentTheme,
|
|
3889
|
+
currentObjective: config.currentObjective,
|
|
3890
|
+
captureCursorSummary: summarizeCaptureCursorForPlanner(config),
|
|
3891
|
+
repairTicketSummary: summarizeRepairTicketForPlanner(config),
|
|
3892
|
+
remainingCaptureQueue: config.remainingCaptureQueue,
|
|
3893
|
+
variantManifestSummary: summarizeVariantManifestForPlanner(config.variantManifest),
|
|
3894
|
+
completedMilestones: workflowScreenshots.length,
|
|
3895
|
+
signal: config.abortSignal,
|
|
3896
|
+
uploadImage: config.uploadImage,
|
|
3897
|
+
stepCounter: ++stepCounter,
|
|
3898
|
+
providerPreferences: config.providerPreferences,
|
|
3899
|
+
}).catch(() => ({ plan: null, usage: null }));
|
|
3900
|
+
if (replanResult.usage)
|
|
3901
|
+
usageLog.push(replanResult.usage);
|
|
3902
|
+
if (replanResult.model && replanResult.model !== modelState.active) {
|
|
3903
|
+
modelState.active = replanResult.model;
|
|
3904
|
+
}
|
|
3905
|
+
if (replanResult.plan) {
|
|
3906
|
+
taskPlan = replanResult.plan;
|
|
3907
|
+
logger.info(`Plan affiné après le screenshot #${workflowScreenshots.length} :\n${replanResult.plan}`);
|
|
3908
|
+
}
|
|
3909
|
+
conversationMessages.push({ role: 'tool', tool_call_id: toolCall.id, content: `Workflow screenshot "${label}" captured successfully. Continue with the next step.` });
|
|
3910
|
+
break;
|
|
3911
|
+
}
|
|
3912
|
+
// --- ready_to_capture ---
|
|
3913
|
+
if (name === 'ready_to_capture') {
|
|
3914
|
+
throwIfAborted(config.abortSignal, 'Agent run cancelled.');
|
|
3915
|
+
const assessment = args.assessment || 'Ready';
|
|
3916
|
+
const forceFlag = args.force === true;
|
|
3917
|
+
const expectsDialogTarget = config.variantManifest?.currentPageIdentity?.dialogTarget === true;
|
|
3918
|
+
const expectsGalleryDetail = config.variantManifest?.currentPageIdentity?.kind === 'gallery'
|
|
3919
|
+
&& GALLERY_DETAIL_PROMPT_RE.test(`${config.variantManifest?.currentPageId || ''} ${config.prompt}`.toLowerCase());
|
|
3920
|
+
// Skip the deterministic dialog check when:
|
|
3921
|
+
// 1. The agent explicitly sets force=true (dialog is the intended target)
|
|
3922
|
+
// 2. The dialog check has already failed 2+ times consecutively (auto-bypass to vision)
|
|
3923
|
+
// 3. The current capture intentionally targets a dialog/modal
|
|
3924
|
+
// 4. The gallery prompt expects a detail view (which opens as a dialog)
|
|
3925
|
+
// 5. The run is a language-only preflight
|
|
3926
|
+
const skipDialogCheck = forceFlag
|
|
3927
|
+
|| consecutiveDialogFailures >= 2
|
|
3928
|
+
|| expectsDialogTarget
|
|
3929
|
+
|| expectsGalleryDetail
|
|
3930
|
+
|| config.runMode === 'language_preflight';
|
|
3931
|
+
const verification = await verifyScreenshot(client, config, modelState, browser, assessment, ++stepCounter, {
|
|
3932
|
+
skipDialogCheck,
|
|
3933
|
+
allowDeterministicSuccess: !forceFlag && !config.variantManifest?.currentPageId,
|
|
3934
|
+
});
|
|
3935
|
+
const { verified, reason, usage: verifyUsage, fatal } = verification;
|
|
3936
|
+
lastVerificationResult = verification;
|
|
3937
|
+
if (verifyUsage)
|
|
3938
|
+
usageLog.push(verifyUsage);
|
|
3939
|
+
if (fatal) {
|
|
3940
|
+
conversationMessages.push({ role: 'tool', tool_call_id: toolCall.id, content: `Verification error: ${reason || 'Fatal error'}` });
|
|
3941
|
+
return {
|
|
3942
|
+
success: false, screenshotPath: null, screenshots: workflowScreenshots, iterations: iteration,
|
|
3943
|
+
actions: actionHistory, assessment: reason || 'Verification failed',
|
|
3944
|
+
diagnostic: { screenshot: pageState.screenshot, url: browser.currentPage.url(), interactiveElements: pageState.interactiveElements, accessibilityTreeSnippet: pageState.accessibilityTree.slice(0, 5000), giveUpReason: reason || 'Verification failed' },
|
|
3945
|
+
usage: usageLog, runtimeStrategy: 'full_llm', deterministicRecoveryUsed: usedDeterministicRecovery, evaluatorUsed: false,
|
|
3946
|
+
verification,
|
|
3947
|
+
};
|
|
3948
|
+
}
|
|
3949
|
+
if (verified) {
|
|
3950
|
+
consecutiveDialogFailures = 0;
|
|
3951
|
+
consecutiveVerificationFailures = 0;
|
|
3952
|
+
consecutiveTechnicalVerificationFailures = 0;
|
|
3953
|
+
conversationMessages.push({ role: 'tool', tool_call_id: toolCall.id, content: 'Verification passed. Workflow complete.' });
|
|
3954
|
+
getPostHog().capture({
|
|
3955
|
+
distinctId: config.analyticsId ?? DISTINCT_ID,
|
|
3956
|
+
event: 'agent_run_succeeded',
|
|
3957
|
+
properties: {
|
|
3958
|
+
url: config.url, model: modelState.active, theme: config.currentTheme, lang: config.currentLang,
|
|
3959
|
+
iterations: iteration, workflow_screenshots: workflowScreenshots.length,
|
|
3960
|
+
total_tokens: usageLog.reduce((s, u) => s + (u.totalTokens ?? 0), 0),
|
|
3961
|
+
cache_read_tokens: usageLog.reduce((s, u) => s + (u.cacheReadTokens ?? 0), 0),
|
|
3962
|
+
cache_hit_rate: (() => {
|
|
3963
|
+
const prompt = usageLog.reduce((s, u) => s + (u.promptTokens ?? 0), 0);
|
|
3964
|
+
const cached = usageLog.reduce((s, u) => s + (u.cacheReadTokens ?? 0), 0);
|
|
3965
|
+
return prompt > 0 ? Math.round((cached / prompt) * 100) : 0;
|
|
3966
|
+
})(),
|
|
3967
|
+
},
|
|
3968
|
+
});
|
|
3969
|
+
return {
|
|
3970
|
+
success: true, screenshotPath: null, screenshots: workflowScreenshots, iterations: iteration,
|
|
3971
|
+
actions: actionHistory, assessment, usage: usageLog, runtimeStrategy: 'full_llm',
|
|
3972
|
+
deterministicRecoveryUsed: usedDeterministicRecovery, evaluatorUsed: false,
|
|
3973
|
+
verification,
|
|
3974
|
+
};
|
|
3975
|
+
}
|
|
3976
|
+
lastVerificationFailure = reason || 'Verification failed';
|
|
3977
|
+
consecutiveVerificationFailures++;
|
|
3978
|
+
if (isTechnicalVerificationFailureReason(lastVerificationFailure)) {
|
|
3979
|
+
consecutiveTechnicalVerificationFailures++;
|
|
3980
|
+
}
|
|
3981
|
+
else {
|
|
3982
|
+
consecutiveTechnicalVerificationFailures = 0;
|
|
3983
|
+
}
|
|
3984
|
+
// Bail-out: if verification has failed too many times in a row, auto-accept the capture.
|
|
3985
|
+
// This prevents infinite loops when the verification model consistently fails to return
|
|
3986
|
+
// structured decisions despite the page being visually correct.
|
|
3987
|
+
const VERIFICATION_BAILOUT_THRESHOLD = 3;
|
|
3988
|
+
if (consecutiveTechnicalVerificationFailures >= VERIFICATION_BAILOUT_THRESHOLD) {
|
|
3989
|
+
logger.info('Vérification ignorée après plusieurs échecs techniques — capture acceptée automatiquement.');
|
|
3990
|
+
consecutiveVerificationFailures = 0;
|
|
3991
|
+
consecutiveTechnicalVerificationFailures = 0;
|
|
3992
|
+
consecutiveDialogFailures = 0;
|
|
3993
|
+
appendActionHistory(actionHistory, config, { iteration, action: 'ready_to_capture', params: args, success: true, outcome: `Auto-accepted after ${VERIFICATION_BAILOUT_THRESHOLD} consecutive verification failures` });
|
|
3994
|
+
conversationMessages.push({ role: 'tool', tool_call_id: toolCall.id, content: 'Verification passed (bail-out after repeated failures). Workflow complete.' });
|
|
3995
|
+
return {
|
|
3996
|
+
success: true, screenshotPath: null, screenshots: workflowScreenshots, iterations: iteration,
|
|
3997
|
+
actions: actionHistory, assessment, usage: usageLog, runtimeStrategy: 'full_llm',
|
|
3998
|
+
deterministicRecoveryUsed: usedDeterministicRecovery, evaluatorUsed: false,
|
|
3999
|
+
verification: { ...verification, verified: true, mode: 'bailout' },
|
|
4000
|
+
};
|
|
4001
|
+
}
|
|
4002
|
+
// Track consecutive dialog-related failures so we auto-bypass on the next attempt
|
|
4003
|
+
if (reason && reason.includes('Blocking dialog')) {
|
|
4004
|
+
consecutiveDialogFailures++;
|
|
4005
|
+
}
|
|
4006
|
+
else {
|
|
4007
|
+
consecutiveDialogFailures = 0;
|
|
4008
|
+
}
|
|
4009
|
+
appendActionHistory(actionHistory, config, { iteration, action: 'ready_to_capture', params: args, success: false, error: `Verification failed: ${lastVerificationFailure}` });
|
|
4010
|
+
const hint = reason && reason.includes('Blocking dialog')
|
|
4011
|
+
? ` A dialog/modal is blocking the page. Close it first by pressing Escape or clicking outside. If the dialog/modal IS the intended capture target mentioned in the task, call ready_to_capture again with force=true.`
|
|
4012
|
+
: verification.duplicateOfPageId
|
|
4013
|
+
? ` The current screen duplicates page "${verification.duplicateOfPageId}". Reach the distinct page/state for "${config.variantManifest?.currentPageId ?? 'current'}" before capturing.`
|
|
4014
|
+
: '';
|
|
4015
|
+
// After 2+ consecutive verification failures, regenerate the plan from current state
|
|
4016
|
+
// so the agent gets fresh direction instead of retrying the same approach.
|
|
4017
|
+
if (consecutiveVerificationFailures === 2 && iteration - lastReplanIteration >= 3) {
|
|
4018
|
+
lastReplanIteration = iteration;
|
|
4019
|
+
const replanScreenshot = await browser.takeScreenshot();
|
|
4020
|
+
const verificationReplan = await callPlanner(client, config.prompt, browser.currentPage.url(), replanScreenshot, {
|
|
4021
|
+
model: modelState.active,
|
|
4022
|
+
fallbackModel: config.fallbackModel,
|
|
4023
|
+
visionModel: config.visionModel,
|
|
4024
|
+
lang: config.currentLang,
|
|
4025
|
+
theme: config.currentTheme,
|
|
4026
|
+
currentObjective: config.currentObjective,
|
|
4027
|
+
captureCursorSummary: summarizeCaptureCursorForPlanner(config),
|
|
4028
|
+
remainingCaptureQueue: config.remainingCaptureQueue,
|
|
4029
|
+
variantManifestSummary: summarizeVariantManifestForPlanner(config.variantManifest),
|
|
4030
|
+
completedMilestones: workflowScreenshots.length || 1,
|
|
4031
|
+
signal: config.abortSignal,
|
|
4032
|
+
uploadImage: config.uploadImage,
|
|
4033
|
+
stepCounter: ++stepCounter,
|
|
4034
|
+
providerPreferences: config.providerPreferences,
|
|
4035
|
+
}).catch(() => ({ plan: null, usage: null }));
|
|
4036
|
+
if (verificationReplan.usage)
|
|
4037
|
+
usageLog.push(verificationReplan.usage);
|
|
4038
|
+
if (verificationReplan.model && verificationReplan.model !== modelState.active) {
|
|
4039
|
+
modelState.active = verificationReplan.model;
|
|
4040
|
+
}
|
|
4041
|
+
if (verificationReplan.plan) {
|
|
4042
|
+
taskPlan = verificationReplan.plan;
|
|
4043
|
+
logger.info(`Nouveau plan d'action après échec de vérification :\n${verificationReplan.plan}`);
|
|
4044
|
+
}
|
|
4045
|
+
}
|
|
4046
|
+
const loopHint = consecutiveTechnicalVerificationFailures >= 2
|
|
4047
|
+
? ' IMPORTANT: Verification has failed multiple times due to technical validator issues. If the page is visually correct, you may retry ready_to_capture with force=true.'
|
|
4048
|
+
: '';
|
|
4049
|
+
conversationMessages.push({ role: 'tool', tool_call_id: toolCall.id, content: `Verification failed: ${lastVerificationFailure}.${hint}${loopHint} Fix the issues (overlays, spinners, wrong page) and call ready_to_capture again.` });
|
|
4050
|
+
break;
|
|
4051
|
+
}
|
|
4052
|
+
// --- give_up ---
|
|
4053
|
+
if (name === 'give_up') {
|
|
4054
|
+
const reason = args.reason || 'Unknown reason';
|
|
4055
|
+
if (config.credentials?.password && HIDDEN_PASSWORD_GIVE_UP_RE.test(reason)) {
|
|
4056
|
+
const correction = 'The password is available in credentials but intentionally hidden from the prompt. Continue with the email/password login flow.';
|
|
4057
|
+
logger.debug(`Rejecting incorrect give_up: ${correction}`);
|
|
4058
|
+
appendActionHistory(actionHistory, config, { iteration, action: 'wait', params: { reason: 'hidden_password_available' }, success: false, error: `INVALID_GIVE_UP: ${reason}. ${correction}` });
|
|
4059
|
+
conversationMessages.push({ role: 'tool', tool_call_id: toolCall.id, content: `REJECTED: ${correction}` });
|
|
4060
|
+
break;
|
|
4061
|
+
}
|
|
4062
|
+
const prematureGiveUpCorrection = rejectedGiveUps < 2
|
|
4063
|
+
? inferPrematureGiveUpCorrection({
|
|
4064
|
+
reason,
|
|
4065
|
+
actionHistory,
|
|
4066
|
+
lastVerificationFailure,
|
|
4067
|
+
iteration,
|
|
4068
|
+
maxIterations: config.maxIterations,
|
|
4069
|
+
})
|
|
4070
|
+
: null;
|
|
4071
|
+
if (prematureGiveUpCorrection) {
|
|
4072
|
+
rejectedGiveUps += 1;
|
|
4073
|
+
logger.debug(`Rejecting premature give_up: ${prematureGiveUpCorrection}`);
|
|
4074
|
+
appendActionHistory(actionHistory, config, {
|
|
4075
|
+
iteration,
|
|
4076
|
+
action: 'wait',
|
|
4077
|
+
params: { reason: 'premature_give_up_rejected' },
|
|
4078
|
+
success: false,
|
|
4079
|
+
error: `INVALID_GIVE_UP: ${reason}. ${prematureGiveUpCorrection}`,
|
|
4080
|
+
});
|
|
4081
|
+
conversationMessages.push({ role: 'tool', tool_call_id: toolCall.id, content: `REJECTED: ${prematureGiveUpCorrection}` });
|
|
4082
|
+
break;
|
|
4083
|
+
}
|
|
4084
|
+
logger.error(`L'agent a abandonné : ${reason}`);
|
|
4085
|
+
let diagnostic;
|
|
4086
|
+
try {
|
|
4087
|
+
const diagScreenshot = await browser.takeScreenshot();
|
|
4088
|
+
diagnostic = { screenshot: diagScreenshot, url: browser.currentPage.url(), interactiveElements: pageState.interactiveElements, accessibilityTreeSnippet: pageState.accessibilityTree.slice(0, 5000), giveUpReason: reason };
|
|
4089
|
+
}
|
|
4090
|
+
catch {
|
|
4091
|
+
logger.error('Failed to capture diagnostic state');
|
|
4092
|
+
}
|
|
4093
|
+
conversationMessages.push({ role: 'tool', tool_call_id: toolCall.id, content: 'Acknowledged.' });
|
|
4094
|
+
getPostHog().capture({
|
|
4095
|
+
distinctId: config.analyticsId ?? DISTINCT_ID,
|
|
4096
|
+
event: 'agent_gave_up',
|
|
4097
|
+
properties: {
|
|
4098
|
+
url: config.url, model: modelState.active, theme: config.currentTheme, lang: config.currentLang,
|
|
4099
|
+
reason, iterations: iteration,
|
|
4100
|
+
total_tokens: usageLog.reduce((s, u) => s + (u.totalTokens ?? 0), 0),
|
|
4101
|
+
cache_read_tokens: usageLog.reduce((s, u) => s + (u.cacheReadTokens ?? 0), 0),
|
|
4102
|
+
cache_hit_rate: (() => {
|
|
4103
|
+
const prompt = usageLog.reduce((s, u) => s + (u.promptTokens ?? 0), 0);
|
|
4104
|
+
const cached = usageLog.reduce((s, u) => s + (u.cacheReadTokens ?? 0), 0);
|
|
4105
|
+
return prompt > 0 ? Math.round((cached / prompt) * 100) : 0;
|
|
4106
|
+
})(),
|
|
4107
|
+
},
|
|
4108
|
+
});
|
|
4109
|
+
return {
|
|
4110
|
+
success: false, screenshotPath: null, screenshots: workflowScreenshots, iterations: iteration,
|
|
4111
|
+
actions: actionHistory, assessment: reason, diagnostic, usage: usageLog, runtimeStrategy: 'full_llm',
|
|
4112
|
+
deterministicRecoveryUsed: usedDeterministicRecovery, evaluatorUsed: false,
|
|
4113
|
+
verification: lastVerificationResult,
|
|
4114
|
+
};
|
|
4115
|
+
}
|
|
4116
|
+
// --- OAuth guard ---
|
|
4117
|
+
if (name === 'click' && hasCredentials) {
|
|
4118
|
+
let blockedText = null;
|
|
4119
|
+
if (args.index !== undefined) {
|
|
4120
|
+
const targetEl = pageState.interactiveElements.find(el => el.index === args.index);
|
|
4121
|
+
if (targetEl && isOAuthElement(targetEl))
|
|
4122
|
+
blockedText = targetEl.text;
|
|
4123
|
+
}
|
|
4124
|
+
if (!blockedText && args.selector) {
|
|
4125
|
+
const sel = args.selector.toLowerCase();
|
|
4126
|
+
const oauthEl = pageState.interactiveElements.find(el => isOAuthElement(el) && el.selector.toLowerCase() === sel);
|
|
4127
|
+
if (oauthEl)
|
|
4128
|
+
blockedText = oauthEl.text;
|
|
4129
|
+
}
|
|
4130
|
+
if (!blockedText && args.x !== undefined && args.y !== undefined) {
|
|
4131
|
+
const x = args.x;
|
|
4132
|
+
const y = args.y;
|
|
4133
|
+
const oauthEl = pageState.interactiveElements.find(el => {
|
|
4134
|
+
if (!isOAuthElement(el) || !el.boundingBox)
|
|
4135
|
+
return false;
|
|
4136
|
+
const bb = el.boundingBox;
|
|
4137
|
+
return x >= bb.x && x <= bb.x + bb.width && y >= bb.y && y <= bb.y + bb.height;
|
|
4138
|
+
});
|
|
4139
|
+
if (oauthEl)
|
|
4140
|
+
blockedText = oauthEl.text;
|
|
4141
|
+
}
|
|
4142
|
+
if (blockedText) {
|
|
4143
|
+
const blockMsg = `BLOCKED: OAuth element "${blockedText}". Look for the email/password login option instead.`;
|
|
4144
|
+
logger.error(`BLOCKED: click on OAuth element "${blockedText}" — use email login instead`);
|
|
4145
|
+
appendActionHistory(actionHistory, config, { iteration, action: 'click', params: args, success: false, error: blockMsg });
|
|
4146
|
+
conversationMessages.push({ role: 'tool', tool_call_id: toolCall.id, content: blockMsg });
|
|
4147
|
+
break;
|
|
4148
|
+
}
|
|
4149
|
+
}
|
|
4150
|
+
// --- Regular browser actions ---
|
|
4151
|
+
const repeatedActionGuard = inferRepeatedActionGuard({
|
|
4152
|
+
actionHistory,
|
|
4153
|
+
action: name,
|
|
4154
|
+
args,
|
|
4155
|
+
currentUrl: browser.currentPage.url(),
|
|
4156
|
+
});
|
|
4157
|
+
if (repeatedActionGuard) {
|
|
4158
|
+
logger.debug(`Blocking repeated ineffective action: ${name}`);
|
|
4159
|
+
appendActionHistory(actionHistory, config, {
|
|
4160
|
+
iteration,
|
|
4161
|
+
action: name,
|
|
4162
|
+
params: args,
|
|
4163
|
+
success: false,
|
|
4164
|
+
error: repeatedActionGuard,
|
|
4165
|
+
stateChanged: false,
|
|
4166
|
+
});
|
|
4167
|
+
conversationMessages.push({ role: 'tool', tool_call_id: toolCall.id, content: repeatedActionGuard });
|
|
4168
|
+
break;
|
|
4169
|
+
}
|
|
4170
|
+
const execResult = await executeAction(browser, name, args);
|
|
4171
|
+
throwIfAborted(config.abortSignal, 'Agent run cancelled.');
|
|
4172
|
+
// Store in action history (for replay and telemetry)
|
|
4173
|
+
const storedParams = sanitizeCredentialParams({ ...args }, config.credentials);
|
|
4174
|
+
delete storedParams.reason;
|
|
4175
|
+
if (name === 'click' || name === 'type_text' || name === 'select_option' || name === 'scroll') {
|
|
4176
|
+
const resolvedElement = args.index !== undefined
|
|
4177
|
+
? pageState.interactiveElements.find((element) => element.index === args.index)
|
|
4178
|
+
: (args.x !== undefined && args.y !== undefined)
|
|
4179
|
+
? findElementForPoint(pageState.interactiveElements, args.x, args.y)
|
|
4180
|
+
: null;
|
|
4181
|
+
if (resolvedElement) {
|
|
4182
|
+
if (!containsInternalAutomationSelector(resolvedElement.selector)) {
|
|
4183
|
+
storedParams.selector = resolvedElement.selector;
|
|
4184
|
+
}
|
|
4185
|
+
else {
|
|
4186
|
+
delete storedParams.selector;
|
|
4187
|
+
}
|
|
4188
|
+
if (resolvedElement.href)
|
|
4189
|
+
storedParams.href = resolvedElement.href;
|
|
4190
|
+
const label = resolvedElement.text || resolvedElement.ariaLabel || resolvedElement.inputType || resolvedElement.tag;
|
|
4191
|
+
if (label)
|
|
4192
|
+
storedParams.elementLabel = label;
|
|
4193
|
+
// Store structural metadata for cross-language replay matching
|
|
4194
|
+
storedParams.elementTag = resolvedElement.tag;
|
|
4195
|
+
storedParams.elementRole = resolvedElement.role;
|
|
4196
|
+
if (resolvedElement.boundingBox) {
|
|
4197
|
+
storedParams.elementCx = Math.round(resolvedElement.boundingBox.x + resolvedElement.boundingBox.width / 2);
|
|
4198
|
+
storedParams.elementCy = Math.round(resolvedElement.boundingBox.y + resolvedElement.boundingBox.height / 2);
|
|
4199
|
+
}
|
|
4200
|
+
delete storedParams.index;
|
|
4201
|
+
delete storedParams.x;
|
|
4202
|
+
delete storedParams.y;
|
|
4203
|
+
}
|
|
4204
|
+
}
|
|
4205
|
+
// Record URLs for replay validation — allows detecting divergence
|
|
4206
|
+
// when the same actions are replayed in a different language/variant.
|
|
4207
|
+
storedParams.preActionUrl = browser.currentPage.url();
|
|
4208
|
+
if (execResult.success && execResult.stateChanged && (name === 'click' || name === 'navigate_to')) {
|
|
4209
|
+
storedParams.postActionUrl = browser.currentPage.url();
|
|
4210
|
+
}
|
|
4211
|
+
appendActionHistory(actionHistory, config, { iteration, action: name, params: storedParams, success: execResult.success, error: execResult.error, outcome: execResult.outcome, stateChanged: execResult.stateChanged });
|
|
4212
|
+
// Add tool result to conversation thread — this is how the LLM learns what happened
|
|
4213
|
+
const toolResultContent = formatToolResult(name, args, execResult, pageState.interactiveElements);
|
|
4214
|
+
conversationMessages.push({ role: 'tool', tool_call_id: toolCall.id, content: toolResultContent });
|
|
4215
|
+
if (!execResult.success) {
|
|
4216
|
+
logger.error(`Échec de l'action : ${execResult.error}`);
|
|
4217
|
+
}
|
|
4218
|
+
const postActionDelayMs = getPostActionDelayMs(name, execResult, {
|
|
4219
|
+
authSubmitAction: isLikelyAuthenticationSubmitAction(name, args),
|
|
4220
|
+
});
|
|
4221
|
+
if (postActionDelayMs > 0) {
|
|
4222
|
+
await browser.wait(postActionDelayMs);
|
|
4223
|
+
}
|
|
4224
|
+
}
|
|
4225
|
+
}
|
|
4226
|
+
// Max iterations exhausted
|
|
4227
|
+
logger.error('Max iterations reached');
|
|
4228
|
+
getPostHog().capture({
|
|
4229
|
+
distinctId: config.analyticsId ?? DISTINCT_ID,
|
|
4230
|
+
event: 'agent_max_iterations_reached',
|
|
4231
|
+
properties: {
|
|
4232
|
+
url: config.url,
|
|
4233
|
+
model: modelState.active,
|
|
4234
|
+
theme: config.currentTheme,
|
|
4235
|
+
lang: config.currentLang,
|
|
4236
|
+
max_iterations: config.maxIterations,
|
|
4237
|
+
total_tokens: usageLog.reduce((s, u) => s + (u.totalTokens ?? 0), 0),
|
|
4238
|
+
cache_read_tokens: usageLog.reduce((s, u) => s + (u.cacheReadTokens ?? 0), 0),
|
|
4239
|
+
cache_hit_rate: (() => {
|
|
4240
|
+
const prompt = usageLog.reduce((s, u) => s + (u.promptTokens ?? 0), 0);
|
|
4241
|
+
const cached = usageLog.reduce((s, u) => s + (u.cacheReadTokens ?? 0), 0);
|
|
4242
|
+
return prompt > 0 ? Math.round((cached / prompt) * 100) : 0;
|
|
4243
|
+
})(),
|
|
4244
|
+
},
|
|
4245
|
+
});
|
|
4246
|
+
return {
|
|
4247
|
+
success: false,
|
|
4248
|
+
screenshotPath: null,
|
|
4249
|
+
screenshots: workflowScreenshots,
|
|
4250
|
+
iterations: config.maxIterations,
|
|
4251
|
+
actions: actionHistory,
|
|
4252
|
+
assessment: 'Max iterations reached without completing the task.',
|
|
4253
|
+
usage: usageLog,
|
|
4254
|
+
runtimeStrategy: 'full_llm',
|
|
4255
|
+
deterministicRecoveryUsed: usedDeterministicRecovery,
|
|
4256
|
+
evaluatorUsed: false,
|
|
4257
|
+
verification: lastVerificationResult,
|
|
4258
|
+
};
|
|
4259
|
+
}
|
|
4260
|
+
/** Actions that can be safely replayed (no terminal or meta actions). */
|
|
4261
|
+
const REPLAYABLE_ACTIONS = [
|
|
4262
|
+
'navigate_to', 'click', 'type_text', 'select_option', 'scroll',
|
|
4263
|
+
'press_key', 'wait', 'resize_viewport', 'dismiss_overlays',
|
|
4264
|
+
// Legacy aliases kept for replaying old recordings
|
|
4265
|
+
'hover', 'safe_expand', 'scroll_to_element',
|
|
4266
|
+
];
|
|
4267
|
+
/**
|
|
4268
|
+
* Replay a previous successful capture's recorded actions instead of running the full AI agent.
|
|
4269
|
+
* Falls back to a full runAgent if replay fails or verification doesn't pass.
|
|
4270
|
+
* Much cheaper than a full agent run: only 1 verification call vs. N agent iterations.
|
|
4271
|
+
*/
|
|
4272
|
+
export async function replayAgent(browser, config, apiKey, recordedActions, options = {}) {
|
|
4273
|
+
const client = createClient(apiKey);
|
|
4274
|
+
const modelState = { active: config.model };
|
|
4275
|
+
const usageLog = [];
|
|
4276
|
+
const allowFullAgentFallback = options.allowFullAgentFallback !== false;
|
|
4277
|
+
const replayObservation = await browser.captureObservation().catch(() => null);
|
|
4278
|
+
const replayAnalysis = analyzeReplayCandidate(recordedActions, {
|
|
4279
|
+
currentUrl: browser.currentPage.url(),
|
|
4280
|
+
targetUrl: config.url,
|
|
4281
|
+
currentViewport: browser.currentPage.viewportSize(),
|
|
4282
|
+
isAuthenticated: config.sessionProfile?.authState === 'authenticated',
|
|
4283
|
+
currentDialogCount: replayObservation?.dialogCount ?? null,
|
|
4284
|
+
pageIdentity: config.variantManifest?.currentPageIdentity ?? null,
|
|
4285
|
+
});
|
|
4286
|
+
const replayable = replayAnalysis.replayableActions;
|
|
4287
|
+
if (replayAnalysis.skipReason) {
|
|
4288
|
+
if (!allowFullAgentFallback) {
|
|
4289
|
+
logger.error(`Replay skipped (${replayAnalysis.skipReason}), staying in replay-only mode`);
|
|
4290
|
+
return {
|
|
4291
|
+
success: false,
|
|
4292
|
+
screenshotPath: null,
|
|
4293
|
+
screenshots: [],
|
|
4294
|
+
iterations: 0,
|
|
4295
|
+
actions: recordedActions,
|
|
4296
|
+
assessment: `Action replay skipped: ${replayAnalysis.skipReason}`,
|
|
4297
|
+
diagnostic: {
|
|
4298
|
+
screenshot: await browser.takeScreenshot().catch(() => Buffer.alloc(0)),
|
|
4299
|
+
url: browser.currentPage.url(),
|
|
4300
|
+
interactiveElements: [],
|
|
4301
|
+
accessibilityTreeSnippet: '',
|
|
4302
|
+
giveUpReason: `Action replay skipped: ${replayAnalysis.skipReason}`,
|
|
4303
|
+
},
|
|
4304
|
+
usage: usageLog,
|
|
4305
|
+
runtimeStrategy: 'action_replay',
|
|
4306
|
+
deterministicRecoveryUsed: false,
|
|
4307
|
+
evaluatorUsed: false,
|
|
4308
|
+
};
|
|
4309
|
+
}
|
|
4310
|
+
logger.info(`Action replay skipped (${replayAnalysis.skipReason}), falling back to full agent run`);
|
|
4311
|
+
if (!urlsRoughlyMatch(config.url, browser.currentPage.url())) {
|
|
4312
|
+
await browser.navigateTo(config.url);
|
|
4313
|
+
}
|
|
4314
|
+
const fallback = await runAgent(browser, config, apiKey);
|
|
4315
|
+
return {
|
|
4316
|
+
...fallback,
|
|
4317
|
+
runtimeStrategy: 'action_replay_fallback',
|
|
4318
|
+
};
|
|
4319
|
+
}
|
|
4320
|
+
logger.info(`Action replay: replaying ${replayable.length} actions...`);
|
|
4321
|
+
try {
|
|
4322
|
+
for (const recorded of replayable) {
|
|
4323
|
+
throwIfAborted(config.abortSignal, 'Replay cancelled.');
|
|
4324
|
+
// Pre-action URL validation: if the recorded action stored the URL it was
|
|
4325
|
+
// executed from, verify the browser is on the same page before attempting it.
|
|
4326
|
+
// Skip URL validation for actions that work regardless of URL (press_key, scroll, wait,
|
|
4327
|
+
// dismiss_overlays) — these are often used between page transitions and don't depend
|
|
4328
|
+
// on being on a specific URL.
|
|
4329
|
+
const URL_INDEPENDENT_ACTIONS = new Set(['press_key', 'scroll', 'scroll_to_element', 'wait', 'dismiss_overlays', 'resize_viewport']);
|
|
4330
|
+
const expectedPreUrl = typeof recorded.params.preActionUrl === 'string' ? recorded.params.preActionUrl : null;
|
|
4331
|
+
if (expectedPreUrl && !URL_INDEPENDENT_ACTIONS.has(recorded.action)) {
|
|
4332
|
+
const currentUrl = browser.currentPage.url();
|
|
4333
|
+
if (!urlsRoughlyMatch(expectedPreUrl, currentUrl) && !urlsRoughlyMatch(currentUrl, expectedPreUrl)) {
|
|
4334
|
+
throw new Error(`Replay starting URL mismatch for "${recorded.action}": expected ${expectedPreUrl}, browser is on ${currentUrl}`);
|
|
4335
|
+
}
|
|
4336
|
+
}
|
|
4337
|
+
const interactiveElements = replayActionRequiresAnchor(recorded)
|
|
4338
|
+
? await browser.getInteractiveElements().catch(() => [])
|
|
4339
|
+
: [];
|
|
4340
|
+
const resolvedReplayAction = resolveReplayActionArgs(recorded, interactiveElements);
|
|
4341
|
+
if (!resolvedReplayAction.args) {
|
|
4342
|
+
throw new Error(resolvedReplayAction.reason ?? `Action "${recorded.action}" cannot be replayed on the current page`);
|
|
4343
|
+
}
|
|
4344
|
+
const resolvedReplayArgs = resolveActionCredentialArgs(recorded.action, resolvedReplayAction.args, config.credentials);
|
|
4345
|
+
// Force clicks during replay to bypass pointer-event interception
|
|
4346
|
+
// (e.g., after dark theme switch, <html class="dark"> may briefly intercept events)
|
|
4347
|
+
if (recorded.action === 'click' || recorded.action === 'safe_expand') {
|
|
4348
|
+
resolvedReplayArgs.__forceClick = true;
|
|
4349
|
+
}
|
|
4350
|
+
const result = await executeAction(browser, recorded.action, resolvedReplayArgs);
|
|
4351
|
+
throwIfAborted(config.abortSignal, 'Replay cancelled.');
|
|
4352
|
+
if (!result.success) {
|
|
4353
|
+
throw new Error(`Action "${recorded.action}" failed: ${result.error}`);
|
|
4354
|
+
}
|
|
4355
|
+
// Replay needs longer delays than live agent — the live agent adapts to
|
|
4356
|
+
// page reactions but replay fires actions blindly. Use the full delay for
|
|
4357
|
+
// state-changing clicks (page transitions, modals) to let async operations settle.
|
|
4358
|
+
const baseDelay = getPostActionDelayMs(recorded.action, result, {
|
|
4359
|
+
authSubmitAction: isLikelyAuthenticationSubmitAction(recorded.action, recorded.params),
|
|
4360
|
+
});
|
|
4361
|
+
const postActionDelayMs = result.stateChanged ? Math.max(baseDelay, 300) : baseDelay;
|
|
4362
|
+
if (postActionDelayMs > 0) {
|
|
4363
|
+
await browser.wait(postActionDelayMs);
|
|
4364
|
+
}
|
|
4365
|
+
// Validate intermediate URL: if the recorded action caused a navigation,
|
|
4366
|
+
// verify the replay landed on a matching URL. This catches divergence early
|
|
4367
|
+
// (e.g., a translated button click that navigates to a different page).
|
|
4368
|
+
const expectedUrl = typeof recorded.params.postActionUrl === 'string' ? recorded.params.postActionUrl : null;
|
|
4369
|
+
if (expectedUrl && result.stateChanged) {
|
|
4370
|
+
const actualUrl = browser.currentPage.url();
|
|
4371
|
+
if (!urlsRoughlyMatch(expectedUrl, actualUrl) && !urlsRoughlyMatch(actualUrl, expectedUrl)) {
|
|
4372
|
+
throw new Error(`Replay URL divergence after "${recorded.action}": expected ${expectedUrl}, got ${actualUrl}`);
|
|
4373
|
+
}
|
|
4374
|
+
}
|
|
4375
|
+
}
|
|
4376
|
+
}
|
|
4377
|
+
catch (err) {
|
|
4378
|
+
if (isAbortError(err)) {
|
|
4379
|
+
throw err;
|
|
4380
|
+
}
|
|
4381
|
+
const replayError = err.message;
|
|
4382
|
+
if (!allowFullAgentFallback) {
|
|
4383
|
+
logger.error(`Replay failed (${replayError}), staying in replay-only mode`);
|
|
4384
|
+
return {
|
|
4385
|
+
success: false,
|
|
4386
|
+
screenshotPath: null,
|
|
4387
|
+
screenshots: [],
|
|
4388
|
+
iterations: 0,
|
|
4389
|
+
actions: recordedActions,
|
|
4390
|
+
assessment: `Action replay failed: ${replayError}`,
|
|
4391
|
+
diagnostic: {
|
|
4392
|
+
screenshot: await browser.takeScreenshot().catch(() => Buffer.alloc(0)),
|
|
4393
|
+
url: browser.currentPage.url(),
|
|
4394
|
+
interactiveElements: [],
|
|
4395
|
+
accessibilityTreeSnippet: '',
|
|
4396
|
+
giveUpReason: `Action replay failed: ${replayError}`,
|
|
4397
|
+
},
|
|
4398
|
+
usage: usageLog,
|
|
4399
|
+
runtimeStrategy: 'action_replay',
|
|
4400
|
+
deterministicRecoveryUsed: false,
|
|
4401
|
+
evaluatorUsed: false,
|
|
4402
|
+
};
|
|
4403
|
+
}
|
|
4404
|
+
logger.error(`Replay failed (${replayError}), falling back to full agent run`);
|
|
4405
|
+
if (!urlsRoughlyMatch(config.url, browser.currentPage.url())) {
|
|
4406
|
+
await browser.navigateTo(config.url);
|
|
4407
|
+
}
|
|
4408
|
+
const fallback = await runAgent(browser, config, apiKey);
|
|
4409
|
+
return {
|
|
4410
|
+
...fallback,
|
|
4411
|
+
runtimeStrategy: 'action_replay_fallback',
|
|
4412
|
+
};
|
|
4413
|
+
}
|
|
4414
|
+
// One verification call to confirm the page state looks correct
|
|
4415
|
+
logger.info('Replay done, verifying...');
|
|
4416
|
+
throwIfAborted(config.abortSignal, 'Replay cancelled.');
|
|
4417
|
+
const replayExpectsGalleryDetail = config.variantManifest?.currentPageIdentity?.kind === 'gallery'
|
|
4418
|
+
&& GALLERY_DETAIL_PROMPT_RE.test(`${config.variantManifest?.currentPageId || ''} ${config.prompt}`.toLowerCase());
|
|
4419
|
+
const verification = await verifyScreenshot(client, config, modelState, browser, 'Page prepared via action replay', 1, {
|
|
4420
|
+
skipDialogCheck: config.variantManifest?.currentPageIdentity?.dialogTarget
|
|
4421
|
+
|| replayExpectsGalleryDetail,
|
|
4422
|
+
});
|
|
4423
|
+
const { verified, reason, usage: verifyUsage, fatal } = verification;
|
|
4424
|
+
if (verifyUsage)
|
|
4425
|
+
usageLog.push(verifyUsage);
|
|
4426
|
+
if (fatal) {
|
|
4427
|
+
return {
|
|
4428
|
+
success: false,
|
|
4429
|
+
screenshotPath: null,
|
|
4430
|
+
screenshots: [],
|
|
4431
|
+
iterations: 0,
|
|
4432
|
+
actions: recordedActions,
|
|
4433
|
+
assessment: reason || 'Replay verification failed',
|
|
4434
|
+
diagnostic: {
|
|
4435
|
+
screenshot: await browser.takeScreenshot().catch(() => Buffer.alloc(0)),
|
|
4436
|
+
url: browser.currentPage.url(),
|
|
4437
|
+
interactiveElements: [],
|
|
4438
|
+
accessibilityTreeSnippet: '',
|
|
4439
|
+
giveUpReason: reason || 'Replay verification failed',
|
|
4440
|
+
},
|
|
4441
|
+
usage: usageLog,
|
|
4442
|
+
runtimeStrategy: 'action_replay',
|
|
4443
|
+
deterministicRecoveryUsed: false,
|
|
4444
|
+
evaluatorUsed: false,
|
|
4445
|
+
verification,
|
|
4446
|
+
};
|
|
4447
|
+
}
|
|
4448
|
+
if (verified) {
|
|
4449
|
+
logger.success('Action replay verified');
|
|
4450
|
+
getPostHog().capture({
|
|
4451
|
+
distinctId: config.analyticsId ?? DISTINCT_ID,
|
|
4452
|
+
event: 'agent_run_succeeded',
|
|
4453
|
+
properties: {
|
|
4454
|
+
url: config.url,
|
|
4455
|
+
model: modelState.active,
|
|
4456
|
+
theme: config.currentTheme,
|
|
4457
|
+
lang: config.currentLang,
|
|
4458
|
+
iterations: 0,
|
|
4459
|
+
replay: true,
|
|
4460
|
+
total_tokens: usageLog.reduce((s, u) => s + (u.totalTokens ?? 0), 0),
|
|
4461
|
+
},
|
|
4462
|
+
});
|
|
4463
|
+
return {
|
|
4464
|
+
success: true,
|
|
4465
|
+
screenshotPath: null,
|
|
4466
|
+
screenshots: [],
|
|
4467
|
+
iterations: 0,
|
|
4468
|
+
actions: recordedActions,
|
|
4469
|
+
assessment: 'Captured via action replay',
|
|
4470
|
+
usage: usageLog,
|
|
4471
|
+
runtimeStrategy: 'action_replay',
|
|
4472
|
+
deterministicRecoveryUsed: false,
|
|
4473
|
+
evaluatorUsed: false,
|
|
4474
|
+
verification,
|
|
4475
|
+
};
|
|
4476
|
+
}
|
|
4477
|
+
// Verification failed — reset and fall back to full agent run
|
|
4478
|
+
if (!allowFullAgentFallback) {
|
|
4479
|
+
logger.error(`Replay verification failed (${reason}), staying in replay-only mode`);
|
|
4480
|
+
return {
|
|
4481
|
+
success: false,
|
|
4482
|
+
screenshotPath: null,
|
|
4483
|
+
screenshots: [],
|
|
4484
|
+
iterations: 0,
|
|
4485
|
+
actions: recordedActions,
|
|
4486
|
+
assessment: reason || 'Replay verification failed',
|
|
4487
|
+
diagnostic: {
|
|
4488
|
+
screenshot: await browser.takeScreenshot().catch(() => Buffer.alloc(0)),
|
|
4489
|
+
url: browser.currentPage.url(),
|
|
4490
|
+
interactiveElements: [],
|
|
4491
|
+
accessibilityTreeSnippet: '',
|
|
4492
|
+
giveUpReason: reason || 'Replay verification failed',
|
|
4493
|
+
},
|
|
4494
|
+
usage: usageLog,
|
|
4495
|
+
runtimeStrategy: 'action_replay',
|
|
4496
|
+
deterministicRecoveryUsed: false,
|
|
4497
|
+
evaluatorUsed: false,
|
|
4498
|
+
verification,
|
|
4499
|
+
};
|
|
4500
|
+
}
|
|
4501
|
+
logger.error(`Replay verification failed (${reason}), falling back to full agent run`);
|
|
4502
|
+
if (!urlsRoughlyMatch(config.url, browser.currentPage.url())) {
|
|
4503
|
+
await browser.navigateTo(config.url);
|
|
4504
|
+
}
|
|
4505
|
+
const fallback = await runAgent(browser, config, apiKey);
|
|
4506
|
+
return {
|
|
4507
|
+
...fallback,
|
|
4508
|
+
runtimeStrategy: 'action_replay_fallback',
|
|
4509
|
+
};
|
|
4510
|
+
}
|
|
4511
|
+
//# sourceMappingURL=agent.js.map
|