npm - autokap - Versions diffs - 1.0.5 → 1.0.7 - Mend

autokap 1.0.5 → 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (130) hide show

package/assets/chrome/ios-statusbar-comparison-reference.jpg +0 -0
package/assets/chrome/ios-statusbar-dark-reference.jpg +0 -0
package/assets/chrome/ios-statusbar-light-reference.jpg +0 -0
package/assets/devices/ipad-pro-11-m4.json +52 -0
package/assets/devices/iphone-16-pro.json +53 -0
package/assets/devices/macbook-air-13.json +45 -0
package/assets/frames/MacBook Air 13.svg +242 -0
package/assets/frames/Status bar - iPhone.png +0 -0
Menu bar- iPad.png +0 -0
package/assets/frames/iPad Pro M4 11_.png +0 -0
package/assets/frames/iPhone 16 Pro.png +0 -0
package/assets/icons/Cellular Connection.svg +3 -0
package/assets/icons/Union.svg +6 -0
package/assets/icons/Wifi.svg +3 -0
package/assets/icons/battery.svg +5 -0
package/assets/icons/battery_charging.svg +8 -0
package/dist/abort.d.ts +5 -0
package/dist/abort.js +44 -0
package/dist/agent.d.ts +142 -0
package/dist/agent.js +4511 -0
package/dist/billing-operation-logging.d.ts +38 -0
package/dist/billing-operation-logging.js +248 -0
package/dist/browser-bar.d.ts +40 -0
package/dist/browser-bar.js +147 -0
package/dist/browser.d.ts +25 -0
package/dist/browser.js +177 -9
package/dist/capture-alt-text.d.ts +12 -0
package/dist/capture-alt-text.js +51 -0
package/dist/capture-encryption.d.ts +10 -0
package/dist/capture-encryption.js +41 -0
package/dist/capture-language-preflight.d.ts +41 -0
package/dist/capture-language-preflight.js +286 -0
package/dist/capture-llm-page-identity.d.ts +15 -0
package/dist/capture-llm-page-identity.js +116 -0
package/dist/capture-model-resolution.d.ts +9 -0
package/dist/capture-model-resolution.js +21 -0
package/dist/capture-page-identity.d.ts +9 -0
package/dist/capture-page-identity.js +219 -0
package/dist/capture-preset-credentials.d.ts +12 -0
package/dist/capture-preset-credentials.js +57 -0
package/dist/capture-request-plan.d.ts +58 -0
package/dist/capture-request-plan.js +216 -0
package/dist/capture-run-optimizer.d.ts +139 -0
package/dist/capture-run-optimizer.js +848 -0
package/dist/capture-selector-memory.d.ts +26 -0
package/dist/capture-selector-memory.js +327 -0
package/dist/capture-session-profile-encryption.d.ts +2 -0
package/dist/capture-session-profile-encryption.js +22 -0
package/dist/capture-step-timeout.d.ts +10 -0
package/dist/capture-step-timeout.js +30 -0
package/dist/capture-studio-sync.d.ts +22 -0
package/dist/capture-studio-sync.js +166 -0
package/dist/capture-variant-state.d.ts +54 -0
package/dist/capture-variant-state.js +156 -0
package/dist/cli.js +21 -0
package/dist/clip-orchestrator.d.ts +148 -0
package/dist/clip-orchestrator.js +950 -0
package/dist/clip-postprocess.d.ts +42 -0
package/dist/clip-postprocess.js +192 -0
package/dist/cost-logging.d.ts +27 -0
package/dist/cost-logging.js +128 -0
package/dist/credential-templates.d.ts +5 -0
package/dist/credential-templates.js +60 -0
package/dist/element-capture.d.ts +53 -0
package/dist/element-capture.js +766 -0
package/dist/hybrid-navigator.d.ts +138 -0
package/dist/hybrid-navigator.js +468 -0
package/dist/index.d.ts +15 -0
package/dist/index.js +11 -0
package/dist/llm-usage.d.ts +17 -0
package/dist/llm-usage.js +45 -0
package/dist/mockup-html.d.ts +119 -0
package/dist/mockup-html.js +253 -0
package/dist/mockup.d.ts +94 -0
package/dist/mockup.js +608 -0
package/dist/mouse-animation.d.ts +46 -0
package/dist/mouse-animation.js +100 -0
package/dist/overlay-utils.d.ts +14 -0
package/dist/overlay-utils.js +13 -0
package/dist/posthog.d.ts +4 -0
package/dist/posthog.js +26 -0
package/dist/prompt-cache.d.ts +10 -0
package/dist/prompt-cache.js +24 -0
package/dist/prompts.d.ts +167 -0
package/dist/prompts.js +1165 -0
package/dist/remote-browser.d.ts +191 -0
package/dist/remote-browser.js +305 -0
package/dist/security.d.ts +20 -0
package/dist/security.js +569 -0
package/dist/server-capture-runtime.d.ts +123 -0
package/dist/server-capture-runtime.js +638 -0
package/dist/server-credit-usage.d.ts +12 -0
package/dist/server-credit-usage.js +41 -0
package/dist/server-posthog.d.ts +2 -0
package/dist/server-posthog.js +16 -0
package/dist/server-project-webhooks.d.ts +45 -0
package/dist/server-project-webhooks.js +97 -0
package/dist/server-screenshot-watermark.d.ts +7 -0
package/dist/server-screenshot-watermark.js +38 -0
package/dist/session-profile.d.ts +86 -0
package/dist/session-profile.js +1373 -0
package/dist/sf-pro-fonts.d.ts +4 -0
package/dist/sf-pro-fonts.js +7 -0
package/dist/status-bar-l10n.d.ts +14 -0
package/dist/status-bar-l10n.js +177 -0
package/dist/status-bar.d.ts +44 -0
package/dist/status-bar.js +336 -0
package/dist/tools.d.ts +4 -0
package/dist/tools.js +578 -0
package/dist/video-agent.d.ts +143 -0
package/dist/video-agent.js +4783 -0
package/dist/video-observation.d.ts +36 -0
package/dist/video-observation.js +192 -0
package/dist/video-planner.d.ts +12 -0
package/dist/video-planner.js +500 -0
package/dist/video-prompts.d.ts +37 -0
package/dist/video-prompts.js +554 -0
package/dist/video-tools.d.ts +3 -0
package/dist/video-tools.js +59 -0
package/dist/video-variant-state.d.ts +29 -0
package/dist/video-variant-state.js +80 -0
package/dist/vision-model.d.ts +17 -0
package/dist/vision-model.js +74 -0
package/dist/ws-auth.d.ts +20 -0
package/dist/ws-auth.js +67 -0
package/dist/ws-handler.d.ts +10 -0
package/dist/ws-handler.js +1663 -0
package/dist/ws-server.d.ts +9 -0
package/dist/ws-server.js +52 -0
package/package.json +93 -39

package/dist/video-prompts.js ADDED Viewed

@@ -0,0 +1,554 @@
+import fs from 'node:fs';
+function serializeObservationSnapshot(snapshot) {
+    if (!snapshot)
+        return '';
+    const elements = snapshot.interactiveElements.slice(0, 12).map((element) => ({
+        index: element.index,
+        tag: element.tag,
+        role: element.role,
+        text: element.text,
+        ariaLabel: element.ariaLabel,
+        href: element.href,
+        selector: element.selector,
+        visible: element.visible,
+    }));
+    return JSON.stringify({
+        coherenceKey: snapshot.coherenceKey ?? null,
+        pageIdentity: snapshot.pageIdentity ?? null,
+        pageSignals: {
+            url: snapshot.pageSignals.url,
+            title: snapshot.pageSignals.title,
+            htmlLang: snapshot.pageSignals.htmlLang,
+            headings: snapshot.pageSignals.headings.slice(0, 6),
+            navLabels: snapshot.pageSignals.navLabels.slice(0, 8),
+            breadcrumbLabels: snapshot.pageSignals.breadcrumbLabels.slice(0, 6),
+            localeHints: snapshot.pageSignals.localeHints.slice(0, 8),
+            detectedTheme: snapshot.pageSignals.detectedTheme,
+        },
+        interactiveElements: elements,
+    }, null, 2);
+}
+export function buildVideoPromptContentParts(params) {
+    const textPart = {
+        type: 'text',
+        text: params.text,
+    };
+    const imagePart = params.imageUrl
+        ? {
+            type: 'image_url',
+            image_url: { url: params.imageUrl, detail: 'low' },
+        }
+        : null;
+    if (!imagePart)
+        return [textPart];
+    return params.cacheLayoutV2
+        ? [textPart, imagePart]
+        : [imagePart, textPart];
+}
+// ── Step fixer prompts ────────────────────────────────────────────────
+export function buildStepFixerSystemPrompt(videoScript) {
+    return `You are a browser automation step fixer. A step in a video demo plan failed. Your job is to produce a sequence of replacement steps that achieves the same goal.
+## Overall demo goal
+${videoScript}
+## Your task
+Given the failed step, the failure reason, a runtime page observation summary, and a screenshot of the current page, output a JSON object with a \`steps\` array containing one or more replacement steps.
+## Grounding rules — CRITICAL
+- The runtime page observation summary is built from the live DOM/accessibility tree after the failure.
+- The runtime page observation summary is captured from the same verification snapshot as the screenshot unless explicitly noted otherwise.
+- Treat the observation as ground truth for what labels, links, headings, controls, locale hints, and routes are actually present.
+- Reuse hrefs, nav labels, button labels, breadcrumbs, and selectors supported by that observation whenever possible.
+- Do NOT hallucinate a selector or route that is not supported by either the screenshot or the observation summary.
+- NEVER output internal automation selectors such as \`[data-ak-*]\` or \`data-ak-interactive-index\`. Those are observation-only artifacts, not stable runtime selectors.
+## Fix priority — try strategies in this order
+### 0. Remove obstructive overlays first
+If a cookie banner, modal, sticky feedback widget, newsletter popup, or consent wall is blocking the intended interaction or making the frame unusable:
+- Add a \`dismiss_overlays\` step first
+- Then retry the intended action or continue with the next recovery strategy
+- Do NOT hide the product's own chat/assistant widget if the overall demo goal is to show that widget
+### 1. Use direct navigation ONLY for technical preparation steps
+Direct \`navigate\` is allowed only when at least one of these is true:
+- the original failed step was already a \`navigate\`
+- the step is explicitly marked \`recordingIntent: "prepare_only"\`
+- there is no user-visible navigation requirement and the overall goal is just to land on the initial page before the video begins
+If the original step is a visible click/hover/highlight in the actual demo:
+- DO NOT replace it with a direct URL jump
+- instead fix the selector, scroll to reveal the element, open the correct menu, or target a more stable nav element
+- If the goal is a SPECIFIC product/page (for example "iPhone 17e"), the replacement must target that exact product/page. Do NOT downgrade it to a family selector like \`[href*="/iphone/"]\` or a generic "iPhone" nav click unless you also add the missing intermediate step(s).
+### 2. Fix the selector (single step, same type)
+The element exists on the page but the selector was wrong. Look at the screenshot and use a more specific selector:
+- \`:has-text('Exact visible label')\` — most reliable
+- \`[aria-label*='keyword']\`, \`[href*='keyword']\`
+- Fallback chain: \`selector1, selector2, selector3\`
+### 3. Scroll to find the element, then act (two steps)
+The element may be below the fold. Replace the failed step with:
+- A \`scroll\` step (direction "down", amount 400–800px)
+- Then the original action with a corrected selector
+Use this when the demo script mentions scrolling or the target section is not visible in the screenshot.
+### 4. Wait longer then retry (two steps)
+The page was still animating. Replace with:
+- A \`wait\` step (waitMs 2000–4000)
+- Then the original action with the same or corrected selector
+## Key name rule — CRITICAL
+For \`key\` steps, Playwright key names are **case-sensitive**. Always use exact capitalization:
+- ✅ "Enter", "Tab", "Escape", "Backspace", "ArrowDown", "ArrowUp", "Control+A"
+- ❌ "enter", "tab", "escape" — these will throw a runtime error
+## Output format
+Respond with ONLY this JSON:
+{ "steps": [ { ...step1 }, { ...step2 }, ... ] }
+Each step schema: { id, type, description, target?, selector?, coordinates?, toTarget?, toSelector?, toCoordinates?, text?, optionLabel?, optionValue?, optionIndex?, direction?, amount?, key?, durationMs?, waitMs?, postStepWaitMs?, expectedPageAfter? }
+Allowed step types: \`navigate\`, \`dismiss_overlays\`, \`click\`, \`type\`, \`select_option\`, \`scroll\`, \`wait\`, \`hover\`, \`drag\`, \`key\`, \`highlight\`, \`assert_url\`, \`assert_text\`, \`assert_element\`, \`assert_page\`
+Keep the original step \`id\` for the first replacement step. Preserve \`recordingIntent\` and any specific destination contract (\`expectedPageAfter\`) unless you are explicitly replacing it with an equivalent stronger contract. Name additional steps \`id + "-b"\`, \`id + "-c"\`, etc.`;
+}
+export function buildStepFixerUserMessage(step, failureReason, suggestion, observationSummary, observationSnapshot) {
+    return `Failed step:
+${JSON.stringify(step, null, 2)}
+Recording intent: ${step.recordingIntent ?? 'visible'}
+Failure reason: ${failureReason}${suggestion ? `\nVerifier suggestion: ${suggestion}` : ''}
+${observationSummary ? `Runtime page observation:\n${observationSummary}\n` : ''}${observationSnapshot ? `Structured observation snapshot:\n${serializeObservationSnapshot(observationSnapshot)}\n` : ''}
+The screenshot shows the page state after the failure. Based on what you see, output a corrected step JSON.`;
+}
+// ── Cursor overlay script injected into every recorded page ──────────
+// SVG files are the single source of truth — edit web/public/cursors/*.svg directly.
+function loadCursorSvg(name) {
+    return fs.readFileSync(new URL(`../web/public/cursors/${name}.svg`, import.meta.url), 'utf-8').trim();
+}
+const CURSOR_THEME_SVGS = {
+    macos: loadCursorSvg('macos'),
+    windows: loadCursorSvg('windows'),
+};
+/**
+ * JavaScript injected via `context.addInitScript()` to show a visible animated
+ * cursor in Playwright video recordings (the native OS cursor is invisible).
+ */
+export function buildCursorOverlayScript(theme = 'minimal') {
+    const cursorSvg = theme === 'minimal' ? null : CURSOR_THEME_SVGS[theme];
+    return `
+(function() {
+  function injectCursor() {
+    if (document.getElementById('__ak_cursor__')) return;
+    const cursorTheme = ${JSON.stringify(theme)};
+    const cursorSvg = ${JSON.stringify(cursorSvg)};
+    const style = document.createElement('style');
+    style.textContent = [
+      '*, *::before, *::after { cursor: none !important; }',
+      '#__ak_cursor__ { mix-blend-mode: normal; }',
+      '#__ak_cursor__ svg { width: 100%; height: 100%; display: block; overflow: visible; }',
+      '#__ak_cursor__[data-theme="macos"], #__ak_cursor__[data-theme="windows"] { width: 28px !important; height: 28px !important; background: transparent !important; border: 0 !important; border-radius: 0 !important; box-shadow: none !important; }',
+      '#__ak_cursor__.__ak_pressed { transform: translate(-50%, -50%) scale(0.72) !important; box-shadow: 0 0 0 8px rgba(37, 99, 235, 0.18), 0 4px 18px rgba(0,0,0,0.28) !important; background: rgba(255,255,255,0.98) !important; border-color: rgba(30,41,59,0.92) !important; }',
+      '#__ak_cursor__[data-theme="macos"].__ak_pressed, #__ak_cursor__[data-theme="windows"].__ak_pressed { background: transparent !important; border-color: transparent !important; box-shadow: none !important; filter: drop-shadow(0 0 0 rgba(37, 99, 235, 0)) drop-shadow(0 0 0 rgba(37, 99, 235, 0)) !important; }',
+      '#__ak_cursor__[data-theme="macos"].__ak_pressed svg, #__ak_cursor__[data-theme="windows"].__ak_pressed svg { filter: drop-shadow(0 0 0.5px rgba(255,255,255,0.9)) drop-shadow(0 0 10px rgba(37, 99, 235, 0.45)); }',
+      '#__ak_cursor_click_pulse__ { position: fixed; width: 20px; height: 20px; border-radius: 9999px; pointer-events: none; z-index: 2147483646; border: 3px solid rgba(37, 99, 235, 0.75); box-shadow: 0 0 0 1px rgba(255,255,255,0.45) inset; transform: translate(-50%, -50%) scale(0.55); opacity: 0; }',
+      '#__ak_cursor_click_pulse__.__ak_active { animation: __ak_cursor_pulse__ 340ms cubic-bezier(0.16, 1, 0.3, 1) forwards; }',
+      '@keyframes __ak_cursor_pulse__ { 0% { opacity: 0.95; transform: translate(-50%, -50%) scale(0.55); } 100% { opacity: 0; transform: translate(-50%, -50%) scale(2.6); } }',
+    ].join('\\n');
+    document.head.appendChild(style);
+    const cursor = document.createElement('div');
+    cursor.id = '__ak_cursor__';
+    cursor.dataset.theme = cursorTheme;
+    cursor.style.cssText = [
+      'position: fixed',
+      'top: -100px',
+      'left: -100px',
+      'width: 20px',
+      'height: 20px',
+      'background: rgba(255, 255, 255, 0.95)',
+      'border: 2.5px solid rgba(0, 0, 0, 0.8)',
+      'border-radius: 50%',
+      'pointer-events: none',
+      'z-index: 2147483647',
+      'transform: translate(-50%, -50%)',
+      'box-shadow: 0 2px 8px rgba(0,0,0,0.4)',
+      'transition: transform 0.09s ease, box-shadow 0.12s ease, background 0.12s ease, border-color 0.12s ease',
+      'will-change: left, top',
+    ].join(';');
+    if (cursorSvg) cursor.innerHTML = cursorSvg;
+    const pulse = document.createElement('div');
+    pulse.id = '__ak_cursor_click_pulse__';
+    document.body.appendChild(cursor);
+    document.body.appendChild(pulse);
+    let pulseResetTimer = null;
+    function setCursorPosition(x, y) {
+      cursor.style.left = x + 'px';
+      cursor.style.top = y + 'px';
+      pulse.style.left = x + 'px';
+      pulse.style.top = y + 'px';
+    }
+    function triggerPulse() {
+      pulse.classList.remove('__ak_active');
+      void pulse.offsetWidth;
+      pulse.classList.add('__ak_active');
+      if (pulseResetTimer) clearTimeout(pulseResetTimer);
+      pulseResetTimer = setTimeout(function() {
+        pulse.classList.remove('__ak_active');
+      }, 380);
+    }
+    // Track mouse position via mousemove (fired by Playwright's page.mouse.move)
+    document.addEventListener('mousemove', function(e) {
+      setCursorPosition(e.clientX, e.clientY);
+    }, { passive: true });
+    window.addEventListener('mousedown', function(e) {
+      setCursorPosition(e.clientX, e.clientY);
+      cursor.classList.add('__ak_pressed');
+      triggerPulse();
+    }, true);
+    window.addEventListener('mouseup', function(e) {
+      setCursorPosition(e.clientX, e.clientY);
+      cursor.classList.remove('__ak_pressed');
+    }, true);
+    window.addEventListener('click', function(e) {
+      setCursorPosition(e.clientX, e.clientY);
+      triggerPulse();
+    }, true);
+  }
+  // addInitScript runs before DOM is parsed — wait for body to be ready
+  if (document.readyState === 'loading') {
+    document.addEventListener('DOMContentLoaded', injectCursor);
+  } else {
+    injectCursor();
+  }
+})();
+`;
+}
+// ── Video planner prompts ────────────────────────────────────────────
+export function buildVideoPlannerSystemPrompt(options = {}) {
+    const mode = options.mode ?? 'full';
+    const modeInstructions = mode === 'variant_prefix'
+        ? `## Planner mode
+This call is in VARIANT PREFIX mode.
+- Output ONLY the minimal steps required to activate the requested language/theme variant.
+- Do NOT execute the main demo flow.
+- Do NOT open the target product/page/section described by the business script in this mode. That belongs to the base plan, not the prefix plan.
+- End with assertions proving language/theme are active.
+- If the observed live page does NOT already match the requested language/theme, you MUST include at least one concrete activation step before any assertion.
+- Valid activation steps are things like \`navigate\`, \`click\`, \`select_option\`, \`type\`, or \`key\`.
+- A plan made only of \`wait\`, \`scroll\`, \`hover\`, \`highlight\`, \`dismiss_overlays\`, or assertions is INVALID when the requested variant is not yet active.
+- Because variant prefix runs before capture, a direct \`navigate\` to the correct locale/theme URL is allowed and preferred when the observation exposes a reliable localized route (for example via \`hreflang\`, canonical links, locale links, or a stable locale path like \`/fr/\`).`
+        : mode === 'clip'
+            ? `## Planner mode
+This call is in CLIP mode.
+You are producing a plan for a MICRO-CLIP: a very short, looping animation (2–8 seconds) that showcases a single UI interaction.
+STRICT CONSTRAINTS:
+- Maximum 4 steps (excluding dismiss_overlays and assertions). Fewer is better.
+- Target duration: 2–8 seconds total.
+- DO NOT include navigation steps unless the clip specifically needs to start from a different page than the current one.
+- ALWAYS start with a \`dismiss_overlays\` step. Cookie banners, consent walls, and popups ruin clips. This step does NOT count toward the 4-step limit.
+- Prefer postStepWaitMs (500–1500ms) after the main interaction to let animations/transitions play out before the recording ends.
+- All the "Clean-video rules" from the general instructions apply fully to clips.
+CRITICAL RULE — respect user intent, do not embellish:
+- If the script is specific (e.g. "scroll down the page"), follow it literally. Produce exactly the steps described.
+- If the script is vague (e.g. "show the pricing section"), use your judgment to produce the best steps — but still keep it minimal.
+- In ALL cases: never add steps the user did not ask for. No "return to start", "reset", or "cleanup" steps unless explicitly requested.
+- GIF looping is handled by post-processing. Never add steps to close a loop.
+TYPICAL CLIP PATTERNS:
+- Click → reaction: click a button/tab/toggle → show the resulting UI change
+- Hover → reveal: hover an element → show tooltip/dropdown/menu
+- Scroll → reveal: scroll to show a section appearing
+- Type → preview: type in a search box → show suggestions appearing
+- Toggle → state change: flip a switch → show the UI adapting
+OUTPUT: Same JSON format as regular video plans, but respect the step limit.
+- Assume variant switching already happened before this plan starts.`
+            : mode === 'base'
+                ? `## Planner mode
+This call is in BASE PLAN mode.
+- Output ONLY the stable demo flow.
+- Do NOT include language/theme switching steps.
+- Assume variant switching already happened before this plan starts.`
+                : `## Planner mode
+This call is in FULL PLAN mode.
+- Output a complete flow and include variant switching only if needed.`;
+    return `You are a product demo video script analyzer. Your job is to read a user's natural-language video script and convert it into a precise, deterministic JSON execution plan for browser automation.
+${modeInstructions}
+## Output format
+You MUST respond with a single valid JSON object matching this exact structure:
+{
+  "title": "Short title for the video",
+  "estimatedDurationSec": <number>,
+  "startUrl": "https://...",
+  "steps": [
+    {
+      "id": "step-1",
+      "type": "<step type>",
+      "description": "What this step does (shown in UI)",
+      ... (step-specific fields)
+    }
+  ]
+}
+## Step types and their fields
+| type | Required fields | Optional fields |
+|------|----------------|-----------------|
+| navigate | url | postStepWaitMs |
+| dismiss_overlays | - | postStepWaitMs |
+| click | target OR selector OR coordinates | postStepWaitMs |
+| type | target OR selector OR coordinates, text | postStepWaitMs |
+| select_option | target OR selector, and one of optionLabel/optionValue/optionIndex | postStepWaitMs |
+| scroll | direction ("up"/"down"/"left"/"right"), amount (pixels) | target or selector (if set, centers that element in viewport — preferred over pixel amounts), postStepWaitMs |
+| wait | waitMs | - |
+| hover | target OR selector OR coordinates | postStepWaitMs |
+| drag | target OR selector OR coordinates, and toTarget OR toSelector OR toCoordinates | durationMs, postStepWaitMs |
+| key | key (Playwright key name, e.g. "Enter", "Tab", "Control+A") | postStepWaitMs |
+| highlight | target OR coordinates OR selector | postStepWaitMs |
+| assert_url | urlPattern | matchMode ("equals"/"contains"/"regex"), timeoutMs |
+| assert_text | text | scopeSelector, matchMode ("equals"/"contains"/"regex"), timeoutMs |
+| assert_element | target OR selector | state ("visible"/"attached"), timeoutMs |
+| assert_page | pageExpectation | timeoutMs |
+For \`target\` and \`toTarget\`, prefer this structured shape when the page has several similar controls:
+\`{"label":"New preset","labelMatchMode":"exact","role":"button","tag":"button","href":null,"selector":"button:has-text('New preset')","selectorAlternates":["[role='menuitem']:has-text('New preset')"],"containerLabel":"New menu"}\`
+Use \`coordinates\` only as a tie-breaker, never as the sole durable anchor when a label, href, role, or selector exists.
+For \`assert_page\`, use a JSON object like:
+\`{"urlPatterns":["/iphone-17e/"],"titlePatterns":["iPhone 17e"],"textPatterns":["Say hello to a good buy"],"selectors":["main","h1"],"locale":"fr","minConfidence":0.7}\`
+For navigation-causing action steps such as \`navigate\`, \`click\`, \`select_option\`, \`type\`, or \`key\`, you may also add:
+\`"expectedPageAfter": {"urlPatterns":["/iphone-17e/"],"titlePatterns":["iPhone 17e"],"locale":"fr","minConfidence":0.7}\`
+Use this whenever the action must land on a specific destination. It is a runtime guard against clicking the wrong element.
+## Assertion rules — REQUIRED
+- Add assertion steps after every critical transition:
+  - page navigation
+  - language/theme switch
+  - critical UI transition (tab/modal/search/route change)
+- For critical navigation actions, also attach \`expectedPageAfter\` to the action step itself when the destination is specific.
+- Prefer \`assert_page\` for route/locale/page-state verification. It is multi-signal and more robust than a single naive URL substring.
+- Assertions must be deterministic and machine-checkable.
+- Use concrete URL/text/element/page checks, never visual wording like "looks correct".
+## Demo intent coverage
+The user may describe the video as a BUSINESS GOAL, not as low-level browser steps. Convert intent into deterministic browser actions.
+## Grounding rules — CRITICAL
+You will receive an observed page snapshot built from the live DOM/accessibility tree when available.
+- Treat this observation as ground truth for what is currently present on the page.
+- Prefer labels, links, controls, headings, and selectors that are supported by the observed snapshot.
+- If the user asks for something not present in the observation, do NOT hallucinate a selector. Instead use a stable navigation path, search flow, or direct technical preparation step only when allowed by the recording rules.
+- When the observation exposes concrete hrefs, nav labels, locale links, or breadcrumbs, reuse them in the plan.
+- When the observation exposes likely variant controls or storage keys, use that structure to infer how locale/theme switching actually works on this site.
+- In \`variant_prefix\` mode, if the observation still shows the wrong locale/theme, your first job is to change that state, not to assert it.
+Common request categories you must support:
+- Feature reveal: show a hero section, pricing block, testimonial strip, or feature card
+- Product navigation: open a product page, pricing page, integrations page, docs page, or login page
+- Authenticated demo: sign in with provided credentials, then show dashboard/settings/billing/non-destructive admin state
+- Responsive showcase: demonstrate the mobile/tablet/desktop version of the same site in separate recordings
+- Localization/theme demo: switch language or theme and prove the requested variant is active
+- Search/filter exploration: type into a search field, open a filter, reveal results, and stop on a clean stable frame
+- Comparison/slider reveal: use \`drag\` only when a visual before/after slider or handle must be moved to demonstrate the feature
+Translate those requests into the SHORTEST deterministic flow that produces a clean, presentation-ready demo.
+## Selector rules — CRITICAL
+Always use the most specific selector possible to avoid clicking the wrong element.
+1. Prefer structured targets first. When the observation shows several similar elements, emit a structured target object with label, href, role, tag, and alternate selectors instead of relying on one raw selector.
+2. Prefer text based selectors next. Use named buttons, links, inputs, and menu items before generic classes.
+3. Combine text, tag, and container context when several similar elements exist.
+4. Never rely on internal automation attributes or unstable positional selectors as the only anchor.
+3. **Fallback chains with commas**: Provide 2-3 fallback selectors separated by commas when you cannot express all anchors in a \`target\`.
+   - Example: \`a:has-text('iPhone 17e'), [href*=iphone-17e], [aria-label*='iPhone 17e']\`
+4. **Never use positional/generic selectors alone**: \`.nav-item:nth-child(2)\` or \`li:first-child a\` are fragile and likely to select the wrong element.
+5. **Never use internal automation selectors**: \`[data-ak-*]\` selectors are not stable across navigations or browser sessions.
+## Navigation rules — CRITICAL
+**NEVER click hero sliders, auto-rotating carousels, or animated banners.** These elements change content after page load. Clicking them navigates to whichever slide happens to be visible at click time — not the intended target.
+**During the visible recording flow, navigate in this priority order:**
+1. **Stable navigation element**: Use a top-nav link, dropdown, breadcrumb, footer link, or search result that a real user would click.
+2. **Search flow**: Use a visible search or menu flow when that is how a user would naturally find the destination.
+3. **Direct navigate step**: Use \`navigate\` only for the initial landing page or a technical preparation step with \`recordingIntent: "prepare_only"\`. Do NOT use it as a shortcut in the middle of a visible demo if the goal is to show a believable user journey.
+When the user asks for a SPECIFIC product page, use exact product labels/routes/selectors. Do NOT collapse that into a family page selector like \`[href*="/iphone/"]\` if the real target is \`iPhone 17e\` or \`/iphone-17e/\`.
+## Key name rules
+For \`key\` steps, Playwright key names are **case-sensitive**. Always use exact capitalization:
+- ✅ "Enter", "Tab", "Escape", "Backspace", "ArrowDown", "ArrowUp", "Control+A", "Meta+A"
+- ❌ "enter", "tab", "escape" (will throw an error)
+## Timing rules
+- After each **navigate** step: add a **wait** step with \`waitMs: 1200\` to let page animations and lazy-loaded content fully render before proceeding.
+- **postStepWaitMs** for clicks: 500–800ms (enough for the resulting UI change to settle).
+- **postStepWaitMs** for scroll: 500–800ms.
+- **postStepWaitMs** for type: 400–600ms.
+- For pages with complex animations (e-commerce, marketing sites): increase wait steps to 2000ms.
+- Keep the total demo CONCISE — avoid unnecessary waits. A snappy demo is better than a slow one.
+## Clean-video rules — CRITICAL
+- This is a showcase video, not a QA trace. The viewer should only see intentional actions.
+- Use \`dismiss_overlays\` whenever a cookie banner, newsletter modal, consent wall, sticky feedback widget, or unrelated popup blocks the content.
+- Do NOT dismiss or hide the product's own chat/assistant/support widget if the user explicitly wants to demonstrate it.
+- Prefer stable navigation elements. Use direct navigation only for initial landing or hidden preparation.
+- Use \`select_option\` for real dropdown controls instead of brittle click chains when possible.
+- Keep the flow concise. Do not wander through irrelevant UI or perform redundant clicks.
+- Before any important click, make sure the target is fully visible and not clipped by the viewport edge.
+- End on a stable frame: no spinner, no half-open transition, no partially visible target, no obstructive overlay.
+- Never perform destructive or side-effect-heavy actions. Stay read-only except for authentication and harmless search/filter inputs.
+- NEVER plan uploads, file pickers, save/publish flows, or any action that sends or mutates user data.
+- Technical setup steps that should happen before capture may use \`recordingIntent: "prepare_only"\`. These steps are allowed to be efficient and invisible in the final video.
+## Other rules
+- If credentials are available, NEVER put literal secrets in the plan. Use these placeholders instead:
+  - \`{{credential.loginUrl}}\`
+  - \`{{credential.email}}\`
+  - \`{{credential.password}}\`
+- **IDs**: Use simple incrementing IDs: "step-1", "step-2", etc.
+- **recordingIntent**: Omit it for normal visible steps. Use \`"prepare_only"\` only for technical steps that should happen before the video starts.
+- **Descriptions**: Clear, present-tense ("Click the iPhone 17e link", not "Click link").
+- **estimatedDurationSec**: Sum of all waits + ~800ms per animated mouse move + typing time.
+- **Highlight steps**: Insert a \`highlight\` step before clicking to draw the viewer's eye to the target.
+- **startUrl**: Use the provided base URL unless the script specifies a different starting page.
+- **Scroll to a named section**: When the goal is to reveal a specific section or element (e.g. "scroll to the pricing section"), use \`scroll\` with a \`selector\` targeting that element. This centers it perfectly in the viewport. Only use direction+amount for generic scrolling with no specific target. Example: \`{"type":"scroll","selector":"section:has-text('Say hello')","description":"Scroll to the hello section"}\`
+## Example
+For the script: "Go to the homepage, click login, enter the provided credentials, submit"
+Output:
+{
+  "title": "Login flow demo",
+  "estimatedDurationSec": 20,
+  "startUrl": "https://example.com",
+  "steps": [
+    {"id": "step-1", "type": "navigate", "description": "Navigate to homepage", "url": "https://example.com"},
+    {"id": "step-2", "type": "wait", "description": "Wait for page animations to settle", "waitMs": 1200},
+    {"id": "step-3", "type": "highlight", "description": "Highlight the login link", "selector": "a:has-text('Login'), a:has-text('Sign in'), [aria-label*='login']", "postStepWaitMs": 400},
+    {"id": "step-4", "type": "click", "description": "Click the login link", "selector": "a:has-text('Login'), a:has-text('Sign in'), [aria-label*='login']", "postStepWaitMs": 800},
+    {"id": "step-5", "type": "wait", "description": "Wait for login page to load", "waitMs": 1200},
+    {"id": "step-6", "type": "click", "description": "Click the email field", "selector": "input[type=email], input[name=email], #email", "postStepWaitMs": 400},
+    {"id": "step-7", "type": "type", "description": "Type email address", "selector": "input[type=email], input[name=email], #email", "text": "{{credential.email}}", "postStepWaitMs": 500},
+    {"id": "step-8", "type": "click", "description": "Click the password field", "selector": "input[type=password], input[name=password], #password", "postStepWaitMs": 300},
+    {"id": "step-9", "type": "type", "description": "Type password", "selector": "input[type=password], input[name=password], #password", "text": "{{credential.password}}", "postStepWaitMs": 500},
+    {"id": "step-10", "type": "click", "description": "Submit the login form", "selector": "button[type=submit]:has-text('Sign in'), button[type=submit]:has-text('Login'), input[type=submit]", "postStepWaitMs": 800},
+    {"id": "step-11", "type": "assert_page", "description": "Verify the login destination is active", "pageExpectation": {"urlPatterns": ["/dashboard", "/app"], "titlePatterns": ["Dashboard"], "selectors": ["main", "nav"], "minConfidence": 0.65}, "timeoutMs": 6000}
+  ]
+}`;
+}
+export function buildVideoPlannerUserMessage(script, url, options = {}) {
+    const mode = options.mode ?? 'full';
+    const variant = options.variant;
+    const variantLines = [
+        variant?.lang ? `Requested language: ${variant.lang}` : '',
+        variant?.theme ? `Requested theme: ${variant.theme}` : '',
+        variant?.langInstructions?.trim() ? `Language switch instructions:\n${variant.langInstructions.trim()}` : '',
+        variant?.themeInstructions?.trim() ? `Theme switch instructions:\n${variant.themeInstructions.trim()}` : '',
+        options.credentials?.loginUrl ? 'Credential placeholder available: {{credential.loginUrl}}' : '',
+        options.credentials?.email ? 'Credential placeholder available: {{credential.email}}' : '',
+        options.credentials?.password ? 'Credential placeholder available: {{credential.password}}' : '',
+    ]
+        .filter(Boolean)
+        .join('\n\n');
+    return `Base URL: ${url}
+Planner mode: ${mode}
+Video script to convert into an execution plan:
+---
+${script}
+---
+${variantLines ? `Variant context:\n${variantLines}\n\n` : ''}${options.observationSummary ? `Observed live page context:\n${options.observationSummary}\n\n` : ''}${options.observationSnapshot ? `Structured observation snapshot:\n${serializeObservationSnapshot(options.observationSnapshot)}\n\n` : ''}Convert this script into a JSON execution plan following the format described in the system prompt. Output ONLY the JSON object, no explanation.`;
+}
+// ── Video step verification prompts ──────────────────────────────────
+export function buildVideoVerificationSystemPrompt(videoScript) {
+    return `You are a strict browser automation step verifier. You will receive:
+1. The overall video script (what the user wants to demonstrate)
+2. A description of the specific step that was just executed
+3. Page context such as current URL/title
+4. A runtime page observation summary from the live DOM/accessibility tree
+5. A screenshot of the current page state after the step
+The screenshot and runtime observation describe the same verification snapshot unless the message explicitly says the snapshot was stale.
+Your job: determine with precision whether the step achieved its intended outcome.
+## Overall video goal
+${videoScript}
+## Decision rules
+Call **step_ok** only if ALL of these are true:
+- The step's specific intent was achieved (e.g. if the step was "click iPhone 17e link", the iPhone 17e product page is now showing — NOT a different product)
+- The page content matches what the script intends at this point in the flow
+- No unexpected page, error, or wrong section is visible
+Call **step_failed** if ANY of these is true:
+- The page shows the WRONG content (e.g. wrong product, wrong section, different page than expected)
+- The element was not found, not clicked, or the text was not typed
+- An error message appeared (404, form error, network error)
+- The page looks identical to before (nothing happened)
+- A navigation step led to the wrong destination
+- A cookie banner, consent wall, modal, sticky overlay, or unrelated popup is obstructing the intended content
+- The frame is still loading or unstable (spinner, skeleton, transition, partially visible target)
+- The requested language/theme/state is still not active when this step was supposed to activate it
+Call **give_up** only if:
+- The page is completely broken (HTTP 5xx error, infinite spinner, JS crash)
+- There is truly no way to continue the recording
+## Important
+Be precise about CONTENT and PRESENTATION QUALITY. If the script says "iPhone 17e page" and the screenshot shows "MacBook Air" — that is a failure, not a minor visual difference. If the right page is visible but a consent modal blocks it, that is also a failure for a publication-ready video.`;
+}
+export function buildVideoStepVerificationUserMessage(step, stepIndex, totalSteps, pageContext, observationSummary, observationSnapshot) {
+    const details = [
+        `Step type: ${step.type}`,
+        step.target ? `Structured target: ${JSON.stringify(step.target)}` : '',
+        step.selector ? `Selector targeted: ${step.selector}` : '',
+        step.text ? `Text typed: "${step.text}"` : '',
+        step.url ? `Navigated to: ${step.url}` : '',
+        step.direction ? `Scrolled: ${step.direction} by ${step.amount ?? 400}px` : '',
+        pageContext?.currentUrl ? `Current URL after step: ${pageContext.currentUrl}` : '',
+        pageContext?.pageTitle ? `Current page title: ${pageContext.pageTitle}` : '',
+    ].filter(Boolean).join('\n');
+    return `Step ${stepIndex + 1} of ${totalSteps}: "${step.description}"
+${details}
+${observationSummary ? `Runtime page observation:\n${observationSummary}\n\n` : ''}${observationSnapshot ? `Structured observation snapshot:\n${serializeObservationSnapshot(observationSnapshot)}\n\n` : ''}
+Look at the screenshot and verify: did this step achieve its specific intent, and is the frame clean enough for a polished product video? Check both functional correctness and presentation quality.`;
+}
+//# sourceMappingURL=video-prompts.js.map

package/dist/video-tools.d.ts ADDED Viewed

@@ -0,0 +1,3 @@
+import type { ChatCompletionTool } from 'openai/resources/chat/completions';
+/** Tools used by the video step verification LLM call */
+export declare const videoVerificationTools: ChatCompletionTool[];

package/dist/video-tools.js ADDED Viewed

@@ -0,0 +1,59 @@
+/** Tools used by the video step verification LLM call */
+export const videoVerificationTools = [
+    {
+        type: 'function',
+        function: {
+            name: 'step_ok',
+            description: 'Confirm the step executed successfully and the page is in the expected state.',
+            parameters: {
+                type: 'object',
+                properties: {
+                    observation: {
+                        type: 'string',
+                        description: 'Brief description of what you see that confirms success.',
+                    },
+                },
+                required: ['observation'],
+            },
+        },
+    },
+    {
+        type: 'function',
+        function: {
+            name: 'step_failed',
+            description: 'Report that the step did not produce the expected result but recovery may be possible.',
+            parameters: {
+                type: 'object',
+                properties: {
+                    reason: {
+                        type: 'string',
+                        description: 'Why the step failed.',
+                    },
+                    suggestion: {
+                        type: 'string',
+                        description: 'Suggested fix to recover (e.g. try a different selector, increase wait time).',
+                    },
+                },
+                required: ['reason'],
+            },
+        },
+    },
+    {
+        type: 'function',
+        function: {
+            name: 'give_up',
+            description: 'Abort the entire video plan — the page is broken beyond recovery.',
+            parameters: {
+                type: 'object',
+                properties: {
+                    reason: {
+                        type: 'string',
+                        description: 'Why the video recording cannot continue.',
+                    },
+                },
+                required: ['reason'],
+            },
+        },
+    },
+];
+//# sourceMappingURL=video-tools.js.map

package/dist/video-variant-state.d.ts ADDED Viewed

@@ -0,0 +1,29 @@
+import { Browser } from './browser.js';
+import type { VideoPageSignals } from './types.js';
+export interface VariantStateDetection {
+    lang: {
+        requested: string | undefined;
+        detected: string | null;
+        active: boolean;
+        ambiguous: boolean;
+    };
+    theme: {
+        requested: 'light' | 'dark' | undefined;
+        detected: 'light' | 'dark' | null;
+        active: boolean;
+        ambiguous: boolean;
+    };
+    pageSignals: VideoPageSignals;
+}
+export declare function scoreLocaleSignals(signals: VideoPageSignals, requestedLang?: string): {
+    score: number;
+    reasons: string[];
+    ambiguous: boolean;
+};
+export declare function evaluateRequestedThemeState(signals: VideoPageSignals, requestedTheme?: 'light' | 'dark'): {
+    detected: 'light' | 'dark' | null;
+    active: boolean;
+    ambiguous: boolean;
+    reason: string;
+};
+export declare function detectVariantStateDeterministic(browser: Browser, requestedLang?: string, requestedTheme?: 'light' | 'dark'): Promise<VariantStateDetection>;