autokap 1.1.8 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  # AutoKap Opcode Reference
2
2
 
3
- Detailed parameter documentation for all 23 opcodes. For workflow, rules, and examples, see [SKILL.md](SKILL.md).
3
+ Detailed parameter documentation for all 24 opcodes. For workflow, rules, and examples, see [SKILL.md](SKILL.md).
4
4
 
5
5
  ## Common Fields (all opcodes)
6
6
 
@@ -150,6 +150,23 @@ Wait for an element to appear.
150
150
  { "kind": "WAIT_FOR", "selector": "[data-ak=\"dashboard\"]", "state": "visible", "postcondition": { "type": "element_visible", "selector": "[data-ak=\"dashboard\"]", "waitMs": 10000 } }
151
151
  ```
152
152
 
153
+ ## SLEEP
154
+
155
+ Pause execution for a fixed duration. For demo videos, use SLEEP as an explicit narration anchor with `stepId`, `narrationTextByLocale`, and `durationMs: 1`. `autokap run` generates TTS once per app-locale voice row, measures the audio, and rewrites `durationMs` to the maximum duration for that `stepId` before executing the opcodes, so one runtime program works for every language/theme variant.
156
+
157
+ | Param | Type | Required | Default | Description |
158
+ |-------|------|----------|---------|-------------|
159
+ | `durationMs` | integer (1..60000) | yes | — | How long to sleep, in milliseconds |
160
+ | `narrationTextByLocale` | object `{ [locale]: string }` | video narration | — | Spoken text for every TTS locale used by `narration_by_app_locale`. Current TTS catalogue supports `en` and `fr`. |
161
+ | `narrationText` | string | legacy single-row only | — | Backward-compatible fallback when only one TTS row is requested. Prefer `narrationTextByLocale`. |
162
+
163
+ **Can:** Hold the runtime for a precise wall-clock duration without DOM polling.
164
+ **Cannot:** React to a selector / postcondition — use `WAIT_FOR` for that.
165
+
166
+ ```json
167
+ { "kind": "SLEEP", "description": "Hold for narration on 'open-pricing'", "stepId": "open-pricing", "durationMs": 1, "narrationTextByLocale": { "en": "Let's open pricing and look at the plan structure.", "fr": "On ouvre les tarifs et on regarde la structure des offres." }, "postcondition": { "type": "always" }, "recovery": { "retries": 0, "useSelectorMemory": false, "useAltInteraction": false, "allowReload": false, "allowHealer": false }, "timeoutMs": 65000, "maxFailures": 1 }
168
+ ```
169
+
153
170
  ## SCROLL
154
171
 
155
172
  Scroll the page or scroll an element into view.
@@ -90,7 +90,7 @@ For every element you interact with in the opcodes, add a `data-ak="descriptive-
90
90
  interface ExecutionProgram {
91
91
  presetId: string; // Unique slug (e.g. "homepage-hero")
92
92
  programVersion: number; // Always 1 for new programs
93
- mediaMode: 'screenshot' | 'clip';
93
+ mediaMode: 'screenshot' | 'clip' | 'video';
94
94
  baseUrl: string; // Root URL of the application
95
95
  variants: VariantSpec[]; // Viewport/locale/theme combinations
96
96
  preconditions: {
@@ -104,11 +104,17 @@ interface ExecutionProgram {
104
104
  };
105
105
  steps: ExecutionOpcode[];
106
106
  artifactPlan: {
107
- mediaMode: 'screenshot' | 'clip';
108
- cursorTheme?: 'minimal' | 'macos' | 'windows'; // Clip only. Default: 'minimal'
107
+ mediaMode: 'screenshot' | 'clip' | 'video';
108
+ cursorTheme?: 'minimal' | 'macos' | 'windows'; // Clip/video recordings. Default: 'minimal'
109
109
  format?: {
110
110
  clipFormat?: 'gif' | 'mp4' | 'both'; // Default: 'gif'
111
111
  screenshotFormat?: 'png' | 'jpeg'; // Default: 'png'
112
+ // Video-only — required when mediaMode='video'.
113
+ // captureResolution must be 1920×1080; legacy 2560×1440 is normalized by AutoKap.
114
+ // captureFps defaults to 30; deliveryResolution defaults to 1920×1080.
115
+ captureResolution?: { width: number; height: number };
116
+ captureFps?: number;
117
+ deliveryResolution?: { width: number; height: number };
112
118
  };
113
119
  applyMockup?: boolean; // Apply device frame mockup
114
120
  applyStatusBar?: boolean; // Add status bar to mockup
@@ -133,7 +139,7 @@ interface VariantSpec {
133
139
 
134
140
  ## Opcode Quick Reference
135
141
 
136
- 23 opcodes available. For full parameter documentation, see [OPCODE-REFERENCE.md](OPCODE-REFERENCE.md).
142
+ 24 opcodes available. For full parameter documentation, see [OPCODE-REFERENCE.md](OPCODE-REFERENCE.md).
137
143
 
138
144
  | Kind | Selector? | Key Params | Typical Postcondition | Notes |
139
145
  |------|-----------|-----------|----------------------|-------|
@@ -143,6 +149,7 @@ interface VariantSpec {
143
149
  | `TYPE` | yes | `text`, `clearFirst` | `any_change` | `{{email}}` / `{{password}}` for creds |
144
150
  | `PRESS_KEY` | no | `key` | `any_change` | `"Enter"`, `"Escape"`, `"Tab"`, etc. |
145
151
  | `WAIT_FOR` | yes* | `state` | `element_visible` | `"visible"` or `"attached"` (DOM only) |
152
+ | `SLEEP` | no | `durationMs` (1..60000), `narrationTextByLocale?` | `always` | Pause N ms. Reserved for video narration anchors. Use `durationMs: 1` as a placeholder; AutoKap rewrites it during `autokap run` after generating TTS. |
146
153
  | `SCROLL` | no | `direction`, `targetSelector?`, `amount?` | `element_visible` | Use `targetSelector` for precise scroll |
147
154
  | `HOVER` | yes | — | `element_visible` | Capture immediately after for hover state |
148
155
  | `SELECT_OPTION` | yes | `optionLabel` / `optionValue` / `optionIndex` | `text_contains` | **Native `<select>` only.** Custom dropdowns: use CLICK sequence |
@@ -217,7 +224,7 @@ interface VariantSpec {
217
224
  | Tell the user to save a local `program.json` file | Persist the preset via CLI; only output full JSON as a fallback if the CLI/server write fails |
218
225
  | Guess CSS selectors | Add `data-ak` attributes to the code first |
219
226
  | Skip WAIT_FOR after page transitions | Add WAIT_FOR with `waitMs: 10000` after login, navigation, modal open |
220
- | Use NAVIGATE or SCROLL inside a clip to reach a clickable target | CLICK the link/button cursor animates to it naturally |
227
+ | Use NAVIGATE inside a clip (between BEGIN_CLIP and END_CLIP) | Always CLICK the in-app element (sidebar Link, modal trigger button) or PRESS_KEY Escape to close modals. NAVIGATE is `page.goto` = full reload + white flash + cursor lost. See "Designing human-like clip programs" below for the patterns |
221
228
  | Skip `cursorTheme` in clip `artifactPlan` | Set `cursorTheme: "macos"` or `"windows"` for polished recordings |
222
229
 
223
230
  ## Clip Workflow
@@ -243,17 +250,53 @@ Because the cursor is visible during recording, **how** you reach a target matte
243
250
  | Do | Don't | Why |
244
251
  |---|---|---|
245
252
  | **CLICK** on a nav link / anchor to scroll to a section | SCROLL blindly then capture | The cursor animates to the link before clicking — the viewer sees intentional navigation |
246
- | **CLICK** on an in-page link to change route | NAVIGATE to the new URL mid-clip | NAVIGATE is an instant browser jump with no cursor motion it looks like a cut, not a flow |
253
+ | **CLICK** on an in-page `<Link>` (sidebar, breadcrumb, card) to change route | NAVIGATE to the new URL mid-clip | NAVIGATE always calls Playwright's `page.goto` a full HTTP navigation that destroys the document, wipes the cursor overlay, and shows a white flash. Even SPAs are forced into a hard reload. CLICK lets Next.js / React Router intercept and use prefetched `router.push` instead — no flash, cursor preserved |
254
+ | **CLICK** on the modal's trigger button (e.g. `[data-ak-interact="new-preset-btn"]`) to open a modal | NAVIGATE to the modal's deep-link route | Most modals open via local React state, not URL navigation. The deep-link route exists for direct entry but forces a real reload — using the trigger button keeps the recording in one fluid take |
255
+ | **PRESS_KEY** `"Escape"` to close a modal (or CLICK its close button) | NAVIGATE back to the previous URL | Radix/shadcn dialogs (the AutoKap dashboard convention) close on Escape and run their `onClose` handler — which usually does the SPA route change internally. NAVIGATE back triggers another full reload |
247
256
  | **HOVER** → pause → **CLICK** for important interactions | CLICK without HOVER | HOVER gives the viewer time to see where the cursor is heading and creates anticipation |
248
257
  | Keep the clip short and focused (5–15 interactions) | Record an entire user journey in one clip | Long clips lose the viewer. Split into multiple clips if needed |
249
258
  | Place WAIT_FOR after CLICK that triggers a route change | Immediately interact after navigation | Gives the page time to render and the viewer time to register the new state |
250
259
  | Add a `screenshot_stable` WAIT_FOR before BEGIN_CLIP when DISMISS_OVERLAYS precedes it | Start recording immediately after DISMISS_OVERLAYS | Overlay dismiss animations (fade-out, slide) bleed into the first frames of the clip |
251
260
 
252
- **Rule of thumb:** inside a clip, never use NAVIGATE or SCROLL to reach content that the user would normally reach by clicking a link or button. Use CLICK on that element instead the cursor animation makes the transition visible and human-like.
261
+ **Rule of thumb — NEVER `NAVIGATE` inside a clip.** NAVIGATE = `page.goto()` = full document reload, white flash, cursor overlay destroyed. There is no exception, even when the destination is on the same origin or the framework supports SPA routing Playwright doesn't use the framework router. Reach every destination through the actual UI element a real user would interact with:
253
262
 
254
- SCROLL is still appropriate for:
255
- - Scrolling down a long page to reveal below-the-fold content when there is no clickable anchor
256
- - Scrolling inside a container or list
263
+ - **Page change** → `CLICK` on a `<Link>` (sidebar item, breadcrumb, card). If no Link points where you need to go, ask the user where the user-facing entry point is don't invent a NAVIGATE workaround.
264
+ - **Open a modal** `CLICK` on its trigger button (annotate it with `data-ak-interact="..."`). Even if the modal also has a deep-link URL, the trigger is the smooth path.
265
+ - **Close a modal** → `PRESS_KEY "Escape"` (or CLICK the close button if present). The modal's `onClose` handler does any required SPA route change for you.
266
+ - **Reveal off-screen content** → `SCROLL` is still fine when there's no clickable anchor (long content sections, lists inside a container).
267
+
268
+ If you find yourself wanting to write `NAVIGATE` between `BEGIN_CLIP` and `END_CLIP`, stop and reconsider — there is almost always an annotated UI element to CLICK instead. Add a `data-ak-interact` to the source if it's missing.
269
+
270
+ ## Demo Video Workflow
271
+
272
+ For narrated demo videos (`mediaMode: "video"`), AutoKap captures a language/theme matrix:
273
+
274
+ 1. Use **one desktop target** at 1920x1080. Do not multiply demo videos by device.
275
+ 2. Generate one variant for every app locale x theme pair. Variant ids must be stable, for example `desktop-1920x1080-fr-dark`.
276
+ 3. Add `SET_LOCALE` and `SET_THEME` with `"$variant"` during setup when the app supports locale/theme switching.
277
+ 4. Put all recorded content inside `BEGIN_CLIP` / `END_CLIP`.
278
+ 5. Every narrated pause is a `SLEEP` with `stepId`, `durationMs: 1`, and `narrationTextByLocale`.
279
+
280
+ Voice-over settings are row-based: each app locale maps to a spoken TTS locale and a voice in `narration_by_app_locale`. `narrationTextByLocale` is the source of truth for TTS text. It must include one entry for every spoken TTS locale used by those rows. AutoKap currently supports TTS for `en` and `fr`; unsupported TTS locales are rejected explicitly. Keep `narrationText` only as a legacy fallback for single-row demos.
281
+
282
+ ```json
283
+ {
284
+ "kind": "SLEEP",
285
+ "stepId": "opening-hello",
286
+ "durationMs": 1,
287
+ "narrationTextByLocale": {
288
+ "en": "Hey, welcome — I'll show you the dashboard quickly.",
289
+ "fr": "Salut, bienvenue — je te montre rapidement le dashboard."
290
+ },
291
+ "description": "Narration anchor: opening greeting",
292
+ "postcondition": { "type": "always" },
293
+ "recovery": { "retries": 0, "useSelectorMemory": false, "useAltInteraction": false, "allowReload": false, "allowHealer": false },
294
+ "timeoutMs": 65000,
295
+ "maxFailures": 1
296
+ }
297
+ ```
298
+
299
+ The server synthesizes TTS once per app-locale voice row, not once per theme. It rewrites every narrated `SLEEP.durationMs` to the maximum audio duration for that `stepId` across rows so the same runtime program can capture every variant.
257
300
 
258
301
  ### Clean start: wait for visual stability before recording
259
302
 
@@ -291,7 +334,7 @@ When an opcode fails, AutoKap tries 5 recovery strategies in order:
291
334
  4. **Targeted reload** — Reloads the page and retries. **Loses UI state** (open modals, form data).
292
335
  5. **LLM Healer** — AI analyzes the page screenshot and AKTree, then rewrites the failing opcode. Max 3 invocations per run.
293
336
 
294
- **Guidance:** Keep `allowReload: false` for most steps (it loses state). Set `allowReload: true` only for the first NAVIGATE or after full page transitions where UI state is expendable.
337
+ **Guidance:** Keep `allowReload: false` for most steps (it loses state). Set `allowReload: true` only for the first NAVIGATE or after full page transitions where UI state is expendable. Never rely on reload recovery inside a `BEGIN_CLIP` / `END_CLIP` recording window.
295
338
 
296
339
  ## SET_LOCALE / SET_THEME: Method Selection
297
340
 
package/dist/browser.js CHANGED
@@ -96,7 +96,7 @@ function resolveEffectivePadding(config, bbox) {
96
96
  left = config.paddingLeft;
97
97
  return { top, right, bottom, left };
98
98
  }
99
- import { dismissCookiesAndWidgets, ensureCaptureHideStyles } from './cookie-dismiss.js';
99
+ import { CAPTURE_HIDE_STYLE_ID, dismissCookiesAndWidgets, ensureCaptureHideStyles, getCaptureHideCSS, } from './cookie-dismiss.js';
100
100
  import { CHROMIUM_ARGS, browserPool } from './browser-pool.js';
101
101
  import { isDebugEnabled, logger } from './logger.js';
102
102
  async function withHelperTimeout(label, timeoutMs, work) {
@@ -841,6 +841,28 @@ export class Browser {
841
841
  });
842
842
  // Inject cursor overlay at context level — survives all navigations in this session
843
843
  await instance.context.addInitScript(cursorScript);
844
+ // Also hide dev/prototype chrome on every document, including navigations
845
+ // that happen after cookie dismissal ran.
846
+ await instance.context.addInitScript(({ styleId, css }) => {
847
+ const install = () => {
848
+ const parent = document.head ?? document.documentElement;
849
+ if (!parent)
850
+ return;
851
+ let style = document.getElementById(styleId);
852
+ if (!style) {
853
+ style = document.createElement('style');
854
+ style.id = styleId;
855
+ parent.appendChild(style);
856
+ }
857
+ if (style.textContent !== css) {
858
+ style.textContent = css;
859
+ }
860
+ };
861
+ install();
862
+ if (document.readyState === 'loading') {
863
+ document.addEventListener('DOMContentLoaded', install, { once: true });
864
+ }
865
+ }, { styleId: CAPTURE_HIDE_STYLE_ID, css: getCaptureHideCSS() });
844
866
  instance.page = await instance.context.newPage();
845
867
  return instance;
846
868
  }
@@ -27,4 +27,18 @@ export declare class ClipStrategy implements CaptureStrategy {
27
27
  capture(adapter: RuntimeAdapter, _spec: ArtifactSpec): Promise<ArtifactResult>;
28
28
  postProcess(artifact: ArtifactResult, spec: ArtifactSpec): Promise<ArtifactResult>;
29
29
  }
30
+ /**
31
+ * Long-form MP4 capture for `mediaMode='video'`. Shares the clip pipeline
32
+ * (frame loop + ffmpeg assembly) but pins capture resolution to the delivery
33
+ * frame (1920×1080 by default) at 30fps and removes the clip duration cap.
34
+ * The bifurcation actually lives in
35
+ * `WebPlaywrightLocal.beginRecording()` — this strategy is the typed surface
36
+ * the orphan capture-strategy factory exposes.
37
+ */
38
+ export declare class VideoStrategy implements CaptureStrategy {
39
+ readonly mediaMode: "video";
40
+ prepare(_adapter: RuntimeAdapter, _spec: ArtifactSpec): Promise<void>;
41
+ capture(adapter: RuntimeAdapter, _spec: ArtifactSpec): Promise<ArtifactResult>;
42
+ postProcess(artifact: ArtifactResult, _spec: ArtifactSpec): Promise<ArtifactResult>;
43
+ }
30
44
  export declare function createCaptureStrategy(mediaMode: MediaMode): CaptureStrategy;
@@ -58,11 +58,39 @@ export class ClipStrategy {
58
58
  return artifact;
59
59
  }
60
60
  }
61
+ // ── Video strategy ──────────────────────────────────────────────────
62
+ /**
63
+ * Long-form MP4 capture for `mediaMode='video'`. Shares the clip pipeline
64
+ * (frame loop + ffmpeg assembly) but pins capture resolution to the delivery
65
+ * frame (1920×1080 by default) at 30fps and removes the clip duration cap.
66
+ * The bifurcation actually lives in
67
+ * `WebPlaywrightLocal.beginRecording()` — this strategy is the typed surface
68
+ * the orphan capture-strategy factory exposes.
69
+ */
70
+ export class VideoStrategy {
71
+ mediaMode = 'video';
72
+ async prepare(_adapter, _spec) {
73
+ // Resolution + fps are forced inside adapter.beginRecording when the
74
+ // BEGIN_CLIP opcode runs in video mode. Nothing to do here.
75
+ }
76
+ async capture(adapter, _spec) {
77
+ const recording = await adapter.endRecording();
78
+ return {
79
+ mediaMode: 'video',
80
+ buffer: recording.buffer,
81
+ mimeType: recording.mimeType,
82
+ };
83
+ }
84
+ async postProcess(artifact, _spec) {
85
+ return artifact;
86
+ }
87
+ }
61
88
  // ── Factory ─────────────────────────────────────────────────────────
62
89
  export function createCaptureStrategy(mediaMode) {
63
90
  switch (mediaMode) {
64
91
  case 'screenshot': return new ScreenshotStrategy();
65
92
  case 'clip': return new ClipStrategy();
93
+ case 'video': return new VideoStrategy();
66
94
  }
67
95
  }
68
96
  //# sourceMappingURL=capture-strategy.js.map
@@ -18,3 +18,64 @@ export declare function buildCliRunCommand(presetId: string, options?: {
18
18
  }): string;
19
19
  export declare function buildCliInstalledSetupCommand(cliKey: string): string;
20
20
  export declare const CLI_PUBLIC_COMMANDS: CliPublicCommandDescriptor[];
21
+ /**
22
+ * Per-clip metadata uploaded by the CLI alongside the raw MP4. The compositor
23
+ * worker consumes these to drive the merge step in clipId order. Opcode
24
+ * timings are retained for diagnostics and future compositor effects.
25
+ */
26
+ export interface VideoClipMetadata {
27
+ /** Variant that produced this raw clip. Required for video-demo matrices. */
28
+ variantId: string;
29
+ lang: string;
30
+ theme: 'light' | 'dark';
31
+ clipId: string;
32
+ /** Storage path within the `videos` bucket (e.g. `raw/{video_id}/{clip_id}.mp4`). */
33
+ mp4StoragePath: string;
34
+ /** Measured duration of the captured MP4 in milliseconds. */
35
+ durationMs: number;
36
+ /**
37
+ * Per-opcode timing entries that landed within this clip, taken verbatim
38
+ * from `RunResult.opcodeTimings` (see execution-types). The compositor
39
+ * filters by `clipId === clip.clipId`.
40
+ */
41
+ opcodeTimings: Array<{
42
+ stepIndex: number;
43
+ stepId?: string;
44
+ opcodeKind: string;
45
+ timecodeStartMs: number;
46
+ timecodeEndMs: number;
47
+ bbox?: {
48
+ x: number;
49
+ y: number;
50
+ width: number;
51
+ height: number;
52
+ } | null;
53
+ }>;
54
+ }
55
+ export interface VideoAudioAsset {
56
+ stepId: string;
57
+ url: string;
58
+ duration_ms: number;
59
+ word_timings?: Array<{
60
+ word: string;
61
+ start_ms: number;
62
+ end_ms: number;
63
+ }>;
64
+ }
65
+ export type VideoAudioAssetsByLocale = Record<string, VideoAudioAsset[]>;
66
+ /** Body of POST /api/cli/video-complete. */
67
+ export interface VideoCompletePayload {
68
+ videoId: string;
69
+ /** `runId` the CLI generated for this run (already used for telemetry). */
70
+ runId: string;
71
+ clips: VideoClipMetadata[];
72
+ /** Per-run TTS assets generated by /api/cli/video-prepare. */
73
+ audioAssets?: VideoAudioAsset[];
74
+ /** Per-locale TTS assets generated by /api/cli/video-prepare. */
75
+ audioAssetsByLocale?: VideoAudioAssetsByLocale;
76
+ }
77
+ /** Server response. */
78
+ export interface VideoCompleteResponse {
79
+ composeRunId: string;
80
+ composeRunIds?: string[];
81
+ }
@@ -11,7 +11,15 @@
11
11
  * 6. Upload artifacts + telemetry to server
12
12
  */
13
13
  import { type ProgressEvent } from './opcode-runner.js';
14
- import type { ExecutionProgram, RunResult } from './execution-types.js';
14
+ import { type VideoClipMetadata } from './cli-contract.js';
15
+ import type { ExecutionProgram, VariantSpec, RunResult } from './execution-types.js';
16
+ export interface RecordableBrowserSettings {
17
+ viewport: VariantSpec['viewport'];
18
+ requestedDeviceScaleFactor: number;
19
+ runtimeDeviceScaleFactor: number;
20
+ }
21
+ export declare function resolveRecordableBrowserSettings(program: ExecutionProgram, variant: VariantSpec): RecordableBrowserSettings;
22
+ export declare function normalizeVideoCaptureProgram(program: ExecutionProgram): ExecutionProgram;
15
23
  export interface CLIRunnerOptions {
16
24
  /** Preset ID to run */
17
25
  presetId: string;
@@ -36,3 +44,4 @@ export interface CLIRunResult {
36
44
  error?: string;
37
45
  }
38
46
  export declare function runCapture(options: CLIRunnerOptions): Promise<CLIRunResult>;
47
+ export declare function buildVideoClipMetadata(videoId: string, result: RunResult, program?: ExecutionProgram, runId?: string): VideoClipMetadata[];