autokap 1.5.1 → 1.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -52,6 +52,12 @@ export interface VideoClipMetadata {
52
52
  width: number;
53
53
  height: number;
54
54
  } | null;
55
+ /**
56
+ * For TYPE opcodes captured in clipCursor mode: clip-relative ms timestamp
57
+ * of every keystroke produced by `humanType`. Drives per-keystroke keyboard
58
+ * SFX in the video compositor.
59
+ */
60
+ keystrokeOffsetsMs?: number[];
55
61
  }>;
56
62
  }
57
63
  export interface VideoAudioAsset {
@@ -748,6 +748,9 @@ export function buildVideoClipMetadata(videoId, result, program, runId) {
748
748
  timecodeStartMs: t.timecodeStartMs,
749
749
  timecodeEndMs: t.timecodeEndMs,
750
750
  bbox: t.bbox ?? null,
751
+ ...(t.keystrokeOffsetsMs && t.keystrokeOffsetsMs.length > 0
752
+ ? { keystrokeOffsetsMs: t.keystrokeOffsetsMs }
753
+ : {}),
751
754
  }));
752
755
  clipsByKey.set(`${variantId}:${artifact.clipId}`, {
753
756
  variantId,
@@ -693,6 +693,13 @@ export interface OpcodeTiming {
693
693
  width: number;
694
694
  height: number;
695
695
  } | null;
696
+ /**
697
+ * For TYPE opcodes captured in clipCursor mode: timestamp (ms relative to the
698
+ * active clip start) of each individual keystroke produced by `humanType`.
699
+ * Drives keyboard SFX per-keystroke in the video compositor. Empty/undefined
700
+ * for non-TYPE opcodes and for typing paths that bypass humanType.
701
+ */
702
+ keystrokeOffsetsMs?: number[];
696
703
  }
697
704
  export interface RunResult {
698
705
  programId: string;
@@ -749,13 +756,23 @@ export interface RecordingResult {
749
756
  mimeType: string;
750
757
  trimStartMs?: number;
751
758
  }
759
+ export interface TypeOptions {
760
+ /**
761
+ * Called once per keystroke produced by `humanType`, with the absolute
762
+ * wall-clock timestamp (`Date.now()`) of the keystroke. The runner converts
763
+ * those to clip-relative offsets stored on `OpcodeTiming.keystrokeOffsetsMs`
764
+ * so the compositor can fire per-keystroke SFX. Only fires in clipCursor
765
+ * mode (the only path that produces visible per-key animation).
766
+ */
767
+ onKeystroke?: (timestampMs: number) => void;
768
+ }
752
769
  export interface RuntimeAdapter {
753
770
  navigate(url: string): Promise<void>;
754
771
  getCurrentUrl(): Promise<string>;
755
772
  getAKTree(): Promise<AKTree>;
756
773
  getPageSignals(): Promise<VideoPageSignals>;
757
774
  click(selector: string, options?: ClickOptions): Promise<void>;
758
- type(selector: string, text: string, clearFirst?: boolean): Promise<void>;
775
+ type(selector: string, text: string, clearFirst?: boolean, opts?: TypeOptions): Promise<void>;
759
776
  pressKey(key: string): Promise<void>;
760
777
  scroll(direction: 'up' | 'down' | 'left' | 'right', amount?: number): Promise<void>;
761
778
  scrollIntoView(selector: string): Promise<void>;
@@ -808,7 +825,7 @@ export interface RuntimeAdapter {
808
825
  selector?: string;
809
826
  target?: SemanticTarget;
810
827
  selectorAlternates?: string[];
811
- }, text: string, clearFirst?: boolean): Promise<void>;
828
+ }, text: string, clearFirst?: boolean, typeOpts?: TypeOptions): Promise<void>;
812
829
  /** Wait for an element by semantic target. */
813
830
  waitForTarget?(opts: {
814
831
  selector?: string;
@@ -52,8 +52,14 @@ export declare function animatedHover(page: Page, target: {
52
52
  /**
53
53
  * Type text into the currently focused element at a human-like typing speed.
54
54
  * Assumes the field is already focused (via a preceding click).
55
+ *
56
+ * `onKeystroke` fires after each character with the absolute wall-clock
57
+ * timestamp (`Date.now()`) of the keystroke. The video pipeline converts
58
+ * these to clip-relative offsets so keyboard SFX fire in lock-step with the
59
+ * visible typing.
55
60
  */
56
61
  export declare function humanType(page: Page, text: string, options?: {
57
62
  minDelayMs?: number;
58
63
  maxDelayMs?: number;
64
+ onKeystroke?: (timestampMs: number) => void;
59
65
  }): Promise<void>;
@@ -132,12 +132,20 @@ export async function animatedHover(page, target, fromCurrent, options = {}) {
132
132
  /**
133
133
  * Type text into the currently focused element at a human-like typing speed.
134
134
  * Assumes the field is already focused (via a preceding click).
135
+ *
136
+ * `onKeystroke` fires after each character with the absolute wall-clock
137
+ * timestamp (`Date.now()`) of the keystroke. The video pipeline converts
138
+ * these to clip-relative offsets so keyboard SFX fire in lock-step with the
139
+ * visible typing.
135
140
  */
136
141
  export async function humanType(page, text, options = {}) {
137
142
  const minDelay = Math.max(0, options.minDelayMs ?? 60);
138
143
  const maxDelay = Math.max(minDelay, options.maxDelayMs ?? 140);
139
144
  for (const char of text) {
140
145
  await page.keyboard.type(char);
146
+ if (options.onKeystroke) {
147
+ options.onKeystroke(Date.now());
148
+ }
141
149
  // 60–120 WPM → ~80–130ms between characters (5 chars per word)
142
150
  const delay = minDelay + Math.random() * (maxDelay - minDelay);
143
151
  if (delay > 0) {
@@ -40,5 +40,11 @@ export declare function findUnresolvedCredentialPlaceholders(text: string, crede
40
40
  export interface OpcodeActionResult {
41
41
  success: boolean;
42
42
  error?: string;
43
+ /**
44
+ * For TYPE opcodes: absolute wall-clock timestamps (`Date.now()`) of each
45
+ * keystroke produced by `humanType`. The runner converts these to
46
+ * clip-relative offsets so the video compositor can fire per-keystroke SFX.
47
+ */
48
+ keystrokeTimestampsMs?: number[];
43
49
  }
44
50
  export declare function executeOpcodeCoreAction(opcode: ExecutionOpcode, adapter: RuntimeAdapter, context?: OpcodeActionContext): Promise<OpcodeActionResult>;
@@ -81,8 +81,12 @@ export async function executeOpcodeCoreAction(opcode, adapter, context = {}) {
81
81
  ? opcode.textByLocale[context.currentVariant.locale] ?? opcode.text
82
82
  : opcode.text);
83
83
  const text = substituteCredentialPlaceholders(rawText, context.credentials);
84
+ const keystrokeTimestampsMs = [];
85
+ const onKeystroke = (timestampMs) => {
86
+ keystrokeTimestampsMs.push(timestampMs);
87
+ };
84
88
  try {
85
- await adapter.type(opcode.selector, text, opcode.clearFirst);
89
+ await adapter.type(opcode.selector, text, opcode.clearFirst, { onKeystroke });
86
90
  }
87
91
  catch (error) {
88
92
  if (!opcode.target || !adapter.typeByTarget)
@@ -91,9 +95,9 @@ export async function executeOpcodeCoreAction(opcode, adapter, context = {}) {
91
95
  selector: opcode.selector,
92
96
  target: opcode.target,
93
97
  selectorAlternates: opcode.selectorAlternates,
94
- }, text, opcode.clearFirst);
98
+ }, text, opcode.clearFirst, { onKeystroke });
95
99
  }
96
- break;
100
+ return { success: true, keystrokeTimestampsMs };
97
101
  }
98
102
  case 'PRESS_KEY':
99
103
  await adapter.pressKey(opcode.key);
@@ -287,6 +287,9 @@ async function executeOpcode(opcode, index, adapter, verifier, breaker, recovery
287
287
  const result = await withTimeout(() => executeOpcodeAction(opcode, index, adapter, artifacts, telemetry, currentVariant, executionState, artifactPlan, mockDataGroups, options, credentials), actionBudgetMs);
288
288
  logger.debug(`[opcode ${index}] action exec end — took ${Date.now() - actionStart}ms, success=${result.success}${result.error ? `, error=${result.error}` : ''}`);
289
289
  if (preTiming) {
290
+ const keystrokeOffsetsMs = result.keystrokeTimestampsMs && result.keystrokeTimestampsMs.length > 0
291
+ ? result.keystrokeTimestampsMs.map((t) => Math.max(0, t - preTiming.clipStartedAt))
292
+ : undefined;
290
293
  opcodeTimings.push({
291
294
  stepIndex: index,
292
295
  stepId: opcode.stepId,
@@ -296,6 +299,7 @@ async function executeOpcode(opcode, index, adapter, verifier, breaker, recovery
296
299
  timecodeStartMs: preTiming.timecodeStartMs,
297
300
  timecodeEndMs: Math.max(0, Date.now() - preTiming.clipStartedAt),
298
301
  bbox: preTiming.bbox,
302
+ ...(keystrokeOffsetsMs ? { keystrokeOffsetsMs } : {}),
299
303
  });
300
304
  }
301
305
  if (!result.success) {
@@ -38,7 +38,9 @@ export declare class WebPlaywrightLocal implements RuntimeAdapter {
38
38
  /**
39
39
  * Type into an element using semantic target resolution.
40
40
  */
41
- typeByTarget(opts: ResolveOptions, text: string, clearFirst?: boolean): Promise<void>;
41
+ typeByTarget(opts: ResolveOptions, text: string, clearFirst?: boolean, typeOpts?: {
42
+ onKeystroke?: (timestampMs: number) => void;
43
+ }): Promise<void>;
42
44
  /**
43
45
  * Wait for an element using semantic target resolution.
44
46
  */
@@ -47,7 +49,9 @@ export declare class WebPlaywrightLocal implements RuntimeAdapter {
47
49
  * Scroll an element into view using semantic target resolution.
48
50
  */
49
51
  scrollIntoViewByTarget(opts: ResolveOptions): Promise<void>;
50
- type(selector: string, text: string, clearFirst?: boolean): Promise<void>;
52
+ type(selector: string, text: string, clearFirst?: boolean, opts?: {
53
+ onKeystroke?: (timestampMs: number) => void;
54
+ }): Promise<void>;
51
55
  pressKey(key: string): Promise<void>;
52
56
  scroll(direction: 'up' | 'down' | 'left' | 'right', amount?: number): Promise<void>;
53
57
  scrollIntoView(selector: string): Promise<void>;
@@ -152,14 +152,14 @@ export class WebPlaywrightLocal {
152
152
  /**
153
153
  * Type into an element using semantic target resolution.
154
154
  */
155
- async typeByTarget(opts, text, clearFirst = true) {
155
+ async typeByTarget(opts, text, clearFirst = true, typeOpts) {
156
156
  const page = await this.browser.currentPage;
157
157
  const resolved = await resolveTarget(page, opts);
158
158
  if (!resolved) {
159
159
  throw new Error(`cannot find target for typing: ${describeResolveOptions(opts)}`);
160
160
  }
161
161
  if (this.clipCursor) {
162
- await this.typeIntoLocator(resolved.locator, text, clearFirst);
162
+ await this.typeIntoLocator(resolved.locator, text, clearFirst, typeOpts?.onKeystroke);
163
163
  return;
164
164
  }
165
165
  if (clearFirst) {
@@ -196,10 +196,10 @@ export class WebPlaywrightLocal {
196
196
  }
197
197
  await resolved.locator.scrollIntoViewIfNeeded({ timeout: 5000 });
198
198
  }
199
- async type(selector, text, clearFirst = true) {
199
+ async type(selector, text, clearFirst = true, opts) {
200
200
  if (this.clipCursor) {
201
201
  const page = await this.browser.currentPage;
202
- await this.typeIntoLocator(page.locator(selector).first(), text, clearFirst);
202
+ await this.typeIntoLocator(page.locator(selector).first(), text, clearFirst, opts?.onKeystroke);
203
203
  return;
204
204
  }
205
205
  await this.browser.typeText(text, { selector, clearFirst });
@@ -878,7 +878,7 @@ export class WebPlaywrightLocal {
878
878
  async close() {
879
879
  await this.browser.close();
880
880
  }
881
- async typeIntoLocator(locator, text, clearFirst) {
881
+ async typeIntoLocator(locator, text, clearFirst, onKeystroke) {
882
882
  const page = await this.browser.currentPage;
883
883
  await locator.waitFor({ state: 'visible', timeout: 5000 });
884
884
  await locator.scrollIntoViewIfNeeded({ timeout: 5000 }).catch(() => undefined);
@@ -895,8 +895,8 @@ export class WebPlaywrightLocal {
895
895
  }
896
896
  await page.waitForTimeout(70);
897
897
  await humanType(page, text, this.clipCursor
898
- ? { minDelayMs: 20, maxDelayMs: 45 }
899
- : undefined);
898
+ ? { minDelayMs: 20, maxDelayMs: 45, onKeystroke }
899
+ : { onKeystroke });
900
900
  }
901
901
  async seedClipCursor(position) {
902
902
  if (!this.clipCursor)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "autokap",
3
- "version": "1.5.1",
3
+ "version": "1.5.2",
4
4
  "description": "AI-powered CLI tool for capturing clean screenshots of websites",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",