autokap 1.2.0 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -52,12 +52,19 @@ export class NoOpRecoveryChain {
52
52
  }
53
53
  }
54
54
  const MIN_CLIP_FINALIZATION_TIMEOUT_MS = 30000;
55
+ const DEFAULT_VIDEO_RECORDING_RESOLUTION = { width: 1920, height: 1080 };
55
56
  function resolveOpcodeTimeoutMs(opcode) {
56
57
  if (opcode.kind === 'END_CLIP') {
57
58
  return Math.max(opcode.timeoutMs, MIN_CLIP_FINALIZATION_TIMEOUT_MS);
58
59
  }
59
60
  return opcode.timeoutMs;
60
61
  }
62
+ function resolveRecordingCaptureResolution(artifactPlan) {
63
+ if (artifactPlan.mediaMode === 'video') {
64
+ return DEFAULT_VIDEO_RECORDING_RESOLUTION;
65
+ }
66
+ return artifactPlan.format?.captureResolution;
67
+ }
61
68
  // ── Main execution function ─────────────────────────────────────────
62
69
  export async function executeProgram(program, createAdapter, options = {}) {
63
70
  const recoveryChain = options.recoveryChain ?? new NoOpRecoveryChain();
@@ -65,6 +72,7 @@ export async function executeProgram(program, createAdapter, options = {}) {
65
72
  const startTime = Date.now();
66
73
  const variantResults = new Array(program.variants.length);
67
74
  const healerPatches = [];
75
+ const opcodeTimings = [];
68
76
  const telemetry = {
69
77
  llmCallCount: 0,
70
78
  llmCostEur: 0,
@@ -90,7 +98,7 @@ export async function executeProgram(program, createAdapter, options = {}) {
90
98
  variantId: variant.id,
91
99
  message: `starting variant ${variant.id}`,
92
100
  });
93
- const variantResult = await executeVariant(program, variant, createAdapter, recoveryChain, telemetry, healerPatches, options);
101
+ const variantResult = await executeVariant(program, variant, createAdapter, recoveryChain, telemetry, healerPatches, opcodeTimings, options);
94
102
  variantResults[currentIndex] = variantResult;
95
103
  options.onProgress?.({
96
104
  type: 'variant_end',
@@ -112,12 +120,13 @@ export async function executeProgram(program, createAdapter, options = {}) {
112
120
  variantResults: completedVariantResults,
113
121
  telemetry,
114
122
  healerPatches: success ? healerPatches : [], // Only propagate patches on success
123
+ opcodeTimings,
115
124
  totalDurationMs: Date.now() - startTime,
116
125
  error: aborted ? 'aborted' : (success ? undefined : completedVariantResults.find(v => !v.success)?.error),
117
126
  };
118
127
  }
119
128
  // ── Variant execution ───────────────────────────────────────────────
120
- async function executeVariant(program, variant, createAdapter, recoveryChain, telemetry, healerPatches, options) {
129
+ async function executeVariant(program, variant, createAdapter, recoveryChain, telemetry, healerPatches, opcodeTimings, options) {
121
130
  const startTime = Date.now();
122
131
  const breaker = new CircuitBreaker();
123
132
  const verifier = new ActionVerifier();
@@ -165,7 +174,7 @@ async function executeVariant(program, variant, createAdapter, recoveryChain, te
165
174
  opcodeKind: opcode.kind,
166
175
  message: opcode.description,
167
176
  });
168
- const result = await executeOpcode(opcode, i, adapter, verifier, breaker, recoveryChain, telemetry, healerPatches, artifacts, options, variant.id, executionState, program.artifactPlan, program.mockDataGroups, variant, program.preconditions.credentials);
177
+ const result = await executeOpcode(opcode, i, adapter, verifier, breaker, recoveryChain, telemetry, healerPatches, opcodeTimings, artifacts, options, variant.id, executionState, program.artifactPlan, program.mockDataGroups, variant, program.preconditions.credentials);
169
178
  opcodeResults.push(result);
170
179
  telemetry.totalOpcodes++;
171
180
  if (result.status === 'recovered')
@@ -241,7 +250,7 @@ function softSkipResult(opcode, index, startTime, reason, telemetry) {
241
250
  error: reason,
242
251
  };
243
252
  }
244
- async function executeOpcode(opcode, index, adapter, verifier, breaker, recoveryChain, telemetry, healerPatches, artifacts, options, variantId, executionState, artifactPlan, mockDataGroups, currentVariant, credentials) {
253
+ async function executeOpcode(opcode, index, adapter, verifier, breaker, recoveryChain, telemetry, healerPatches, opcodeTimings, artifacts, options, variantId, executionState, artifactPlan, mockDataGroups, currentVariant, credentials) {
245
254
  const startTime = Date.now();
246
255
  const effectiveTimeoutMs = resolveOpcodeTimeoutMs(opcode);
247
256
  const deadlineMs = startTime + effectiveTimeoutMs;
@@ -267,17 +276,33 @@ async function executeOpcode(opcode, index, adapter, verifier, breaker, recovery
267
276
  logger.debug(`[opcode ${index}] no budget left after captureBeforeState (deadline=${deadlineMs}, now=${Date.now()})`);
268
277
  if (isSoft)
269
278
  return softSkipResult(opcode, index, startTime, reason, telemetry);
270
- return handleFailure(opcode, index, adapter, verifier, isInteraction, breaker, recoveryChain, telemetry, healerPatches, options, variantId, currentVariant, startTime, deadlineMs, effectiveTimeoutMs, reason);
279
+ return handleFailure(opcode, index, adapter, verifier, isInteraction, breaker, recoveryChain, telemetry, healerPatches, options, executionState, variantId, currentVariant, startTime, deadlineMs, effectiveTimeoutMs, reason);
271
280
  }
281
+ // For mediaMode='video', capture pre-action timing + bbox metadata inside
282
+ // the active clip window only. Opcodes outside a clip are not part of the
283
+ // video output.
284
+ const preTiming = await capturePreActionTiming(opcode, adapter, executionState.activeClip, artifactPlan);
272
285
  logger.debug(`[opcode ${index}] action exec start — actionBudget ${actionBudgetMs}ms`);
273
286
  const actionStart = Date.now();
274
287
  const result = await withTimeout(() => executeOpcodeAction(opcode, index, adapter, artifacts, telemetry, currentVariant, executionState, artifactPlan, mockDataGroups, options, credentials), actionBudgetMs);
275
288
  logger.debug(`[opcode ${index}] action exec end — took ${Date.now() - actionStart}ms, success=${result.success}${result.error ? `, error=${result.error}` : ''}`);
289
+ if (preTiming) {
290
+ opcodeTimings.push({
291
+ stepIndex: index,
292
+ stepId: opcode.stepId,
293
+ opcodeKind: opcode.kind,
294
+ variantId,
295
+ clipId: preTiming.clipId,
296
+ timecodeStartMs: preTiming.timecodeStartMs,
297
+ timecodeEndMs: Math.max(0, Date.now() - preTiming.clipStartedAt),
298
+ bbox: preTiming.bbox,
299
+ });
300
+ }
276
301
  if (!result.success) {
277
302
  const reason = result.error ?? 'action failed';
278
303
  if (isSoft)
279
304
  return softSkipResult(opcode, index, startTime, reason, telemetry);
280
- return handleFailure(opcode, index, adapter, verifier, isInteraction, breaker, recoveryChain, telemetry, healerPatches, options, variantId, currentVariant, startTime, deadlineMs, effectiveTimeoutMs, reason);
305
+ return handleFailure(opcode, index, adapter, verifier, isInteraction, breaker, recoveryChain, telemetry, healerPatches, options, executionState, variantId, currentVariant, startTime, deadlineMs, effectiveTimeoutMs, reason);
281
306
  }
282
307
  // Verify postcondition
283
308
  const postconditionBudgetMs = getRemainingTimeMs(deadlineMs);
@@ -286,7 +311,7 @@ async function executeOpcode(opcode, index, adapter, verifier, breaker, recovery
286
311
  logger.debug(`[opcode ${index}] no budget left for postcondition check`);
287
312
  if (isSoft)
288
313
  return softSkipResult(opcode, index, startTime, reason, telemetry);
289
- return handleFailure(opcode, index, adapter, verifier, isInteraction, breaker, recoveryChain, telemetry, healerPatches, options, variantId, currentVariant, startTime, deadlineMs, effectiveTimeoutMs, reason);
314
+ return handleFailure(opcode, index, adapter, verifier, isInteraction, breaker, recoveryChain, telemetry, healerPatches, options, executionState, variantId, currentVariant, startTime, deadlineMs, effectiveTimeoutMs, reason);
290
315
  }
291
316
  const postStart = Date.now();
292
317
  const postcondition = await evaluatePostcondition(adapter, withClampedPostconditionTimeout(opcode.postcondition, postconditionBudgetMs));
@@ -295,13 +320,13 @@ async function executeOpcode(opcode, index, adapter, verifier, breaker, recovery
295
320
  const reason = `postcondition failed: ${postcondition.reason}`;
296
321
  if (isSoft)
297
322
  return softSkipResult(opcode, index, startTime, reason, telemetry);
298
- return handleFailure(opcode, index, adapter, verifier, isInteraction, breaker, recoveryChain, telemetry, healerPatches, options, variantId, currentVariant, startTime, deadlineMs, effectiveTimeoutMs, reason);
323
+ return handleFailure(opcode, index, adapter, verifier, isInteraction, breaker, recoveryChain, telemetry, healerPatches, options, executionState, variantId, currentVariant, startTime, deadlineMs, effectiveTimeoutMs, reason);
299
324
  }
300
325
  // Verify action had effect (for interaction opcodes)
301
326
  if (isInteraction) {
302
327
  const verification = await verifier.verifyAfterAction(adapter);
303
328
  if (!verification.hadEffect && opcode.postcondition.type !== 'always' && opcode.postcondition.type !== 'any_change') {
304
- return handleFailure(opcode, index, adapter, verifier, isInteraction, breaker, recoveryChain, telemetry, healerPatches, options, variantId, currentVariant, startTime, deadlineMs, effectiveTimeoutMs, `action had no effect: ${verification.summary}`);
329
+ return handleFailure(opcode, index, adapter, verifier, isInteraction, breaker, recoveryChain, telemetry, healerPatches, options, executionState, variantId, currentVariant, startTime, deadlineMs, effectiveTimeoutMs, `action had no effect: ${verification.summary}`);
305
330
  }
306
331
  }
307
332
  // Record successful mock data application
@@ -310,6 +335,17 @@ async function executeOpcode(opcode, index, adapter, verifier, breaker, recovery
310
335
  telemetry.mockDataGroupResults = {};
311
336
  telemetry.mockDataGroupResults[opcode.groupName] = 'applied';
312
337
  }
338
+ // AUT-57 — for `mediaMode='video'`, every visible interaction (CLICK,
339
+ // SCROLL, NAVIGATE, etc.) gets a deterministic post-action settle. This
340
+ // gives the page time to finish smooth-scroll / route transitions /
341
+ // animations BEFORE the next opcode starts (typically a SLEEP narration
342
+ // anchor). The recorded video then has a natural breath after each
343
+ // action and the voice always lands on a settled visual.
344
+ if (artifactPlan.mediaMode === 'video'
345
+ && executionState.activeClip
346
+ && isVideoVisibleInteraction(opcode)) {
347
+ await sleep(VIDEO_POST_ACTION_SETTLE_MS);
348
+ }
313
349
  breaker.recordSuccess(index);
314
350
  return {
315
351
  opcodeIndex: index,
@@ -323,11 +359,33 @@ async function executeOpcode(opcode, index, adapter, verifier, breaker, recovery
323
359
  const errorMsg = err instanceof Error ? err.message : String(err);
324
360
  if (isSoft)
325
361
  return softSkipResult(opcode, index, startTime, errorMsg, telemetry);
326
- return handleFailure(opcode, index, adapter, verifier, isInteraction, breaker, recoveryChain, telemetry, healerPatches, options, variantId, currentVariant, startTime, deadlineMs, effectiveTimeoutMs, errorMsg);
362
+ return handleFailure(opcode, index, adapter, verifier, isInteraction, breaker, recoveryChain, telemetry, healerPatches, options, executionState, variantId, currentVariant, startTime, deadlineMs, effectiveTimeoutMs, errorMsg);
327
363
  }
328
364
  }
365
+ /** Post-action breathing room (ms) injected between visible interactions
366
+ * inside a `mediaMode='video'` clip. See `executeOpcode` for context. */
367
+ const VIDEO_POST_ACTION_SETTLE_MS = 500;
368
+ const DEFAULT_VIDEO_CLIP_ID = 'main';
369
+ const VIDEO_VISIBLE_INTERACTIONS = new Set([
370
+ 'NAVIGATE',
371
+ 'CLICK',
372
+ 'DOUBLE_CLICK',
373
+ 'TYPE',
374
+ 'HOVER',
375
+ 'SCROLL',
376
+ 'PRESS_KEY',
377
+ 'DRAG',
378
+ 'SELECT_OPTION',
379
+ 'CHECK',
380
+ ]);
381
+ function isVideoVisibleInteraction(opcode) {
382
+ return VIDEO_VISIBLE_INTERACTIONS.has(opcode.kind);
383
+ }
384
+ function sleep(ms) {
385
+ return new Promise((resolve) => setTimeout(resolve, ms));
386
+ }
329
387
  // ── Failure handling with recovery ──────────────────────────────────
330
- async function handleFailure(opcode, index, adapter, verifier, isInteraction, breaker, recoveryChain, telemetry, healerPatches, options, variantId, currentVariant, startTime, deadlineMs, effectiveTimeoutMs, errorMsg) {
388
+ async function handleFailure(opcode, index, adapter, verifier, isInteraction, breaker, recoveryChain, telemetry, healerPatches, options, executionState, variantId, currentVariant, startTime, deadlineMs, effectiveTimeoutMs, errorMsg) {
331
389
  const breakerState = breaker.recordFailure(index, opcode.maxFailures);
332
390
  if (breakerState.tripped) {
333
391
  telemetry.circuitBreakerTrips++;
@@ -363,6 +421,8 @@ async function handleFailure(opcode, index, adapter, verifier, isInteraction, br
363
421
  remainingTimeMs,
364
422
  maxDeterministicRetries: Math.max(0, opcode.maxFailures - breakerState.opcodeFailures),
365
423
  currentVariant,
424
+ allowPageReload: !executionState.activeClip,
425
+ suppressPageReloads: Boolean(executionState.activeClip),
366
426
  });
367
427
  if (recovery.llmResult) {
368
428
  telemetry.llmCallCount++;
@@ -423,6 +483,7 @@ async function executeOpcodeAction(opcode, opcodeIndex, adapter, artifacts, tele
423
483
  case 'TYPE':
424
484
  case 'PRESS_KEY':
425
485
  case 'WAIT_FOR':
486
+ case 'SLEEP':
426
487
  case 'SET_LOCALE':
427
488
  case 'SET_THEME':
428
489
  case 'SCROLL':
@@ -435,7 +496,12 @@ async function executeOpcodeAction(opcode, opcodeIndex, adapter, artifacts, tele
435
496
  case 'INJECT_MOCK_DATA':
436
497
  case 'REMOVE_ELEMENT':
437
498
  case 'SET_ATTRIBUTE':
438
- return executeOpcodeCoreAction(opcode, adapter, { currentVariant, mockDataGroups, credentials });
499
+ return executeOpcodeCoreAction(opcode, adapter, {
500
+ currentVariant,
501
+ mockDataGroups,
502
+ credentials,
503
+ suppressPageReloads: Boolean(executionState.activeClip),
504
+ });
439
505
  case 'ASSERT_ROUTE':
440
506
  return evaluateImmediateAssertion(await evaluatePostcondition(adapter, {
441
507
  type: 'route_matches',
@@ -555,22 +621,33 @@ async function executeOpcodeAction(opcode, opcodeIndex, adapter, artifacts, tele
555
621
  });
556
622
  break;
557
623
  }
558
- case 'BEGIN_CLIP':
624
+ case 'BEGIN_CLIP': {
559
625
  if (executionState.activeClip) {
560
626
  return { success: false, error: 'cannot start a new clip before the previous one ends' };
561
627
  }
628
+ const recordingMode = artifactPlan.mediaMode === 'video' ? 'video' : 'clip';
562
629
  executionState.activeClip = {
563
- clipId: opcode.clipId,
630
+ clipId: opcode.clipId ?? (recordingMode === 'video' ? DEFAULT_VIDEO_CLIP_ID : undefined),
564
631
  clipName: opcode.clipName,
632
+ startedAt: Date.now(),
565
633
  };
566
- await adapter.beginRecording({ mediaMode: 'clip' });
634
+ await adapter.beginRecording({
635
+ mediaMode: recordingMode,
636
+ captureResolution: resolveRecordingCaptureResolution(artifactPlan),
637
+ captureFps: artifactPlan.format?.captureFps,
638
+ });
567
639
  break;
640
+ }
568
641
  case 'END_CLIP': {
569
642
  const clipIdentity = resolveClipIdentity(executionState.activeClip, opcode);
570
643
  const recording = await adapter.endRecording();
571
644
  executionState.activeClip = undefined;
645
+ // Match the artifact's mediaMode to the program's so the upload route
646
+ // routes opcode-video clips to the AUT-57 storage path
647
+ // (`raw/{videoId}/{clipId}.mp4`) instead of the legacy clip flow.
648
+ const artifactMediaMode = artifactPlan.mediaMode === 'video' ? 'video' : 'clip';
572
649
  artifacts.push({
573
- mediaMode: 'clip',
650
+ mediaMode: artifactMediaMode,
574
651
  buffer: recording.buffer,
575
652
  mimeType: recording.mimeType,
576
653
  durationMs: recording.durationMs,
@@ -600,7 +677,52 @@ async function executeOpcodeAction(opcode, opcodeIndex, adapter, artifacts, tele
600
677
  };
601
678
  }
602
679
  }
603
- // ── Helpers ─────────────────────────────────────────────────────────
680
+ /**
681
+ * Snapshot per-opcode timing + element bbox before the action runs. Returns
682
+ * null when no timing should be emitted (mediaMode != video, or no active
683
+ * clip context). The bbox is only resolved for visible interaction opcodes
684
+ * (CLICK / DOUBLE_CLICK / TYPE / HOVER) — other opcodes record `bbox: null`.
685
+ */
686
+ async function capturePreActionTiming(opcode, adapter, activeClip, artifactPlan) {
687
+ if (artifactPlan.mediaMode !== 'video')
688
+ return null;
689
+ if (!activeClip || activeClip.startedAt === undefined)
690
+ return null;
691
+ const now = Date.now();
692
+ const timecodeStartMs = Math.max(0, now - activeClip.startedAt);
693
+ let bbox = null;
694
+ const selector = getZoomTargetSelector(opcode);
695
+ if (selector && adapter.getElementBoundingBox) {
696
+ try {
697
+ bbox = await adapter.getElementBoundingBox(selector);
698
+ }
699
+ catch {
700
+ bbox = null;
701
+ }
702
+ }
703
+ return {
704
+ clipId: activeClip.clipId,
705
+ clipStartedAt: activeClip.startedAt,
706
+ timecodeStartMs,
707
+ bbox,
708
+ };
709
+ }
710
+ /**
711
+ * Selector to anchor video timing metadata on, or undefined if the opcode has
712
+ * no meaningful visible target. Limited to interaction opcodes that the user's
713
+ * eye tracks during a video demo.
714
+ */
715
+ function getZoomTargetSelector(opcode) {
716
+ switch (opcode.kind) {
717
+ case 'CLICK':
718
+ case 'DOUBLE_CLICK':
719
+ case 'TYPE':
720
+ case 'HOVER':
721
+ return opcode.selector;
722
+ default:
723
+ return undefined;
724
+ }
725
+ }
604
726
  async function withTimeout(fn, timeoutMs) {
605
727
  return new Promise((resolve, reject) => {
606
728
  const timer = setTimeout(() => reject(new Error(`timeout after ${timeoutMs}ms`)), timeoutMs);
@@ -0,0 +1,74 @@
1
+ /**
2
+ * Output formats accepted by OpenRouter's `/audio/speech` endpoint
3
+ * (April 2026 docs). Only `mp3` and `pcm` are supported — narrower than
4
+ * OpenAI's direct API which also exposes opus/aac/flac/wav.
5
+ */
6
+ export type TtsResponseFormat = 'mp3' | 'pcm';
7
+ export interface TtsClientConfig {
8
+ apiKey: string;
9
+ /** Override base URL — useful for tests or pointing at OpenAI directly. Default: OpenRouter. */
10
+ baseUrl?: string;
11
+ /** Default model for this client. Default: `openai/gpt-4o-mini-tts`. */
12
+ defaultModel?: string;
13
+ /** Per-request timeout. Default: 30s. */
14
+ timeoutMs?: number;
15
+ /** Number of retry attempts on transient HTTP failures (5xx, network). Default: 2. */
16
+ maxRetries?: number;
17
+ }
18
+ export interface TtsRequest {
19
+ /** Text to synthesize. The provider rejects empty strings; keep chunks under ~4k chars. */
20
+ text: string;
21
+ /** Voice handle, e.g. 'nova', 'alloy', 'echo'. Provider-specific. */
22
+ voice: string;
23
+ /** Optional model override. Defaults to client.defaultModel. */
24
+ model?: string;
25
+ /** Output container/codec. Default: 'mp3'. */
26
+ format?: TtsResponseFormat;
27
+ /** Speaking-rate multiplier (0.25..4.0). Provider may clamp. Default: 1.0. */
28
+ speed?: number;
29
+ /** Optional abort signal for cancellation. */
30
+ signal?: AbortSignal;
31
+ }
32
+ export interface TtsResponse {
33
+ /** Raw audio bytes in the requested format. */
34
+ audioBuffer: Buffer;
35
+ /** Mime type matching the requested format. */
36
+ mimeType: string;
37
+ /** Audio duration in ms, measured via ffprobe on the produced buffer. */
38
+ durationMs: number;
39
+ /** Effective model used. */
40
+ model: string;
41
+ /** Voice used. */
42
+ voice: string;
43
+ }
44
+ export declare class TtsError extends Error {
45
+ readonly status?: number | undefined;
46
+ readonly providerBody?: string | undefined;
47
+ constructor(message: string, status?: number | undefined, providerBody?: string | undefined);
48
+ }
49
+ /**
50
+ * POST a single TTS chunk to the configured provider, retry on 5xx / network
51
+ * errors up to `maxRetries`, then probe the resulting buffer for duration.
52
+ *
53
+ * Throws `TtsError` (with HTTP status when available) on permanent failure.
54
+ */
55
+ export declare function generateTtsChunk(config: TtsClientConfig, request: TtsRequest): Promise<TtsResponse>;
56
+ /**
57
+ * Mutable indirection used by `generateTtsChunk` to find its dependencies.
58
+ * Tests reach into this object to swap `probeAudioDurationMs` for a fake
59
+ * (ESM exports are immutable bindings, so a plain `vi.spyOn` on the named
60
+ * export does not affect the in-module reference).
61
+ */
62
+ export declare const ttsTestHooks: {
63
+ probeAudioDurationMs: (audioBuffer: Buffer, format: TtsResponseFormat) => Promise<number>;
64
+ };
65
+ /**
66
+ * Write the buffer to a tempfile, run ffprobe to get the format duration,
67
+ * clean up. Errors propagate (caller can wrap as TtsError if needed).
68
+ *
69
+ * Exported under both the legacy name (`probeAudioDurationMs`) and via the
70
+ * `ttsTestHooks` indirection so unit tests can override it.
71
+ */
72
+ export declare const probeAudioDurationMs: typeof defaultProbeAudioDurationMs;
73
+ declare function defaultProbeAudioDurationMs(audioBuffer: Buffer, format: TtsResponseFormat): Promise<number>;
74
+ export {};
@@ -0,0 +1,218 @@
1
+ /**
2
+ * AUT-57 — OpenRouter TTS client.
3
+ *
4
+ * Posts a single chunk of narration text to OpenRouter's `/audio/speech`
5
+ * endpoint (mirrors the OpenAI shape) and returns the audio buffer + measured
6
+ * duration. Word-level timestamps are NOT returned by this provider; the
7
+ * caller derives per-word timings separately for animated subtitles.
8
+ *
9
+ * The default model is `openai/gpt-4o-mini-tts` (chosen in the AUT-57 plan as
10
+ * the V1 baseline — good EN voices, OpenRouter-billed). Override via the
11
+ * caller if a benchmark identifies a better trade-off.
12
+ *
13
+ * **Pure module** — no Node-only imports beyond `node:fs` / `node:os` /
14
+ * `node:path` / `node:child_process` (used only for duration probing). Safe
15
+ * to import from both the CLI and the Next.js server runtime.
16
+ */
17
+ import { execFile } from 'node:child_process';
18
+ import { promises as fs } from 'node:fs';
19
+ import os from 'node:os';
20
+ import path from 'node:path';
21
+ import { promisify } from 'node:util';
22
+ const execFileAsync = promisify(execFile);
23
+ const OPENROUTER_BASE_URL = 'https://openrouter.ai/api/v1';
24
+ /**
25
+ * Default TTS model. Matches the exact slug listed under
26
+ * `GET /models?output_modalities=speech` (April 2026):
27
+ * - openai/gpt-4o-mini-tts-2025-12-15
28
+ * - mistralai/voxtral-mini-tts-2603
29
+ * - google/gemini-3.1-flash-tts-preview
30
+ * - hexgrad/kokoro-82m, sesame/csm-1b, zyphra/zonos-*, canopylabs/orpheus-*
31
+ *
32
+ * gpt-4o-mini-tts ranks well on EN voice quality and is the AUT-57 baseline.
33
+ * The PR #3 spec explicitly calls it out — pivot per benchmark in PR #4 if
34
+ * a cheaper / better fit emerges.
35
+ */
36
+ const DEFAULT_TTS_MODEL = 'openai/gpt-4o-mini-tts-2025-12-15';
37
+ const DEFAULT_RESPONSE_FORMAT = 'mp3';
38
+ const DEFAULT_TIMEOUT_MS = 30_000;
39
+ const DEFAULT_MAX_RETRIES = 2;
40
+ const FORMAT_MIME_TYPES = {
41
+ mp3: 'audio/mpeg',
42
+ pcm: 'audio/pcm',
43
+ };
44
+ const FORMAT_FILE_EXTENSIONS = {
45
+ mp3: 'mp3',
46
+ pcm: 'pcm',
47
+ };
48
+ export class TtsError extends Error {
49
+ status;
50
+ providerBody;
51
+ constructor(message, status, providerBody) {
52
+ super(message);
53
+ this.status = status;
54
+ this.providerBody = providerBody;
55
+ this.name = 'TtsError';
56
+ }
57
+ }
58
+ /**
59
+ * POST a single TTS chunk to the configured provider, retry on 5xx / network
60
+ * errors up to `maxRetries`, then probe the resulting buffer for duration.
61
+ *
62
+ * Throws `TtsError` (with HTTP status when available) on permanent failure.
63
+ */
64
+ export async function generateTtsChunk(config, request) {
65
+ if (!request.text.trim()) {
66
+ throw new TtsError('TTS request rejected: empty text');
67
+ }
68
+ const baseUrl = config.baseUrl ?? OPENROUTER_BASE_URL;
69
+ const model = request.model ?? config.defaultModel ?? DEFAULT_TTS_MODEL;
70
+ const format = request.format ?? DEFAULT_RESPONSE_FORMAT;
71
+ const timeoutMs = config.timeoutMs ?? DEFAULT_TIMEOUT_MS;
72
+ const maxRetries = Math.max(0, config.maxRetries ?? DEFAULT_MAX_RETRIES);
73
+ const audioBuffer = await fetchAudioWithRetry({
74
+ baseUrl,
75
+ apiKey: config.apiKey,
76
+ model,
77
+ voice: request.voice,
78
+ text: request.text,
79
+ format,
80
+ speed: request.speed,
81
+ timeoutMs,
82
+ maxRetries,
83
+ abortSignal: request.signal,
84
+ });
85
+ const durationMs = await ttsTestHooks.probeAudioDurationMs(audioBuffer, format);
86
+ return {
87
+ audioBuffer,
88
+ mimeType: FORMAT_MIME_TYPES[format],
89
+ durationMs,
90
+ model,
91
+ voice: request.voice,
92
+ };
93
+ }
94
+ async function fetchAudioWithRetry(params) {
95
+ let lastErr = null;
96
+ for (let attempt = 0; attempt <= params.maxRetries; attempt++) {
97
+ try {
98
+ return await fetchAudio(params);
99
+ }
100
+ catch (err) {
101
+ lastErr = err;
102
+ if (err instanceof TtsError && err.status && err.status < 500 && err.status !== 429) {
103
+ // 4xx (other than 429) is permanent — don't retry.
104
+ throw err;
105
+ }
106
+ if (attempt < params.maxRetries) {
107
+ await sleep(backoffMs(attempt));
108
+ continue;
109
+ }
110
+ }
111
+ }
112
+ if (lastErr instanceof Error)
113
+ throw lastErr;
114
+ throw new TtsError('TTS request failed after retries');
115
+ }
116
+ async function fetchAudio(params) {
117
+ const url = `${params.baseUrl.replace(/\/$/, '')}/audio/speech`;
118
+ const controller = new AbortController();
119
+ const timeoutHandle = setTimeout(() => controller.abort(), params.timeoutMs);
120
+ // If caller passed an external signal, abort our internal controller too.
121
+ const externalAbort = () => controller.abort();
122
+ if (params.abortSignal) {
123
+ if (params.abortSignal.aborted)
124
+ controller.abort();
125
+ else
126
+ params.abortSignal.addEventListener('abort', externalAbort, { once: true });
127
+ }
128
+ try {
129
+ const body = {
130
+ model: params.model,
131
+ input: params.text,
132
+ voice: params.voice,
133
+ response_format: params.format,
134
+ };
135
+ if (typeof params.speed === 'number' && Number.isFinite(params.speed)) {
136
+ body.speed = params.speed;
137
+ }
138
+ const response = await fetch(url, {
139
+ method: 'POST',
140
+ headers: {
141
+ Authorization: `Bearer ${params.apiKey}`,
142
+ 'Content-Type': 'application/json',
143
+ Accept: FORMAT_MIME_TYPES[params.format],
144
+ },
145
+ body: JSON.stringify(body),
146
+ signal: controller.signal,
147
+ });
148
+ if (!response.ok) {
149
+ const text = await safeReadBody(response);
150
+ throw new TtsError(`TTS request failed: ${response.status} ${response.statusText}`, response.status, text);
151
+ }
152
+ const arrayBuffer = await response.arrayBuffer();
153
+ if (arrayBuffer.byteLength === 0) {
154
+ throw new TtsError('TTS provider returned empty body');
155
+ }
156
+ return Buffer.from(arrayBuffer);
157
+ }
158
+ finally {
159
+ clearTimeout(timeoutHandle);
160
+ if (params.abortSignal) {
161
+ params.abortSignal.removeEventListener('abort', externalAbort);
162
+ }
163
+ }
164
+ }
165
+ async function safeReadBody(response) {
166
+ try {
167
+ return (await response.text()).slice(0, 2000);
168
+ }
169
+ catch {
170
+ return undefined;
171
+ }
172
+ }
173
+ function sleep(ms) {
174
+ return new Promise((resolve) => setTimeout(resolve, ms));
175
+ }
176
+ function backoffMs(attempt) {
177
+ // 250ms, 500ms, 1s, ... capped at 5s.
178
+ return Math.min(5000, 250 * Math.pow(2, attempt));
179
+ }
180
+ /**
181
+ * Mutable indirection used by `generateTtsChunk` to find its dependencies.
182
+ * Tests reach into this object to swap `probeAudioDurationMs` for a fake
183
+ * (ESM exports are immutable bindings, so a plain `vi.spyOn` on the named
184
+ * export does not affect the in-module reference).
185
+ */
186
+ export const ttsTestHooks = {
187
+ probeAudioDurationMs: defaultProbeAudioDurationMs,
188
+ };
189
+ /**
190
+ * Write the buffer to a tempfile, run ffprobe to get the format duration,
191
+ * clean up. Errors propagate (caller can wrap as TtsError if needed).
192
+ *
193
+ * Exported under both the legacy name (`probeAudioDurationMs`) and via the
194
+ * `ttsTestHooks` indirection so unit tests can override it.
195
+ */
196
+ export const probeAudioDurationMs = defaultProbeAudioDurationMs;
197
+ async function defaultProbeAudioDurationMs(audioBuffer, format) {
198
+ const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), 'autokap-tts-'));
199
+ const audioPath = path.join(tmpDir, `chunk.${FORMAT_FILE_EXTENSIONS[format]}`);
200
+ try {
201
+ await fs.writeFile(audioPath, audioBuffer);
202
+ const { stdout } = await execFileAsync('ffprobe', [
203
+ '-v', 'error',
204
+ '-show_entries', 'format=duration',
205
+ '-of', 'default=noprint_wrappers=1:nokey=1',
206
+ audioPath,
207
+ ]);
208
+ const seconds = parseFloat(stdout.trim());
209
+ if (!Number.isFinite(seconds) || seconds <= 0) {
210
+ throw new TtsError(`ffprobe could not measure audio duration (got "${stdout.trim()}")`);
211
+ }
212
+ return Math.round(seconds * 1000);
213
+ }
214
+ finally {
215
+ await fs.rm(tmpDir, { recursive: true, force: true }).catch(() => { });
216
+ }
217
+ }
218
+ //# sourceMappingURL=openrouter-tts.js.map
@@ -60,32 +60,7 @@ async function checkRouteMatches(adapter, pattern) {
60
60
  try {
61
61
  const { pathname, search } = new URL(url);
62
62
  const fullPath = pathname + search;
63
- // Support glob-like patterns: ** matches anything (incl. slashes / empty),
64
- // * matches a single path segment, ? matches one non-slash char.
65
- // Tokenize in one pass so the `*` rewrite doesn't clobber the `*` produced
66
- // by the `**` rewrite (e.g. `/home**` must compile to `^/home.*$`, not
67
- // `^/home.[^/]*$` which would reject `/home` itself).
68
- let regexStr = '';
69
- for (let i = 0; i < pattern.length; i++) {
70
- const ch = pattern[i];
71
- if (ch === '*' && pattern[i + 1] === '*') {
72
- regexStr += '.*';
73
- i++;
74
- }
75
- else if (ch === '*') {
76
- regexStr += '[^/]*';
77
- }
78
- else if (ch === '?') {
79
- regexStr += '[^/]';
80
- }
81
- else if (/[.+^${}()|[\]\\]/.test(ch)) {
82
- regexStr += `\\${ch}`;
83
- }
84
- else {
85
- regexStr += ch;
86
- }
87
- }
88
- const regex = new RegExp(`^${regexStr}$`);
63
+ const regex = compileRoutePattern(pattern);
89
64
  if (regex.test(fullPath) || regex.test(pathname)) {
90
65
  return { passed: true, reason: `URL "${fullPath}" matches pattern "${pattern}"` };
91
66
  }
@@ -95,6 +70,41 @@ async function checkRouteMatches(adapter, pattern) {
95
70
  return { passed: false, reason: `invalid URL "${url}" or pattern "${pattern}"` };
96
71
  }
97
72
  }
73
+ function compileRoutePattern(pattern) {
74
+ // `route_matches` historically used glob syntax, but generated programs and
75
+ // ASSERT_ROUTE docs may provide anchored regexes such as `^/$`. Treat only
76
+ // anchored patterns as raw regex to avoid changing common globs like
77
+ // `/pricing*`.
78
+ if (pattern.startsWith('^') || pattern.endsWith('$')) {
79
+ return new RegExp(pattern);
80
+ }
81
+ // Support glob-like patterns: ** matches anything (incl. slashes / empty),
82
+ // * matches a single path segment, ? matches one non-slash char.
83
+ // Tokenize in one pass so the `*` rewrite doesn't clobber the `*` produced
84
+ // by the `**` rewrite (e.g. `/home**` must compile to `^/home.*$`, not
85
+ // `^/home.[^/]*$` which would reject `/home` itself).
86
+ let regexStr = '';
87
+ for (let i = 0; i < pattern.length; i++) {
88
+ const ch = pattern[i];
89
+ if (ch === '*' && pattern[i + 1] === '*') {
90
+ regexStr += '.*';
91
+ i++;
92
+ }
93
+ else if (ch === '*') {
94
+ regexStr += '[^/]*';
95
+ }
96
+ else if (ch === '?') {
97
+ regexStr += '[^/]';
98
+ }
99
+ else if (/[.+^${}()|[\]\\]/.test(ch)) {
100
+ regexStr += `\\${ch}`;
101
+ }
102
+ else {
103
+ regexStr += ch;
104
+ }
105
+ }
106
+ return new RegExp(`^${regexStr}$`);
107
+ }
98
108
  async function checkElementVisible(adapter, selector) {
99
109
  // Primary check: use Playwright waitFor (fast, reliable)
100
110
  try {