@lightcone-ai/daemon 0.15.71 → 0.15.72

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lightcone-ai/daemon",
3
- "version": "0.15.71",
3
+ "version": "0.15.72",
4
4
  "type": "module",
5
5
  "main": "src/index.js",
6
6
  "bin": {
@@ -9,7 +9,12 @@ const DEFAULT_WIDTH = 1080;
9
9
  const DEFAULT_HEIGHT = 1920;
10
10
  const DEFAULT_FPS = 30;
11
11
  const TRANSITION_DURATION = 0.5;
12
- const SUBTITLE_FONT = 'PingFang SC,Microsoft YaHei,Arial';
12
+ // ASS `Fontname` is a single family name, not a CSS-style fallback list — a
13
+ // comma here shifts every subsequent field in the `Style:` line, corrupting the
14
+ // whole style so libass renders nothing (i.e. burned-in subtitles look missing).
15
+ // Use one installed family; libass + fontconfig handle glyph fallback. Override
16
+ // via SUBTITLE_FONT env if the deployment ships a different CJK font.
17
+ const SUBTITLE_FONT = (process.env.SUBTITLE_FONT || 'Noto Sans CJK SC').split(',')[0].trim() || 'Noto Sans CJK SC';
13
18
  const SUBTITLE_FONT_SIZE = 72;
14
19
  const SUBTITLE_MARGIN_V = 120;
15
20
 
@@ -23,6 +28,28 @@ function msToAssTimestamp(ms) {
23
28
  return `${hr}:${String(min).padStart(2, '0')}:${String(sec).padStart(2, '0')}.${String(cs).padStart(2, '0')}`;
24
29
  }
25
30
 
31
+ // Split a subtitle block into display-sized sentence units. Breaks on CJK/ASCII
32
+ // sentence punctuation and newlines; merges very short fragments forward so we
33
+ // don't flash one-character lines.
34
+ function splitSubtitleSentences(text) {
35
+ const raw = String(text ?? '').trim();
36
+ if (!raw) return [];
37
+ const pieces = raw
38
+ .split(/(?<=[。!?!?;;\n])/u)
39
+ .map(s => s.replace(/\s+/g, ' ').trim())
40
+ .filter(Boolean);
41
+ if (pieces.length <= 1) return [raw];
42
+ const merged = [];
43
+ for (const piece of pieces) {
44
+ if (merged.length > 0 && Array.from(merged[merged.length - 1]).length < 6) {
45
+ merged[merged.length - 1] = `${merged[merged.length - 1]}${piece}`;
46
+ } else {
47
+ merged.push(piece);
48
+ }
49
+ }
50
+ return merged;
51
+ }
52
+
26
53
  function wrapSubtitleText(text, maxChars = 14) {
27
54
  const chars = Array.from(String(text ?? ''));
28
55
  if (chars.length <= maxChars) return chars.join('');
@@ -311,15 +338,29 @@ export async function composeVideoV2({
311
338
  }
312
339
  }
313
340
 
314
- // Build subtitle entries with cumulative timeline timestamps
341
+ // Build subtitle entries with cumulative timeline timestamps. When a clip's
342
+ // subtitle text spans several sentences, split it into one event per sentence
343
+ // and spread them across the clip in proportion to their length, so a long
344
+ // beat reads as sequential lines roughly tracking the narration instead of one
345
+ // static wall of text.
315
346
  let cursorMs = 0;
316
347
  const subtitleEntries = [];
317
348
  for (const clip of readyClips) {
318
349
  if (clip.subtitleText) {
319
- subtitleEntries.push({
320
- text: clip.subtitleText,
321
- start_ms: cursorMs,
322
- end_ms: cursorMs + Math.round(clip.duration * 1000),
350
+ const clipMs = Math.round(clip.duration * 1000);
351
+ const sentences = splitSubtitleSentences(clip.subtitleText);
352
+ const totalLen = sentences.reduce((sum, s) => sum + Array.from(s).length, 0) || 1;
353
+ let offsetMs = 0;
354
+ sentences.forEach((sentence, idx) => {
355
+ const share = Array.from(sentence).length / totalLen;
356
+ const isLast = idx === sentences.length - 1;
357
+ const spanMs = isLast ? clipMs - offsetMs : Math.max(1, Math.round(clipMs * share));
358
+ subtitleEntries.push({
359
+ text: sentence,
360
+ start_ms: cursorMs + offsetMs,
361
+ end_ms: cursorMs + offsetMs + spanMs,
362
+ });
363
+ offsetMs += spanMs;
323
364
  });
324
365
  }
325
366
  cursorMs += Math.round(clip.duration * 1000);
@@ -1,17 +1,12 @@
1
1
  import { spawn } from 'node:child_process';
2
2
  import { mkdirSync } from 'node:fs';
3
- import { stat, writeFile } from 'node:fs/promises';
3
+ import { mkdtemp, rm, stat, writeFile } from 'node:fs/promises';
4
+ import os from 'node:os';
4
5
  import path from 'node:path';
5
6
 
6
7
  import { launchChromiumMobile, openPageAndSettle } from './chromium-driver.js';
7
8
  import { defaultDisplayPool } from './display-pool.js';
8
- import {
9
- createUnexpectedExitWatcher,
10
- startFfmpegCapture,
11
- stopFfmpegCapture,
12
- waitForProcessExit,
13
- } from './ffmpeg-runner.js';
14
- import { estimatePlanDurationMs } from './plan-estimator.js';
9
+ import { createUnexpectedExitWatcher, waitForProcessExit } from './ffmpeg-runner.js';
15
10
  import { executePlanPhases, normalizePlanPhases } from './plan-executor.js';
16
11
 
17
12
  const DEFAULT_VIEWPORT = Object.freeze({ width: 1080, height: 1920 });
@@ -177,6 +172,49 @@ function scalePhaseY(phase, zoom) {
177
172
  };
178
173
  }
179
174
 
175
+ // Re-encode the page recording (webm, page content only — no browser chrome) into
176
+ // the mp4 the rest of the pipeline expects, dropping the head segment that covers
177
+ // page load + settle so the clip starts at the first plan phase.
178
+ async function transcodeWebmToMp4({
179
+ webmPath,
180
+ outputPath,
181
+ startMs = 0,
182
+ fps = DEFAULT_FPS,
183
+ ffmpegBin = 'ffmpeg',
184
+ } = {}) {
185
+ const ss = Math.max(0, Number(startMs) || 0) / 1000;
186
+ const args = [
187
+ '-y',
188
+ ...(ss > 0 ? ['-ss', ss.toFixed(3)] : []),
189
+ '-i', webmPath,
190
+ '-an',
191
+ '-c:v', 'libx264',
192
+ '-preset', 'veryfast',
193
+ '-pix_fmt', 'yuv420p',
194
+ ...(Number.isFinite(Number(fps)) && Number(fps) > 0 ? ['-r', String(fps)] : []),
195
+ '-movflags', '+faststart',
196
+ outputPath,
197
+ ];
198
+ await new Promise((resolve, reject) => {
199
+ const proc = spawn(ffmpegBin, args, { stdio: ['ignore', 'pipe', 'pipe'] });
200
+ const errChunks = [];
201
+ proc.stderr?.on('data', (chunk) => errChunks.push(chunk));
202
+ proc.once('error', (err) => {
203
+ const wrapped = new Error(`ffmpeg_spawn_failed:${err.message}`);
204
+ wrapped.code = 'FFMPEG_SPAWN_FAILED';
205
+ reject(wrapped);
206
+ });
207
+ proc.on('close', (code) => {
208
+ if (code === 0) return resolve();
209
+ const wrapped = new Error(
210
+ `ffmpeg_transcode_failed:code=${code}: ${Buffer.concat(errChunks).toString().slice(-2000)}`
211
+ );
212
+ wrapped.code = 'FFMPEG_TRANSCODE_FAILED';
213
+ reject(wrapped);
214
+ });
215
+ });
216
+ }
217
+
180
218
  export async function recordUrlNarration({
181
219
  plan,
182
220
  output_path,
@@ -189,11 +227,14 @@ export async function recordUrlNarration({
189
227
  settle_ms = 4000,
190
228
  page_zoom = 1.1,
191
229
  displayPool = defaultDisplayPool,
192
- ffmpegDurationBufferSec = 8,
193
230
  startupProbeMs = 1200,
194
- ffmpegStopTimeoutMs = 10000,
195
231
  xvfbStopTimeoutMs = 5000,
196
232
  postPlanTailMs = 600,
233
+ recordingDir = null,
234
+ launchChromiumFn = launchChromiumMobile,
235
+ openPageFn = openPageAndSettle,
236
+ transcodeFn = transcodeWebmToMp4,
237
+ nowMs = () => Date.now(),
197
238
  } = {}) {
198
239
  const zoom = Number.isFinite(Number(page_zoom)) && Number(page_zoom) > 0 ? Number(page_zoom) : 1.1;
199
240
  const rawPhases = normalizePlanPhases(plan);
@@ -212,14 +253,14 @@ export async function recordUrlNarration({
212
253
  mkdirSync(path.dirname(resolvedOutputPath), { recursive: true });
213
254
  mkdirSync(path.dirname(resolvedEventsPath), { recursive: true });
214
255
 
256
+ const ownTempDir = !recordingDir;
257
+ const recVideoDir = recordingDir || await mkdtemp(path.join(os.tmpdir(), 'lc-recvid-'));
258
+
215
259
  let displayLease;
216
260
  let xvfb;
217
- let ffmpeg;
218
- let browserSession;
219
261
  let xvfbWatcher;
220
- let ffmpegWatcher;
262
+ let browserSession = null;
221
263
  let primaryError = null;
222
-
223
264
  const cleanupErrors = [];
224
265
 
225
266
  try {
@@ -234,11 +275,26 @@ export async function recordUrlNarration({
234
275
  });
235
276
  xvfbWatcher = createUnexpectedExitWatcher(xvfb.child, 'xvfb');
236
277
 
237
- browserSession = await launchChromiumMobile({
278
+ // The page recording captures the page viewport only (no browser chrome),
279
+ // regardless of the on-screen window. recordVideo starts when the page is
280
+ // created, so the webm includes goto + settle; we measure that head and trim
281
+ // it off in transcodeFn.
282
+ const recordStartedAt = nowMs();
283
+ browserSession = await launchChromiumFn({
238
284
  display,
239
285
  viewport: normalizedViewport,
286
+ contextOptions: {
287
+ recordVideo: {
288
+ dir: recVideoDir,
289
+ size: { width: normalizedViewport.width, height: normalizedViewport.height },
290
+ },
291
+ },
240
292
  });
241
- await openPageAndSettle(browserSession.page, {
293
+ const videoHandle = typeof browserSession.page.video === 'function'
294
+ ? browserSession.page.video()
295
+ : null;
296
+
297
+ await openPageFn(browserSession.page, {
242
298
  url: resolvedUrl,
243
299
  settleMs: settle_ms,
244
300
  });
@@ -250,42 +306,53 @@ export async function recordUrlNarration({
250
306
  await browserSession.page.waitForTimeout(300);
251
307
  }
252
308
 
253
- const estimatedDurationMs = estimatePlanDurationMs(executablePlan);
254
- const estimatedDurationSec = Math.max(
255
- 5,
256
- Math.ceil(estimatedDurationMs / 1000) + Math.max(0, Number(ffmpegDurationBufferSec) || 0)
257
- );
258
-
259
- ffmpeg = await startFfmpegCapture({
260
- display,
261
- outputPath: resolvedOutputPath,
262
- width: normalizedViewport.width,
263
- height: normalizedViewport.height,
264
- fps: normalizedFps,
265
- durationSec: estimatedDurationSec,
266
- startupProbeMs,
267
- });
268
- ffmpegWatcher = createUnexpectedExitWatcher(ffmpeg.child, 'ffmpeg');
269
-
270
309
  await scrollToTop(browserSession.page);
271
310
  await browserSession.page.waitForTimeout(350);
272
311
 
312
+ const headTrimMs = Math.max(0, nowMs() - recordStartedAt);
313
+
273
314
  const eventsLog = await Promise.race([
274
315
  executePlanPhases(browserSession.page, executablePlan),
275
316
  xvfbWatcher.promise,
276
- ffmpegWatcher.promise,
277
317
  ]);
278
318
 
279
319
  await browserSession.page.waitForTimeout(Math.max(0, Number(postPlanTailMs) || 0));
280
320
 
281
- ffmpegWatcher.deactivate();
282
- await stopFfmpegCapture(ffmpeg, {
283
- timeoutMs: ffmpegStopTimeoutMs,
284
- });
285
-
286
321
  xvfbWatcher.deactivate();
287
322
 
288
- await writeFile(resolvedEventsPath, JSON.stringify(eventsLog, null, 2), 'utf8');
323
+ // Flush the recording: video is written when the context closes.
324
+ let webmPath = null;
325
+ try {
326
+ await browserSession.context.close();
327
+ } catch (closeError) {
328
+ cleanupErrors.push(`context_close_failed:${closeError.message}`);
329
+ }
330
+ if (videoHandle) {
331
+ try {
332
+ webmPath = await videoHandle.path();
333
+ } catch (pathError) {
334
+ cleanupErrors.push(`video_path_failed:${pathError.message}`);
335
+ }
336
+ }
337
+ try {
338
+ await browserSession.browser.close();
339
+ } catch (closeError) {
340
+ cleanupErrors.push(`browser_close_failed:${closeError.message}`);
341
+ }
342
+ browserSession = null;
343
+
344
+ if (!webmPath) {
345
+ const error = new Error('record_video_not_produced');
346
+ error.code = 'RECORD_VIDEO_NOT_PRODUCED';
347
+ throw error;
348
+ }
349
+
350
+ await transcodeFn({
351
+ webmPath,
352
+ outputPath: resolvedOutputPath,
353
+ startMs: headTrimMs,
354
+ fps: normalizedFps,
355
+ });
289
356
 
290
357
  const videoStat = await stat(resolvedOutputPath);
291
358
  if (!videoStat.isFile() || videoStat.size <= 0) {
@@ -294,42 +361,36 @@ export async function recordUrlNarration({
294
361
  throw error;
295
362
  }
296
363
 
364
+ await writeFile(resolvedEventsPath, JSON.stringify(eventsLog, null, 2), 'utf8');
365
+
366
+ const lastTms = Array.isArray(eventsLog)
367
+ ? eventsLog.reduce((max, ev) => Math.max(max, Number(ev?.t_ms) || 0), 0)
368
+ : 0;
369
+
297
370
  return {
298
371
  video_path: resolvedOutputPath,
299
372
  events_path: resolvedEventsPath,
300
373
  events_log: eventsLog,
374
+ duration_ms: lastTms > 0 ? lastTms : null,
301
375
  display,
302
376
  };
303
377
  } catch (error) {
304
378
  primaryError = error;
305
379
  throw error;
306
380
  } finally {
307
- ffmpegWatcher?.deactivate();
308
381
  xvfbWatcher?.deactivate();
309
382
 
310
383
  if (browserSession) {
311
384
  try {
312
- await browserSession.close();
385
+ await browserSession.browser.close();
313
386
  } catch (closeError) {
314
387
  cleanupErrors.push(`browser_close_failed:${closeError.message}`);
315
388
  }
316
389
  }
317
390
 
318
- if (ffmpeg) {
319
- try {
320
- await stopFfmpegCapture(ffmpeg, {
321
- timeoutMs: ffmpegStopTimeoutMs,
322
- });
323
- } catch (stopError) {
324
- cleanupErrors.push(`ffmpeg_stop_failed:${stopError.message}`);
325
- }
326
- }
327
-
328
391
  if (xvfb) {
329
392
  try {
330
- await stopXvfb(xvfb, {
331
- timeoutMs: xvfbStopTimeoutMs,
332
- });
393
+ await stopXvfb(xvfb, { timeoutMs: xvfbStopTimeoutMs });
333
394
  } catch (stopError) {
334
395
  cleanupErrors.push(`xvfb_stop_failed:${stopError.message}`);
335
396
  }
@@ -339,6 +400,10 @@ export async function recordUrlNarration({
339
400
  displayLease.release();
340
401
  }
341
402
 
403
+ if (ownTempDir) {
404
+ await rm(recVideoDir, { recursive: true, force: true }).catch(() => {});
405
+ }
406
+
342
407
  if (cleanupErrors.length > 0) {
343
408
  if (primaryError) {
344
409
  primaryError.cleanupErrors = cleanupErrors;
@@ -21,6 +21,61 @@ function normalizeRange(value) {
21
21
  return [low, high];
22
22
  }
23
23
 
24
+ // The recorder executes exactly these visual actions. There is no "scroll a bit"
25
+ // blind-scroll action: every scroll phase must say where it lands.
26
+ export const SUPPORTED_PHASE_ACTIONS = Object.freeze([
27
+ 'hold',
28
+ 'smooth_scroll',
29
+ 'fast_scroll',
30
+ 'linear_scroll_during',
31
+ 'scroll_to_dwell',
32
+ 'scroll_back',
33
+ 'cursor_focus',
34
+ ]);
35
+
36
+ // Common spellings authors reach for, mapped onto the canonical action above.
37
+ // Note: scroll_down / scroll_up are intentionally NOT aliased — there is no blind
38
+ // scroll; an unrecognised action raises phase_action_unsupported so the plan gets
39
+ // fixed rather than silently degraded.
40
+ const PHASE_ACTION_ALIASES = new Map([
41
+ ['scroll_to', 'scroll_to_dwell'],
42
+ ['scrollto', 'scroll_to_dwell'],
43
+ ['scroll', 'scroll_to_dwell'],
44
+ ['scroll_to_region', 'scroll_to_dwell'],
45
+ ['scroll_to_y', 'scroll_to_dwell'],
46
+ ['dwell', 'scroll_to_dwell'],
47
+ ['focus_hold', 'scroll_to_dwell'],
48
+ ['pan', 'linear_scroll_during'],
49
+ ['narrated_pan', 'linear_scroll_during'],
50
+ ['linear_scroll', 'linear_scroll_during'],
51
+ ['scroll_during', 'linear_scroll_during'],
52
+ ['scroll_while_narrating', 'linear_scroll_during'],
53
+ ['return', 'scroll_back'],
54
+ ['return_anchor', 'scroll_back'],
55
+ ['back', 'scroll_back'],
56
+ ['scroll_to_top', 'scroll_back'],
57
+ ['wait', 'hold'],
58
+ ['pause', 'hold'],
59
+ ['stay', 'hold'],
60
+ ['focus', 'cursor_focus'],
61
+ ['highlight', 'cursor_focus'],
62
+ ]);
63
+
64
+ function normalizeActionName(rawValue) {
65
+ const name = normalizeText(rawValue).toLowerCase();
66
+ if (!name) return '';
67
+ if (SUPPORTED_PHASE_ACTIONS.includes(name)) return name;
68
+ return PHASE_ACTION_ALIASES.get(name) || name;
69
+ }
70
+
71
+ // `visual_action` may be a string (the action name) or an object ({type, target_y, ...}).
72
+ function visualActionObject(section = {}) {
73
+ const va = section?.visual_action;
74
+ if (va && typeof va === 'object') return va;
75
+ if (typeof va === 'string' && va.trim()) return { type: va.trim() };
76
+ return {};
77
+ }
78
+
24
79
  function inferActionFromCameraMotion(phase = {}) {
25
80
  const motion = normalizeText(phase.camera_motion ?? phase.cameraMotion).toLowerCase();
26
81
  if (motion === 'narrated_pan') return 'linear_scroll_during';
@@ -30,19 +85,32 @@ function inferActionFromCameraMotion(phase = {}) {
30
85
  return '';
31
86
  }
32
87
 
88
+ function pickFirstNumber(...values) {
89
+ for (const value of values) {
90
+ if (value == null) continue;
91
+ const parsed = Number(value);
92
+ if (Number.isFinite(parsed)) return Math.round(parsed);
93
+ }
94
+ return null;
95
+ }
96
+
33
97
  function normalizeSectionAsPhase(section = {}, index = 0) {
34
- const phaseId = normalizeText(section.id ?? section.phase_id) || `phase_${index + 1}`;
35
- const visualAction = section.visual_action && typeof section.visual_action === 'object'
36
- ? section.visual_action
37
- : {};
98
+ const phaseId = normalizeText(section.id ?? section.phase_id ?? section.name) || `phase_${index + 1}`;
99
+ const visualAction = visualActionObject(section);
38
100
  const focusRegion = normalizeRange(
39
101
  section.focus_region
40
102
  ?? section.focusRegion
41
103
  ?? visualAction.focus_region
42
104
  ?? visualAction.focusRegion
43
105
  );
44
- const explicitAction = normalizeText(section.action ?? visualAction.type).toLowerCase();
45
- const action = explicitAction || inferActionFromCameraMotion(section) || 'scroll_to_dwell';
106
+ const explicitAction = normalizeActionName(section.action ?? visualAction.type ?? visualAction.action);
107
+ const inferred = explicitAction || inferActionFromCameraMotion(section);
108
+ const targetY = pickFirstNumber(
109
+ section.target_y, section.to_y, section.y, section.scroll_y,
110
+ visualAction.target_y, visualAction.to_y, visualAction.y, visualAction.scroll_y,
111
+ );
112
+ const hasTarget = focusRegion != null || targetY != null;
113
+ const action = inferred || (hasTarget ? 'scroll_to_dwell' : 'hold');
46
114
 
47
115
  return {
48
116
  ...section,
@@ -51,21 +119,42 @@ function normalizeSectionAsPhase(section = {}, index = 0) {
51
119
  action,
52
120
  focus_region: focusRegion ?? null,
53
121
  visual_action: visualAction,
54
- target_y: section.target_y ?? visualAction.target_y ?? visualAction.to_y ?? null,
55
- from_y: section.from_y ?? visualAction.from_y ?? null,
56
- to_y: section.to_y ?? visualAction.to_y ?? null,
57
- transition_ms: section.transition_ms ?? visualAction.transition_ms ?? null,
58
- duration_ms: section.duration_ms ?? section.dwell_ms ?? null,
122
+ target_y: targetY,
123
+ from_y: pickFirstNumber(section.from_y, visualAction.from_y),
124
+ to_y: pickFirstNumber(section.to_y, visualAction.to_y, section.y, visualAction.y),
125
+ transition_ms: section.transition_ms ?? visualAction.transition_ms ?? visualAction.duration_ms ?? null,
126
+ duration_ms: section.duration_ms ?? section.dwell_ms ?? section.audio_duration_ms
127
+ ?? (section.presentation && Number.isFinite(Number(section.presentation.duration))
128
+ ? Math.round(Number(section.presentation.duration) * 1000)
129
+ : null),
59
130
  };
60
131
  }
61
132
 
62
133
  export function normalizePlanPhases(plan = {}) {
63
- const phases = Array.isArray(plan?.phases) ? plan.phases : [];
64
- if (phases.length > 0) return phases;
134
+ const topLevelPhases = Array.isArray(plan?.phases) ? plan.phases : [];
135
+ if (topLevelPhases.length > 0) {
136
+ return topLevelPhases.map((phase, index) => normalizeSectionAsPhase(phase, index));
137
+ }
65
138
 
66
139
  const sections = Array.isArray(plan?.sections) ? plan.sections : [];
67
140
  if (sections.length > 0) {
68
- return sections.map((section, index) => normalizeSectionAsPhase(section, index));
141
+ const flattened = [];
142
+ sections.forEach((section, sectionIndex) => {
143
+ const nested = Array.isArray(section?.phases) ? section.phases : null;
144
+ if (nested && nested.length > 0) {
145
+ const prefix = normalizeText(section.id ?? section.phase_id ?? section.name) || `s${sectionIndex + 1}`;
146
+ nested.forEach((subPhase, subIndex) => {
147
+ const merged = {
148
+ ...subPhase,
149
+ id: subPhase.id ?? subPhase.phase_id ?? subPhase.name ?? `${prefix}_${subIndex + 1}`,
150
+ };
151
+ flattened.push(normalizeSectionAsPhase(merged, flattened.length));
152
+ });
153
+ } else {
154
+ flattened.push(normalizeSectionAsPhase(section, flattened.length));
155
+ }
156
+ });
157
+ return flattened;
69
158
  }
70
159
 
71
160
  const error = new Error('plan_phases_required');
@@ -74,13 +163,15 @@ export function normalizePlanPhases(plan = {}) {
74
163
  }
75
164
 
76
165
  function resolvePhaseAction(phase = {}) {
77
- const explicit = normalizeText(phase.action ?? phase.visual_action?.type).toLowerCase();
166
+ const explicit = normalizeActionName(
167
+ phase.action ?? phase.visual_action?.type ?? phase.visual_action?.action
168
+ );
78
169
  if (explicit) return explicit;
79
170
  return inferActionFromCameraMotion(phase);
80
171
  }
81
172
 
82
173
  function resolvePhaseId(phase = {}, index = 0) {
83
- return normalizeText(phase.id ?? phase.phase_id) || `phase_${index + 1}`;
174
+ return normalizeText(phase.id ?? phase.phase_id ?? phase.name) || `phase_${index + 1}`;
84
175
  }
85
176
 
86
177
  function nowMs(getNowMs) {
@@ -94,9 +185,12 @@ function resolveTransitionMs(phase, fallback) {
94
185
  }
95
186
 
96
187
  function resolveTargetY(phase, fallback = null) {
97
- const raw = phase?.target_y ?? phase?.to_y ?? phase?.visual_action?.target_y ?? phase?.visual_action?.to_y;
98
- const parsed = Number(raw);
99
- if (Number.isFinite(parsed)) return Math.round(parsed);
188
+ const explicit = pickFirstNumber(
189
+ phase?.target_y, phase?.to_y, phase?.y, phase?.scroll_y,
190
+ phase?.visual_action?.target_y, phase?.visual_action?.to_y,
191
+ phase?.visual_action?.y, phase?.visual_action?.scroll_y,
192
+ );
193
+ if (explicit != null) return explicit;
100
194
 
101
195
  const focusRegion = normalizeRange(
102
196
  phase?.focus_region
@@ -113,6 +207,20 @@ function resolveTargetY(phase, fallback = null) {
113
207
  return fallback;
114
208
  }
115
209
 
210
+ function requireTargetY(phase, action) {
211
+ const targetY = resolveTargetY(phase, null);
212
+ if (targetY == null) {
213
+ const error = new Error(
214
+ `phase_target_y_required: phase "${resolvePhaseId(phase)}" uses "${action}" but has no `
215
+ + 'target_y / to_y / y or focus_region — every scroll phase must say where it lands '
216
+ + '(there is no blind scroll)',
217
+ );
218
+ error.code = 'PHASE_TARGET_Y_REQUIRED';
219
+ throw error;
220
+ }
221
+ return targetY;
222
+ }
223
+
116
224
  function resolveFromY(phase, fallback = null) {
117
225
  const raw = phase?.from_y ?? phase?.visual_action?.from_y;
118
226
  const parsed = Number(raw);
@@ -192,8 +300,8 @@ async function executeHold(page, phase) {
192
300
  return { anchorY: null };
193
301
  }
194
302
 
195
- async function executeSmoothScroll(page, phase, { fallbackTargetY = null } = {}) {
196
- const targetY = resolveTargetY(phase, fallbackTargetY);
303
+ async function executeSmoothScroll(page, phase) {
304
+ const targetY = requireTargetY(phase, 'smooth_scroll');
197
305
  const transitionMs = resolveTransitionMs(phase, 900);
198
306
  await animateScroll(page, {
199
307
  targetY,
@@ -205,8 +313,8 @@ async function executeSmoothScroll(page, phase, { fallbackTargetY = null } = {})
205
313
  return { anchorY: targetY };
206
314
  }
207
315
 
208
- async function executeFastScroll(page, phase, { fallbackTargetY = null } = {}) {
209
- const targetY = resolveTargetY(phase, fallbackTargetY);
316
+ async function executeFastScroll(page, phase) {
317
+ const targetY = requireTargetY(phase, 'fast_scroll');
210
318
  const transitionMs = resolveTransitionMs(phase, 420);
211
319
  await animateScroll(page, {
212
320
  targetY,
@@ -218,12 +326,9 @@ async function executeFastScroll(page, phase, { fallbackTargetY = null } = {}) {
218
326
  return { anchorY: targetY };
219
327
  }
220
328
 
221
- async function executeLinearScrollDuring(page, phase, {
222
- fallbackFromY = null,
223
- fallbackTargetY = null,
224
- } = {}) {
329
+ async function executeLinearScrollDuring(page, phase, { fallbackFromY = null } = {}) {
225
330
  const fromY = resolveFromY(phase, fallbackFromY);
226
- const toY = resolveTargetY(phase, fallbackTargetY);
331
+ const toY = requireTargetY(phase, 'linear_scroll_during');
227
332
  const durationMs = resolveDurationMs(phase, null);
228
333
  if (!Number.isFinite(Number(durationMs)) || Number(durationMs) <= 0) {
229
334
  const error = new Error('linear_scroll_duration_required');
@@ -247,8 +352,8 @@ async function executeLinearScrollDuring(page, phase, {
247
352
  return { anchorY: toY };
248
353
  }
249
354
 
250
- async function executeScrollToDwell(page, phase, { fallbackTargetY = null } = {}) {
251
- const targetY = resolveTargetY(phase, fallbackTargetY);
355
+ async function executeScrollToDwell(page, phase) {
356
+ const targetY = requireTargetY(phase, 'scroll_to_dwell');
252
357
  const transitionMs = resolveTransitionMs(phase, 820);
253
358
  await animateScroll(page, {
254
359
  targetY,
@@ -286,8 +391,8 @@ async function executeScrollBack(page, phase, { fallbackTargetY = 0 } = {}) {
286
391
  return { anchorY: targetY };
287
392
  }
288
393
 
289
- async function executeCursorFocus(page, phase, { fallbackTargetY = null } = {}) {
290
- const targetY = resolveTargetY(phase, fallbackTargetY);
394
+ async function executeCursorFocus(page, phase) {
395
+ const targetY = requireTargetY(phase, 'cursor_focus');
291
396
  const transitionMs = resolveTransitionMs(phase, 650);
292
397
  await animateScroll(page, {
293
398
  targetY,
@@ -313,34 +418,34 @@ async function executePhase(page, phase, {
313
418
  initialAnchorY = 0,
314
419
  } = {}) {
315
420
  const action = resolvePhaseAction(phase);
316
- const fallbackY = lastAnchorY ?? initialAnchorY;
421
+ const fallbackFromY = lastAnchorY ?? initialAnchorY;
317
422
 
318
423
  if (action === 'hold') {
319
424
  return executeHold(page, phase);
320
425
  }
321
426
  if (action === 'smooth_scroll') {
322
- return executeSmoothScroll(page, phase, { fallbackTargetY: fallbackY });
427
+ return executeSmoothScroll(page, phase);
323
428
  }
324
429
  if (action === 'fast_scroll') {
325
- return executeFastScroll(page, phase, { fallbackTargetY: fallbackY });
430
+ return executeFastScroll(page, phase);
326
431
  }
327
432
  if (action === 'linear_scroll_during') {
328
- return executeLinearScrollDuring(page, phase, {
329
- fallbackFromY: fallbackY,
330
- fallbackTargetY: fallbackY,
331
- });
433
+ return executeLinearScrollDuring(page, phase, { fallbackFromY });
332
434
  }
333
435
  if (action === 'scroll_to_dwell') {
334
- return executeScrollToDwell(page, phase, { fallbackTargetY: fallbackY });
436
+ return executeScrollToDwell(page, phase);
335
437
  }
336
438
  if (action === 'scroll_back') {
337
439
  return executeScrollBack(page, phase, { fallbackTargetY: 0 });
338
440
  }
339
441
  if (action === 'cursor_focus') {
340
- return executeCursorFocus(page, phase, { fallbackTargetY: fallbackY });
442
+ return executeCursorFocus(page, phase);
341
443
  }
342
444
 
343
- const error = new Error(`phase_action_unsupported:${action || 'empty'}`);
445
+ const error = new Error(
446
+ `phase_action_unsupported:${action || 'empty'} — supported actions: ${SUPPORTED_PHASE_ACTIONS.join(', ')}`
447
+ + ' (there is no blind scroll_down/scroll_up; use scroll_to_dwell with target_y or focus_region)',
448
+ );
344
449
  error.code = 'PHASE_ACTION_UNSUPPORTED';
345
450
  throw error;
346
451
  }
@@ -87,11 +87,19 @@ export async function runPlanVideoSegmentsTool({ segments, workspace_id, voice_i
87
87
  presentation = { duration, ...(kind === 'scroll' ? { style: 'scroll' } : {}) };
88
88
  }
89
89
 
90
+ // dwell_ms lets the same segment double as a record_url_narration plan phase
91
+ // (the recorder reads dwell_ms / duration_ms for how long to hold each beat).
92
+ // Prefer the real measured audio length; fall back to the planned visual duration.
93
+ const dwellMs = audioDurationMs > 0
94
+ ? audioDurationMs
95
+ : Math.round((presentation.duration ?? presentation.per_card_duration ?? 4) * 1000);
96
+
90
97
  const planned_seg = {
91
98
  ...seg,
92
99
  ...(audioResult?.audio_path ? { audio_path: audioResult.audio_path } : {}),
93
100
  ...(text ? { subtitle_text: text } : {}),
94
101
  presentation: { ...presentation, ...(seg.presentation ?? {}) },
102
+ dwell_ms: seg.dwell_ms ?? dwellMs,
95
103
  };
96
104
  if (audioResult?.audio_duration_ms) {
97
105
  planned_seg.audio_duration_ms = audioResult.audio_duration_ms;