@lightcone-ai/daemon 0.20.0 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -272,16 +272,15 @@ server.tool(
272
272
  // tails, and re-records (Task #25/#26 trial).
273
273
  server.tool(
274
274
  'compose_video_v2',
275
- 'Compose a video from a list of segments using ffmpeg. Each segment has a visual source (image / scroll / '
276
- + 'carousel / video / gif), optional audio, and optional subtitle text. Subtitles are burned in when '
277
- + 'subtitle_text is provided AND burn_subtitles is not false. Segments are concatenated in order; outro clips are appended after.\n\n'
275
+ 'Compose video(s) from a list of segments using ffmpeg. Each segment has a visual source (image / scroll / '
276
+ + 'carousel / video / gif), optional audio, and optional subtitle text. Segments are concatenated in order; '
277
+ + 'outro clips are appended after.\n\n'
278
278
  + 'When any segment has audio_path, MUST be preceded by plan_video_segments in the same session '
279
- + '(plan_video_segments fills duration/subtitle_text/audio_path mechanically; manual alignment is rejected). '
280
- + 'Returns a local mp4 path + size_bytes.\n\n'
281
- + 'Dual / multi-version delivery (e.g. subtitled+voiced + clean silent): pass the variants[] array one call '
282
- + 'runs the heavy per-segment ffmpeg work ONCE and only diverges at audio mux + concat + subtitle burn per '
283
- + 'variant. That is ~1.2-1.4× single-version time vs ~2× when calling this tool twice. Each variant chooses '
284
- + 'its own burn_subtitles and include_audio independently.',
279
+ + '(plan_video_segments fills duration/subtitle_text/audio_path mechanically; manual alignment is rejected).\n\n'
280
+ + 'Outputs are controlled by variants[] ALWAYS required. Single output is variants:[{output_path:"..."}]. '
281
+ + 'Multi-output (e.g. 字幕+配音 + 无声无字幕 clean ) is variants:[{output_path:"sub.mp4"}, {output_path:"clean.mp4", burn_subtitles:false, include_audio:false}]. '
282
+ + 'The heavy per-segment ffmpeg work runs ONCE across all variants only audio mux + concat + subtitle burn '
283
+ + 'repeat per variant. Two-variant delivery is ~1.2-1.4× single-variant time, not 2×.',
285
284
  {
286
285
  segments: z.array(z.object({
287
286
  visual_path: z.string().optional().describe('Absolute path to a single image / video / gif.'),
@@ -296,20 +295,15 @@ server.tool(
296
295
  subtitle_text: z.string().optional().describe('Narration text to burn as subtitle. Displayed for the full segment duration.'),
297
296
  transition: z.enum(['cut', 'fade', 'crossfade']).optional().describe('Transition to next segment. Default cut.'),
298
297
  })).describe('Ordered list of video segments.'),
299
- outro_paths: z.array(z.string()).optional().describe('Absolute paths to outro video clips appended at end.'),
298
+ outro_paths: z.array(z.string()).optional().describe('Absolute paths to outro video clips appended at end (shared across all variants).'),
300
299
  resolution: z.string().optional().describe('Output resolution WxH. Default "1080x1920".'),
301
- output_path: z.string().optional().describe('Absolute output path (single-output mode). Auto-generated if omitted. Ignored when variants[] is provided.'),
302
- burn_subtitles: z.boolean().optional().describe('Single-output mode only: whether to burn subtitle_text. Default true. '
303
- + 'For producing multiple variants in one call, use variants[] instead.'),
304
300
  variants: z.array(z.object({
305
301
  output_path: z.string().describe('Absolute output path for this variant. Each variant must use a unique path.'),
306
302
  burn_subtitles: z.boolean().optional().describe('Whether to burn subtitle_text into THIS variant. Default true.'),
307
303
  include_audio: z.boolean().optional().describe('Whether to mux segment.audio_path into THIS variant. Default true. '
308
- + 'Pass false for a fully silent copy (skips audio mux entirely; segment.audio_path is ignored for this variant).'),
309
- })).optional().describe('Multi-output mode: one call produces all variants. '
310
- + 'Visual segment processing (the heavy work) runs once; each variant only repeats audio mux + concat + optional subtitle burn. '
311
- + 'Typical use: [{output_path:"with-sub.mp4"}, {output_path:"clean.mp4", burn_subtitles:false, include_audio:false}] '
312
- + 'to deliver a subtitled+voiced version and a silent clean version together.'),
304
+ + 'Pass false for a fully silent copy (segment.audio_path ignored for this variant).'),
305
+ })).min(1).describe('Required: one entry per output file. Single output = 1-element array. '
306
+ + 'Multi-output dual delivery example: [{output_path:"with-sub-voice.mp4"}, {output_path:"clean.mp4", burn_subtitles:false, include_audio:false}].'),
313
307
  },
314
308
  async (args) => {
315
309
  const segments = Array.isArray(args?.segments) ? args.segments : [];
@@ -348,28 +342,24 @@ server.tool(
348
342
  // audio in production runs (Tasks #20/#25/#26), forcing re-records.
349
343
  server.tool(
350
344
  'record_url_narration',
351
- 'Record a silent video of a URL by driving Chromium on an Xvfb display and capturing it with Playwright recordVideo, driven by a video plan; ffmpeg then transcodes the recording to mp4. Outputs a silent mp4 that can be passed to compose_video_v2 as a video-kind segment with an audio_path for narration.\n\nUse this as the canonical recording step for URL-narration videos. Falls back: if the page needs interactions outside the visual_action vocabulary (clicks, waits, OCR loops), use Monitor (Bash) with custom Playwright instead.\n\nMUST be preceded by plan_video_segments in the same session — feed plan_video_segments\'s `segments` array as `plan.sections` so dwell_ms aligns mechanically with TTS audio_duration_ms (hand-written dwell_ms has drifted and forced re-records in production).\n\nMULTI-SECTION OUTPUT (recommended for any URL with ≥2 sections): pass `output_paths` as an array with one path per plan.sections entry. The tool records the URL ONCE continuously (one browser session, one scrollTop, natural scroll flow through all sections), then slices the recording at section boundaries via ffmpeg. This avoids the per-segment scroll-back-to-top reset that happens when the agent splits N sections into N separate record_url_narration calls — that pattern reopens the browser and re-navigates for each segment, which looks visually disjointed even though the per-segment timing is correct.\n\nRuntime requirements: this tool only works on a Linux daemon machine with Xvfb + Chromium + ffmpeg installed (ffmpeg is used to transcode the recording to mp4; no x11grab device support needed). macOS / Windows daemons will fail at startup.',
345
+ 'Record silent mp4s of a URL by driving Chromium on an Xvfb display and capturing it with Playwright recordVideo, then ffmpeg-transcoding. Each output mp4 can be passed to compose_video_v2 as a video-kind segment with an audio_path for narration.\n\nUse this as the canonical recording step for URL-narration videos. Falls back: if the page needs interactions outside the visual_action vocabulary (clicks, waits, OCR loops), use Monitor (Bash) with custom Playwright instead.\n\nMUST be preceded by plan_video_segments in the same session — feed plan_video_segments\'s `segments` array as `plan.sections` so dwell_ms aligns mechanically with TTS audio_duration_ms (hand-written dwell_ms has drifted and forced re-records in production).\n\nALWAYS pass output_paths as an array with one mp4 path per plan.sections entry (single-section recording is a 1-element array). The tool records the URL ONCE continuously (one browser session, one scrollTop, natural scroll flow through all sections), then slices the recording at section boundaries via ffmpeg. There is NO mode that records N sections in N separate calls — that pattern reopened the browser and re-scrolled-from-top for each segment, which looked visually disjointed. One URL = one call.\n\nRuntime requirements: this tool only works on a Linux daemon machine with Xvfb + Chromium + ffmpeg installed (ffmpeg is used to transcode the recording to mp4; no x11grab device support needed). macOS / Windows daemons will fail at startup.',
352
346
  {
353
347
  url: z.string().describe('Page URL to record'),
354
348
  plan: z.record(z.any()).describe(
355
- 'A video plan: an object with `phases` (or `sections`), each a "visual beat" with '
356
- + '`action` (scroll_to_dwell / linear_scroll_during / scroll_back / hold / ...), a '
357
- + 'target (`target_y` or `focus_region:[y1,y2]`) for scroll-type actions, and '
358
- + '`dwell_ms` (how long to hold that beat should match the segment\'s TTS duration).\n\n'
359
- + 'Standard chain: pass plan_video_segments\'s `segments` array directly as `plan.sections` — '
360
- + 'each segment\'s `dwell_ms` is already set to its `audio_duration_ms`.\n\n'
361
- + 'For RECRUITMENT URLs (mp.weixin.qq.com / 校招 / 实习 / 岗位 content), each section MUST '
362
- + 'also declare `target_y_content_label` a short Chinese label describing what content '
363
- + 'sits at that pixel y position on the page (e.g. "标题区" / "岗位信息卡片" / "公司介绍" / '
364
- + '"届别说明"). Labels matching forbidden regions ("二维码" / "扫码" / "投递入口" / "投递方式" / '
365
- + '"联系方式" / "微信号" / "QR" / "阅读原文" / "外链") will cause the tool to refuse the '
366
- + 'recording — recruitment content must NOT dwell on these areas (see fragments.md '
367
- + 'frag.short.recruitment_url_mode_policy). Pick a different target_y in the 标题/岗位 '
368
- + 'information area and rewrite that section.'
349
+ 'A video plan: an object with `phases` (or `sections`), each a "visual beat".\n\n'
350
+ + 'ACTION VOCABULARY = atoms + macros. Pick by content type:\n'
351
+ + ' - scroll_to_dwell (default for most sections): fast transition + dwell with subtle micro-motion at target_y. Use for titles, content cards, single focal areas.\n'
352
+ + ' - narrated_pan: continuous linear scroll over the full section duration. Use ONLY when the speech actually narrates a long visible list (e.g. reading every job title in order). Was called linear_scroll_during; that name still works as an alias.\n'
353
+ + ' - focal_arc: NO scroll; cursor moves between N visual focal points. Use for SHORT pages where consecutive sections share basically the same target_y (within ~150px) scrolling would be invisible, the cursor carries the rhythm. Requires `points: [{x,y}, ...]` instead of target_y.\n'
354
+ + ' - hold: pure pause, no motion. Rare.\n\n'
355
+ + 'ATOMS (for power use via phase.beats[]): scroll_to / hold / micro_oscillate / cursor_focus. Any custom sequence the macros do not cover can be written as a beats array.\n\n'
356
+ + 'Each section needs: action (or beats[]), target (`target_y` / `focus_region:[y1,y2]` / `points`), and `dwell_ms` (= section total duration; for narrated content this should match the segment\'s TTS audio_duration_ms).\n\n'
357
+ + 'Standard chain: pass plan_video_segments\'s `segments` array directly as `plan.sections` each segment\'s `dwell_ms` is already its `audio_duration_ms`.\n\n'
358
+ + 'For RECRUITMENT URLs (mp.weixin.qq.com / 校招 / 实习 / 岗位 content), each section MUST also declare `target_y_content_label` — a short Chinese label describing what content sits at that pixel y position on the page (e.g. "标题区" / "岗位信息卡片" / "公司介绍" / "届别说明"). Labels matching forbidden regions ("二维码" / "扫码" / "投递入口" / "投递方式" / "联系方式" / "微信号" / "QR" / "阅读原文" / "外链") will cause the tool to refuse the recording — recruitment content must NOT dwell on these areas (see fragments.md frag.short.recruitment_url_mode_policy). Pick a different target_y in the 标题/岗位 information area and rewrite that section.'
369
359
  ),
370
- output_path: z.string().optional().describe('Workspace-relative output mp4 path for the CONSOLIDATED master recording. Default tmp/wx3_video/recorded-{ts}.mp4. When output_paths is also provided, this still receives the full continuous recording for verification/debugging.'),
371
- output_paths: z.array(z.string()).optional().describe('Multi-section output mode. Pass an array of N workspace-relative paths matching plan.sections length. The tool records ONCE continuously then slices the result into N mp4s at section boundaries (derived from phase_start / phase_end events). RECOMMENDED whenever a URL has ≥2 sections keeps visual flow natural between sections instead of reopening the browser per segment.'),
372
- events_path: z.string().optional().describe('Workspace-relative events.json path. Default ${output_path}.events.json'),
360
+ output_paths: z.array(z.string()).min(1).describe('REQUIRED. Workspace-relative mp4 paths, one per plan.sections entry (single-section is a 1-element array). The tool records ONCE continuously and slices the result at section boundaries (derived from phase_start / phase_end events) — each section produces exactly one of these mp4s.'),
361
+ output_path: z.string().optional().describe('Optional debug-only path for the CONSOLIDATED master recording (the full continuous webm transcoded). Auto-generated under tmp/ if omitted. Agents normally do not need to set this they consume output_paths.'),
362
+ events_path: z.string().optional().describe('Workspace-relative events.json path. Default ${master}.events.json'),
373
363
  viewport: z.object({
374
364
  width: z.number().optional(),
375
365
  height: z.number().optional(),
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lightcone-ai/daemon",
3
- "version": "0.20.0",
3
+ "version": "0.22.0",
4
4
  "type": "module",
5
5
  "main": "src/index.js",
6
6
  "bin": {
@@ -249,54 +249,41 @@ async function applyFadeTransition({ clipA, clipB, tmpDir, style = 'fade' }) {
249
249
  return outPath;
250
250
  }
251
251
 
252
- // compose_video_v2 supports two modes:
252
+ // compose_video_v2 ONE shape: caller passes variants[]. Single-output is
253
+ // just variants of length 1. Multi-output (subtitled+voiced + clean silent)
254
+ // is the same call with more variants. There is no top-level output_path or
255
+ // burn_subtitles shortcut — it added a second pattern, and agents
256
+ // consistently defaulted to the simpler one even when multi-output was
257
+ // requested, so the dual-version optimization went unused.
253
258
  //
254
- // 1. Legacy single-output: pass output_path (+ optional burn_subtitles).
255
- // Returns { path, duration_ms, size_bytes, variants: [..1 entry..] }.
256
- //
257
- // 2. Multi-variant: pass variants=[{output_path, burn_subtitles?, include_audio?}, ...].
258
- // Visual segment processing runs ONCE (the heavy part — per-segment ffmpeg
259
- // transcode/scale/scroll). Each variant then diverges only at audio mux +
260
- // concat + subtitle burn — typically a few seconds per extra variant.
261
- // Returns { variants: [{path, duration_ms, size_bytes, burn_subtitles,
262
- // include_audio}, ...] }.
263
- //
264
- // Use the multi-variant mode when shipping the same content with different
265
- // subtitle/audio combinations (e.g. subtitled+voiced + clean silent). Calling
266
- // the legacy mode twice produces correct outputs but redoes per-segment work.
259
+ // Visual segment processing runs ONCE; each variant diverges only at audio
260
+ // mux + concat + subtitle burn (~seconds per extra variant).
267
261
  export async function composeVideoV2({
268
262
  segments = [],
269
263
  outro_paths = [],
270
264
  resolution = '1080x1920',
271
- output_path,
272
- burn_subtitles = true,
273
265
  variants,
274
266
  }) {
275
267
  if (!Array.isArray(segments) || segments.length === 0) {
276
268
  throw new Error('segments must be a non-empty array');
277
269
  }
278
270
 
279
- // Normalize variants. If caller did not pass an explicit variants array,
280
- // synthesize a single variant from the legacy output_path + burn_subtitles.
281
- // include_audio defaults to true (auto-include any segment.audio_path).
282
- const normalizedVariants = (Array.isArray(variants) && variants.length > 0)
283
- ? variants.map((v, idx) => {
284
- if (!v || typeof v !== 'object') {
285
- throw new Error(`variants[${idx}]: must be an object`);
286
- }
287
- const outPath = String(v.output_path ?? '').trim();
288
- if (!outPath) throw new Error(`variants[${idx}]: output_path is required`);
289
- return {
290
- output_path: outPath,
291
- burn_subtitles: v.burn_subtitles !== false,
292
- include_audio: v.include_audio !== false,
293
- };
294
- })
295
- : [{
296
- output_path: output_path ?? path.join(os.tmpdir(), `lightcone-video-${Date.now()}.mp4`),
297
- burn_subtitles: burn_subtitles !== false,
298
- include_audio: true,
299
- }];
271
+ if (!Array.isArray(variants) || variants.length === 0) {
272
+ throw new Error('variants must be a non-empty array. Single output is variants:[{output_path:"..."}].');
273
+ }
274
+
275
+ const normalizedVariants = variants.map((v, idx) => {
276
+ if (!v || typeof v !== 'object') {
277
+ throw new Error(`variants[${idx}]: must be an object`);
278
+ }
279
+ const outPath = String(v.output_path ?? '').trim();
280
+ if (!outPath) throw new Error(`variants[${idx}]: output_path is required`);
281
+ return {
282
+ output_path: outPath,
283
+ burn_subtitles: v.burn_subtitles !== false,
284
+ include_audio: v.include_audio !== false,
285
+ };
286
+ });
300
287
 
301
288
  // Disallow two variants writing to the same file — would race on disk.
302
289
  const seenOutputs = new Set();
@@ -485,15 +472,8 @@ export async function composeVideoV2({
485
472
  });
486
473
  }
487
474
 
488
- // Legacy single-output callers (didn't pass variants) get the same flat
489
- // shape they used to get, plus the variants array for forward-compat.
490
- const first = variantOutputs[0];
491
- return {
492
- path: first.path,
493
- duration_ms: first.duration_ms,
494
- size_bytes: first.size_bytes,
495
- variants: variantOutputs,
496
- };
475
+ // Always return variants[]. Single-output callers read variants[0].
476
+ return { variants: variantOutputs };
497
477
  } finally {
498
478
  await rm(tmpDir, { recursive: true, force: true });
499
479
  }
@@ -367,21 +367,26 @@ export async function recordUrlNarration({
367
367
  const normalizedViewport = normalizeViewport(viewport);
368
368
  const normalizedFps = normalizeInteger(fps, DEFAULT_FPS);
369
369
  const resolvedOutputPaths = normalizeOutputPaths(outputPaths);
370
- // When multi-section output is requested, the count must match plan.sections
371
- // 1:1 otherwise the agent will end up with audio/visual misalignment when
372
- // it feeds these into plan_video_segments. Fail loud rather than silently
373
- // truncating or padding.
374
- if (resolvedOutputPaths && resolvedOutputPaths.length !== phases.length) {
370
+ // output_paths is REQUIRED. Single-section recordings just pass an array
371
+ // of one. Removing the optional path forces 1:1 alignment with plan.sections
372
+ // and eliminates the "default to single output_path master" pattern that
373
+ // led agents to call this tool once per section instead of once per URL.
374
+ if (!resolvedOutputPaths) {
375
+ const error = new Error(
376
+ 'output_paths is required — one entry per plan.sections (single section is a 1-element array).',
377
+ );
378
+ error.code = 'OUTPUT_PATHS_REQUIRED';
379
+ throw error;
380
+ }
381
+ if (resolvedOutputPaths.length !== phases.length) {
375
382
  const error = new Error(
376
383
  `output_paths_count_mismatch:expected=${phases.length}:got=${resolvedOutputPaths.length}`,
377
384
  );
378
385
  error.code = 'OUTPUT_PATHS_COUNT_MISMATCH';
379
386
  throw error;
380
387
  }
381
- if (resolvedOutputPaths) {
382
- for (const p of resolvedOutputPaths) {
383
- mkdirSync(path.dirname(p), { recursive: true });
384
- }
388
+ for (const p of resolvedOutputPaths) {
389
+ mkdirSync(path.dirname(p), { recursive: true });
385
390
  }
386
391
 
387
392
  mkdirSync(path.dirname(resolvedOutputPath), { recursive: true });
@@ -501,43 +506,40 @@ export async function recordUrlNarration({
501
506
  ? eventsLog.reduce((max, ev) => Math.max(max, Number(ev?.t_ms) || 0), 0)
502
507
  : 0;
503
508
 
504
- // Multi-section output: slice the consolidated mp4 at section boundaries
505
- // (derived from phase_start / phase_end events). All slices come from the
506
- // SAME continuous recording, so the visual flow between sections stays
509
+ // Slice the consolidated mp4 at section boundaries (derived from
510
+ // phase_start / phase_end events). All slices come from the SAME
511
+ // continuous recording, so the visual flow between sections stays
507
512
  // natural — no browser reload, no scroll-back-to-top per segment.
508
- let sectionOutputs = null;
509
- if (resolvedOutputPaths) {
510
- const cutPoints = deriveSectionCutPoints(eventsLog, phases.length);
511
- sectionOutputs = [];
512
- for (let i = 0; i < cutPoints.length; i += 1) {
513
- const cut = cutPoints[i];
514
- const outPath = resolvedOutputPaths[i];
515
- await cutFn({
516
- inputPath: resolvedOutputPath,
517
- outputPath: outPath,
518
- startMs: cut.start_ms,
519
- durationMs: cut.duration_ms,
520
- fps: normalizedFps,
521
- });
522
- const sliceStat = await stat(outPath);
523
- if (!sliceStat.isFile() || sliceStat.size <= 0) {
524
- const error = new Error(`section_slice_empty:${outPath}`);
525
- error.code = 'SECTION_SLICE_EMPTY';
526
- throw error;
527
- }
528
- sectionOutputs.push({
529
- phase_id: cut.phase_id,
530
- video_path: outPath,
531
- start_ms: cut.start_ms,
532
- end_ms: cut.end_ms,
533
- duration_ms: cut.duration_ms,
534
- size_bytes: Number(sliceStat.size ?? 0),
535
- });
513
+ const cutPoints = deriveSectionCutPoints(eventsLog, phases.length);
514
+ const sectionOutputs = [];
515
+ for (let i = 0; i < cutPoints.length; i += 1) {
516
+ const cut = cutPoints[i];
517
+ const outPath = resolvedOutputPaths[i];
518
+ await cutFn({
519
+ inputPath: resolvedOutputPath,
520
+ outputPath: outPath,
521
+ startMs: cut.start_ms,
522
+ durationMs: cut.duration_ms,
523
+ fps: normalizedFps,
524
+ });
525
+ const sliceStat = await stat(outPath);
526
+ if (!sliceStat.isFile() || sliceStat.size <= 0) {
527
+ const error = new Error(`section_slice_empty:${outPath}`);
528
+ error.code = 'SECTION_SLICE_EMPTY';
529
+ throw error;
536
530
  }
531
+ sectionOutputs.push({
532
+ phase_id: cut.phase_id,
533
+ video_path: outPath,
534
+ start_ms: cut.start_ms,
535
+ end_ms: cut.end_ms,
536
+ duration_ms: cut.duration_ms,
537
+ size_bytes: Number(sliceStat.size ?? 0),
538
+ });
537
539
  }
538
540
 
539
541
  return {
540
- video_path: resolvedOutputPath,
542
+ master_video_path: resolvedOutputPath,
541
543
  events_path: resolvedEventsPath,
542
544
  events_log: eventsLog,
543
545
  duration_ms: lastTms > 0 ? lastTms : null,
@@ -1,5 +1,7 @@
1
1
  import { resolveDurationMs } from './phase-duration.js';
2
- import { humanizedScroll } from '../humanized-scroll.js';
2
+ import { ATOMS, ATOM_NAMES } from './atoms.js';
3
+ import { MACROS, resolveMacroName } from './macros.js';
4
+ import { getCdpSession } from '../cdp-touch.js';
3
5
 
4
6
  function normalizeText(value) {
5
7
  if (typeof value !== 'string') return '';
@@ -22,16 +24,31 @@ function normalizeRange(value) {
22
24
  return [low, high];
23
25
  }
24
26
 
25
- // The recorder executes exactly these visual actions. There is no "scroll a bit"
26
- // blind-scroll action: every scroll phase must say where it lands.
27
+ // Canonical action vocabulary = atoms + macros. Macros are the day-to-day
28
+ // names plans should use; atoms are exposed for sequencing via phase.beats[]
29
+ // when a macro doesn't fit. The legacy V2 names (smooth_scroll, fast_scroll,
30
+ // linear_scroll_during, scroll_back) are still accepted via ACTION_ALIASES
31
+ // in macros.js so existing plans keep running, but new plans should pick
32
+ // from the canonical list.
33
+ //
34
+ // "hold" is its own canonical action (not a macro) because it's the leaf
35
+ // case — pure pause — and the agent shouldn't have to know whether it's
36
+ // implemented as macro or atom.
27
37
  export const SUPPORTED_PHASE_ACTIONS = Object.freeze([
38
+ // macros (recommended for most sections)
39
+ 'scroll_to_dwell', // fast transition + dwell with micro-motion (default)
40
+ 'narrated_pan', // continuous linear scroll (long lists)
41
+ 'focal_arc', // cursor across N focal points (short pages)
42
+ // atoms (for power use — usually inside phase.beats[])
43
+ 'scroll_to',
28
44
  'hold',
45
+ 'micro_oscillate',
46
+ 'cursor_focus',
47
+ // legacy V2 names — kept for backward compat
29
48
  'smooth_scroll',
30
49
  'fast_scroll',
31
50
  'linear_scroll_during',
32
- 'scroll_to_dwell',
33
51
  'scroll_back',
34
- 'cursor_focus',
35
52
  ]);
36
53
 
37
54
  // Common spellings authors reach for, mapped onto the canonical action above.
@@ -229,205 +246,173 @@ function resolveFromY(phase, fallback = null) {
229
246
  return Math.round(parsed);
230
247
  }
231
248
 
232
- // Delegates to humanizedScroll, which dispatches real CDP touch events so
233
- // the browser's gesture engine produces native scroll physics (rubber-band,
234
- // fling inertia, compositor-paced repaints). The old implementation drove
235
- // `root.scrollTo(...)` in a setTimeout loop inside page.evaluate visually
236
- // smooth in isolation, but bypassed the gesture pipeline entirely, which is
237
- // what made scrolls feel "robotic" on recordings (see the
238
- // `not natural` thread in docs/scenario-content-creation discussion).
239
- //
240
- // `minSteps` is no longer needed (humanizedScroll computes segments from
241
- // distance + duration). `jitterPx` is forwarded as `pixel_jitter_px`, which
242
- // humanizedScroll converts into per-touchMove vertical offset.
243
- async function animateScroll(page, {
244
- startY = null,
245
- targetY,
246
- durationMs,
247
- easing = 'easeInOutQuad',
248
- jitterPx = 0,
249
- // minSteps is accepted but unused — kept in the signature so callers don't
250
- // need updating in this refactor.
251
- minSteps: _minSteps, // eslint-disable-line no-unused-vars
252
- } = {}) {
253
- if (!Number.isFinite(Number(targetY))) {
254
- const error = new Error('phase_target_y_required');
255
- error.code = 'PHASE_TARGET_Y_REQUIRED';
256
- throw error;
249
+ // ── beat dispatch ─────────────────────────────────────────────────────────
250
+ // A "beat" is one entry in the atom sequence run during a phase. Either:
251
+ // { atom: 'scroll_to', params: {...} } — leaf, dispatched directly
252
+ // { macro: 'scroll_to_dwell', params } — expanded to atoms then run
253
+ // plan-executor stays thin: turn phase into beats, then run beats.
254
+
255
+ // Build the atom-sequence for a phase. Two modes:
256
+ // 1. phase.beats[] given — used as-is (agent composing arbitrary sequences).
257
+ // 2. phase.action given resolved to macro (incl. legacy aliases) or
258
+ // single atom, phase params get passed through.
259
+ function phaseToBeats(phase, { fallbackFromY = 0 } = {}) {
260
+ if (Array.isArray(phase?.beats) && phase.beats.length > 0) {
261
+ return { beats: phase.beats, anchorY: null };
257
262
  }
258
263
 
259
- const resolvedFromY = Number.isFinite(Number(startY))
260
- ? Number(startY)
261
- : await page.evaluate(() => {
262
- const root = document.scrollingElement || document.documentElement;
263
- return Math.round(root.scrollTop);
264
- });
264
+ const action = resolvePhaseAction(phase);
265
265
 
266
- await humanizedScroll(page, {
267
- from_y: resolvedFromY,
268
- to_y: Number(targetY),
269
- duration_ms: Math.max(0, Number(durationMs) || 0),
270
- motion_curve: easing,
271
- pixel_jitter_px: Math.max(0, Number(jitterPx) || 0),
272
- });
273
- }
266
+ if (ATOM_NAMES.includes(action)) {
267
+ const params = paramsForAtom(action, phase, fallbackFromY);
268
+ const beat = { atom: action, params };
269
+ const anchorY = action === 'scroll_to' && Number.isFinite(Number(params.target_y))
270
+ ? Number(params.target_y)
271
+ : null;
272
+ return { beats: [beat], anchorY };
273
+ }
274
274
 
275
- async function executeHold(page, phase) {
276
- const holdMs = resolveDurationMs(phase, 0);
277
- if (holdMs > 0) {
278
- await page.waitForTimeout(holdMs);
275
+ const macroName = resolveMacroName(action);
276
+ if (macroName) {
277
+ const macroFn = MACROS[macroName];
278
+ const params = paramsForMacro(macroName, phase, fallbackFromY);
279
+ const expanded = macroFn(params, { fromY: fallbackFromY });
280
+ return { beats: expanded.beats, anchorY: expanded.anchorY };
279
281
  }
280
- return { anchorY: null };
281
- }
282
282
 
283
- async function executeSmoothScroll(page, phase) {
284
- const targetY = requireTargetY(phase, 'smooth_scroll');
285
- const transitionMs = resolveTransitionMs(phase, 900);
286
- await animateScroll(page, {
287
- targetY,
288
- durationMs: transitionMs,
289
- easing: 'easeInOutQuad',
290
- jitterPx: 2,
291
- minSteps: 18,
292
- });
293
- return { anchorY: targetY };
294
- }
283
+ if (action === 'hold') {
284
+ return {
285
+ beats: [{ atom: 'hold', params: { duration_ms: resolveDurationMs(phase, 0) } }],
286
+ anchorY: null,
287
+ };
288
+ }
295
289
 
296
- async function executeFastScroll(page, phase) {
297
- const targetY = requireTargetY(phase, 'fast_scroll');
298
- const transitionMs = resolveTransitionMs(phase, 420);
299
- await animateScroll(page, {
300
- targetY,
301
- durationMs: transitionMs,
302
- easing: 'easeOutQuad',
303
- jitterPx: 3,
304
- minSteps: 10,
305
- });
306
- return { anchorY: targetY };
290
+ const error = new Error(
291
+ `phase_action_unsupported:${action || 'empty'} — supported actions: ${SUPPORTED_PHASE_ACTIONS.join(', ')}`
292
+ + ' (there is no blind scroll_down/scroll_up; pick a macro or write phase.beats[] from atoms)',
293
+ );
294
+ error.code = 'PHASE_ACTION_UNSUPPORTED';
295
+ throw error;
307
296
  }
308
297
 
309
- async function executeLinearScrollDuring(page, phase, { fallbackFromY = null } = {}) {
310
- const fromY = resolveFromY(phase, fallbackFromY);
311
- const toY = requireTargetY(phase, 'linear_scroll_during');
298
+ // Phase fields atom params. Atom-specific shape; missing fields fall back
299
+ // to the same derivations the old executors did (resolveDurationMs, etc).
300
+ function paramsForAtom(atomName, phase, fallbackFromY) {
312
301
  const durationMs = resolveDurationMs(phase, null);
313
- if (!Number.isFinite(Number(durationMs)) || Number(durationMs) <= 0) {
314
- const error = new Error('linear_scroll_duration_required');
315
- error.code = 'LINEAR_SCROLL_DURATION_REQUIRED';
316
- throw error;
302
+ if (atomName === 'scroll_to') {
303
+ return {
304
+ target_y: requireTargetY(phase, 'scroll_to'),
305
+ duration_ms: durationMs ?? resolveTransitionMs(phase, 600),
306
+ curve: phase.curve || phase.motion_curve || 'easeInOutQuad',
307
+ jitter_px: Number.isFinite(Number(phase.jitter_px)) ? Number(phase.jitter_px) : 2,
308
+ from_y: resolveFromY(phase, fallbackFromY),
309
+ };
317
310
  }
318
-
319
- await animateScroll(page, {
320
- startY: fromY,
321
- targetY: toY,
322
- durationMs,
323
- easing: 'linear',
324
- jitterPx: 0,
325
- minSteps: 12,
326
- });
327
-
328
- const dwellMs = normalizeInteger(phase?.dwell_ms, null);
329
- if (Number.isFinite(dwellMs) && dwellMs > durationMs) {
330
- await page.waitForTimeout(dwellMs - durationMs);
311
+ if (atomName === 'hold') {
312
+ return { duration_ms: durationMs ?? 0 };
331
313
  }
332
- return { anchorY: toY };
333
- }
334
-
335
- async function executeScrollToDwell(page, phase) {
336
- const targetY = requireTargetY(phase, 'scroll_to_dwell');
337
- const transitionMs = resolveTransitionMs(phase, 820);
338
- await animateScroll(page, {
339
- targetY,
340
- durationMs: transitionMs,
341
- easing: 'easeInOutQuad',
342
- jitterPx: 2,
343
- minSteps: 16,
344
- });
345
- const dwellMs = normalizeInteger(phase?.dwell_ms, null);
346
- if (Number.isFinite(dwellMs) && dwellMs > 0) {
347
- await page.waitForTimeout(dwellMs);
348
- } else {
349
- const holdMs = resolveDurationMs(phase, 0);
350
- if (holdMs > 0) {
351
- await page.waitForTimeout(holdMs);
352
- }
314
+ if (atomName === 'micro_oscillate') {
315
+ return {
316
+ amplitude_px: Number(phase.amplitude_px) || 30,
317
+ duration_ms: durationMs ?? 0,
318
+ period_ms: Number(phase.period_ms) || 1400,
319
+ };
353
320
  }
354
- return { anchorY: targetY };
321
+ if (atomName === 'cursor_focus') {
322
+ return {
323
+ x: Number(phase.x ?? phase.cursor_x),
324
+ y: Number(phase.y ?? phase.cursor_y),
325
+ duration_ms: durationMs ?? 0,
326
+ };
327
+ }
328
+ return { ...phase };
355
329
  }
356
330
 
357
- async function executeScrollBack(page, phase, { fallbackTargetY = 0 } = {}) {
358
- const targetY = resolveTargetY(phase, fallbackTargetY);
359
- const transitionMs = resolveTransitionMs(phase, 900);
360
- await animateScroll(page, {
361
- targetY,
362
- durationMs: transitionMs,
363
- easing: 'easeOutQuad',
364
- jitterPx: 1,
365
- minSteps: 14,
366
- });
367
- const dwellMs = normalizeInteger(phase?.dwell_ms, null);
368
- if (Number.isFinite(dwellMs) && dwellMs > 0) {
369
- await page.waitForTimeout(dwellMs);
331
+ // Phase fields macro params. Macros validate their own inputs.
332
+ function paramsForMacro(macroName, phase, fallbackFromY) {
333
+ const durationMs = resolveDurationMs(phase, null);
334
+ const targetY = resolveTargetY(phase, null);
335
+ const fromY = resolveFromY(phase, fallbackFromY);
336
+ const base = {
337
+ target_y: targetY,
338
+ duration_ms: durationMs,
339
+ from_y: fromY,
340
+ transition_ms: Number.isFinite(Number(phase.transition_ms)) ? Number(phase.transition_ms) : undefined,
341
+ transition_ratio: Number.isFinite(Number(phase.transition_ratio)) ? Number(phase.transition_ratio) : undefined,
342
+ transition_curve: phase.transition_curve,
343
+ amplitude_px: phase.amplitude_px,
344
+ oscillate_period_ms: phase.oscillate_period_ms,
345
+ };
346
+ if (macroName === 'focal_arc') {
347
+ return { ...base, points: Array.isArray(phase.points) ? phase.points : [] };
370
348
  }
371
- return { anchorY: targetY };
349
+ if (macroName === 'scroll_to_dwell') {
350
+ // The legacy fast_scroll alias maps to scroll_to_dwell, but with a
351
+ // shorter default transition_ratio so the snap feels closer to the
352
+ // original fast_scroll behavior.
353
+ const isFastScrollAlias = String(phase.action || '').toLowerCase() === 'fast_scroll';
354
+ if (isFastScrollAlias && base.transition_ratio === undefined) {
355
+ base.transition_ratio = 0.1;
356
+ }
357
+ }
358
+ return base;
372
359
  }
373
360
 
374
- async function executeCursorFocus(page, phase) {
375
- const targetY = requireTargetY(phase, 'cursor_focus');
376
- const transitionMs = resolveTransitionMs(phase, 650);
377
- await animateScroll(page, {
378
- targetY,
379
- durationMs: transitionMs,
380
- easing: 'easeInOutQuad',
381
- jitterPx: 1,
382
- minSteps: 12,
383
- });
384
- const dwellMs = normalizeInteger(phase?.dwell_ms, null);
385
- if (Number.isFinite(dwellMs) && dwellMs > 0) {
386
- await page.waitForTimeout(dwellMs);
387
- } else {
388
- const holdMs = resolveDurationMs(phase, 360);
389
- if (holdMs > 0) {
390
- await page.waitForTimeout(holdMs);
361
+ // Run an atom-sequence in order. Each beat is either:
362
+ // - an atom (run directly through ATOMS[name])
363
+ // - a macro reference (expand to atoms inline; one level of nesting only —
364
+ // macros calling macros breaks the "shortcut for common atom combo"
365
+ // mental model).
366
+ async function executeBeats(page, ctx, beats, { fallbackFromY = 0 } = {}) {
367
+ let anchorY = fallbackFromY;
368
+ for (const beat of beats) {
369
+ if (!beat || typeof beat !== 'object') continue;
370
+ if (beat.macro) {
371
+ const macroFn = MACROS[beat.macro] || MACROS[resolveMacroName(beat.macro)];
372
+ if (!macroFn) {
373
+ const error = new Error(`beat_macro_unsupported:${beat.macro}`);
374
+ error.code = 'BEAT_MACRO_UNSUPPORTED';
375
+ throw error;
376
+ }
377
+ const expanded = macroFn(beat.params ?? {}, { fromY: anchorY });
378
+ const sub = await executeBeats(page, ctx, expanded.beats, { fallbackFromY: anchorY });
379
+ if (sub?.anchorY != null) anchorY = sub.anchorY;
380
+ else if (expanded.anchorY != null) anchorY = expanded.anchorY;
381
+ continue;
382
+ }
383
+ const atomName = beat.atom;
384
+ const atomFn = ATOMS[atomName];
385
+ if (!atomFn) {
386
+ const error = new Error(`beat_atom_unsupported:${atomName || 'missing'}`);
387
+ error.code = 'BEAT_ATOM_UNSUPPORTED';
388
+ throw error;
391
389
  }
390
+ const params = { ...(beat.params ?? {}) };
391
+ // Pre-fill scroll_to's from_y from the running anchor so sequences can
392
+ // chain naturally without the agent recomputing positions.
393
+ if (atomName === 'scroll_to' && params.from_y == null) params.from_y = anchorY;
394
+ const result = await atomFn(page, ctx, params);
395
+ if (result?.anchorY != null) anchorY = result.anchorY;
392
396
  }
393
- return { anchorY: targetY };
397
+ return { anchorY };
394
398
  }
395
399
 
396
400
  async function executePhase(page, phase, {
397
401
  lastAnchorY = null,
398
402
  initialAnchorY = 0,
399
403
  } = {}) {
400
- const action = resolvePhaseAction(phase);
401
404
  const fallbackFromY = lastAnchorY ?? initialAnchorY;
405
+ const { beats, anchorY: macroAnchor } = phaseToBeats(phase, { fallbackFromY });
402
406
 
403
- if (action === 'hold') {
404
- return executeHold(page, phase);
405
- }
406
- if (action === 'smooth_scroll') {
407
- return executeSmoothScroll(page, phase);
408
- }
409
- if (action === 'fast_scroll') {
410
- return executeFastScroll(page, phase);
411
- }
412
- if (action === 'linear_scroll_during') {
413
- return executeLinearScrollDuring(page, phase, { fallbackFromY });
414
- }
415
- if (action === 'scroll_to_dwell') {
416
- return executeScrollToDwell(page, phase);
417
- }
418
- if (action === 'scroll_back') {
419
- return executeScrollBack(page, phase, { fallbackTargetY: 0 });
420
- }
421
- if (action === 'cursor_focus') {
422
- return executeCursorFocus(page, phase);
423
- }
407
+ // One CDP session per phase, threaded to atoms via ctx (cached on page by
408
+ // cdp-touch.js so this is cheap on repeat calls). Tests pass a mock page
409
+ // that throws here; we swallow and let atoms that need CDP fail on demand.
410
+ let cdp = null;
411
+ try { cdp = await getCdpSession(page); } catch { /* mock pages may not expose context() */ }
412
+ const ctx = { cdp, fromY: fallbackFromY };
424
413
 
425
- const error = new Error(
426
- `phase_action_unsupported:${action || 'empty'} supported actions: ${SUPPORTED_PHASE_ACTIONS.join(', ')}`
427
- + ' (there is no blind scroll_down/scroll_up; use scroll_to_dwell with target_y or focus_region)',
428
- );
429
- error.code = 'PHASE_ACTION_UNSUPPORTED';
430
- throw error;
414
+ const { anchorY } = await executeBeats(page, ctx, beats, { fallbackFromY });
415
+ return { anchorY: anchorY ?? macroAnchor ?? null };
431
416
  }
432
417
 
433
418
  function createEvent({ tMs, action, phaseId, phaseAction, detail = {} }) {
@@ -27,14 +27,31 @@ export async function runComposeVideoV2Tool({
27
27
  outro_paths,
28
28
  format,
29
29
  resolution,
30
+ variants,
31
+ // Trapping legacy params: agents that still pass these from older prompts
32
+ // need an explicit error so they migrate, not silent fallback.
30
33
  output_path,
31
34
  burn_subtitles,
32
- variants,
33
35
  workspaceDir,
34
36
  }) {
37
+ if (output_path != null || burn_subtitles != null) {
38
+ return toolError(
39
+ 'compose_video_v2: output_path and burn_subtitles are no longer accepted at the top level. '
40
+ + 'Pass variants:[{output_path, burn_subtitles?, include_audio?}] — single output is a '
41
+ + '1-element array. See frag.short.video_synthesis_tools.',
42
+ );
43
+ }
44
+
35
45
  if (!Array.isArray(segments) || segments.length === 0) {
36
46
  return toolError('segments must be a non-empty array.');
37
47
  }
48
+ if (!Array.isArray(variants) || variants.length === 0) {
49
+ return toolError(
50
+ 'compose_video_v2: variants[] is required. Single output is variants:[{output_path:"..."}]. '
51
+ + 'Multi-output dual delivery (字幕版 + 无字幕版) is variants:[{output_path:"sub.mp4"}, '
52
+ + '{output_path:"clean.mp4", burn_subtitles:false, include_audio:false}].',
53
+ );
54
+ }
38
55
 
39
56
  const imagePaths = [];
40
57
  for (let i = 0; i < segments.length; i++) {
@@ -69,34 +86,24 @@ export async function runComposeVideoV2Tool({
69
86
  }
70
87
  }
71
88
 
72
- // Normalize variants. If caller passed a variants[] array, that takes
73
- // priority — multi-output mode. Otherwise build a single-element variants
74
- // array from the legacy output_path + burn_subtitles params.
89
+ // Normalize variants. Each entry needs an output_path; flags default to
90
+ // burn_subtitles=true, include_audio=true.
75
91
  const outDir = workspaceDir
76
92
  ? path.join(workspaceDir, 'artifacts', 'video')
77
93
  : path.join(os.tmpdir(), 'lightcone-video');
78
94
 
79
- let normalizedVariants;
80
- if (Array.isArray(variants) && variants.length > 0) {
81
- normalizedVariants = variants.map((v, idx) => {
82
- if (!v || typeof v !== 'object') {
83
- return null; // surfaced below
84
- }
85
- const outPath = String(v.output_path ?? '').trim()
86
- || path.join(outDir, `composed-${Date.now()}-${idx}-${randomUUID().slice(0, 8)}.mp4`);
87
- return {
88
- output_path: outPath,
89
- burn_subtitles: v.burn_subtitles !== false,
90
- include_audio: v.include_audio !== false,
91
- };
92
- });
93
- if (normalizedVariants.some(v => v === null)) {
94
- return toolError('variants must be an array of objects, each with { output_path, burn_subtitles?, include_audio? }.');
95
- }
96
- } else {
97
- const burnSubtitles = burn_subtitles !== false;
98
- const outPath = output_path ?? path.join(outDir, `composed-${Date.now()}-${randomUUID().slice(0, 8)}.mp4`);
99
- normalizedVariants = [{ output_path: outPath, burn_subtitles: burnSubtitles, include_audio: true }];
95
+ const normalizedVariants = variants.map((v, idx) => {
96
+ if (!v || typeof v !== 'object') return null;
97
+ const outPath = String(v.output_path ?? '').trim()
98
+ || path.join(outDir, `composed-${Date.now()}-${idx}-${randomUUID().slice(0, 8)}.mp4`);
99
+ return {
100
+ output_path: outPath,
101
+ burn_subtitles: v.burn_subtitles !== false,
102
+ include_audio: v.include_audio !== false,
103
+ };
104
+ });
105
+ if (normalizedVariants.some(v => v === null)) {
106
+ return toolError('variants must be an array of objects, each with { output_path, burn_subtitles?, include_audio? }.');
100
107
  }
101
108
 
102
109
  const warnings = [];
@@ -135,31 +142,16 @@ export async function runComposeVideoV2Tool({
135
142
  variants: normalizedVariants,
136
143
  });
137
144
 
138
- const outputs = Array.isArray(result?.variants) && result.variants.length > 0
139
- ? result.variants
140
- : [{ path: result.path, duration_ms: result.duration_ms, size_bytes: result.size_bytes,
141
- burn_subtitles: normalizedVariants[0].burn_subtitles,
142
- include_audio: normalizedVariants[0].include_audio }];
143
-
144
- const lines = ['compose_video_v2 completed.'];
145
- if (outputs.length === 1) {
146
- const v = outputs[0];
145
+ const outputs = Array.isArray(result?.variants) ? result.variants : [];
146
+ const lines = ['compose_video_v2 completed.', `variants=${outputs.length}`];
147
+ outputs.forEach((v, idx) => {
148
+ lines.push(`--- variant ${idx} ---`);
147
149
  lines.push(`path=${v.path}`);
148
150
  lines.push(`duration_ms=${v.duration_ms}`);
149
151
  lines.push(`size_bytes=${v.size_bytes ?? 'unknown'}`);
150
152
  lines.push(`burn_subtitles=${v.burn_subtitles}`);
151
153
  lines.push(`include_audio=${v.include_audio}`);
152
- } else {
153
- lines.push(`variants=${outputs.length}`);
154
- outputs.forEach((v, idx) => {
155
- lines.push(`--- variant ${idx} ---`);
156
- lines.push(`path=${v.path}`);
157
- lines.push(`duration_ms=${v.duration_ms}`);
158
- lines.push(`size_bytes=${v.size_bytes ?? 'unknown'}`);
159
- lines.push(`burn_subtitles=${v.burn_subtitles}`);
160
- lines.push(`include_audio=${v.include_audio}`);
161
- });
162
- }
154
+ });
163
155
  lines.push(`segments=${segments.length}`);
164
156
  lines.push(`outro_clips=${(outro_paths ?? []).length}`);
165
157
  for (const w of warnings) lines.push(w);
@@ -257,40 +257,49 @@ export async function runRecordUrlNarrationTool({
257
257
  }
258
258
 
259
259
  try {
260
- const { resolvedOutputPath, resolvedEventsPath } = resolveRecordUrlNarrationPaths({
261
- workspaceDir,
262
- outputPath: validatedInput.output_path,
263
- eventsPath: validatedInput.events_path,
264
- nowMs,
265
- });
266
-
267
- mkdirSync(path.dirname(resolvedOutputPath), { recursive: true });
268
- mkdirSync(path.dirname(resolvedEventsPath), { recursive: true });
269
-
270
- // Multi-section mode: caller passed output_paths. Validate it 1:1 with
271
- // plan.sections so the recorder can slice the continuous recording into
272
- // per-section mp4s without ambiguity.
273
- let resolvedOutputPaths = null;
260
+ // output_paths is REQUIRED. The legacy "default output_path master file"
261
+ // mode is gone — agents kept defaulting to one-call-per-section because
262
+ // that was the lowest-friction path. Now every recording is sliced, even
263
+ // single-section ones (which are just a 1-element output_paths array).
264
+ let resolvedOutputPaths;
274
265
  try {
275
266
  resolvedOutputPaths = resolveOutputPaths(validatedInput.output_paths, { workspaceDir });
276
267
  } catch (error) {
277
268
  return toolError(`Error: ${error.message}`);
278
269
  }
279
- if (resolvedOutputPaths) {
280
- const planSectionCount = (planSegments(validatedInput.plan) ?? []).length;
281
- if (resolvedOutputPaths.length !== planSectionCount) {
282
- return toolError(
283
- `Error: output_paths length (${resolvedOutputPaths.length}) must match `
284
- + `plan.sections length (${planSectionCount}). Each section produces exactly one mp4 — `
285
- + `don't pad or truncate.`,
286
- );
287
- }
270
+ if (!resolvedOutputPaths) {
271
+ return toolError(
272
+ 'Error: output_paths is required — one workspace-relative mp4 path per plan.sections entry. '
273
+ + 'Single-section recording is a 1-element array. Multi-section recording records once '
274
+ + 'continuously (one browser session, one scrollTop) and slices the result at section '
275
+ + 'boundaries. See frag.short.video_synthesis_tools.',
276
+ );
277
+ }
278
+ const planSectionCount = (planSegments(validatedInput.plan) ?? []).length;
279
+ if (resolvedOutputPaths.length !== planSectionCount) {
280
+ return toolError(
281
+ `Error: output_paths length (${resolvedOutputPaths.length}) must match `
282
+ + `plan.sections length (${planSectionCount}). Each section produces exactly one mp4 — `
283
+ + `don't pad or truncate.`,
284
+ );
288
285
  }
289
286
 
287
+ // The master / events JSON paths are agent-optional debug artifacts.
288
+ // Default master to a tmp path next to the first output; events default
289
+ // to <master>.events.json. Agent can override either if they care.
290
+ const { resolvedOutputPath: masterPath, resolvedEventsPath } = resolveRecordUrlNarrationPaths({
291
+ workspaceDir,
292
+ outputPath: validatedInput.output_path,
293
+ eventsPath: validatedInput.events_path,
294
+ nowMs,
295
+ });
296
+ mkdirSync(path.dirname(masterPath), { recursive: true });
297
+ mkdirSync(path.dirname(resolvedEventsPath), { recursive: true });
298
+
290
299
  const recorderOutput = await recordUrlNarrationFn({
291
300
  url: validatedInput.url,
292
301
  plan: validatedInput.plan,
293
- output_path: resolvedOutputPath,
302
+ output_path: masterPath,
294
303
  events_path: resolvedEventsPath,
295
304
  output_paths: resolvedOutputPaths,
296
305
  viewport: validatedInput.viewport,
@@ -298,24 +307,11 @@ export async function runRecordUrlNarrationTool({
298
307
  settle_ms: validatedInput.settle_ms,
299
308
  });
300
309
 
301
- // Single-output mode (legacy): same one-line summary as before.
302
- if (!resolvedOutputPaths) {
303
- return toolText(
304
- `Recorded URL narration.\n`
305
- + `video_path=${resolvedOutputPath}\n`
306
- + `events_path=${resolvedEventsPath}\n`
307
- + `duration_ms=${deriveDurationMs(recorderOutput) ?? 'unknown'}\n`
308
- + `phases=${derivePhaseCount({ plan: validatedInput.plan, recorderOutput }) ?? 'n/a'}`,
309
- );
310
- }
311
-
312
- // Multi-section mode: one section block per output mp4, plus the
313
- // consolidated master mp4 path for debugging / verification.
314
310
  const sections = Array.isArray(recorderOutput?.sections) ? recorderOutput.sections : [];
315
311
  const lines = [
316
- 'Recorded URL narration (multi-section).',
317
- `master_video_path=${resolvedOutputPath}`,
312
+ 'Recorded URL narration.',
318
313
  `events_path=${resolvedEventsPath}`,
314
+ `master_video_path=${masterPath}`,
319
315
  `total_duration_ms=${deriveDurationMs(recorderOutput) ?? 'unknown'}`,
320
316
  `sections=${sections.length}`,
321
317
  ];