@lightcone-ai/daemon 0.22.1 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,316 @@
1
+ // V6 page_understanding schema.
2
+ //
3
+ // The output of `analyze_page` — a structured, content-aware view of a URL
4
+ // that downstream tools (record_url_narration, plan_video_segments) and the
5
+ // agent (short_video_scripter) consume. See video-synthesis-design.md §五.
6
+ //
7
+ // V5's recruitment_slots / candidate_hotspots / skip_zones / mode_hint / text_bins
8
+ // vocabulary is gone; V6 is universal across page types and grounds plan
9
+ // authoring in three things:
10
+ // - blocks[]: from-top-to-bottom content units with semantics and pacing hints
11
+ // - unsafe_regions[]: y-ranges the recorder must never dwell on
12
+ // - narrative_arc: LLM's suggested storyline (advisory, agent must cross-check)
13
+
14
+ export const VISUAL_KIND_VALUES = Object.freeze([
15
+ 'hero',
16
+ 'title',
17
+ 'subtitle',
18
+ 'paragraph',
19
+ 'list',
20
+ 'image_with_text',
21
+ 'image_only',
22
+ 'table',
23
+ 'callout',
24
+ 'divider',
25
+ 'footer',
26
+ ]);
27
+
28
+ export const DENSITY_VALUES = Object.freeze(['low', 'medium', 'high']);
29
+
30
+ export const VISUAL_WEIGHT_VALUES = Object.freeze([
31
+ 'hero',
32
+ 'primary',
33
+ 'secondary',
34
+ 'aux',
35
+ ]);
36
+
37
+ export const READING_PRIORITY_VALUES = Object.freeze([
38
+ 'must',
39
+ 'should',
40
+ 'may',
41
+ 'skip',
42
+ ]);
43
+
44
+ export const UNSAFE_REASON_VALUES = Object.freeze([
45
+ 'qr_code',
46
+ 'contact_info',
47
+ 'application_entry',
48
+ 'external_link',
49
+ 'footer_promo',
50
+ ]);
51
+
52
+ export const NARRATIVE_STRUCTURE_VALUES = Object.freeze([
53
+ 'linear',
54
+ 'list',
55
+ 'comparison',
56
+ 'hero_then_detail',
57
+ 'step_by_step',
58
+ ]);
59
+
60
+ export const PREHEAT_STRATEGY_VALUES = Object.freeze([
61
+ 'none',
62
+ 'full_scroll_then_top',
63
+ ]);
64
+
65
+ const VISUAL_KIND_SET = new Set(VISUAL_KIND_VALUES);
66
+ const DENSITY_SET = new Set(DENSITY_VALUES);
67
+ const VISUAL_WEIGHT_SET = new Set(VISUAL_WEIGHT_VALUES);
68
+ const READING_PRIORITY_SET = new Set(READING_PRIORITY_VALUES);
69
+ const UNSAFE_REASON_SET = new Set(UNSAFE_REASON_VALUES);
70
+ const NARRATIVE_STRUCTURE_SET = new Set(NARRATIVE_STRUCTURE_VALUES);
71
+ const PREHEAT_STRATEGY_SET = new Set(PREHEAT_STRATEGY_VALUES);
72
+
73
+ const DEFAULT_PACING_HINT = Object.freeze({ dwell_ms_min: 2500, dwell_ms_max: 6000 });
74
+
75
+ function clampInt(value, min, max, fallback) {
76
+ const n = Number(value);
77
+ if (!Number.isFinite(n)) return fallback;
78
+ return Math.max(min, Math.min(max, Math.round(n)));
79
+ }
80
+
81
+ function trimString(value, fallback = '') {
82
+ if (typeof value !== 'string') return fallback;
83
+ const trimmed = value.trim();
84
+ return trimmed || fallback;
85
+ }
86
+
87
+ function pickEnum(value, set, fallback) {
88
+ const candidate = typeof value === 'string' ? value.trim() : '';
89
+ return set.has(candidate) ? candidate : fallback;
90
+ }
91
+
92
+ function normalizeKeywords(input) {
93
+ if (!Array.isArray(input)) return [];
94
+ const seen = new Set();
95
+ const out = [];
96
+ for (const raw of input) {
97
+ const word = trimString(raw);
98
+ if (!word) continue;
99
+ const lower = word.toLowerCase();
100
+ if (seen.has(lower)) continue;
101
+ seen.add(lower);
102
+ out.push(word);
103
+ if (out.length >= 12) break;
104
+ }
105
+ return out;
106
+ }
107
+
108
+ function normalizePacingHint(input, fallback = DEFAULT_PACING_HINT) {
109
+ if (!input || typeof input !== 'object' || Array.isArray(input)) {
110
+ return { dwell_ms_min: fallback.dwell_ms_min, dwell_ms_max: fallback.dwell_ms_max };
111
+ }
112
+ const min = clampInt(input.dwell_ms_min, 500, 60000, fallback.dwell_ms_min);
113
+ const max = clampInt(input.dwell_ms_max, 500, 60000, fallback.dwell_ms_max);
114
+ if (max < min) return { dwell_ms_min: min, dwell_ms_max: min + 1000 };
115
+ return { dwell_ms_min: min, dwell_ms_max: max };
116
+ }
117
+
118
+ function normalizeBlock(raw, index, totalHeight) {
119
+ if (!raw || typeof raw !== 'object' || Array.isArray(raw)) return null;
120
+ const id = trimString(raw.id, `b${index + 1}`);
121
+ const yTop = clampInt(raw.y_top, 0, totalHeight, 0);
122
+ const yBottom = clampInt(raw.y_bottom, 0, totalHeight, yTop);
123
+ if (yBottom <= yTop) return null;
124
+
125
+ const visualKind = pickEnum(raw.visual_kind, VISUAL_KIND_SET, 'paragraph');
126
+ const density = pickEnum(raw.density, DENSITY_SET, 'medium');
127
+ const visualWeight = pickEnum(raw.visual_weight, VISUAL_WEIGHT_SET, 'secondary');
128
+ const readingPriority = pickEnum(raw.reading_priority, READING_PRIORITY_SET, 'should');
129
+
130
+ return {
131
+ id,
132
+ y_top: yTop,
133
+ y_bottom: yBottom,
134
+ visual_kind: visualKind,
135
+ text: trimString(raw.text),
136
+ summary: trimString(raw.summary),
137
+ keywords: normalizeKeywords(raw.keywords),
138
+ density,
139
+ visual_weight: visualWeight,
140
+ contains_image: Boolean(raw.contains_image),
141
+ reading_priority: readingPriority,
142
+ pacing_hint: normalizePacingHint(raw.pacing_hint),
143
+ narration_hint: trimString(raw.narration_hint),
144
+ };
145
+ }
146
+
147
+ function normalizeUnsafeRegion(raw, totalHeight) {
148
+ if (!raw || typeof raw !== 'object' || Array.isArray(raw)) return null;
149
+ const yTop = clampInt(raw.y_top, 0, totalHeight, 0);
150
+ const yBottom = clampInt(raw.y_bottom, 0, totalHeight, yTop);
151
+ if (yBottom <= yTop) return null;
152
+ const reason = pickEnum(raw.reason, UNSAFE_REASON_SET, 'footer_promo');
153
+ return { y_top: yTop, y_bottom: yBottom, reason };
154
+ }
155
+
156
+ function normalizeNarrativeArc(raw, blockIds) {
157
+ const fallback = {
158
+ structure: 'linear',
159
+ suggested_flow: [],
160
+ };
161
+ if (!raw || typeof raw !== 'object' || Array.isArray(raw)) return fallback;
162
+ const structure = pickEnum(raw.structure, NARRATIVE_STRUCTURE_SET, 'linear');
163
+ const idSet = new Set(blockIds);
164
+ const flow = Array.isArray(raw.suggested_flow)
165
+ ? raw.suggested_flow
166
+ .map((row) => {
167
+ if (typeof row === 'string') {
168
+ const text = row.trim();
169
+ return text ? { step: text } : null;
170
+ }
171
+ if (!row || typeof row !== 'object') return null;
172
+ const step = trimString(row.step ?? row.note);
173
+ const blockRefs = Array.isArray(row.block_ids)
174
+ ? row.block_ids.map(trimString).filter((id) => id && idSet.has(id))
175
+ : [];
176
+ if (!step && blockRefs.length === 0) return null;
177
+ return { step, block_ids: blockRefs };
178
+ })
179
+ .filter(Boolean)
180
+ .slice(0, 12)
181
+ : [];
182
+ return { structure, suggested_flow: flow };
183
+ }
184
+
185
+ function normalizeViewport(raw) {
186
+ const width = clampInt(raw?.width, 320, 4096, 1080);
187
+ const height = clampInt(raw?.height, 480, 4096, 1920);
188
+ return { width, height };
189
+ }
190
+
191
+ export function normalizePageUnderstanding(raw = {}) {
192
+ const totalHeight = clampInt(raw.full_height_px ?? raw.total_height, 100, 200000, 1920);
193
+ const viewport = normalizeViewport(raw.viewport);
194
+ const preheat = pickEnum(raw.preheat_strategy, PREHEAT_STRATEGY_SET, 'full_scroll_then_top');
195
+
196
+ const rawBlocks = Array.isArray(raw.blocks) ? raw.blocks : [];
197
+ const normalizedBlocks = rawBlocks
198
+ .map((b, i) => normalizeBlock(b, i, totalHeight))
199
+ .filter(Boolean)
200
+ .sort((a, b) => a.y_top - b.y_top);
201
+
202
+ // De-dupe / reassign ids if duplicate
203
+ const seenIds = new Set();
204
+ for (const block of normalizedBlocks) {
205
+ if (seenIds.has(block.id)) {
206
+ let suffix = 2;
207
+ let next = `${block.id}_${suffix}`;
208
+ while (seenIds.has(next)) {
209
+ suffix += 1;
210
+ next = `${block.id}_${suffix}`;
211
+ }
212
+ block.id = next;
213
+ }
214
+ seenIds.add(block.id);
215
+ }
216
+
217
+ const rawUnsafe = Array.isArray(raw.unsafe_regions) ? raw.unsafe_regions : [];
218
+ const unsafeRegions = rawUnsafe
219
+ .map((r) => normalizeUnsafeRegion(r, totalHeight))
220
+ .filter(Boolean)
221
+ .sort((a, b) => a.y_top - b.y_top);
222
+
223
+ const narrativeArc = normalizeNarrativeArc(raw.narrative_arc, normalizedBlocks.map((b) => b.id));
224
+
225
+ return {
226
+ url: trimString(raw.url),
227
+ page_type: trimString(raw.page_type, 'generic_article'),
228
+ primary_topic: trimString(raw.primary_topic),
229
+ viewport,
230
+ preheat_strategy: preheat,
231
+ full_height_px: totalHeight,
232
+ blocks: normalizedBlocks,
233
+ unsafe_regions: unsafeRegions,
234
+ narrative_arc: narrativeArc,
235
+ meta: raw.meta && typeof raw.meta === 'object' && !Array.isArray(raw.meta) ? raw.meta : {},
236
+ };
237
+ }
238
+
239
+ export function validatePageUnderstanding(payload) {
240
+ const errors = [];
241
+ const model = payload ?? {};
242
+
243
+ if (typeof model.url !== 'string' || !model.url.trim()) errors.push('url required');
244
+ if (!Number.isFinite(model.full_height_px) || model.full_height_px <= 0) {
245
+ errors.push('full_height_px invalid');
246
+ }
247
+ if (!model.viewport || typeof model.viewport !== 'object'
248
+ || !Number.isFinite(model.viewport.width) || !Number.isFinite(model.viewport.height)) {
249
+ errors.push('viewport invalid');
250
+ }
251
+ if (!PREHEAT_STRATEGY_SET.has(model.preheat_strategy)) {
252
+ errors.push('preheat_strategy invalid');
253
+ }
254
+
255
+ if (!Array.isArray(model.blocks)) {
256
+ errors.push('blocks must be array');
257
+ } else {
258
+ if (model.blocks.length === 0) errors.push('blocks empty — analyze_page produced no content units');
259
+ for (let i = 0; i < model.blocks.length; i += 1) {
260
+ const block = model.blocks[i];
261
+ if (!block || typeof block !== 'object') {
262
+ errors.push(`blocks[${i}] invalid`);
263
+ continue;
264
+ }
265
+ if (!block.id || typeof block.id !== 'string') errors.push(`blocks[${i}].id invalid`);
266
+ if (!Number.isFinite(block.y_top) || !Number.isFinite(block.y_bottom)
267
+ || block.y_bottom <= block.y_top) {
268
+ errors.push(`blocks[${i}].y range invalid`);
269
+ }
270
+ if (!VISUAL_KIND_SET.has(block.visual_kind)) errors.push(`blocks[${i}].visual_kind invalid`);
271
+ if (!DENSITY_SET.has(block.density)) errors.push(`blocks[${i}].density invalid`);
272
+ if (!VISUAL_WEIGHT_SET.has(block.visual_weight)) errors.push(`blocks[${i}].visual_weight invalid`);
273
+ if (!READING_PRIORITY_SET.has(block.reading_priority)) {
274
+ errors.push(`blocks[${i}].reading_priority invalid`);
275
+ }
276
+ }
277
+ }
278
+
279
+ if (!Array.isArray(model.unsafe_regions)) {
280
+ errors.push('unsafe_regions must be array');
281
+ } else {
282
+ for (let i = 0; i < model.unsafe_regions.length; i += 1) {
283
+ const region = model.unsafe_regions[i];
284
+ if (!region || !Number.isFinite(region.y_top) || !Number.isFinite(region.y_bottom)
285
+ || region.y_bottom <= region.y_top) {
286
+ errors.push(`unsafe_regions[${i}] invalid`);
287
+ continue;
288
+ }
289
+ if (!UNSAFE_REASON_SET.has(region.reason)) errors.push(`unsafe_regions[${i}].reason invalid`);
290
+ }
291
+ }
292
+
293
+ if (!model.narrative_arc || typeof model.narrative_arc !== 'object') {
294
+ errors.push('narrative_arc invalid');
295
+ } else if (!NARRATIVE_STRUCTURE_SET.has(model.narrative_arc.structure)) {
296
+ errors.push('narrative_arc.structure invalid');
297
+ }
298
+
299
+ return { ok: errors.length === 0, errors };
300
+ }
301
+
302
+ // y-overlap check: does any block / unsafe region overlap the given y range?
303
+ export function findOverlappingUnsafeRegion(unsafeRegions, y) {
304
+ if (!Array.isArray(unsafeRegions)) return null;
305
+ const target = Number(y);
306
+ if (!Number.isFinite(target)) return null;
307
+ for (const region of unsafeRegions) {
308
+ if (target >= region.y_top && target <= region.y_bottom) return region;
309
+ }
310
+ return null;
311
+ }
312
+
313
+ export function findBlockById(blocks, blockId) {
314
+ if (!Array.isArray(blocks)) return null;
315
+ return blocks.find((b) => b && b.id === blockId) || null;
316
+ }
@@ -1,23 +1,36 @@
1
- // plan_video_segments — pure audio/video alignment planner.
1
+ // V6 plan_video_segments — audio / operations time alignment.
2
2
  //
3
- // Takes per-segment {text, audio_path, visual_kind, ...} and returns unified
4
- // plan segments with:
5
- // - audio_duration_ms (read via ffprobe from the provided audio_path)
6
- // - subtitle_text (= text)
7
- // - presentation.duration / per_card_duration (audio_duration + buffer)
8
- // - dwell_ms (= audio_duration; lets the same segment drive record_url_narration)
3
+ // Takes per-segment input (text, audio_path, visual_kind, operations[]) and:
4
+ // 1. probes audio_path via ffprobe → audio_duration_ms
5
+ // 2. fills audio_duration_ms / subtitle_text / presentation.duration / dwell_ms
6
+ // 3. expands `"duration_ms": "fill"` on the LAST hold operation to make
7
+ // sum(operations.duration_ms) audio_duration_ms
8
+ // 4. validates sum(operations.duration_ms) within 200ms of audio_duration_ms
9
+ // — rejects with operations_duration_mismatch otherwise
10
+ // 5. rejects V5 fields (action / target_y / target_y_content_label /
11
+ // focus_region / transition_ms / agent-written dwell_ms / phase.beats)
9
12
  //
10
- // Previously this tool ALSO synthesized TTS internally — which duplicated
11
- // the work when callers had already run synthesize_tts, and caused the
12
- // "wrong standard chain" confusion in fragments.md. TTS is now decoupled:
13
- // callers must run synthesize_tts per segment first and pass the resulting
14
- // audio_path here. See docs/scenario-content-creation/video-synthesis-design.md.
15
- //
16
- // Lives in daemon/src/tools/ as a reusable handler module; the MCP-tool
17
- // registration lives in daemon/mcp-servers/official/media-tools/index.js.
13
+ // Lives in daemon/src/tools/; MCP registration in
14
+ // daemon/mcp-servers/official/media-tools/index.js.
18
15
 
19
16
  import { spawn } from 'node:child_process';
20
17
 
18
+ const DURATION_TOLERANCE_MS = 200;
19
+
20
+ const V5_SEGMENT_FIELDS = Object.freeze([
21
+ 'action',
22
+ 'target_y',
23
+ 'target_y_content_label',
24
+ 'targetYContentLabel',
25
+ 'focus_region',
26
+ 'focusRegion',
27
+ 'transition_ms',
28
+ 'transition_ratio',
29
+ 'beats',
30
+ 'visual_action',
31
+ 'camera_motion',
32
+ ]);
33
+
21
34
  function toolText(text) {
22
35
  return { content: [{ type: 'text', text }] };
23
36
  }
@@ -60,25 +73,125 @@ function planDurationSec(audioDurationMs, bufferSec = 0.5) {
60
73
  return Math.ceil(raw * 2) / 2;
61
74
  }
62
75
 
76
+ function assertNoV5Fields(seg, index) {
77
+ for (const field of V5_SEGMENT_FIELDS) {
78
+ if (Object.prototype.hasOwnProperty.call(seg, field)) {
79
+ const error = new Error(
80
+ `phase_v5_fields_removed: segments[${index}] carries V5 field "${field}". `
81
+ + 'V6 segments: { text, audio_path, visual_kind, operations?, visual_path?, visual_paths?, transition?, presentation? }.',
82
+ );
83
+ error.code = 'PHASE_V5_FIELDS_REMOVED';
84
+ throw error;
85
+ }
86
+ }
87
+ if (seg.dwell_ms != null && Number.isFinite(Number(seg.dwell_ms))) {
88
+ const error = new Error(
89
+ `phase_v5_fields_removed: segments[${index}].dwell_ms is set manually. `
90
+ + 'V6 fills dwell_ms automatically from audio_duration_ms; remove dwell_ms from input.',
91
+ );
92
+ error.code = 'PHASE_V5_FIELDS_REMOVED';
93
+ throw error;
94
+ }
95
+ }
96
+
97
+ // Process operations[]: expand "fill" on the last hold, validate atom shape.
98
+ function processOperations(operations, audioDurationMs, segmentIndex) {
99
+ if (!Array.isArray(operations) || operations.length === 0) {
100
+ return { operations: [], durationSumMs: 0 };
101
+ }
102
+
103
+ const expanded = operations.map((op) => ({ ...op }));
104
+ let fillIndex = -1;
105
+ for (let i = 0; i < expanded.length; i += 1) {
106
+ const op = expanded[i];
107
+ if (op && op.duration_ms === 'fill') {
108
+ if (fillIndex !== -1) {
109
+ const error = new Error(
110
+ `fill_position_invalid: segments[${segmentIndex}].operations[${i}] has duration_ms="fill" but `
111
+ + `another fill already at index ${fillIndex}. Only one "fill" allowed, and it must be the LAST hold.`,
112
+ );
113
+ error.code = 'FILL_POSITION_INVALID';
114
+ throw error;
115
+ }
116
+ if (op.atom !== 'hold') {
117
+ const error = new Error(
118
+ `fill_position_invalid: segments[${segmentIndex}].operations[${i}] has duration_ms="fill" on atom="${op.atom}". `
119
+ + '"fill" is only allowed on the last hold atom.',
120
+ );
121
+ error.code = 'FILL_POSITION_INVALID';
122
+ throw error;
123
+ }
124
+ if (i !== expanded.length - 1) {
125
+ const error = new Error(
126
+ `fill_position_invalid: segments[${segmentIndex}].operations[${i}] has duration_ms="fill" but is not `
127
+ + 'the last operation. "fill" must be the LAST atom.',
128
+ );
129
+ error.code = 'FILL_POSITION_INVALID';
130
+ throw error;
131
+ }
132
+ fillIndex = i;
133
+ }
134
+ }
135
+
136
+ if (fillIndex !== -1) {
137
+ let othersSum = 0;
138
+ for (let i = 0; i < expanded.length; i += 1) {
139
+ if (i === fillIndex) continue;
140
+ const n = Number(expanded[i].duration_ms);
141
+ if (!Number.isFinite(n) || n <= 0) {
142
+ const error = new Error(
143
+ `operations_invalid: segments[${segmentIndex}].operations[${i}].duration_ms must be a positive number `
144
+ + `(got ${expanded[i].duration_ms}).`,
145
+ );
146
+ error.code = 'OPERATIONS_INVALID';
147
+ throw error;
148
+ }
149
+ othersSum += n;
150
+ }
151
+ const filled = Math.max(0, audioDurationMs - othersSum);
152
+ expanded[fillIndex] = { ...expanded[fillIndex], duration_ms: filled };
153
+ }
154
+
155
+ let sum = 0;
156
+ for (let i = 0; i < expanded.length; i += 1) {
157
+ const n = Number(expanded[i].duration_ms);
158
+ if (!Number.isFinite(n) || n <= 0) {
159
+ const error = new Error(
160
+ `operations_invalid: segments[${segmentIndex}].operations[${i}].duration_ms must be a positive number `
161
+ + `after expansion (got ${expanded[i].duration_ms}).`,
162
+ );
163
+ error.code = 'OPERATIONS_INVALID';
164
+ throw error;
165
+ }
166
+ sum += n;
167
+ }
168
+ return { operations: expanded, durationSumMs: Math.round(sum) };
169
+ }
170
+
63
171
  export async function runPlanVideoSegmentsTool({ segments } = {}) {
64
172
  if (!Array.isArray(segments) || segments.length === 0) {
65
173
  return toolError('segments must be a non-empty array.');
66
174
  }
67
175
 
68
- // Up-front validation — fail fast before any work.
176
+ // Up-front validation — fail fast before any ffprobe work.
69
177
  for (let i = 0; i < segments.length; i += 1) {
70
178
  const seg = segments[i] ?? {};
71
179
  if (typeof seg.audio_path !== 'string' || !seg.audio_path.trim()) {
72
180
  return toolError(
73
181
  `segments[${i}]: audio_path is required. plan_video_segments no longer synthesizes TTS — call synthesize_tts(text) `
74
- + 'first and pass the returned path as audio_path. Standard chain: synthesize_tts × N → plan_video_segments → '
75
- + 'record_url_narration + compose_video_v2 (share the same plan).'
182
+ + 'first and pass the returned path as audio_path. V6 standard chain: analyze_page → synthesize_tts × N → '
183
+ + 'plan_video_segments → record_url_narration + compose_video_v2.',
76
184
  );
77
185
  }
78
186
  const kind = String(seg.visual_kind ?? '');
79
187
  if (!kind) {
80
188
  return toolError(`segments[${i}]: visual_kind is required (image / video / gif / carousel).`);
81
189
  }
190
+ try {
191
+ assertNoV5Fields(seg, i);
192
+ } catch (err) {
193
+ return toolError(err.message);
194
+ }
82
195
  }
83
196
 
84
197
  const planned = [];
@@ -97,6 +210,25 @@ export async function runPlanVideoSegmentsTool({ segments } = {}) {
97
210
  audioDurationMs = 3000;
98
211
  }
99
212
 
213
+ let processedOps;
214
+ try {
215
+ processedOps = processOperations(seg.operations, audioDurationMs, i);
216
+ } catch (err) {
217
+ return toolError(err.message);
218
+ }
219
+
220
+ if (processedOps.operations.length > 0) {
221
+ const drift = Math.abs(processedOps.durationSumMs - audioDurationMs);
222
+ if (drift > DURATION_TOLERANCE_MS) {
223
+ return toolError(
224
+ `operations_duration_mismatch: segments[${i}].operations duration sum=${processedOps.durationSumMs}ms `
225
+ + `but audio_duration_ms=${audioDurationMs}ms (drift ${drift}ms > ${DURATION_TOLERANCE_MS}ms tolerance). `
226
+ + 'Adjust the section operations so their durations sum to TTS audio duration, or use '
227
+ + '"duration_ms": "fill" on the last hold to auto-fill the remainder.',
228
+ );
229
+ }
230
+ }
231
+
100
232
  let presentation;
101
233
  if (kind === 'carousel') {
102
234
  const numCards = Array.isArray(seg.visual_paths) ? seg.visual_paths.length : 1;
@@ -104,7 +236,6 @@ export async function runPlanVideoSegmentsTool({ segments } = {}) {
104
236
  const perCard = Math.max(2, Math.ceil((totalDuration / numCards) * 2) / 2);
105
237
  presentation = { per_card_duration: perCard };
106
238
  } else {
107
- // image / scroll / video / gif
108
239
  const duration = planDurationSec(audioDurationMs, kind === 'scroll' ? 1.0 : 0.5);
109
240
  presentation = { duration, ...(kind === 'scroll' ? { style: 'scroll' } : {}) };
110
241
  }
@@ -114,10 +245,9 @@ export async function runPlanVideoSegmentsTool({ segments } = {}) {
114
245
  audio_path: seg.audio_path,
115
246
  audio_duration_ms: audioDurationMs,
116
247
  ...(text ? { subtitle_text: text } : {}),
248
+ ...(processedOps.operations.length > 0 ? { operations: processedOps.operations } : {}),
117
249
  presentation: { ...presentation, ...(seg.presentation ?? {}) },
118
- // dwell_ms doubles as record_url_narration's per-phase hold duration so
119
- // recording naturally tracks the narration audio.
120
- dwell_ms: seg.dwell_ms ?? audioDurationMs,
250
+ dwell_ms: audioDurationMs,
121
251
  });
122
252
  }
123
253