@lightcone-ai/daemon 0.15.70 → 0.15.72
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -9,7 +9,12 @@ const DEFAULT_WIDTH = 1080;
|
|
|
9
9
|
const DEFAULT_HEIGHT = 1920;
|
|
10
10
|
const DEFAULT_FPS = 30;
|
|
11
11
|
const TRANSITION_DURATION = 0.5;
|
|
12
|
-
|
|
12
|
+
// ASS `Fontname` is a single family name, not a CSS-style fallback list — a
|
|
13
|
+
// comma here shifts every subsequent field in the `Style:` line, corrupting the
|
|
14
|
+
// whole style so libass renders nothing (i.e. burned-in subtitles look missing).
|
|
15
|
+
// Use one installed family; libass + fontconfig handle glyph fallback. Override
|
|
16
|
+
// via SUBTITLE_FONT env if the deployment ships a different CJK font.
|
|
17
|
+
const SUBTITLE_FONT = (process.env.SUBTITLE_FONT || 'Noto Sans CJK SC').split(',')[0].trim() || 'Noto Sans CJK SC';
|
|
13
18
|
const SUBTITLE_FONT_SIZE = 72;
|
|
14
19
|
const SUBTITLE_MARGIN_V = 120;
|
|
15
20
|
|
|
@@ -23,6 +28,28 @@ function msToAssTimestamp(ms) {
|
|
|
23
28
|
return `${hr}:${String(min).padStart(2, '0')}:${String(sec).padStart(2, '0')}.${String(cs).padStart(2, '0')}`;
|
|
24
29
|
}
|
|
25
30
|
|
|
31
|
+
// Split a subtitle block into display-sized sentence units. Breaks on CJK/ASCII
|
|
32
|
+
// sentence punctuation and newlines; merges very short fragments forward so we
|
|
33
|
+
// don't flash one-character lines.
|
|
34
|
+
function splitSubtitleSentences(text) {
|
|
35
|
+
const raw = String(text ?? '').trim();
|
|
36
|
+
if (!raw) return [];
|
|
37
|
+
const pieces = raw
|
|
38
|
+
.split(/(?<=[。!?!?;;\n])/u)
|
|
39
|
+
.map(s => s.replace(/\s+/g, ' ').trim())
|
|
40
|
+
.filter(Boolean);
|
|
41
|
+
if (pieces.length <= 1) return [raw];
|
|
42
|
+
const merged = [];
|
|
43
|
+
for (const piece of pieces) {
|
|
44
|
+
if (merged.length > 0 && Array.from(merged[merged.length - 1]).length < 6) {
|
|
45
|
+
merged[merged.length - 1] = `${merged[merged.length - 1]}${piece}`;
|
|
46
|
+
} else {
|
|
47
|
+
merged.push(piece);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
return merged;
|
|
51
|
+
}
|
|
52
|
+
|
|
26
53
|
function wrapSubtitleText(text, maxChars = 14) {
|
|
27
54
|
const chars = Array.from(String(text ?? ''));
|
|
28
55
|
if (chars.length <= maxChars) return chars.join('');
|
|
@@ -292,7 +319,14 @@ export async function composeVideoV2({
|
|
|
292
319
|
finalClip = await silentClip({ videoPath: visualClip.path, duration: visualClip.duration, tmpDir });
|
|
293
320
|
}
|
|
294
321
|
|
|
295
|
-
|
|
322
|
+
// Accept `text` as an alias for `subtitle_text`: plan_video_segments takes
|
|
323
|
+
// segment narration as `text` on input, compose_video_v2's canonical name is
|
|
324
|
+
// `subtitle_text`. Either reaches the burn pass so subtitles aren't silently dropped.
|
|
325
|
+
const subtitleText = (
|
|
326
|
+
typeof seg.subtitle_text === 'string' ? seg.subtitle_text
|
|
327
|
+
: typeof seg.text === 'string' ? seg.text
|
|
328
|
+
: ''
|
|
329
|
+
).trim();
|
|
296
330
|
readyClips.push({ path: finalClip, duration: visualClip.duration, transition, subtitleText });
|
|
297
331
|
}
|
|
298
332
|
|
|
@@ -304,15 +338,29 @@ export async function composeVideoV2({
|
|
|
304
338
|
}
|
|
305
339
|
}
|
|
306
340
|
|
|
307
|
-
// Build subtitle entries with cumulative timeline timestamps
|
|
341
|
+
// Build subtitle entries with cumulative timeline timestamps. When a clip's
|
|
342
|
+
// subtitle text spans several sentences, split it into one event per sentence
|
|
343
|
+
// and spread them across the clip in proportion to their length, so a long
|
|
344
|
+
// beat reads as sequential lines roughly tracking the narration instead of one
|
|
345
|
+
// static wall of text.
|
|
308
346
|
let cursorMs = 0;
|
|
309
347
|
const subtitleEntries = [];
|
|
310
348
|
for (const clip of readyClips) {
|
|
311
349
|
if (clip.subtitleText) {
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
350
|
+
const clipMs = Math.round(clip.duration * 1000);
|
|
351
|
+
const sentences = splitSubtitleSentences(clip.subtitleText);
|
|
352
|
+
const totalLen = sentences.reduce((sum, s) => sum + Array.from(s).length, 0) || 1;
|
|
353
|
+
let offsetMs = 0;
|
|
354
|
+
sentences.forEach((sentence, idx) => {
|
|
355
|
+
const share = Array.from(sentence).length / totalLen;
|
|
356
|
+
const isLast = idx === sentences.length - 1;
|
|
357
|
+
const spanMs = isLast ? clipMs - offsetMs : Math.max(1, Math.round(clipMs * share));
|
|
358
|
+
subtitleEntries.push({
|
|
359
|
+
text: sentence,
|
|
360
|
+
start_ms: cursorMs + offsetMs,
|
|
361
|
+
end_ms: cursorMs + offsetMs + spanMs,
|
|
362
|
+
});
|
|
363
|
+
offsetMs += spanMs;
|
|
316
364
|
});
|
|
317
365
|
}
|
|
318
366
|
cursorMs += Math.round(clip.duration * 1000);
|
|
@@ -1,17 +1,12 @@
|
|
|
1
1
|
import { spawn } from 'node:child_process';
|
|
2
2
|
import { mkdirSync } from 'node:fs';
|
|
3
|
-
import { stat, writeFile } from 'node:fs/promises';
|
|
3
|
+
import { mkdtemp, rm, stat, writeFile } from 'node:fs/promises';
|
|
4
|
+
import os from 'node:os';
|
|
4
5
|
import path from 'node:path';
|
|
5
6
|
|
|
6
7
|
import { launchChromiumMobile, openPageAndSettle } from './chromium-driver.js';
|
|
7
8
|
import { defaultDisplayPool } from './display-pool.js';
|
|
8
|
-
import {
|
|
9
|
-
createUnexpectedExitWatcher,
|
|
10
|
-
startFfmpegCapture,
|
|
11
|
-
stopFfmpegCapture,
|
|
12
|
-
waitForProcessExit,
|
|
13
|
-
} from './ffmpeg-runner.js';
|
|
14
|
-
import { estimatePlanDurationMs } from './plan-estimator.js';
|
|
9
|
+
import { createUnexpectedExitWatcher, waitForProcessExit } from './ffmpeg-runner.js';
|
|
15
10
|
import { executePlanPhases, normalizePlanPhases } from './plan-executor.js';
|
|
16
11
|
|
|
17
12
|
const DEFAULT_VIEWPORT = Object.freeze({ width: 1080, height: 1920 });
|
|
@@ -177,6 +172,49 @@ function scalePhaseY(phase, zoom) {
|
|
|
177
172
|
};
|
|
178
173
|
}
|
|
179
174
|
|
|
175
|
+
// Re-encode the page recording (webm, page content only — no browser chrome) into
|
|
176
|
+
// the mp4 the rest of the pipeline expects, dropping the head segment that covers
|
|
177
|
+
// page load + settle so the clip starts at the first plan phase.
|
|
178
|
+
async function transcodeWebmToMp4({
|
|
179
|
+
webmPath,
|
|
180
|
+
outputPath,
|
|
181
|
+
startMs = 0,
|
|
182
|
+
fps = DEFAULT_FPS,
|
|
183
|
+
ffmpegBin = 'ffmpeg',
|
|
184
|
+
} = {}) {
|
|
185
|
+
const ss = Math.max(0, Number(startMs) || 0) / 1000;
|
|
186
|
+
const args = [
|
|
187
|
+
'-y',
|
|
188
|
+
...(ss > 0 ? ['-ss', ss.toFixed(3)] : []),
|
|
189
|
+
'-i', webmPath,
|
|
190
|
+
'-an',
|
|
191
|
+
'-c:v', 'libx264',
|
|
192
|
+
'-preset', 'veryfast',
|
|
193
|
+
'-pix_fmt', 'yuv420p',
|
|
194
|
+
...(Number.isFinite(Number(fps)) && Number(fps) > 0 ? ['-r', String(fps)] : []),
|
|
195
|
+
'-movflags', '+faststart',
|
|
196
|
+
outputPath,
|
|
197
|
+
];
|
|
198
|
+
await new Promise((resolve, reject) => {
|
|
199
|
+
const proc = spawn(ffmpegBin, args, { stdio: ['ignore', 'pipe', 'pipe'] });
|
|
200
|
+
const errChunks = [];
|
|
201
|
+
proc.stderr?.on('data', (chunk) => errChunks.push(chunk));
|
|
202
|
+
proc.once('error', (err) => {
|
|
203
|
+
const wrapped = new Error(`ffmpeg_spawn_failed:${err.message}`);
|
|
204
|
+
wrapped.code = 'FFMPEG_SPAWN_FAILED';
|
|
205
|
+
reject(wrapped);
|
|
206
|
+
});
|
|
207
|
+
proc.on('close', (code) => {
|
|
208
|
+
if (code === 0) return resolve();
|
|
209
|
+
const wrapped = new Error(
|
|
210
|
+
`ffmpeg_transcode_failed:code=${code}: ${Buffer.concat(errChunks).toString().slice(-2000)}`
|
|
211
|
+
);
|
|
212
|
+
wrapped.code = 'FFMPEG_TRANSCODE_FAILED';
|
|
213
|
+
reject(wrapped);
|
|
214
|
+
});
|
|
215
|
+
});
|
|
216
|
+
}
|
|
217
|
+
|
|
180
218
|
export async function recordUrlNarration({
|
|
181
219
|
plan,
|
|
182
220
|
output_path,
|
|
@@ -189,11 +227,14 @@ export async function recordUrlNarration({
|
|
|
189
227
|
settle_ms = 4000,
|
|
190
228
|
page_zoom = 1.1,
|
|
191
229
|
displayPool = defaultDisplayPool,
|
|
192
|
-
ffmpegDurationBufferSec = 8,
|
|
193
230
|
startupProbeMs = 1200,
|
|
194
|
-
ffmpegStopTimeoutMs = 10000,
|
|
195
231
|
xvfbStopTimeoutMs = 5000,
|
|
196
232
|
postPlanTailMs = 600,
|
|
233
|
+
recordingDir = null,
|
|
234
|
+
launchChromiumFn = launchChromiumMobile,
|
|
235
|
+
openPageFn = openPageAndSettle,
|
|
236
|
+
transcodeFn = transcodeWebmToMp4,
|
|
237
|
+
nowMs = () => Date.now(),
|
|
197
238
|
} = {}) {
|
|
198
239
|
const zoom = Number.isFinite(Number(page_zoom)) && Number(page_zoom) > 0 ? Number(page_zoom) : 1.1;
|
|
199
240
|
const rawPhases = normalizePlanPhases(plan);
|
|
@@ -212,14 +253,14 @@ export async function recordUrlNarration({
|
|
|
212
253
|
mkdirSync(path.dirname(resolvedOutputPath), { recursive: true });
|
|
213
254
|
mkdirSync(path.dirname(resolvedEventsPath), { recursive: true });
|
|
214
255
|
|
|
256
|
+
const ownTempDir = !recordingDir;
|
|
257
|
+
const recVideoDir = recordingDir || await mkdtemp(path.join(os.tmpdir(), 'lc-recvid-'));
|
|
258
|
+
|
|
215
259
|
let displayLease;
|
|
216
260
|
let xvfb;
|
|
217
|
-
let ffmpeg;
|
|
218
|
-
let browserSession;
|
|
219
261
|
let xvfbWatcher;
|
|
220
|
-
let
|
|
262
|
+
let browserSession = null;
|
|
221
263
|
let primaryError = null;
|
|
222
|
-
|
|
223
264
|
const cleanupErrors = [];
|
|
224
265
|
|
|
225
266
|
try {
|
|
@@ -234,11 +275,26 @@ export async function recordUrlNarration({
|
|
|
234
275
|
});
|
|
235
276
|
xvfbWatcher = createUnexpectedExitWatcher(xvfb.child, 'xvfb');
|
|
236
277
|
|
|
237
|
-
|
|
278
|
+
// The page recording captures the page viewport only (no browser chrome),
|
|
279
|
+
// regardless of the on-screen window. recordVideo starts when the page is
|
|
280
|
+
// created, so the webm includes goto + settle; we measure that head and trim
|
|
281
|
+
// it off in transcodeFn.
|
|
282
|
+
const recordStartedAt = nowMs();
|
|
283
|
+
browserSession = await launchChromiumFn({
|
|
238
284
|
display,
|
|
239
285
|
viewport: normalizedViewport,
|
|
286
|
+
contextOptions: {
|
|
287
|
+
recordVideo: {
|
|
288
|
+
dir: recVideoDir,
|
|
289
|
+
size: { width: normalizedViewport.width, height: normalizedViewport.height },
|
|
290
|
+
},
|
|
291
|
+
},
|
|
240
292
|
});
|
|
241
|
-
|
|
293
|
+
const videoHandle = typeof browserSession.page.video === 'function'
|
|
294
|
+
? browserSession.page.video()
|
|
295
|
+
: null;
|
|
296
|
+
|
|
297
|
+
await openPageFn(browserSession.page, {
|
|
242
298
|
url: resolvedUrl,
|
|
243
299
|
settleMs: settle_ms,
|
|
244
300
|
});
|
|
@@ -250,42 +306,53 @@ export async function recordUrlNarration({
|
|
|
250
306
|
await browserSession.page.waitForTimeout(300);
|
|
251
307
|
}
|
|
252
308
|
|
|
253
|
-
const estimatedDurationMs = estimatePlanDurationMs(executablePlan);
|
|
254
|
-
const estimatedDurationSec = Math.max(
|
|
255
|
-
5,
|
|
256
|
-
Math.ceil(estimatedDurationMs / 1000) + Math.max(0, Number(ffmpegDurationBufferSec) || 0)
|
|
257
|
-
);
|
|
258
|
-
|
|
259
|
-
ffmpeg = await startFfmpegCapture({
|
|
260
|
-
display,
|
|
261
|
-
outputPath: resolvedOutputPath,
|
|
262
|
-
width: normalizedViewport.width,
|
|
263
|
-
height: normalizedViewport.height,
|
|
264
|
-
fps: normalizedFps,
|
|
265
|
-
durationSec: estimatedDurationSec,
|
|
266
|
-
startupProbeMs,
|
|
267
|
-
});
|
|
268
|
-
ffmpegWatcher = createUnexpectedExitWatcher(ffmpeg.child, 'ffmpeg');
|
|
269
|
-
|
|
270
309
|
await scrollToTop(browserSession.page);
|
|
271
310
|
await browserSession.page.waitForTimeout(350);
|
|
272
311
|
|
|
312
|
+
const headTrimMs = Math.max(0, nowMs() - recordStartedAt);
|
|
313
|
+
|
|
273
314
|
const eventsLog = await Promise.race([
|
|
274
315
|
executePlanPhases(browserSession.page, executablePlan),
|
|
275
316
|
xvfbWatcher.promise,
|
|
276
|
-
ffmpegWatcher.promise,
|
|
277
317
|
]);
|
|
278
318
|
|
|
279
319
|
await browserSession.page.waitForTimeout(Math.max(0, Number(postPlanTailMs) || 0));
|
|
280
320
|
|
|
281
|
-
ffmpegWatcher.deactivate();
|
|
282
|
-
await stopFfmpegCapture(ffmpeg, {
|
|
283
|
-
timeoutMs: ffmpegStopTimeoutMs,
|
|
284
|
-
});
|
|
285
|
-
|
|
286
321
|
xvfbWatcher.deactivate();
|
|
287
322
|
|
|
288
|
-
|
|
323
|
+
// Flush the recording: video is written when the context closes.
|
|
324
|
+
let webmPath = null;
|
|
325
|
+
try {
|
|
326
|
+
await browserSession.context.close();
|
|
327
|
+
} catch (closeError) {
|
|
328
|
+
cleanupErrors.push(`context_close_failed:${closeError.message}`);
|
|
329
|
+
}
|
|
330
|
+
if (videoHandle) {
|
|
331
|
+
try {
|
|
332
|
+
webmPath = await videoHandle.path();
|
|
333
|
+
} catch (pathError) {
|
|
334
|
+
cleanupErrors.push(`video_path_failed:${pathError.message}`);
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
try {
|
|
338
|
+
await browserSession.browser.close();
|
|
339
|
+
} catch (closeError) {
|
|
340
|
+
cleanupErrors.push(`browser_close_failed:${closeError.message}`);
|
|
341
|
+
}
|
|
342
|
+
browserSession = null;
|
|
343
|
+
|
|
344
|
+
if (!webmPath) {
|
|
345
|
+
const error = new Error('record_video_not_produced');
|
|
346
|
+
error.code = 'RECORD_VIDEO_NOT_PRODUCED';
|
|
347
|
+
throw error;
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
await transcodeFn({
|
|
351
|
+
webmPath,
|
|
352
|
+
outputPath: resolvedOutputPath,
|
|
353
|
+
startMs: headTrimMs,
|
|
354
|
+
fps: normalizedFps,
|
|
355
|
+
});
|
|
289
356
|
|
|
290
357
|
const videoStat = await stat(resolvedOutputPath);
|
|
291
358
|
if (!videoStat.isFile() || videoStat.size <= 0) {
|
|
@@ -294,42 +361,36 @@ export async function recordUrlNarration({
|
|
|
294
361
|
throw error;
|
|
295
362
|
}
|
|
296
363
|
|
|
364
|
+
await writeFile(resolvedEventsPath, JSON.stringify(eventsLog, null, 2), 'utf8');
|
|
365
|
+
|
|
366
|
+
const lastTms = Array.isArray(eventsLog)
|
|
367
|
+
? eventsLog.reduce((max, ev) => Math.max(max, Number(ev?.t_ms) || 0), 0)
|
|
368
|
+
: 0;
|
|
369
|
+
|
|
297
370
|
return {
|
|
298
371
|
video_path: resolvedOutputPath,
|
|
299
372
|
events_path: resolvedEventsPath,
|
|
300
373
|
events_log: eventsLog,
|
|
374
|
+
duration_ms: lastTms > 0 ? lastTms : null,
|
|
301
375
|
display,
|
|
302
376
|
};
|
|
303
377
|
} catch (error) {
|
|
304
378
|
primaryError = error;
|
|
305
379
|
throw error;
|
|
306
380
|
} finally {
|
|
307
|
-
ffmpegWatcher?.deactivate();
|
|
308
381
|
xvfbWatcher?.deactivate();
|
|
309
382
|
|
|
310
383
|
if (browserSession) {
|
|
311
384
|
try {
|
|
312
|
-
await browserSession.close();
|
|
385
|
+
await browserSession.browser.close();
|
|
313
386
|
} catch (closeError) {
|
|
314
387
|
cleanupErrors.push(`browser_close_failed:${closeError.message}`);
|
|
315
388
|
}
|
|
316
389
|
}
|
|
317
390
|
|
|
318
|
-
if (ffmpeg) {
|
|
319
|
-
try {
|
|
320
|
-
await stopFfmpegCapture(ffmpeg, {
|
|
321
|
-
timeoutMs: ffmpegStopTimeoutMs,
|
|
322
|
-
});
|
|
323
|
-
} catch (stopError) {
|
|
324
|
-
cleanupErrors.push(`ffmpeg_stop_failed:${stopError.message}`);
|
|
325
|
-
}
|
|
326
|
-
}
|
|
327
|
-
|
|
328
391
|
if (xvfb) {
|
|
329
392
|
try {
|
|
330
|
-
await stopXvfb(xvfb, {
|
|
331
|
-
timeoutMs: xvfbStopTimeoutMs,
|
|
332
|
-
});
|
|
393
|
+
await stopXvfb(xvfb, { timeoutMs: xvfbStopTimeoutMs });
|
|
333
394
|
} catch (stopError) {
|
|
334
395
|
cleanupErrors.push(`xvfb_stop_failed:${stopError.message}`);
|
|
335
396
|
}
|
|
@@ -339,6 +400,10 @@ export async function recordUrlNarration({
|
|
|
339
400
|
displayLease.release();
|
|
340
401
|
}
|
|
341
402
|
|
|
403
|
+
if (ownTempDir) {
|
|
404
|
+
await rm(recVideoDir, { recursive: true, force: true }).catch(() => {});
|
|
405
|
+
}
|
|
406
|
+
|
|
342
407
|
if (cleanupErrors.length > 0) {
|
|
343
408
|
if (primaryError) {
|
|
344
409
|
primaryError.cleanupErrors = cleanupErrors;
|
|
@@ -21,6 +21,61 @@ function normalizeRange(value) {
|
|
|
21
21
|
return [low, high];
|
|
22
22
|
}
|
|
23
23
|
|
|
24
|
+
// The recorder executes exactly these visual actions. There is no "scroll a bit"
|
|
25
|
+
// blind-scroll action: every scroll phase must say where it lands.
|
|
26
|
+
export const SUPPORTED_PHASE_ACTIONS = Object.freeze([
|
|
27
|
+
'hold',
|
|
28
|
+
'smooth_scroll',
|
|
29
|
+
'fast_scroll',
|
|
30
|
+
'linear_scroll_during',
|
|
31
|
+
'scroll_to_dwell',
|
|
32
|
+
'scroll_back',
|
|
33
|
+
'cursor_focus',
|
|
34
|
+
]);
|
|
35
|
+
|
|
36
|
+
// Common spellings authors reach for, mapped onto the canonical action above.
|
|
37
|
+
// Note: scroll_down / scroll_up are intentionally NOT aliased — there is no blind
|
|
38
|
+
// scroll; an unrecognised action raises phase_action_unsupported so the plan gets
|
|
39
|
+
// fixed rather than silently degraded.
|
|
40
|
+
const PHASE_ACTION_ALIASES = new Map([
|
|
41
|
+
['scroll_to', 'scroll_to_dwell'],
|
|
42
|
+
['scrollto', 'scroll_to_dwell'],
|
|
43
|
+
['scroll', 'scroll_to_dwell'],
|
|
44
|
+
['scroll_to_region', 'scroll_to_dwell'],
|
|
45
|
+
['scroll_to_y', 'scroll_to_dwell'],
|
|
46
|
+
['dwell', 'scroll_to_dwell'],
|
|
47
|
+
['focus_hold', 'scroll_to_dwell'],
|
|
48
|
+
['pan', 'linear_scroll_during'],
|
|
49
|
+
['narrated_pan', 'linear_scroll_during'],
|
|
50
|
+
['linear_scroll', 'linear_scroll_during'],
|
|
51
|
+
['scroll_during', 'linear_scroll_during'],
|
|
52
|
+
['scroll_while_narrating', 'linear_scroll_during'],
|
|
53
|
+
['return', 'scroll_back'],
|
|
54
|
+
['return_anchor', 'scroll_back'],
|
|
55
|
+
['back', 'scroll_back'],
|
|
56
|
+
['scroll_to_top', 'scroll_back'],
|
|
57
|
+
['wait', 'hold'],
|
|
58
|
+
['pause', 'hold'],
|
|
59
|
+
['stay', 'hold'],
|
|
60
|
+
['focus', 'cursor_focus'],
|
|
61
|
+
['highlight', 'cursor_focus'],
|
|
62
|
+
]);
|
|
63
|
+
|
|
64
|
+
function normalizeActionName(rawValue) {
|
|
65
|
+
const name = normalizeText(rawValue).toLowerCase();
|
|
66
|
+
if (!name) return '';
|
|
67
|
+
if (SUPPORTED_PHASE_ACTIONS.includes(name)) return name;
|
|
68
|
+
return PHASE_ACTION_ALIASES.get(name) || name;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// `visual_action` may be a string (the action name) or an object ({type, target_y, ...}).
|
|
72
|
+
function visualActionObject(section = {}) {
|
|
73
|
+
const va = section?.visual_action;
|
|
74
|
+
if (va && typeof va === 'object') return va;
|
|
75
|
+
if (typeof va === 'string' && va.trim()) return { type: va.trim() };
|
|
76
|
+
return {};
|
|
77
|
+
}
|
|
78
|
+
|
|
24
79
|
function inferActionFromCameraMotion(phase = {}) {
|
|
25
80
|
const motion = normalizeText(phase.camera_motion ?? phase.cameraMotion).toLowerCase();
|
|
26
81
|
if (motion === 'narrated_pan') return 'linear_scroll_during';
|
|
@@ -30,19 +85,32 @@ function inferActionFromCameraMotion(phase = {}) {
|
|
|
30
85
|
return '';
|
|
31
86
|
}
|
|
32
87
|
|
|
88
|
+
function pickFirstNumber(...values) {
|
|
89
|
+
for (const value of values) {
|
|
90
|
+
if (value == null) continue;
|
|
91
|
+
const parsed = Number(value);
|
|
92
|
+
if (Number.isFinite(parsed)) return Math.round(parsed);
|
|
93
|
+
}
|
|
94
|
+
return null;
|
|
95
|
+
}
|
|
96
|
+
|
|
33
97
|
function normalizeSectionAsPhase(section = {}, index = 0) {
|
|
34
|
-
const phaseId = normalizeText(section.id ?? section.phase_id) || `phase_${index + 1}`;
|
|
35
|
-
const visualAction = section
|
|
36
|
-
? section.visual_action
|
|
37
|
-
: {};
|
|
98
|
+
const phaseId = normalizeText(section.id ?? section.phase_id ?? section.name) || `phase_${index + 1}`;
|
|
99
|
+
const visualAction = visualActionObject(section);
|
|
38
100
|
const focusRegion = normalizeRange(
|
|
39
101
|
section.focus_region
|
|
40
102
|
?? section.focusRegion
|
|
41
103
|
?? visualAction.focus_region
|
|
42
104
|
?? visualAction.focusRegion
|
|
43
105
|
);
|
|
44
|
-
const explicitAction =
|
|
45
|
-
const
|
|
106
|
+
const explicitAction = normalizeActionName(section.action ?? visualAction.type ?? visualAction.action);
|
|
107
|
+
const inferred = explicitAction || inferActionFromCameraMotion(section);
|
|
108
|
+
const targetY = pickFirstNumber(
|
|
109
|
+
section.target_y, section.to_y, section.y, section.scroll_y,
|
|
110
|
+
visualAction.target_y, visualAction.to_y, visualAction.y, visualAction.scroll_y,
|
|
111
|
+
);
|
|
112
|
+
const hasTarget = focusRegion != null || targetY != null;
|
|
113
|
+
const action = inferred || (hasTarget ? 'scroll_to_dwell' : 'hold');
|
|
46
114
|
|
|
47
115
|
return {
|
|
48
116
|
...section,
|
|
@@ -51,21 +119,42 @@ function normalizeSectionAsPhase(section = {}, index = 0) {
|
|
|
51
119
|
action,
|
|
52
120
|
focus_region: focusRegion ?? null,
|
|
53
121
|
visual_action: visualAction,
|
|
54
|
-
target_y:
|
|
55
|
-
from_y: section.from_y
|
|
56
|
-
to_y: section.to_y
|
|
57
|
-
transition_ms: section.transition_ms ?? visualAction.transition_ms ?? null,
|
|
58
|
-
duration_ms: section.duration_ms ?? section.dwell_ms ??
|
|
122
|
+
target_y: targetY,
|
|
123
|
+
from_y: pickFirstNumber(section.from_y, visualAction.from_y),
|
|
124
|
+
to_y: pickFirstNumber(section.to_y, visualAction.to_y, section.y, visualAction.y),
|
|
125
|
+
transition_ms: section.transition_ms ?? visualAction.transition_ms ?? visualAction.duration_ms ?? null,
|
|
126
|
+
duration_ms: section.duration_ms ?? section.dwell_ms ?? section.audio_duration_ms
|
|
127
|
+
?? (section.presentation && Number.isFinite(Number(section.presentation.duration))
|
|
128
|
+
? Math.round(Number(section.presentation.duration) * 1000)
|
|
129
|
+
: null),
|
|
59
130
|
};
|
|
60
131
|
}
|
|
61
132
|
|
|
62
133
|
export function normalizePlanPhases(plan = {}) {
|
|
63
|
-
const
|
|
64
|
-
if (
|
|
134
|
+
const topLevelPhases = Array.isArray(plan?.phases) ? plan.phases : [];
|
|
135
|
+
if (topLevelPhases.length > 0) {
|
|
136
|
+
return topLevelPhases.map((phase, index) => normalizeSectionAsPhase(phase, index));
|
|
137
|
+
}
|
|
65
138
|
|
|
66
139
|
const sections = Array.isArray(plan?.sections) ? plan.sections : [];
|
|
67
140
|
if (sections.length > 0) {
|
|
68
|
-
|
|
141
|
+
const flattened = [];
|
|
142
|
+
sections.forEach((section, sectionIndex) => {
|
|
143
|
+
const nested = Array.isArray(section?.phases) ? section.phases : null;
|
|
144
|
+
if (nested && nested.length > 0) {
|
|
145
|
+
const prefix = normalizeText(section.id ?? section.phase_id ?? section.name) || `s${sectionIndex + 1}`;
|
|
146
|
+
nested.forEach((subPhase, subIndex) => {
|
|
147
|
+
const merged = {
|
|
148
|
+
...subPhase,
|
|
149
|
+
id: subPhase.id ?? subPhase.phase_id ?? subPhase.name ?? `${prefix}_${subIndex + 1}`,
|
|
150
|
+
};
|
|
151
|
+
flattened.push(normalizeSectionAsPhase(merged, flattened.length));
|
|
152
|
+
});
|
|
153
|
+
} else {
|
|
154
|
+
flattened.push(normalizeSectionAsPhase(section, flattened.length));
|
|
155
|
+
}
|
|
156
|
+
});
|
|
157
|
+
return flattened;
|
|
69
158
|
}
|
|
70
159
|
|
|
71
160
|
const error = new Error('plan_phases_required');
|
|
@@ -74,13 +163,15 @@ export function normalizePlanPhases(plan = {}) {
|
|
|
74
163
|
}
|
|
75
164
|
|
|
76
165
|
function resolvePhaseAction(phase = {}) {
|
|
77
|
-
const explicit =
|
|
166
|
+
const explicit = normalizeActionName(
|
|
167
|
+
phase.action ?? phase.visual_action?.type ?? phase.visual_action?.action
|
|
168
|
+
);
|
|
78
169
|
if (explicit) return explicit;
|
|
79
170
|
return inferActionFromCameraMotion(phase);
|
|
80
171
|
}
|
|
81
172
|
|
|
82
173
|
function resolvePhaseId(phase = {}, index = 0) {
|
|
83
|
-
return normalizeText(phase.id ?? phase.phase_id) || `phase_${index + 1}`;
|
|
174
|
+
return normalizeText(phase.id ?? phase.phase_id ?? phase.name) || `phase_${index + 1}`;
|
|
84
175
|
}
|
|
85
176
|
|
|
86
177
|
function nowMs(getNowMs) {
|
|
@@ -94,9 +185,12 @@ function resolveTransitionMs(phase, fallback) {
|
|
|
94
185
|
}
|
|
95
186
|
|
|
96
187
|
function resolveTargetY(phase, fallback = null) {
|
|
97
|
-
const
|
|
98
|
-
|
|
99
|
-
|
|
188
|
+
const explicit = pickFirstNumber(
|
|
189
|
+
phase?.target_y, phase?.to_y, phase?.y, phase?.scroll_y,
|
|
190
|
+
phase?.visual_action?.target_y, phase?.visual_action?.to_y,
|
|
191
|
+
phase?.visual_action?.y, phase?.visual_action?.scroll_y,
|
|
192
|
+
);
|
|
193
|
+
if (explicit != null) return explicit;
|
|
100
194
|
|
|
101
195
|
const focusRegion = normalizeRange(
|
|
102
196
|
phase?.focus_region
|
|
@@ -113,6 +207,20 @@ function resolveTargetY(phase, fallback = null) {
|
|
|
113
207
|
return fallback;
|
|
114
208
|
}
|
|
115
209
|
|
|
210
|
+
function requireTargetY(phase, action) {
|
|
211
|
+
const targetY = resolveTargetY(phase, null);
|
|
212
|
+
if (targetY == null) {
|
|
213
|
+
const error = new Error(
|
|
214
|
+
`phase_target_y_required: phase "${resolvePhaseId(phase)}" uses "${action}" but has no `
|
|
215
|
+
+ 'target_y / to_y / y or focus_region — every scroll phase must say where it lands '
|
|
216
|
+
+ '(there is no blind scroll)',
|
|
217
|
+
);
|
|
218
|
+
error.code = 'PHASE_TARGET_Y_REQUIRED';
|
|
219
|
+
throw error;
|
|
220
|
+
}
|
|
221
|
+
return targetY;
|
|
222
|
+
}
|
|
223
|
+
|
|
116
224
|
function resolveFromY(phase, fallback = null) {
|
|
117
225
|
const raw = phase?.from_y ?? phase?.visual_action?.from_y;
|
|
118
226
|
const parsed = Number(raw);
|
|
@@ -192,8 +300,8 @@ async function executeHold(page, phase) {
|
|
|
192
300
|
return { anchorY: null };
|
|
193
301
|
}
|
|
194
302
|
|
|
195
|
-
async function executeSmoothScroll(page, phase
|
|
196
|
-
const targetY =
|
|
303
|
+
async function executeSmoothScroll(page, phase) {
|
|
304
|
+
const targetY = requireTargetY(phase, 'smooth_scroll');
|
|
197
305
|
const transitionMs = resolveTransitionMs(phase, 900);
|
|
198
306
|
await animateScroll(page, {
|
|
199
307
|
targetY,
|
|
@@ -205,8 +313,8 @@ async function executeSmoothScroll(page, phase, { fallbackTargetY = null } = {})
|
|
|
205
313
|
return { anchorY: targetY };
|
|
206
314
|
}
|
|
207
315
|
|
|
208
|
-
async function executeFastScroll(page, phase
|
|
209
|
-
const targetY =
|
|
316
|
+
async function executeFastScroll(page, phase) {
|
|
317
|
+
const targetY = requireTargetY(phase, 'fast_scroll');
|
|
210
318
|
const transitionMs = resolveTransitionMs(phase, 420);
|
|
211
319
|
await animateScroll(page, {
|
|
212
320
|
targetY,
|
|
@@ -218,12 +326,9 @@ async function executeFastScroll(page, phase, { fallbackTargetY = null } = {}) {
|
|
|
218
326
|
return { anchorY: targetY };
|
|
219
327
|
}
|
|
220
328
|
|
|
221
|
-
async function executeLinearScrollDuring(page, phase, {
|
|
222
|
-
fallbackFromY = null,
|
|
223
|
-
fallbackTargetY = null,
|
|
224
|
-
} = {}) {
|
|
329
|
+
async function executeLinearScrollDuring(page, phase, { fallbackFromY = null } = {}) {
|
|
225
330
|
const fromY = resolveFromY(phase, fallbackFromY);
|
|
226
|
-
const toY =
|
|
331
|
+
const toY = requireTargetY(phase, 'linear_scroll_during');
|
|
227
332
|
const durationMs = resolveDurationMs(phase, null);
|
|
228
333
|
if (!Number.isFinite(Number(durationMs)) || Number(durationMs) <= 0) {
|
|
229
334
|
const error = new Error('linear_scroll_duration_required');
|
|
@@ -247,8 +352,8 @@ async function executeLinearScrollDuring(page, phase, {
|
|
|
247
352
|
return { anchorY: toY };
|
|
248
353
|
}
|
|
249
354
|
|
|
250
|
-
async function executeScrollToDwell(page, phase
|
|
251
|
-
const targetY =
|
|
355
|
+
async function executeScrollToDwell(page, phase) {
|
|
356
|
+
const targetY = requireTargetY(phase, 'scroll_to_dwell');
|
|
252
357
|
const transitionMs = resolveTransitionMs(phase, 820);
|
|
253
358
|
await animateScroll(page, {
|
|
254
359
|
targetY,
|
|
@@ -286,8 +391,8 @@ async function executeScrollBack(page, phase, { fallbackTargetY = 0 } = {}) {
|
|
|
286
391
|
return { anchorY: targetY };
|
|
287
392
|
}
|
|
288
393
|
|
|
289
|
-
async function executeCursorFocus(page, phase
|
|
290
|
-
const targetY =
|
|
394
|
+
async function executeCursorFocus(page, phase) {
|
|
395
|
+
const targetY = requireTargetY(phase, 'cursor_focus');
|
|
291
396
|
const transitionMs = resolveTransitionMs(phase, 650);
|
|
292
397
|
await animateScroll(page, {
|
|
293
398
|
targetY,
|
|
@@ -313,34 +418,34 @@ async function executePhase(page, phase, {
|
|
|
313
418
|
initialAnchorY = 0,
|
|
314
419
|
} = {}) {
|
|
315
420
|
const action = resolvePhaseAction(phase);
|
|
316
|
-
const
|
|
421
|
+
const fallbackFromY = lastAnchorY ?? initialAnchorY;
|
|
317
422
|
|
|
318
423
|
if (action === 'hold') {
|
|
319
424
|
return executeHold(page, phase);
|
|
320
425
|
}
|
|
321
426
|
if (action === 'smooth_scroll') {
|
|
322
|
-
return executeSmoothScroll(page, phase
|
|
427
|
+
return executeSmoothScroll(page, phase);
|
|
323
428
|
}
|
|
324
429
|
if (action === 'fast_scroll') {
|
|
325
|
-
return executeFastScroll(page, phase
|
|
430
|
+
return executeFastScroll(page, phase);
|
|
326
431
|
}
|
|
327
432
|
if (action === 'linear_scroll_during') {
|
|
328
|
-
return executeLinearScrollDuring(page, phase, {
|
|
329
|
-
fallbackFromY: fallbackY,
|
|
330
|
-
fallbackTargetY: fallbackY,
|
|
331
|
-
});
|
|
433
|
+
return executeLinearScrollDuring(page, phase, { fallbackFromY });
|
|
332
434
|
}
|
|
333
435
|
if (action === 'scroll_to_dwell') {
|
|
334
|
-
return executeScrollToDwell(page, phase
|
|
436
|
+
return executeScrollToDwell(page, phase);
|
|
335
437
|
}
|
|
336
438
|
if (action === 'scroll_back') {
|
|
337
439
|
return executeScrollBack(page, phase, { fallbackTargetY: 0 });
|
|
338
440
|
}
|
|
339
441
|
if (action === 'cursor_focus') {
|
|
340
|
-
return executeCursorFocus(page, phase
|
|
442
|
+
return executeCursorFocus(page, phase);
|
|
341
443
|
}
|
|
342
444
|
|
|
343
|
-
const error = new Error(
|
|
445
|
+
const error = new Error(
|
|
446
|
+
`phase_action_unsupported:${action || 'empty'} — supported actions: ${SUPPORTED_PHASE_ACTIONS.join(', ')}`
|
|
447
|
+
+ ' (there is no blind scroll_down/scroll_up; use scroll_to_dwell with target_y or focus_region)',
|
|
448
|
+
);
|
|
344
449
|
error.code = 'PHASE_ACTION_UNSUPPORTED';
|
|
345
450
|
throw error;
|
|
346
451
|
}
|
|
@@ -59,13 +59,30 @@ export async function runComposeVideoV2Tool({ segments, outro_paths, format, res
|
|
|
59
59
|
);
|
|
60
60
|
}
|
|
61
61
|
}
|
|
62
|
+
const warnings = [];
|
|
62
63
|
// Heuristic warning: a multi-segment image video that reuses one single image
|
|
63
64
|
// will look near-static — usually a sign the source page didn't render and the
|
|
64
65
|
// agent fell back to one blank screenshot.
|
|
65
|
-
let warning = null;
|
|
66
66
|
if (imagePaths.length >= 2 && new Set(imagePaths).size === 1) {
|
|
67
|
-
|
|
68
|
-
|
|
67
|
+
warnings.push(
|
|
68
|
+
`WARNING: all ${imagePaths.length} image segments reuse the same file (${imagePaths[0]}). `
|
|
69
|
+
+ 'The output will be near-static — verify the source page actually rendered before submitting this video.'
|
|
70
|
+
);
|
|
71
|
+
}
|
|
72
|
+
// Warn when narration is present but no subtitle text is — compose_video_v2 burns
|
|
73
|
+
// subtitles only from `subtitle_text` (or its `text` alias); without it the video
|
|
74
|
+
// ships with no captions. Simplest fix: pass plan_video_segments' output verbatim.
|
|
75
|
+
{
|
|
76
|
+
const hasSubText = s => (typeof s?.subtitle_text === 'string' && s.subtitle_text.trim())
|
|
77
|
+
|| (typeof s?.text === 'string' && s.text.trim());
|
|
78
|
+
const narratedNoSub = segments.filter(s =>
|
|
79
|
+
(typeof s?.audio_path === 'string' && s.audio_path.trim()) && !hasSubText(s)).length;
|
|
80
|
+
if (narratedNoSub > 0) {
|
|
81
|
+
warnings.push(
|
|
82
|
+
`WARNING: ${narratedNoSub} segment(s) have narration audio but no subtitle text — the output will have NO subtitles. `
|
|
83
|
+
+ 'If subtitles are wanted, set subtitle_text per segment (or pass the plan_video_segments output array verbatim).'
|
|
84
|
+
);
|
|
85
|
+
}
|
|
69
86
|
}
|
|
70
87
|
|
|
71
88
|
const outDir = workspaceDir
|
|
@@ -89,7 +106,7 @@ export async function runComposeVideoV2Tool({ segments, outro_paths, format, res
|
|
|
89
106
|
`segments=${segments.length}`,
|
|
90
107
|
`outro_clips=${(outro_paths ?? []).length}`,
|
|
91
108
|
];
|
|
92
|
-
|
|
109
|
+
for (const w of warnings) lines.push(w);
|
|
93
110
|
return toolText(lines.join('\n'));
|
|
94
111
|
} catch (error) {
|
|
95
112
|
return toolError(`compose_video_v2 failed: ${error.message}`);
|
|
@@ -87,11 +87,19 @@ export async function runPlanVideoSegmentsTool({ segments, workspace_id, voice_i
|
|
|
87
87
|
presentation = { duration, ...(kind === 'scroll' ? { style: 'scroll' } : {}) };
|
|
88
88
|
}
|
|
89
89
|
|
|
90
|
+
// dwell_ms lets the same segment double as a record_url_narration plan phase
|
|
91
|
+
// (the recorder reads dwell_ms / duration_ms for how long to hold each beat).
|
|
92
|
+
// Prefer the real measured audio length; fall back to the planned visual duration.
|
|
93
|
+
const dwellMs = audioDurationMs > 0
|
|
94
|
+
? audioDurationMs
|
|
95
|
+
: Math.round((presentation.duration ?? presentation.per_card_duration ?? 4) * 1000);
|
|
96
|
+
|
|
90
97
|
const planned_seg = {
|
|
91
98
|
...seg,
|
|
92
99
|
...(audioResult?.audio_path ? { audio_path: audioResult.audio_path } : {}),
|
|
93
100
|
...(text ? { subtitle_text: text } : {}),
|
|
94
101
|
presentation: { ...presentation, ...(seg.presentation ?? {}) },
|
|
102
|
+
dwell_ms: seg.dwell_ms ?? dwellMs,
|
|
95
103
|
};
|
|
96
104
|
if (audioResult?.audio_duration_ms) {
|
|
97
105
|
planned_seg.audio_duration_ms = audioResult.audio_duration_ms;
|