@lightcone-ai/daemon 0.15.71 → 0.15.73

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -56,7 +56,7 @@ export class KuaishouAdapter {
56
56
  await this._clickByText('放弃');
57
57
  await sleep(500);
58
58
  try { await this._cdp.send('Runtime.evaluate', { expression: 'window.scrollTo(0, 300)', returnByValue: false }); } catch {}
59
- await this._waitForSelector('input[type="file"], [class*="upload"], [class*="Upload"]', 45000);
59
+ await this._waitForSelector('input[type="file"], [class*="upload"], [class*="Upload"]', 120000);
60
60
 
61
61
  const { loggedIn } = await this.checkLoginStatus();
62
62
  if (!loggedIn) throw new Error('LOGIN_EXPIRED: 快手登录已过期,请重新扫码连接');
@@ -97,7 +97,7 @@ export class KuaishouAdapter {
97
97
 
98
98
  // Scroll once to trigger any lazy-rendered upload widgets, then wait
99
99
  try { await this._cdp.send('Runtime.evaluate', { expression: 'window.scrollTo(0, 300)', returnByValue: false }); } catch {}
100
- await this._waitForSelector('input[type="file"], [class*="upload"], [class*="Upload"]', 45000);
100
+ await this._waitForSelector('input[type="file"], [class*="upload"], [class*="Upload"]', 120000);
101
101
 
102
102
  const { loggedIn } = await this.checkLoginStatus();
103
103
  if (!loggedIn) throw new Error('LOGIN_EXPIRED: 快手登录已过期,请重新扫码连接');
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lightcone-ai/daemon",
3
- "version": "0.15.71",
3
+ "version": "0.15.73",
4
4
  "type": "module",
5
5
  "main": "src/index.js",
6
6
  "bin": {
@@ -9,7 +9,12 @@ const DEFAULT_WIDTH = 1080;
9
9
  const DEFAULT_HEIGHT = 1920;
10
10
  const DEFAULT_FPS = 30;
11
11
  const TRANSITION_DURATION = 0.5;
12
- const SUBTITLE_FONT = 'PingFang SC,Microsoft YaHei,Arial';
12
+ // ASS `Fontname` is a single family name, not a CSS-style fallback list — a
13
+ // comma here shifts every subsequent field in the `Style:` line, corrupting the
14
+ // whole style so libass renders nothing (i.e. burned-in subtitles look missing).
15
+ // Use one installed family; libass + fontconfig handle glyph fallback. Override
16
+ // via SUBTITLE_FONT env if the deployment ships a different CJK font.
17
+ const SUBTITLE_FONT = (process.env.SUBTITLE_FONT || 'Noto Sans CJK SC').split(',')[0].trim() || 'Noto Sans CJK SC';
13
18
  const SUBTITLE_FONT_SIZE = 72;
14
19
  const SUBTITLE_MARGIN_V = 120;
15
20
 
@@ -23,6 +28,28 @@ function msToAssTimestamp(ms) {
23
28
  return `${hr}:${String(min).padStart(2, '0')}:${String(sec).padStart(2, '0')}.${String(cs).padStart(2, '0')}`;
24
29
  }
25
30
 
31
+ // Split a subtitle block into display-sized sentence units. Breaks on CJK/ASCII
32
+ // sentence punctuation and newlines; merges very short fragments forward so we
33
+ // don't flash one-character lines.
34
+ function splitSubtitleSentences(text) {
35
+ const raw = String(text ?? '').trim();
36
+ if (!raw) return [];
37
+ const pieces = raw
38
+ .split(/(?<=[。!?!?;;\n])/u)
39
+ .map(s => s.replace(/\s+/g, ' ').trim())
40
+ .filter(Boolean);
41
+ if (pieces.length <= 1) return [raw];
42
+ const merged = [];
43
+ for (const piece of pieces) {
44
+ if (merged.length > 0 && Array.from(merged[merged.length - 1]).length < 6) {
45
+ merged[merged.length - 1] = `${merged[merged.length - 1]}${piece}`;
46
+ } else {
47
+ merged.push(piece);
48
+ }
49
+ }
50
+ return merged;
51
+ }
52
+
26
53
  function wrapSubtitleText(text, maxChars = 14) {
27
54
  const chars = Array.from(String(text ?? ''));
28
55
  if (chars.length <= maxChars) return chars.join('');
@@ -311,15 +338,29 @@ export async function composeVideoV2({
311
338
  }
312
339
  }
313
340
 
314
- // Build subtitle entries with cumulative timeline timestamps
341
+ // Build subtitle entries with cumulative timeline timestamps. When a clip's
342
+ // subtitle text spans several sentences, split it into one event per sentence
343
+ // and spread them across the clip in proportion to their length, so a long
344
+ // beat reads as sequential lines roughly tracking the narration instead of one
345
+ // static wall of text.
315
346
  let cursorMs = 0;
316
347
  const subtitleEntries = [];
317
348
  for (const clip of readyClips) {
318
349
  if (clip.subtitleText) {
319
- subtitleEntries.push({
320
- text: clip.subtitleText,
321
- start_ms: cursorMs,
322
- end_ms: cursorMs + Math.round(clip.duration * 1000),
350
+ const clipMs = Math.round(clip.duration * 1000);
351
+ const sentences = splitSubtitleSentences(clip.subtitleText);
352
+ const totalLen = sentences.reduce((sum, s) => sum + Array.from(s).length, 0) || 1;
353
+ let offsetMs = 0;
354
+ sentences.forEach((sentence, idx) => {
355
+ const share = Array.from(sentence).length / totalLen;
356
+ const isLast = idx === sentences.length - 1;
357
+ const spanMs = isLast ? clipMs - offsetMs : Math.max(1, Math.round(clipMs * share));
358
+ subtitleEntries.push({
359
+ text: sentence,
360
+ start_ms: cursorMs + offsetMs,
361
+ end_ms: cursorMs + offsetMs + spanMs,
362
+ });
363
+ offsetMs += spanMs;
323
364
  });
324
365
  }
325
366
  cursorMs += Math.round(clip.duration * 1000);
@@ -1,17 +1,12 @@
1
1
  import { spawn } from 'node:child_process';
2
2
  import { mkdirSync } from 'node:fs';
3
- import { stat, writeFile } from 'node:fs/promises';
3
+ import { mkdtemp, rm, stat, writeFile } from 'node:fs/promises';
4
+ import os from 'node:os';
4
5
  import path from 'node:path';
5
6
 
6
7
  import { launchChromiumMobile, openPageAndSettle } from './chromium-driver.js';
7
8
  import { defaultDisplayPool } from './display-pool.js';
8
- import {
9
- createUnexpectedExitWatcher,
10
- startFfmpegCapture,
11
- stopFfmpegCapture,
12
- waitForProcessExit,
13
- } from './ffmpeg-runner.js';
14
- import { estimatePlanDurationMs } from './plan-estimator.js';
9
+ import { createUnexpectedExitWatcher, waitForProcessExit } from './ffmpeg-runner.js';
15
10
  import { executePlanPhases, normalizePlanPhases } from './plan-executor.js';
16
11
 
17
12
  const DEFAULT_VIEWPORT = Object.freeze({ width: 1080, height: 1920 });
@@ -177,6 +172,49 @@ function scalePhaseY(phase, zoom) {
177
172
  };
178
173
  }
179
174
 
175
+ // Re-encode the page recording (webm, page content only — no browser chrome) into
176
+ // the mp4 the rest of the pipeline expects, dropping the head segment that covers
177
+ // page load + settle so the clip starts at the first plan phase.
178
+ async function transcodeWebmToMp4({
179
+ webmPath,
180
+ outputPath,
181
+ startMs = 0,
182
+ fps = DEFAULT_FPS,
183
+ ffmpegBin = 'ffmpeg',
184
+ } = {}) {
185
+ const ss = Math.max(0, Number(startMs) || 0) / 1000;
186
+ const args = [
187
+ '-y',
188
+ ...(ss > 0 ? ['-ss', ss.toFixed(3)] : []),
189
+ '-i', webmPath,
190
+ '-an',
191
+ '-c:v', 'libx264',
192
+ '-preset', 'veryfast',
193
+ '-pix_fmt', 'yuv420p',
194
+ ...(Number.isFinite(Number(fps)) && Number(fps) > 0 ? ['-r', String(fps)] : []),
195
+ '-movflags', '+faststart',
196
+ outputPath,
197
+ ];
198
+ await new Promise((resolve, reject) => {
199
+ const proc = spawn(ffmpegBin, args, { stdio: ['ignore', 'pipe', 'pipe'] });
200
+ const errChunks = [];
201
+ proc.stderr?.on('data', (chunk) => errChunks.push(chunk));
202
+ proc.once('error', (err) => {
203
+ const wrapped = new Error(`ffmpeg_spawn_failed:${err.message}`);
204
+ wrapped.code = 'FFMPEG_SPAWN_FAILED';
205
+ reject(wrapped);
206
+ });
207
+ proc.on('close', (code) => {
208
+ if (code === 0) return resolve();
209
+ const wrapped = new Error(
210
+ `ffmpeg_transcode_failed:code=${code}: ${Buffer.concat(errChunks).toString().slice(-2000)}`
211
+ );
212
+ wrapped.code = 'FFMPEG_TRANSCODE_FAILED';
213
+ reject(wrapped);
214
+ });
215
+ });
216
+ }
217
+
180
218
  export async function recordUrlNarration({
181
219
  plan,
182
220
  output_path,
@@ -189,11 +227,14 @@ export async function recordUrlNarration({
189
227
  settle_ms = 4000,
190
228
  page_zoom = 1.1,
191
229
  displayPool = defaultDisplayPool,
192
- ffmpegDurationBufferSec = 8,
193
230
  startupProbeMs = 1200,
194
- ffmpegStopTimeoutMs = 10000,
195
231
  xvfbStopTimeoutMs = 5000,
196
232
  postPlanTailMs = 600,
233
+ recordingDir = null,
234
+ launchChromiumFn = launchChromiumMobile,
235
+ openPageFn = openPageAndSettle,
236
+ transcodeFn = transcodeWebmToMp4,
237
+ nowMs = () => Date.now(),
197
238
  } = {}) {
198
239
  const zoom = Number.isFinite(Number(page_zoom)) && Number(page_zoom) > 0 ? Number(page_zoom) : 1.1;
199
240
  const rawPhases = normalizePlanPhases(plan);
@@ -212,14 +253,14 @@ export async function recordUrlNarration({
212
253
  mkdirSync(path.dirname(resolvedOutputPath), { recursive: true });
213
254
  mkdirSync(path.dirname(resolvedEventsPath), { recursive: true });
214
255
 
256
+ const ownTempDir = !recordingDir;
257
+ const recVideoDir = recordingDir || await mkdtemp(path.join(os.tmpdir(), 'lc-recvid-'));
258
+
215
259
  let displayLease;
216
260
  let xvfb;
217
- let ffmpeg;
218
- let browserSession;
219
261
  let xvfbWatcher;
220
- let ffmpegWatcher;
262
+ let browserSession = null;
221
263
  let primaryError = null;
222
-
223
264
  const cleanupErrors = [];
224
265
 
225
266
  try {
@@ -234,11 +275,26 @@ export async function recordUrlNarration({
234
275
  });
235
276
  xvfbWatcher = createUnexpectedExitWatcher(xvfb.child, 'xvfb');
236
277
 
237
- browserSession = await launchChromiumMobile({
278
+ // The page recording captures the page viewport only (no browser chrome),
279
+ // regardless of the on-screen window. recordVideo starts when the page is
280
+ // created, so the webm includes goto + settle; we measure that head and trim
281
+ // it off in transcodeFn.
282
+ const recordStartedAt = nowMs();
283
+ browserSession = await launchChromiumFn({
238
284
  display,
239
285
  viewport: normalizedViewport,
286
+ contextOptions: {
287
+ recordVideo: {
288
+ dir: recVideoDir,
289
+ size: { width: normalizedViewport.width, height: normalizedViewport.height },
290
+ },
291
+ },
240
292
  });
241
- await openPageAndSettle(browserSession.page, {
293
+ const videoHandle = typeof browserSession.page.video === 'function'
294
+ ? browserSession.page.video()
295
+ : null;
296
+
297
+ await openPageFn(browserSession.page, {
242
298
  url: resolvedUrl,
243
299
  settleMs: settle_ms,
244
300
  });
@@ -250,42 +306,53 @@ export async function recordUrlNarration({
250
306
  await browserSession.page.waitForTimeout(300);
251
307
  }
252
308
 
253
- const estimatedDurationMs = estimatePlanDurationMs(executablePlan);
254
- const estimatedDurationSec = Math.max(
255
- 5,
256
- Math.ceil(estimatedDurationMs / 1000) + Math.max(0, Number(ffmpegDurationBufferSec) || 0)
257
- );
258
-
259
- ffmpeg = await startFfmpegCapture({
260
- display,
261
- outputPath: resolvedOutputPath,
262
- width: normalizedViewport.width,
263
- height: normalizedViewport.height,
264
- fps: normalizedFps,
265
- durationSec: estimatedDurationSec,
266
- startupProbeMs,
267
- });
268
- ffmpegWatcher = createUnexpectedExitWatcher(ffmpeg.child, 'ffmpeg');
269
-
270
309
  await scrollToTop(browserSession.page);
271
310
  await browserSession.page.waitForTimeout(350);
272
311
 
312
+ const headTrimMs = Math.max(0, nowMs() - recordStartedAt);
313
+
273
314
  const eventsLog = await Promise.race([
274
315
  executePlanPhases(browserSession.page, executablePlan),
275
316
  xvfbWatcher.promise,
276
- ffmpegWatcher.promise,
277
317
  ]);
278
318
 
279
319
  await browserSession.page.waitForTimeout(Math.max(0, Number(postPlanTailMs) || 0));
280
320
 
281
- ffmpegWatcher.deactivate();
282
- await stopFfmpegCapture(ffmpeg, {
283
- timeoutMs: ffmpegStopTimeoutMs,
284
- });
285
-
286
321
  xvfbWatcher.deactivate();
287
322
 
288
- await writeFile(resolvedEventsPath, JSON.stringify(eventsLog, null, 2), 'utf8');
323
+ // Flush the recording: video is written when the context closes.
324
+ let webmPath = null;
325
+ try {
326
+ await browserSession.context.close();
327
+ } catch (closeError) {
328
+ cleanupErrors.push(`context_close_failed:${closeError.message}`);
329
+ }
330
+ if (videoHandle) {
331
+ try {
332
+ webmPath = await videoHandle.path();
333
+ } catch (pathError) {
334
+ cleanupErrors.push(`video_path_failed:${pathError.message}`);
335
+ }
336
+ }
337
+ try {
338
+ await browserSession.browser.close();
339
+ } catch (closeError) {
340
+ cleanupErrors.push(`browser_close_failed:${closeError.message}`);
341
+ }
342
+ browserSession = null;
343
+
344
+ if (!webmPath) {
345
+ const error = new Error('record_video_not_produced');
346
+ error.code = 'RECORD_VIDEO_NOT_PRODUCED';
347
+ throw error;
348
+ }
349
+
350
+ await transcodeFn({
351
+ webmPath,
352
+ outputPath: resolvedOutputPath,
353
+ startMs: headTrimMs,
354
+ fps: normalizedFps,
355
+ });
289
356
 
290
357
  const videoStat = await stat(resolvedOutputPath);
291
358
  if (!videoStat.isFile() || videoStat.size <= 0) {
@@ -294,42 +361,36 @@ export async function recordUrlNarration({
294
361
  throw error;
295
362
  }
296
363
 
364
+ await writeFile(resolvedEventsPath, JSON.stringify(eventsLog, null, 2), 'utf8');
365
+
366
+ const lastTms = Array.isArray(eventsLog)
367
+ ? eventsLog.reduce((max, ev) => Math.max(max, Number(ev?.t_ms) || 0), 0)
368
+ : 0;
369
+
297
370
  return {
298
371
  video_path: resolvedOutputPath,
299
372
  events_path: resolvedEventsPath,
300
373
  events_log: eventsLog,
374
+ duration_ms: lastTms > 0 ? lastTms : null,
301
375
  display,
302
376
  };
303
377
  } catch (error) {
304
378
  primaryError = error;
305
379
  throw error;
306
380
  } finally {
307
- ffmpegWatcher?.deactivate();
308
381
  xvfbWatcher?.deactivate();
309
382
 
310
383
  if (browserSession) {
311
384
  try {
312
- await browserSession.close();
385
+ await browserSession.browser.close();
313
386
  } catch (closeError) {
314
387
  cleanupErrors.push(`browser_close_failed:${closeError.message}`);
315
388
  }
316
389
  }
317
390
 
318
- if (ffmpeg) {
319
- try {
320
- await stopFfmpegCapture(ffmpeg, {
321
- timeoutMs: ffmpegStopTimeoutMs,
322
- });
323
- } catch (stopError) {
324
- cleanupErrors.push(`ffmpeg_stop_failed:${stopError.message}`);
325
- }
326
- }
327
-
328
391
  if (xvfb) {
329
392
  try {
330
- await stopXvfb(xvfb, {
331
- timeoutMs: xvfbStopTimeoutMs,
332
- });
393
+ await stopXvfb(xvfb, { timeoutMs: xvfbStopTimeoutMs });
333
394
  } catch (stopError) {
334
395
  cleanupErrors.push(`xvfb_stop_failed:${stopError.message}`);
335
396
  }
@@ -339,6 +400,10 @@ export async function recordUrlNarration({
339
400
  displayLease.release();
340
401
  }
341
402
 
403
+ if (ownTempDir) {
404
+ await rm(recVideoDir, { recursive: true, force: true }).catch(() => {});
405
+ }
406
+
342
407
  if (cleanupErrors.length > 0) {
343
408
  if (primaryError) {
344
409
  primaryError.cleanupErrors = cleanupErrors;
@@ -21,6 +21,61 @@ function normalizeRange(value) {
21
21
  return [low, high];
22
22
  }
23
23
 
24
+ // The recorder executes exactly these visual actions. There is no "scroll a bit"
25
+ // blind-scroll action: every scroll phase must say where it lands.
26
+ export const SUPPORTED_PHASE_ACTIONS = Object.freeze([
27
+ 'hold',
28
+ 'smooth_scroll',
29
+ 'fast_scroll',
30
+ 'linear_scroll_during',
31
+ 'scroll_to_dwell',
32
+ 'scroll_back',
33
+ 'cursor_focus',
34
+ ]);
35
+
36
+ // Common spellings authors reach for, mapped onto the canonical action above.
37
+ // Note: scroll_down / scroll_up are intentionally NOT aliased — there is no blind
38
+ // scroll; an unrecognised action raises phase_action_unsupported so the plan gets
39
+ // fixed rather than silently degraded.
40
+ const PHASE_ACTION_ALIASES = new Map([
41
+ ['scroll_to', 'scroll_to_dwell'],
42
+ ['scrollto', 'scroll_to_dwell'],
43
+ ['scroll', 'scroll_to_dwell'],
44
+ ['scroll_to_region', 'scroll_to_dwell'],
45
+ ['scroll_to_y', 'scroll_to_dwell'],
46
+ ['dwell', 'scroll_to_dwell'],
47
+ ['focus_hold', 'scroll_to_dwell'],
48
+ ['pan', 'linear_scroll_during'],
49
+ ['narrated_pan', 'linear_scroll_during'],
50
+ ['linear_scroll', 'linear_scroll_during'],
51
+ ['scroll_during', 'linear_scroll_during'],
52
+ ['scroll_while_narrating', 'linear_scroll_during'],
53
+ ['return', 'scroll_back'],
54
+ ['return_anchor', 'scroll_back'],
55
+ ['back', 'scroll_back'],
56
+ ['scroll_to_top', 'scroll_back'],
57
+ ['wait', 'hold'],
58
+ ['pause', 'hold'],
59
+ ['stay', 'hold'],
60
+ ['focus', 'cursor_focus'],
61
+ ['highlight', 'cursor_focus'],
62
+ ]);
63
+
64
+ function normalizeActionName(rawValue) {
65
+ const name = normalizeText(rawValue).toLowerCase();
66
+ if (!name) return '';
67
+ if (SUPPORTED_PHASE_ACTIONS.includes(name)) return name;
68
+ return PHASE_ACTION_ALIASES.get(name) || name;
69
+ }
70
+
71
+ // `visual_action` may be a string (the action name) or an object ({type, target_y, ...}).
72
+ function visualActionObject(section = {}) {
73
+ const va = section?.visual_action;
74
+ if (va && typeof va === 'object') return va;
75
+ if (typeof va === 'string' && va.trim()) return { type: va.trim() };
76
+ return {};
77
+ }
78
+
24
79
  function inferActionFromCameraMotion(phase = {}) {
25
80
  const motion = normalizeText(phase.camera_motion ?? phase.cameraMotion).toLowerCase();
26
81
  if (motion === 'narrated_pan') return 'linear_scroll_during';
@@ -30,19 +85,32 @@ function inferActionFromCameraMotion(phase = {}) {
30
85
  return '';
31
86
  }
32
87
 
88
+ function pickFirstNumber(...values) {
89
+ for (const value of values) {
90
+ if (value == null) continue;
91
+ const parsed = Number(value);
92
+ if (Number.isFinite(parsed)) return Math.round(parsed);
93
+ }
94
+ return null;
95
+ }
96
+
33
97
  function normalizeSectionAsPhase(section = {}, index = 0) {
34
- const phaseId = normalizeText(section.id ?? section.phase_id) || `phase_${index + 1}`;
35
- const visualAction = section.visual_action && typeof section.visual_action === 'object'
36
- ? section.visual_action
37
- : {};
98
+ const phaseId = normalizeText(section.id ?? section.phase_id ?? section.name) || `phase_${index + 1}`;
99
+ const visualAction = visualActionObject(section);
38
100
  const focusRegion = normalizeRange(
39
101
  section.focus_region
40
102
  ?? section.focusRegion
41
103
  ?? visualAction.focus_region
42
104
  ?? visualAction.focusRegion
43
105
  );
44
- const explicitAction = normalizeText(section.action ?? visualAction.type).toLowerCase();
45
- const action = explicitAction || inferActionFromCameraMotion(section) || 'scroll_to_dwell';
106
+ const explicitAction = normalizeActionName(section.action ?? visualAction.type ?? visualAction.action);
107
+ const inferred = explicitAction || inferActionFromCameraMotion(section);
108
+ const targetY = pickFirstNumber(
109
+ section.target_y, section.to_y, section.y, section.scroll_y,
110
+ visualAction.target_y, visualAction.to_y, visualAction.y, visualAction.scroll_y,
111
+ );
112
+ const hasTarget = focusRegion != null || targetY != null;
113
+ const action = inferred || (hasTarget ? 'scroll_to_dwell' : 'hold');
46
114
 
47
115
  return {
48
116
  ...section,
@@ -51,21 +119,42 @@ function normalizeSectionAsPhase(section = {}, index = 0) {
51
119
  action,
52
120
  focus_region: focusRegion ?? null,
53
121
  visual_action: visualAction,
54
- target_y: section.target_y ?? visualAction.target_y ?? visualAction.to_y ?? null,
55
- from_y: section.from_y ?? visualAction.from_y ?? null,
56
- to_y: section.to_y ?? visualAction.to_y ?? null,
57
- transition_ms: section.transition_ms ?? visualAction.transition_ms ?? null,
58
- duration_ms: section.duration_ms ?? section.dwell_ms ?? null,
122
+ target_y: targetY,
123
+ from_y: pickFirstNumber(section.from_y, visualAction.from_y),
124
+ to_y: pickFirstNumber(section.to_y, visualAction.to_y, section.y, visualAction.y),
125
+ transition_ms: section.transition_ms ?? visualAction.transition_ms ?? visualAction.duration_ms ?? null,
126
+ duration_ms: section.duration_ms ?? section.dwell_ms ?? section.audio_duration_ms
127
+ ?? (section.presentation && Number.isFinite(Number(section.presentation.duration))
128
+ ? Math.round(Number(section.presentation.duration) * 1000)
129
+ : null),
59
130
  };
60
131
  }
61
132
 
62
133
  export function normalizePlanPhases(plan = {}) {
63
- const phases = Array.isArray(plan?.phases) ? plan.phases : [];
64
- if (phases.length > 0) return phases;
134
+ const topLevelPhases = Array.isArray(plan?.phases) ? plan.phases : [];
135
+ if (topLevelPhases.length > 0) {
136
+ return topLevelPhases.map((phase, index) => normalizeSectionAsPhase(phase, index));
137
+ }
65
138
 
66
139
  const sections = Array.isArray(plan?.sections) ? plan.sections : [];
67
140
  if (sections.length > 0) {
68
- return sections.map((section, index) => normalizeSectionAsPhase(section, index));
141
+ const flattened = [];
142
+ sections.forEach((section, sectionIndex) => {
143
+ const nested = Array.isArray(section?.phases) ? section.phases : null;
144
+ if (nested && nested.length > 0) {
145
+ const prefix = normalizeText(section.id ?? section.phase_id ?? section.name) || `s${sectionIndex + 1}`;
146
+ nested.forEach((subPhase, subIndex) => {
147
+ const merged = {
148
+ ...subPhase,
149
+ id: subPhase.id ?? subPhase.phase_id ?? subPhase.name ?? `${prefix}_${subIndex + 1}`,
150
+ };
151
+ flattened.push(normalizeSectionAsPhase(merged, flattened.length));
152
+ });
153
+ } else {
154
+ flattened.push(normalizeSectionAsPhase(section, flattened.length));
155
+ }
156
+ });
157
+ return flattened;
69
158
  }
70
159
 
71
160
  const error = new Error('plan_phases_required');
@@ -74,13 +163,15 @@ export function normalizePlanPhases(plan = {}) {
74
163
  }
75
164
 
76
165
  function resolvePhaseAction(phase = {}) {
77
- const explicit = normalizeText(phase.action ?? phase.visual_action?.type).toLowerCase();
166
+ const explicit = normalizeActionName(
167
+ phase.action ?? phase.visual_action?.type ?? phase.visual_action?.action
168
+ );
78
169
  if (explicit) return explicit;
79
170
  return inferActionFromCameraMotion(phase);
80
171
  }
81
172
 
82
173
  function resolvePhaseId(phase = {}, index = 0) {
83
- return normalizeText(phase.id ?? phase.phase_id) || `phase_${index + 1}`;
174
+ return normalizeText(phase.id ?? phase.phase_id ?? phase.name) || `phase_${index + 1}`;
84
175
  }
85
176
 
86
177
  function nowMs(getNowMs) {
@@ -94,9 +185,12 @@ function resolveTransitionMs(phase, fallback) {
94
185
  }
95
186
 
96
187
  function resolveTargetY(phase, fallback = null) {
97
- const raw = phase?.target_y ?? phase?.to_y ?? phase?.visual_action?.target_y ?? phase?.visual_action?.to_y;
98
- const parsed = Number(raw);
99
- if (Number.isFinite(parsed)) return Math.round(parsed);
188
+ const explicit = pickFirstNumber(
189
+ phase?.target_y, phase?.to_y, phase?.y, phase?.scroll_y,
190
+ phase?.visual_action?.target_y, phase?.visual_action?.to_y,
191
+ phase?.visual_action?.y, phase?.visual_action?.scroll_y,
192
+ );
193
+ if (explicit != null) return explicit;
100
194
 
101
195
  const focusRegion = normalizeRange(
102
196
  phase?.focus_region
@@ -113,6 +207,20 @@ function resolveTargetY(phase, fallback = null) {
113
207
  return fallback;
114
208
  }
115
209
 
210
+ function requireTargetY(phase, action) {
211
+ const targetY = resolveTargetY(phase, null);
212
+ if (targetY == null) {
213
+ const error = new Error(
214
+ `phase_target_y_required: phase "${resolvePhaseId(phase)}" uses "${action}" but has no `
215
+ + 'target_y / to_y / y or focus_region — every scroll phase must say where it lands '
216
+ + '(there is no blind scroll)',
217
+ );
218
+ error.code = 'PHASE_TARGET_Y_REQUIRED';
219
+ throw error;
220
+ }
221
+ return targetY;
222
+ }
223
+
116
224
  function resolveFromY(phase, fallback = null) {
117
225
  const raw = phase?.from_y ?? phase?.visual_action?.from_y;
118
226
  const parsed = Number(raw);
@@ -192,8 +300,8 @@ async function executeHold(page, phase) {
192
300
  return { anchorY: null };
193
301
  }
194
302
 
195
- async function executeSmoothScroll(page, phase, { fallbackTargetY = null } = {}) {
196
- const targetY = resolveTargetY(phase, fallbackTargetY);
303
+ async function executeSmoothScroll(page, phase) {
304
+ const targetY = requireTargetY(phase, 'smooth_scroll');
197
305
  const transitionMs = resolveTransitionMs(phase, 900);
198
306
  await animateScroll(page, {
199
307
  targetY,
@@ -205,8 +313,8 @@ async function executeSmoothScroll(page, phase, { fallbackTargetY = null } = {})
205
313
  return { anchorY: targetY };
206
314
  }
207
315
 
208
- async function executeFastScroll(page, phase, { fallbackTargetY = null } = {}) {
209
- const targetY = resolveTargetY(phase, fallbackTargetY);
316
+ async function executeFastScroll(page, phase) {
317
+ const targetY = requireTargetY(phase, 'fast_scroll');
210
318
  const transitionMs = resolveTransitionMs(phase, 420);
211
319
  await animateScroll(page, {
212
320
  targetY,
@@ -218,12 +326,9 @@ async function executeFastScroll(page, phase, { fallbackTargetY = null } = {}) {
218
326
  return { anchorY: targetY };
219
327
  }
220
328
 
221
- async function executeLinearScrollDuring(page, phase, {
222
- fallbackFromY = null,
223
- fallbackTargetY = null,
224
- } = {}) {
329
+ async function executeLinearScrollDuring(page, phase, { fallbackFromY = null } = {}) {
225
330
  const fromY = resolveFromY(phase, fallbackFromY);
226
- const toY = resolveTargetY(phase, fallbackTargetY);
331
+ const toY = requireTargetY(phase, 'linear_scroll_during');
227
332
  const durationMs = resolveDurationMs(phase, null);
228
333
  if (!Number.isFinite(Number(durationMs)) || Number(durationMs) <= 0) {
229
334
  const error = new Error('linear_scroll_duration_required');
@@ -247,8 +352,8 @@ async function executeLinearScrollDuring(page, phase, {
247
352
  return { anchorY: toY };
248
353
  }
249
354
 
250
- async function executeScrollToDwell(page, phase, { fallbackTargetY = null } = {}) {
251
- const targetY = resolveTargetY(phase, fallbackTargetY);
355
+ async function executeScrollToDwell(page, phase) {
356
+ const targetY = requireTargetY(phase, 'scroll_to_dwell');
252
357
  const transitionMs = resolveTransitionMs(phase, 820);
253
358
  await animateScroll(page, {
254
359
  targetY,
@@ -286,8 +391,8 @@ async function executeScrollBack(page, phase, { fallbackTargetY = 0 } = {}) {
286
391
  return { anchorY: targetY };
287
392
  }
288
393
 
289
- async function executeCursorFocus(page, phase, { fallbackTargetY = null } = {}) {
290
- const targetY = resolveTargetY(phase, fallbackTargetY);
394
+ async function executeCursorFocus(page, phase) {
395
+ const targetY = requireTargetY(phase, 'cursor_focus');
291
396
  const transitionMs = resolveTransitionMs(phase, 650);
292
397
  await animateScroll(page, {
293
398
  targetY,
@@ -313,34 +418,34 @@ async function executePhase(page, phase, {
313
418
  initialAnchorY = 0,
314
419
  } = {}) {
315
420
  const action = resolvePhaseAction(phase);
316
- const fallbackY = lastAnchorY ?? initialAnchorY;
421
+ const fallbackFromY = lastAnchorY ?? initialAnchorY;
317
422
 
318
423
  if (action === 'hold') {
319
424
  return executeHold(page, phase);
320
425
  }
321
426
  if (action === 'smooth_scroll') {
322
- return executeSmoothScroll(page, phase, { fallbackTargetY: fallbackY });
427
+ return executeSmoothScroll(page, phase);
323
428
  }
324
429
  if (action === 'fast_scroll') {
325
- return executeFastScroll(page, phase, { fallbackTargetY: fallbackY });
430
+ return executeFastScroll(page, phase);
326
431
  }
327
432
  if (action === 'linear_scroll_during') {
328
- return executeLinearScrollDuring(page, phase, {
329
- fallbackFromY: fallbackY,
330
- fallbackTargetY: fallbackY,
331
- });
433
+ return executeLinearScrollDuring(page, phase, { fallbackFromY });
332
434
  }
333
435
  if (action === 'scroll_to_dwell') {
334
- return executeScrollToDwell(page, phase, { fallbackTargetY: fallbackY });
436
+ return executeScrollToDwell(page, phase);
335
437
  }
336
438
  if (action === 'scroll_back') {
337
439
  return executeScrollBack(page, phase, { fallbackTargetY: 0 });
338
440
  }
339
441
  if (action === 'cursor_focus') {
340
- return executeCursorFocus(page, phase, { fallbackTargetY: fallbackY });
442
+ return executeCursorFocus(page, phase);
341
443
  }
342
444
 
343
- const error = new Error(`phase_action_unsupported:${action || 'empty'}`);
445
+ const error = new Error(
446
+ `phase_action_unsupported:${action || 'empty'} — supported actions: ${SUPPORTED_PHASE_ACTIONS.join(', ')}`
447
+ + ' (there is no blind scroll_down/scroll_up; use scroll_to_dwell with target_y or focus_region)',
448
+ );
344
449
  error.code = 'PHASE_ACTION_UNSUPPORTED';
345
450
  throw error;
346
451
  }
@@ -1430,10 +1430,10 @@ server.tool('get_library_file',
1430
1430
 
1431
1431
  // ── record_url_narration ────────────────────────────────────────────────────────
1432
1432
  server.tool('record_url_narration',
1433
- 'Record a silent video of a URL by orchestrating Xvfb + Chromium + ffmpeg, driven by a video plan. Outputs a silent mp4 that can be passed to compose_video_v2 as a video-kind segment with an audio_path for narration.\n\nUse this as the canonical recording step for URL-narration videos. Falls back: if the page needs interactions outside the visual_action vocabulary (clicks, waits, OCR loops), use Monitor (Bash) with custom Playwright instead.\n\nRuntime requirements: this tool only works on a Linux daemon machine with Xvfb + ffmpeg (x11grab) + Chromium installed. macOS / Windows daemons will fail at startup.',
1433
+ 'Record a silent video of a URL by driving Chromium on an Xvfb display and capturing it with Playwright recordVideo, driven by a video plan; ffmpeg then transcodes the recording to mp4. Outputs a silent mp4 that can be passed to compose_video_v2 as a video-kind segment with an audio_path for narration.\n\nUse this as the canonical recording step for URL-narration videos. Falls back: if the page needs interactions outside the visual_action vocabulary (clicks, waits, OCR loops), use Monitor (Bash) with custom Playwright instead.\n\nRuntime requirements: this tool only works on a Linux daemon machine with Xvfb + Chromium + ffmpeg installed (ffmpeg is used to transcode the recording to mp4; no x11grab device support needed). macOS / Windows daemons will fail at startup.',
1434
1434
  {
1435
1435
  url: z.string().describe('Page URL to record'),
1436
- plan: z.record(z.any()).describe('Must be the full output from detail_sections (not plan_video). detail_sections output includes detail_sections_version, sections[], audio metadata, and dwell_ms per phase.'),
1436
+ plan: z.record(z.any()).describe('A video plan: an object with `phases` (or `sections`), each a "visual beat" with `action` (scroll_to_dwell / linear_scroll_during / scroll_back / hold / ...), a target (`target_y` or `focus_region:[y1,y2]`) for scroll-type actions, and `dwell_ms` (how long to hold that beat — should match the segment\'s TTS duration). It can be hand-written or the output of plan_video_segments (whose returned segments array doubles as a valid plan).'),
1437
1437
  output_path: z.string().optional().describe('Workspace-relative output mp4 path. Default tmp/wx3_video/recorded-{ts}.mp4'),
1438
1438
  events_path: z.string().optional().describe('Workspace-relative events.json path. Default ${output_path}.events.json'),
1439
1439
  viewport: z.object({
@@ -1468,7 +1468,7 @@ server.tool('submit_to_library',
1468
1468
  target_platform: z.string().optional().describe('目标发布平台,如 xhs / douyin'),
1469
1469
  metadata: z.record(z.any()).optional().describe('其它 metadata(brand_voice / persona / account / goal_state 等)'),
1470
1470
  understanding: z.record(z.any()).optional().describe('analyze_page 输出'),
1471
- plan: z.record(z.any()).optional().describe('plan_video / detail_sections 输出'),
1471
+ plan: z.record(z.any()).optional().describe('plan_video_segments 输出(或手写的录屏 plan)'),
1472
1472
  },
1473
1473
  async (args) => {
1474
1474
  if (isBlockedCvmaxEditorVideoTool('submit_to_library')) {
@@ -1529,7 +1529,7 @@ server.tool('request_approval',
1529
1529
  platform: z.string().describe('Target platform, e.g. "x", "xhs", "email"'),
1530
1530
  description: z.string().describe('Human-readable summary of what will happen if approved'),
1531
1531
  payload: z.record(z.any()).describe('Full action parameters (content, media_urls, etc.)'),
1532
- credential_id: z.string().optional().describe('Which account/credential to use. For publishing, prefer a workspace account_id or real credential UUID. Role aliases like primary/test are accepted only if they uniquely match a workspace account.'),
1532
+ credential_id: z.string().optional().describe('Which account/credential to use. Accepts a workspace account_id, a real credential UUID, the account display name, or a role alias (主号/main/primary, 矩阵号/matrix/secondary, 测试号/test/incubator) any value works as long as it uniquely matches one workspace account on the target platform. If publishing fails with publish_account_selection_required/ambiguous, pick a value from the returned candidates\' "selectors" list yourself instead of asking the user to re-type an account name.'),
1533
1533
  },
1534
1534
  async ({ action_type, platform, description, payload, credential_id }) => {
1535
1535
  try {
@@ -45,6 +45,20 @@ function planDurationSec(audioDurationMs, bufferSec = 0.5) {
45
45
  return Math.ceil(raw * 2) / 2; // round up to nearest 0.5s
46
46
  }
47
47
 
48
+ // Run fn over items with a bounded number of concurrent workers (FIFO drain).
49
+ async function mapWithConcurrency(items, limit, fn) {
50
+ const queue = items.map((item, index) => ({ item, index }));
51
+ const workers = Array.from({ length: Math.max(1, Math.min(limit, queue.length)) }, async () => {
52
+ while (queue.length > 0) {
53
+ const next = queue.shift();
54
+ await fn(next.item, next.index);
55
+ }
56
+ });
57
+ await Promise.all(workers);
58
+ }
59
+
60
+ const TTS_CONCURRENCY = 5;
61
+
48
62
  export async function runPlanVideoSegmentsTool({ segments, workspace_id, voice_id, currentWorkspaceId, api }) {
49
63
  if (!Array.isArray(segments) || segments.length === 0) {
50
64
  return toolError('segments must be a non-empty array.');
@@ -58,20 +72,31 @@ export async function runPlanVideoSegmentsTool({ segments, workspace_id, voice_i
58
72
  const planned = [];
59
73
  const errors = [];
60
74
 
75
+ // Synthesize TTS for every text-bearing segment up front, in parallel (bounded),
76
+ // so an N-segment plan no longer pays N sequential round-trips to the TTS API.
77
+ const audioResults = new Array(segments.length).fill(null);
78
+ const ttsJobs = segments
79
+ .map((seg, i) => ({ i, text: String(seg.text ?? '').trim() }))
80
+ .filter(job => job.text);
81
+ await mapWithConcurrency(ttsJobs, TTS_CONCURRENCY, async ({ i, text }) => {
82
+ try {
83
+ audioResults[i] = await synthesizeSegmentTts(text, { workspace_id: targetWorkspaceId, voice_id, api });
84
+ } catch (err) {
85
+ errors.push(`segments[${i}]: TTS failed — ${err.message}`);
86
+ audioResults[i] = { audio_path: null, audio_duration_ms: 3000 }; // fallback estimate
87
+ }
88
+ });
89
+ errors.sort((a, b) => {
90
+ const na = Number((a.match(/segments\[(\d+)\]/) ?? [])[1] ?? 0);
91
+ const nb = Number((b.match(/segments\[(\d+)\]/) ?? [])[1] ?? 0);
92
+ return na - nb;
93
+ });
94
+
61
95
  for (let i = 0; i < segments.length; i++) {
62
96
  const seg = segments[i];
63
97
  const text = String(seg.text ?? '').trim();
64
98
  const kind = String(seg.visual_kind ?? 'image');
65
-
66
- let audioResult = null;
67
- if (text) {
68
- try {
69
- audioResult = await synthesizeSegmentTts(text, { workspace_id: targetWorkspaceId, voice_id, api });
70
- } catch (err) {
71
- errors.push(`segments[${i}]: TTS failed — ${err.message}`);
72
- audioResult = { audio_path: null, audio_duration_ms: 3000 }; // fallback estimate
73
- }
74
- }
99
+ const audioResult = audioResults[i];
75
100
 
76
101
  const audioDurationMs = audioResult?.audio_duration_ms ?? 0;
77
102
  let presentation;
@@ -87,11 +112,19 @@ export async function runPlanVideoSegmentsTool({ segments, workspace_id, voice_i
87
112
  presentation = { duration, ...(kind === 'scroll' ? { style: 'scroll' } : {}) };
88
113
  }
89
114
 
115
+ // dwell_ms lets the same segment double as a record_url_narration plan phase
116
+ // (the recorder reads dwell_ms / duration_ms for how long to hold each beat).
117
+ // Prefer the real measured audio length; fall back to the planned visual duration.
118
+ const dwellMs = audioDurationMs > 0
119
+ ? audioDurationMs
120
+ : Math.round((presentation.duration ?? presentation.per_card_duration ?? 4) * 1000);
121
+
90
122
  const planned_seg = {
91
123
  ...seg,
92
124
  ...(audioResult?.audio_path ? { audio_path: audioResult.audio_path } : {}),
93
125
  ...(text ? { subtitle_text: text } : {}),
94
126
  presentation: { ...presentation, ...(seg.presentation ?? {}) },
127
+ dwell_ms: seg.dwell_ms ?? dwellMs,
95
128
  };
96
129
  if (audioResult?.audio_duration_ms) {
97
130
  planned_seg.audio_duration_ms = audioResult.audio_duration_ms;