npm - @lightcone-ai/daemon - Versions diffs - 0.15.71 → 0.15.73 - Mend

@lightcone-ai/daemon 0.15.71 → 0.15.73

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/mcp-servers/publisher/adapters/kuaishou.js +2 -2
package/package.json +1 -1
package/src/_vendor/video/composer-v2/index.js +47 -6
package/src/_vendor/video/recorder/index.js +120 -55
package/src/_vendor/video/recorder/plan-executor.js +147 -42
package/src/chat-bridge.js +4 -4
package/src/tools/plan-video-segments.js +43 -10

package/mcp-servers/publisher/adapters/kuaishou.js CHANGED Viewed

@@ -56,7 +56,7 @@ export class KuaishouAdapter {
     await this._clickByText('放弃');
     await sleep(500);
     try { await this._cdp.send('Runtime.evaluate', { expression: 'window.scrollTo(0, 300)', returnByValue: false }); } catch {}
-    await this._waitForSelector('input[type="file"], [class*="upload"], [class*="Upload"]', 45000);
+    await this._waitForSelector('input[type="file"], [class*="upload"], [class*="Upload"]', 120000);
     const { loggedIn } = await this.checkLoginStatus();
     if (!loggedIn) throw new Error('LOGIN_EXPIRED: 快手登录已过期，请重新扫码连接');
@@ -97,7 +97,7 @@ export class KuaishouAdapter {
     // Scroll once to trigger any lazy-rendered upload widgets, then wait
     try { await this._cdp.send('Runtime.evaluate', { expression: 'window.scrollTo(0, 300)', returnByValue: false }); } catch {}
-    await this._waitForSelector('input[type="file"], [class*="upload"], [class*="Upload"]', 45000);
+    await this._waitForSelector('input[type="file"], [class*="upload"], [class*="Upload"]', 120000);
     const { loggedIn } = await this.checkLoginStatus();
     if (!loggedIn) throw new Error('LOGIN_EXPIRED: 快手登录已过期，请重新扫码连接');

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@lightcone-ai/daemon",
-  "version": "0.15.71",
+  "version": "0.15.73",
   "type": "module",
   "main": "src/index.js",
   "bin": {

package/src/_vendor/video/composer-v2/index.js CHANGED Viewed

@@ -9,7 +9,12 @@ const DEFAULT_WIDTH = 1080;
 const DEFAULT_HEIGHT = 1920;
 const DEFAULT_FPS = 30;
 const TRANSITION_DURATION = 0.5;
-const SUBTITLE_FONT = 'PingFang SC,Microsoft YaHei,Arial';
+// ASS `Fontname` is a single family name, not a CSS-style fallback list — a
+// comma here shifts every subsequent field in the `Style:` line, corrupting the
+// whole style so libass renders nothing (i.e. burned-in subtitles look missing).
+// Use one installed family; libass + fontconfig handle glyph fallback. Override
+// via SUBTITLE_FONT env if the deployment ships a different CJK font.
+const SUBTITLE_FONT = (process.env.SUBTITLE_FONT || 'Noto Sans CJK SC').split(',')[0].trim() || 'Noto Sans CJK SC';
 const SUBTITLE_FONT_SIZE = 72;
 const SUBTITLE_MARGIN_V = 120;
@@ -23,6 +28,28 @@ function msToAssTimestamp(ms) {
   return `${hr}:${String(min).padStart(2, '0')}:${String(sec).padStart(2, '0')}.${String(cs).padStart(2, '0')}`;
 }
+// Split a subtitle block into display-sized sentence units. Breaks on CJK/ASCII
+// sentence punctuation and newlines; merges very short fragments forward so we
+// don't flash one-character lines.
+function splitSubtitleSentences(text) {
+  const raw = String(text ?? '').trim();
+  if (!raw) return [];
+  const pieces = raw
+    .split(/(?<=[。！？!?；;\n])/u)
+    .map(s => s.replace(/\s+/g, ' ').trim())
+    .filter(Boolean);
+  if (pieces.length <= 1) return [raw];
+  const merged = [];
+  for (const piece of pieces) {
+    if (merged.length > 0 && Array.from(merged[merged.length - 1]).length < 6) {
+      merged[merged.length - 1] = `${merged[merged.length - 1]}${piece}`;
+    } else {
+      merged.push(piece);
+    }
+  }
+  return merged;
+}
 function wrapSubtitleText(text, maxChars = 14) {
   const chars = Array.from(String(text ?? ''));
   if (chars.length <= maxChars) return chars.join('');
@@ -311,15 +338,29 @@ export async function composeVideoV2({
       }
     }
-    // Build subtitle entries with cumulative timeline timestamps
+    // Build subtitle entries with cumulative timeline timestamps. When a clip's
+    // subtitle text spans several sentences, split it into one event per sentence
+    // and spread them across the clip in proportion to their length, so a long
+    // beat reads as sequential lines roughly tracking the narration instead of one
+    // static wall of text.
     let cursorMs = 0;
     const subtitleEntries = [];
     for (const clip of readyClips) {
       if (clip.subtitleText) {
-        subtitleEntries.push({
-          text: clip.subtitleText,
-          start_ms: cursorMs,
-          end_ms: cursorMs + Math.round(clip.duration * 1000),
+        const clipMs = Math.round(clip.duration * 1000);
+        const sentences = splitSubtitleSentences(clip.subtitleText);
+        const totalLen = sentences.reduce((sum, s) => sum + Array.from(s).length, 0) || 1;
+        let offsetMs = 0;
+        sentences.forEach((sentence, idx) => {
+          const share = Array.from(sentence).length / totalLen;
+          const isLast = idx === sentences.length - 1;
+          const spanMs = isLast ? clipMs - offsetMs : Math.max(1, Math.round(clipMs * share));
+          subtitleEntries.push({
+            text: sentence,
+            start_ms: cursorMs + offsetMs,
+            end_ms: cursorMs + offsetMs + spanMs,
+          });
+          offsetMs += spanMs;
         });
       }
       cursorMs += Math.round(clip.duration * 1000);

package/src/_vendor/video/recorder/index.js CHANGED Viewed

@@ -1,17 +1,12 @@
 import { spawn } from 'node:child_process';
 import { mkdirSync } from 'node:fs';
-import { stat, writeFile } from 'node:fs/promises';
+import { mkdtemp, rm, stat, writeFile } from 'node:fs/promises';
+import os from 'node:os';
 import path from 'node:path';
 import { launchChromiumMobile, openPageAndSettle } from './chromium-driver.js';
 import { defaultDisplayPool } from './display-pool.js';
-import {
-  createUnexpectedExitWatcher,
-  startFfmpegCapture,
-  stopFfmpegCapture,
-  waitForProcessExit,
-} from './ffmpeg-runner.js';
-import { estimatePlanDurationMs } from './plan-estimator.js';
+import { createUnexpectedExitWatcher, waitForProcessExit } from './ffmpeg-runner.js';
 import { executePlanPhases, normalizePlanPhases } from './plan-executor.js';
 const DEFAULT_VIEWPORT = Object.freeze({ width: 1080, height: 1920 });
@@ -177,6 +172,49 @@ function scalePhaseY(phase, zoom) {
   };
 }
+// Re-encode the page recording (webm, page content only — no browser chrome) into
+// the mp4 the rest of the pipeline expects, dropping the head segment that covers
+// page load + settle so the clip starts at the first plan phase.
+async function transcodeWebmToMp4({
+  webmPath,
+  outputPath,
+  startMs = 0,
+  fps = DEFAULT_FPS,
+  ffmpegBin = 'ffmpeg',
+} = {}) {
+  const ss = Math.max(0, Number(startMs) || 0) / 1000;
+  const args = [
+    '-y',
+    ...(ss > 0 ? ['-ss', ss.toFixed(3)] : []),
+    '-i', webmPath,
+    '-an',
+    '-c:v', 'libx264',
+    '-preset', 'veryfast',
+    '-pix_fmt', 'yuv420p',
+    ...(Number.isFinite(Number(fps)) && Number(fps) > 0 ? ['-r', String(fps)] : []),
+    '-movflags', '+faststart',
+    outputPath,
+  ];
+  await new Promise((resolve, reject) => {
+    const proc = spawn(ffmpegBin, args, { stdio: ['ignore', 'pipe', 'pipe'] });
+    const errChunks = [];
+    proc.stderr?.on('data', (chunk) => errChunks.push(chunk));
+    proc.once('error', (err) => {
+      const wrapped = new Error(`ffmpeg_spawn_failed:${err.message}`);
+      wrapped.code = 'FFMPEG_SPAWN_FAILED';
+      reject(wrapped);
+    });
+    proc.on('close', (code) => {
+      if (code === 0) return resolve();
+      const wrapped = new Error(
+        `ffmpeg_transcode_failed:code=${code}: ${Buffer.concat(errChunks).toString().slice(-2000)}`
+      );
+      wrapped.code = 'FFMPEG_TRANSCODE_FAILED';
+      reject(wrapped);
+    });
+  });
+}
 export async function recordUrlNarration({
   plan,
   output_path,
@@ -189,11 +227,14 @@ export async function recordUrlNarration({
   settle_ms = 4000,
   page_zoom = 1.1,
   displayPool = defaultDisplayPool,
-  ffmpegDurationBufferSec = 8,
   startupProbeMs = 1200,
-  ffmpegStopTimeoutMs = 10000,
   xvfbStopTimeoutMs = 5000,
   postPlanTailMs = 600,
+  recordingDir = null,
+  launchChromiumFn = launchChromiumMobile,
+  openPageFn = openPageAndSettle,
+  transcodeFn = transcodeWebmToMp4,
+  nowMs = () => Date.now(),
 } = {}) {
   const zoom = Number.isFinite(Number(page_zoom)) && Number(page_zoom) > 0 ? Number(page_zoom) : 1.1;
   const rawPhases = normalizePlanPhases(plan);
@@ -212,14 +253,14 @@ export async function recordUrlNarration({
   mkdirSync(path.dirname(resolvedOutputPath), { recursive: true });
   mkdirSync(path.dirname(resolvedEventsPath), { recursive: true });
+  const ownTempDir = !recordingDir;
+  const recVideoDir = recordingDir || await mkdtemp(path.join(os.tmpdir(), 'lc-recvid-'));
   let displayLease;
   let xvfb;
-  let ffmpeg;
-  let browserSession;
   let xvfbWatcher;
-  let ffmpegWatcher;
+  let browserSession = null;
   let primaryError = null;
   const cleanupErrors = [];
   try {
@@ -234,11 +275,26 @@ export async function recordUrlNarration({
     });
     xvfbWatcher = createUnexpectedExitWatcher(xvfb.child, 'xvfb');
-    browserSession = await launchChromiumMobile({
+    // The page recording captures the page viewport only (no browser chrome),
+    // regardless of the on-screen window. recordVideo starts when the page is
+    // created, so the webm includes goto + settle; we measure that head and trim
+    // it off in transcodeFn.
+    const recordStartedAt = nowMs();
+    browserSession = await launchChromiumFn({
       display,
       viewport: normalizedViewport,
+      contextOptions: {
+        recordVideo: {
+          dir: recVideoDir,
+          size: { width: normalizedViewport.width, height: normalizedViewport.height },
+        },
+      },
     });
-    await openPageAndSettle(browserSession.page, {
+    const videoHandle = typeof browserSession.page.video === 'function'
+      ? browserSession.page.video()
+      : null;
+    await openPageFn(browserSession.page, {
       url: resolvedUrl,
       settleMs: settle_ms,
     });
@@ -250,42 +306,53 @@ export async function recordUrlNarration({
       await browserSession.page.waitForTimeout(300);
     }
-    const estimatedDurationMs = estimatePlanDurationMs(executablePlan);
-    const estimatedDurationSec = Math.max(
-      5,
-      Math.ceil(estimatedDurationMs / 1000) + Math.max(0, Number(ffmpegDurationBufferSec) || 0)
-    );
-    ffmpeg = await startFfmpegCapture({
-      display,
-      outputPath: resolvedOutputPath,
-      width: normalizedViewport.width,
-      height: normalizedViewport.height,
-      fps: normalizedFps,
-      durationSec: estimatedDurationSec,
-      startupProbeMs,
-    });
-    ffmpegWatcher = createUnexpectedExitWatcher(ffmpeg.child, 'ffmpeg');
     await scrollToTop(browserSession.page);
     await browserSession.page.waitForTimeout(350);
+    const headTrimMs = Math.max(0, nowMs() - recordStartedAt);
     const eventsLog = await Promise.race([
       executePlanPhases(browserSession.page, executablePlan),
       xvfbWatcher.promise,
-      ffmpegWatcher.promise,
     ]);
     await browserSession.page.waitForTimeout(Math.max(0, Number(postPlanTailMs) || 0));
-    ffmpegWatcher.deactivate();
-    await stopFfmpegCapture(ffmpeg, {
-      timeoutMs: ffmpegStopTimeoutMs,
-    });
     xvfbWatcher.deactivate();
-    await writeFile(resolvedEventsPath, JSON.stringify(eventsLog, null, 2), 'utf8');
+    // Flush the recording: video is written when the context closes.
+    let webmPath = null;
+    try {
+      await browserSession.context.close();
+    } catch (closeError) {
+      cleanupErrors.push(`context_close_failed:${closeError.message}`);
+    }
+    if (videoHandle) {
+      try {
+        webmPath = await videoHandle.path();
+      } catch (pathError) {
+        cleanupErrors.push(`video_path_failed:${pathError.message}`);
+      }
+    }
+    try {
+      await browserSession.browser.close();
+    } catch (closeError) {
+      cleanupErrors.push(`browser_close_failed:${closeError.message}`);
+    }
+    browserSession = null;
+    if (!webmPath) {
+      const error = new Error('record_video_not_produced');
+      error.code = 'RECORD_VIDEO_NOT_PRODUCED';
+      throw error;
+    }
+    await transcodeFn({
+      webmPath,
+      outputPath: resolvedOutputPath,
+      startMs: headTrimMs,
+      fps: normalizedFps,
+    });
     const videoStat = await stat(resolvedOutputPath);
     if (!videoStat.isFile() || videoStat.size <= 0) {
@@ -294,42 +361,36 @@ export async function recordUrlNarration({
       throw error;
     }
+    await writeFile(resolvedEventsPath, JSON.stringify(eventsLog, null, 2), 'utf8');
+    const lastTms = Array.isArray(eventsLog)
+      ? eventsLog.reduce((max, ev) => Math.max(max, Number(ev?.t_ms) || 0), 0)
+      : 0;
     return {
       video_path: resolvedOutputPath,
       events_path: resolvedEventsPath,
       events_log: eventsLog,
+      duration_ms: lastTms > 0 ? lastTms : null,
       display,
     };
   } catch (error) {
     primaryError = error;
     throw error;
   } finally {
-    ffmpegWatcher?.deactivate();
     xvfbWatcher?.deactivate();
     if (browserSession) {
       try {
-        await browserSession.close();
+        await browserSession.browser.close();
       } catch (closeError) {
         cleanupErrors.push(`browser_close_failed:${closeError.message}`);
       }
     }
-    if (ffmpeg) {
-      try {
-        await stopFfmpegCapture(ffmpeg, {
-          timeoutMs: ffmpegStopTimeoutMs,
-        });
-      } catch (stopError) {
-        cleanupErrors.push(`ffmpeg_stop_failed:${stopError.message}`);
-      }
-    }
     if (xvfb) {
       try {
-        await stopXvfb(xvfb, {
-          timeoutMs: xvfbStopTimeoutMs,
-        });
+        await stopXvfb(xvfb, { timeoutMs: xvfbStopTimeoutMs });
       } catch (stopError) {
         cleanupErrors.push(`xvfb_stop_failed:${stopError.message}`);
       }
@@ -339,6 +400,10 @@ export async function recordUrlNarration({
       displayLease.release();
     }
+    if (ownTempDir) {
+      await rm(recVideoDir, { recursive: true, force: true }).catch(() => {});
+    }
     if (cleanupErrors.length > 0) {
       if (primaryError) {
         primaryError.cleanupErrors = cleanupErrors;

package/src/_vendor/video/recorder/plan-executor.js CHANGED Viewed

@@ -21,6 +21,61 @@ function normalizeRange(value) {
   return [low, high];
 }
+// The recorder executes exactly these visual actions. There is no "scroll a bit"
+// blind-scroll action: every scroll phase must say where it lands.
+export const SUPPORTED_PHASE_ACTIONS = Object.freeze([
+  'hold',
+  'smooth_scroll',
+  'fast_scroll',
+  'linear_scroll_during',
+  'scroll_to_dwell',
+  'scroll_back',
+  'cursor_focus',
+]);
+// Common spellings authors reach for, mapped onto the canonical action above.
+// Note: scroll_down / scroll_up are intentionally NOT aliased — there is no blind
+// scroll; an unrecognised action raises phase_action_unsupported so the plan gets
+// fixed rather than silently degraded.
+const PHASE_ACTION_ALIASES = new Map([
+  ['scroll_to', 'scroll_to_dwell'],
+  ['scrollto', 'scroll_to_dwell'],
+  ['scroll', 'scroll_to_dwell'],
+  ['scroll_to_region', 'scroll_to_dwell'],
+  ['scroll_to_y', 'scroll_to_dwell'],
+  ['dwell', 'scroll_to_dwell'],
+  ['focus_hold', 'scroll_to_dwell'],
+  ['pan', 'linear_scroll_during'],
+  ['narrated_pan', 'linear_scroll_during'],
+  ['linear_scroll', 'linear_scroll_during'],
+  ['scroll_during', 'linear_scroll_during'],
+  ['scroll_while_narrating', 'linear_scroll_during'],
+  ['return', 'scroll_back'],
+  ['return_anchor', 'scroll_back'],
+  ['back', 'scroll_back'],
+  ['scroll_to_top', 'scroll_back'],
+  ['wait', 'hold'],
+  ['pause', 'hold'],
+  ['stay', 'hold'],
+  ['focus', 'cursor_focus'],
+  ['highlight', 'cursor_focus'],
+]);
+function normalizeActionName(rawValue) {
+  const name = normalizeText(rawValue).toLowerCase();
+  if (!name) return '';
+  if (SUPPORTED_PHASE_ACTIONS.includes(name)) return name;
+  return PHASE_ACTION_ALIASES.get(name) || name;
+}
+// `visual_action` may be a string (the action name) or an object ({type, target_y, ...}).
+function visualActionObject(section = {}) {
+  const va = section?.visual_action;
+  if (va && typeof va === 'object') return va;
+  if (typeof va === 'string' && va.trim()) return { type: va.trim() };
+  return {};
+}
 function inferActionFromCameraMotion(phase = {}) {
   const motion = normalizeText(phase.camera_motion ?? phase.cameraMotion).toLowerCase();
   if (motion === 'narrated_pan') return 'linear_scroll_during';
@@ -30,19 +85,32 @@ function inferActionFromCameraMotion(phase = {}) {
   return '';
 }
+function pickFirstNumber(...values) {
+  for (const value of values) {
+    if (value == null) continue;
+    const parsed = Number(value);
+    if (Number.isFinite(parsed)) return Math.round(parsed);
+  }
+  return null;
+}
 function normalizeSectionAsPhase(section = {}, index = 0) {
-  const phaseId = normalizeText(section.id ?? section.phase_id) || `phase_${index + 1}`;
-  const visualAction = section.visual_action && typeof section.visual_action === 'object'
-    ? section.visual_action
-    : {};
+  const phaseId = normalizeText(section.id ?? section.phase_id ?? section.name) || `phase_${index + 1}`;
+  const visualAction = visualActionObject(section);
   const focusRegion = normalizeRange(
     section.focus_region
     ?? section.focusRegion
     ?? visualAction.focus_region
     ?? visualAction.focusRegion
   );
-  const explicitAction = normalizeText(section.action ?? visualAction.type).toLowerCase();
-  const action = explicitAction || inferActionFromCameraMotion(section) || 'scroll_to_dwell';
+  const explicitAction = normalizeActionName(section.action ?? visualAction.type ?? visualAction.action);
+  const inferred = explicitAction || inferActionFromCameraMotion(section);
+  const targetY = pickFirstNumber(
+    section.target_y, section.to_y, section.y, section.scroll_y,
+    visualAction.target_y, visualAction.to_y, visualAction.y, visualAction.scroll_y,
+  );
+  const hasTarget = focusRegion != null || targetY != null;
+  const action = inferred || (hasTarget ? 'scroll_to_dwell' : 'hold');
   return {
     ...section,
@@ -51,21 +119,42 @@ function normalizeSectionAsPhase(section = {}, index = 0) {
     action,
     focus_region: focusRegion ?? null,
     visual_action: visualAction,
-    target_y: section.target_y ?? visualAction.target_y ?? visualAction.to_y ?? null,
-    from_y: section.from_y ?? visualAction.from_y ?? null,
-    to_y: section.to_y ?? visualAction.to_y ?? null,
-    transition_ms: section.transition_ms ?? visualAction.transition_ms ?? null,
-    duration_ms: section.duration_ms ?? section.dwell_ms ?? null,
+    target_y: targetY,
+    from_y: pickFirstNumber(section.from_y, visualAction.from_y),
+    to_y: pickFirstNumber(section.to_y, visualAction.to_y, section.y, visualAction.y),
+    transition_ms: section.transition_ms ?? visualAction.transition_ms ?? visualAction.duration_ms ?? null,
+    duration_ms: section.duration_ms ?? section.dwell_ms ?? section.audio_duration_ms
+      ?? (section.presentation && Number.isFinite(Number(section.presentation.duration))
+        ? Math.round(Number(section.presentation.duration) * 1000)
+        : null),
   };
 }
 export function normalizePlanPhases(plan = {}) {
-  const phases = Array.isArray(plan?.phases) ? plan.phases : [];
-  if (phases.length > 0) return phases;
+  const topLevelPhases = Array.isArray(plan?.phases) ? plan.phases : [];
+  if (topLevelPhases.length > 0) {
+    return topLevelPhases.map((phase, index) => normalizeSectionAsPhase(phase, index));
+  }
   const sections = Array.isArray(plan?.sections) ? plan.sections : [];
   if (sections.length > 0) {
-    return sections.map((section, index) => normalizeSectionAsPhase(section, index));
+    const flattened = [];
+    sections.forEach((section, sectionIndex) => {
+      const nested = Array.isArray(section?.phases) ? section.phases : null;
+      if (nested && nested.length > 0) {
+        const prefix = normalizeText(section.id ?? section.phase_id ?? section.name) || `s${sectionIndex + 1}`;
+        nested.forEach((subPhase, subIndex) => {
+          const merged = {
+            ...subPhase,
+            id: subPhase.id ?? subPhase.phase_id ?? subPhase.name ?? `${prefix}_${subIndex + 1}`,
+          };
+          flattened.push(normalizeSectionAsPhase(merged, flattened.length));
+        });
+      } else {
+        flattened.push(normalizeSectionAsPhase(section, flattened.length));
+      }
+    });
+    return flattened;
   }
   const error = new Error('plan_phases_required');
@@ -74,13 +163,15 @@ export function normalizePlanPhases(plan = {}) {
 }
 function resolvePhaseAction(phase = {}) {
-  const explicit = normalizeText(phase.action ?? phase.visual_action?.type).toLowerCase();
+  const explicit = normalizeActionName(
+    phase.action ?? phase.visual_action?.type ?? phase.visual_action?.action
+  );
   if (explicit) return explicit;
   return inferActionFromCameraMotion(phase);
 }
 function resolvePhaseId(phase = {}, index = 0) {
-  return normalizeText(phase.id ?? phase.phase_id) || `phase_${index + 1}`;
+  return normalizeText(phase.id ?? phase.phase_id ?? phase.name) || `phase_${index + 1}`;
 }
 function nowMs(getNowMs) {
@@ -94,9 +185,12 @@ function resolveTransitionMs(phase, fallback) {
 }
 function resolveTargetY(phase, fallback = null) {
-  const raw = phase?.target_y ?? phase?.to_y ?? phase?.visual_action?.target_y ?? phase?.visual_action?.to_y;
-  const parsed = Number(raw);
-  if (Number.isFinite(parsed)) return Math.round(parsed);
+  const explicit = pickFirstNumber(
+    phase?.target_y, phase?.to_y, phase?.y, phase?.scroll_y,
+    phase?.visual_action?.target_y, phase?.visual_action?.to_y,
+    phase?.visual_action?.y, phase?.visual_action?.scroll_y,
+  );
+  if (explicit != null) return explicit;
   const focusRegion = normalizeRange(
     phase?.focus_region
@@ -113,6 +207,20 @@ function resolveTargetY(phase, fallback = null) {
   return fallback;
 }
+function requireTargetY(phase, action) {
+  const targetY = resolveTargetY(phase, null);
+  if (targetY == null) {
+    const error = new Error(
+      `phase_target_y_required: phase "${resolvePhaseId(phase)}" uses "${action}" but has no `
+      + 'target_y / to_y / y or focus_region — every scroll phase must say where it lands '
+      + '(there is no blind scroll)',
+    );
+    error.code = 'PHASE_TARGET_Y_REQUIRED';
+    throw error;
+  }
+  return targetY;
+}
 function resolveFromY(phase, fallback = null) {
   const raw = phase?.from_y ?? phase?.visual_action?.from_y;
   const parsed = Number(raw);
@@ -192,8 +300,8 @@ async function executeHold(page, phase) {
   return { anchorY: null };
 }
-async function executeSmoothScroll(page, phase, { fallbackTargetY = null } = {}) {
-  const targetY = resolveTargetY(phase, fallbackTargetY);
+async function executeSmoothScroll(page, phase) {
+  const targetY = requireTargetY(phase, 'smooth_scroll');
   const transitionMs = resolveTransitionMs(phase, 900);
   await animateScroll(page, {
     targetY,
@@ -205,8 +313,8 @@ async function executeSmoothScroll(page, phase, { fallbackTargetY = null } = {})
   return { anchorY: targetY };
 }
-async function executeFastScroll(page, phase, { fallbackTargetY = null } = {}) {
-  const targetY = resolveTargetY(phase, fallbackTargetY);
+async function executeFastScroll(page, phase) {
+  const targetY = requireTargetY(phase, 'fast_scroll');
   const transitionMs = resolveTransitionMs(phase, 420);
   await animateScroll(page, {
     targetY,
@@ -218,12 +326,9 @@ async function executeFastScroll(page, phase, { fallbackTargetY = null } = {}) {
   return { anchorY: targetY };
 }
-async function executeLinearScrollDuring(page, phase, {
-  fallbackFromY = null,
-  fallbackTargetY = null,
-} = {}) {
+async function executeLinearScrollDuring(page, phase, { fallbackFromY = null } = {}) {
   const fromY = resolveFromY(phase, fallbackFromY);
-  const toY = resolveTargetY(phase, fallbackTargetY);
+  const toY = requireTargetY(phase, 'linear_scroll_during');
   const durationMs = resolveDurationMs(phase, null);
   if (!Number.isFinite(Number(durationMs)) || Number(durationMs) <= 0) {
     const error = new Error('linear_scroll_duration_required');
@@ -247,8 +352,8 @@ async function executeLinearScrollDuring(page, phase, {
   return { anchorY: toY };
 }
-async function executeScrollToDwell(page, phase, { fallbackTargetY = null } = {}) {
-  const targetY = resolveTargetY(phase, fallbackTargetY);
+async function executeScrollToDwell(page, phase) {
+  const targetY = requireTargetY(phase, 'scroll_to_dwell');
   const transitionMs = resolveTransitionMs(phase, 820);
   await animateScroll(page, {
     targetY,
@@ -286,8 +391,8 @@ async function executeScrollBack(page, phase, { fallbackTargetY = 0 } = {}) {
   return { anchorY: targetY };
 }
-async function executeCursorFocus(page, phase, { fallbackTargetY = null } = {}) {
-  const targetY = resolveTargetY(phase, fallbackTargetY);
+async function executeCursorFocus(page, phase) {
+  const targetY = requireTargetY(phase, 'cursor_focus');
   const transitionMs = resolveTransitionMs(phase, 650);
   await animateScroll(page, {
     targetY,
@@ -313,34 +418,34 @@ async function executePhase(page, phase, {
   initialAnchorY = 0,
 } = {}) {
   const action = resolvePhaseAction(phase);
-  const fallbackY = lastAnchorY ?? initialAnchorY;
+  const fallbackFromY = lastAnchorY ?? initialAnchorY;
   if (action === 'hold') {
     return executeHold(page, phase);
   }
   if (action === 'smooth_scroll') {
-    return executeSmoothScroll(page, phase, { fallbackTargetY: fallbackY });
+    return executeSmoothScroll(page, phase);
   }
   if (action === 'fast_scroll') {
-    return executeFastScroll(page, phase, { fallbackTargetY: fallbackY });
+    return executeFastScroll(page, phase);
   }
   if (action === 'linear_scroll_during') {
-    return executeLinearScrollDuring(page, phase, {
-      fallbackFromY: fallbackY,
-      fallbackTargetY: fallbackY,
-    });
+    return executeLinearScrollDuring(page, phase, { fallbackFromY });
   }
   if (action === 'scroll_to_dwell') {
-    return executeScrollToDwell(page, phase, { fallbackTargetY: fallbackY });
+    return executeScrollToDwell(page, phase);
   }
   if (action === 'scroll_back') {
     return executeScrollBack(page, phase, { fallbackTargetY: 0 });
   }
   if (action === 'cursor_focus') {
-    return executeCursorFocus(page, phase, { fallbackTargetY: fallbackY });
+    return executeCursorFocus(page, phase);
   }
-  const error = new Error(`phase_action_unsupported:${action || 'empty'}`);
+  const error = new Error(
+    `phase_action_unsupported:${action || 'empty'} — supported actions: ${SUPPORTED_PHASE_ACTIONS.join(', ')}`
+    + ' (there is no blind scroll_down/scroll_up; use scroll_to_dwell with target_y or focus_region)',
+  );
   error.code = 'PHASE_ACTION_UNSUPPORTED';
   throw error;
 }

package/src/chat-bridge.js CHANGED Viewed

@@ -1430,10 +1430,10 @@ server.tool('get_library_file',
 // ── record_url_narration ────────────────────────────────────────────────────────
 server.tool('record_url_narration',
-  'Record a silent video of a URL by orchestrating Xvfb + Chromium + ffmpeg, driven by a video plan. Outputs a silent mp4 that can be passed to compose_video_v2 as a video-kind segment with an audio_path for narration.\n\nUse this as the canonical recording step for URL-narration videos. Falls back: if the page needs interactions outside the visual_action vocabulary (clicks, waits, OCR loops), use Monitor (Bash) with custom Playwright instead.\n\nRuntime requirements: this tool only works on a Linux daemon machine with Xvfb + ffmpeg (x11grab) + Chromium installed. macOS / Windows daemons will fail at startup.',
+  'Record a silent video of a URL by driving Chromium on an Xvfb display and capturing it with Playwright recordVideo, driven by a video plan; ffmpeg then transcodes the recording to mp4. Outputs a silent mp4 that can be passed to compose_video_v2 as a video-kind segment with an audio_path for narration.\n\nUse this as the canonical recording step for URL-narration videos. Falls back: if the page needs interactions outside the visual_action vocabulary (clicks, waits, OCR loops), use Monitor (Bash) with custom Playwright instead.\n\nRuntime requirements: this tool only works on a Linux daemon machine with Xvfb + Chromium + ffmpeg installed (ffmpeg is used to transcode the recording to mp4; no x11grab device support needed). macOS / Windows daemons will fail at startup.',
   {
     url: z.string().describe('Page URL to record'),
-    plan: z.record(z.any()).describe('Must be the full output from detail_sections (not plan_video). detail_sections output includes detail_sections_version, sections[], audio metadata, and dwell_ms per phase.'),
+    plan: z.record(z.any()).describe('A video plan: an object with `phases` (or `sections`), each a "visual beat" with `action` (scroll_to_dwell / linear_scroll_during / scroll_back / hold / ...), a target (`target_y` or `focus_region:[y1,y2]`) for scroll-type actions, and `dwell_ms` (how long to hold that beat — should match the segment\'s TTS duration). It can be hand-written or the output of plan_video_segments (whose returned segments array doubles as a valid plan).'),
     output_path: z.string().optional().describe('Workspace-relative output mp4 path. Default tmp/wx3_video/recorded-{ts}.mp4'),
     events_path: z.string().optional().describe('Workspace-relative events.json path. Default ${output_path}.events.json'),
     viewport: z.object({
@@ -1468,7 +1468,7 @@ server.tool('submit_to_library',
     target_platform: z.string().optional().describe('目标发布平台，如 xhs / douyin'),
     metadata: z.record(z.any()).optional().describe('其它 metadata（brand_voice / persona / account / goal_state 等）'),
     understanding: z.record(z.any()).optional().describe('analyze_page 输出'),
-    plan: z.record(z.any()).optional().describe('plan_video / detail_sections 输出'),
+    plan: z.record(z.any()).optional().describe('plan_video_segments 输出（或手写的录屏 plan）'),
   },
   async (args) => {
     if (isBlockedCvmaxEditorVideoTool('submit_to_library')) {
@@ -1529,7 +1529,7 @@ server.tool('request_approval',
     platform:      z.string().describe('Target platform, e.g. "x", "xhs", "email"'),
     description:   z.string().describe('Human-readable summary of what will happen if approved'),
     payload:       z.record(z.any()).describe('Full action parameters (content, media_urls, etc.)'),
-    credential_id: z.string().optional().describe('Which account/credential to use. For publishing, prefer a workspace account_id or real credential UUID. Role aliases like primary/test are accepted only if they uniquely match a workspace account.'),
+    credential_id: z.string().optional().describe('Which account/credential to use. Accepts a workspace account_id, a real credential UUID, the account display name, or a role alias (主号/main/primary, 矩阵号/matrix/secondary, 测试号/test/incubator) — any value works as long as it uniquely matches one workspace account on the target platform. If publishing fails with publish_account_selection_required/ambiguous, pick a value from the returned candidates\' "selectors" list yourself instead of asking the user to re-type an account name.'),
   },
   async ({ action_type, platform, description, payload, credential_id }) => {
     try {

package/src/tools/plan-video-segments.js CHANGED Viewed

@@ -45,6 +45,20 @@ function planDurationSec(audioDurationMs, bufferSec = 0.5) {
   return Math.ceil(raw * 2) / 2; // round up to nearest 0.5s
 }
+// Run fn over items with a bounded number of concurrent workers (FIFO drain).
+async function mapWithConcurrency(items, limit, fn) {
+  const queue = items.map((item, index) => ({ item, index }));
+  const workers = Array.from({ length: Math.max(1, Math.min(limit, queue.length)) }, async () => {
+    while (queue.length > 0) {
+      const next = queue.shift();
+      await fn(next.item, next.index);
+    }
+  });
+  await Promise.all(workers);
+}
+const TTS_CONCURRENCY = 5;
 export async function runPlanVideoSegmentsTool({ segments, workspace_id, voice_id, currentWorkspaceId, api }) {
   if (!Array.isArray(segments) || segments.length === 0) {
     return toolError('segments must be a non-empty array.');
@@ -58,20 +72,31 @@ export async function runPlanVideoSegmentsTool({ segments, workspace_id, voice_i
   const planned = [];
   const errors = [];
+  // Synthesize TTS for every text-bearing segment up front, in parallel (bounded),
+  // so an N-segment plan no longer pays N sequential round-trips to the TTS API.
+  const audioResults = new Array(segments.length).fill(null);
+  const ttsJobs = segments
+    .map((seg, i) => ({ i, text: String(seg.text ?? '').trim() }))
+    .filter(job => job.text);
+  await mapWithConcurrency(ttsJobs, TTS_CONCURRENCY, async ({ i, text }) => {
+    try {
+      audioResults[i] = await synthesizeSegmentTts(text, { workspace_id: targetWorkspaceId, voice_id, api });
+    } catch (err) {
+      errors.push(`segments[${i}]: TTS failed — ${err.message}`);
+      audioResults[i] = { audio_path: null, audio_duration_ms: 3000 }; // fallback estimate
+    }
+  });
+  errors.sort((a, b) => {
+    const na = Number((a.match(/segments\[(\d+)\]/) ?? [])[1] ?? 0);
+    const nb = Number((b.match(/segments\[(\d+)\]/) ?? [])[1] ?? 0);
+    return na - nb;
+  });
   for (let i = 0; i < segments.length; i++) {
     const seg = segments[i];
     const text = String(seg.text ?? '').trim();
     const kind = String(seg.visual_kind ?? 'image');
-    let audioResult = null;
-    if (text) {
-      try {
-        audioResult = await synthesizeSegmentTts(text, { workspace_id: targetWorkspaceId, voice_id, api });
-      } catch (err) {
-        errors.push(`segments[${i}]: TTS failed — ${err.message}`);
-        audioResult = { audio_path: null, audio_duration_ms: 3000 }; // fallback estimate
-      }
-    }
+    const audioResult = audioResults[i];
     const audioDurationMs = audioResult?.audio_duration_ms ?? 0;
     let presentation;
@@ -87,11 +112,19 @@ export async function runPlanVideoSegmentsTool({ segments, workspace_id, voice_i
       presentation = { duration, ...(kind === 'scroll' ? { style: 'scroll' } : {}) };
     }
+    // dwell_ms lets the same segment double as a record_url_narration plan phase
+    // (the recorder reads dwell_ms / duration_ms for how long to hold each beat).
+    // Prefer the real measured audio length; fall back to the planned visual duration.
+    const dwellMs = audioDurationMs > 0
+      ? audioDurationMs
+      : Math.round((presentation.duration ?? presentation.per_card_duration ?? 4) * 1000);
     const planned_seg = {
       ...seg,
       ...(audioResult?.audio_path ? { audio_path: audioResult.audio_path } : {}),
       ...(text ? { subtitle_text: text } : {}),
       presentation: { ...presentation, ...(seg.presentation ?? {}) },
+      dwell_ms: seg.dwell_ms ?? dwellMs,
     };
     if (audioResult?.audio_duration_ms) {
       planned_seg.audio_duration_ms = audioResult.audio_duration_ms;