npm - @lightcone-ai/daemon - Versions diffs - 0.16.1 → 0.17.0 - Mend

@lightcone-ai/daemon 0.16.1 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/mcp-servers/official/media-tools/index.js +256 -0
package/mcp-servers/official/media-tools/lib/lightcone-api.js +41 -0
package/mcp-servers/official/media-tools/manifest.json +8 -1
package/package.json +1 -1
package/src/chat-bridge.js +18 -237
package/src/tools/plan-video-segments.js +83 -94
package/src/{record-url-narration-tool.js → tools/record-url-narration.js} +61 -58

package/mcp-servers/official/media-tools/index.js CHANGED Viewed

@@ -5,6 +5,55 @@ import { z } from 'zod';
 import { addTitleEffects } from './lib/render.js';
 import { SUPPORTED_PRESETS } from './lib/presets.js';
+import { runSynthesisTtsTool } from '../../../src/tools/synthesize-tts.js';
+import { runPlanVideoSegmentsTool } from '../../../src/tools/plan-video-segments.js';
+import { runComposeVideoV2Tool } from '../../../src/tools/compose-video-v2.js';
+import { runRecordUrlNarrationTool } from '../../../src/tools/record-url-narration.js';
+import { runRenderTextToImageTool } from '../../../src/tools/render-text-to-image.js';
+import { runRenderHtmlToImageTool } from '../../../src/tools/render-html-to-image.js';
+import { runTakePageScreenshotTool } from '../../../src/tools/take-page-screenshot.js';
+import { lightconeApi, CURRENT_WORKSPACE_ID, CURRENT_AGENT_ID } from './lib/lightcone-api.js';
+const WORKSPACE_DIR = String(process.env.WORKSPACE_DIR ?? '');
+// CVMAX editor_in_chief block: in one workspace, the editor_in_chief agent
+// must not run video production tools directly (short_video_scripter owns
+// that role). Env-gated so ops can rotate workspace/agent IDs without code
+// changes. Previously lived in chat-bridge; moved here alongside the tool
+// it gates (V4 migration). submit_to_library, the other gated tool, still
+// lives in chat-bridge and keeps its own copy of this check.
+const CVMAX_WORKSPACE_ID = String(process.env.BLOCKED_EDITOR_WORKSPACE_ID ?? '');
+const CVMAX_EDITOR_IN_CHIEF_AGENT_ID = String(process.env.BLOCKED_EDITOR_AGENT_ID ?? '');
+function isBlockedCvmaxEditorVideoTool(toolName) {
+  return CURRENT_WORKSPACE_ID === CVMAX_WORKSPACE_ID
+    && CURRENT_AGENT_ID === CVMAX_EDITOR_IN_CHIEF_AGENT_ID
+    && CVMAX_WORKSPACE_ID
+    && CVMAX_EDITOR_IN_CHIEF_AGENT_ID
+    && toolName === 'record_url_narration';
+}
+function cvmaxEditorVideoToolError(toolName) {
+  return {
+    isError: true,
+    content: [{
+      type: 'text',
+      text:
+        `Error: ${toolName} blocked for editor_in_chief in CvMax. `
+        + 'In this workspace, @short_video_scripter owns video production. '
+        + 'editor_in_chief may route, review, or assist with OCR/verification, but must not run video production tools directly.',
+    }],
+  };
+}
+// Session-scoped flag set when plan_video_segments runs successfully.
+// compose_video_v2 refuses TTS-bearing segments (any segment with audio_path)
+// unless this is true — the agent must route audio through plan_video_segments
+// first so durations / subtitle_text are mechanically aligned. media-tools is
+// spawned per-agent, so a fresh agent session must call plan_video_segments
+// fresh. (This flag previously lived in chat-bridge module scope; moved here
+// alongside the tools it gates, see V2 migration.)
+let _planVideoSegmentsCalledThisSession = false;
 const PRESET_ENUM = z.enum(SUPPORTED_PRESETS);
 const POSITION_ENUM = z.enum(['top', 'center', 'bottom']);
@@ -63,6 +112,213 @@ server.tool(
   }
 );
+// ── synthesize_tts (migrated from chat-bridge) ────────────────────────────
+// Pure TTS atomic tool: text → mp3 file. The lightcone server proxies to
+// MiniMax TTS; this tool downloads the resulting mp3 to a local tmp path so
+// downstream tools (plan_video_segments / compose_video_v2) can read it.
+//
+// Per the video-synthesis-design migration (see docs/upload-pipeline-design.md
+// and docs/scenario-content-creation/video-synthesis-design.md), this tool
+// lives in media-tools rather than chat-bridge so the video pipeline is a
+// single coherent MCP server.
+server.tool(
+  'synthesize_tts',
+  'Run MiniMax TTS on a snippet of narration text and download the resulting mp3 to a local tmp path. '
+  + 'Returns the local path and duration. Call once per video segment — do not concatenate all narration '
+  + 'into a single call (segment-level audio is required for plan_video_segments to align video durations).',
+  {
+    text: z.string().min(1).describe('Narration text for this segment. Will be synthesized as a single mp3.'),
+    voice_id: z.string().optional().describe('TTS voice preset. Omit to use workspace default.'),
+    workspace_id: z.string().optional().describe('Target workspace. Defaults to the current workspace.'),
+  },
+  async ({ text, voice_id, workspace_id }) => runSynthesisTtsTool({
+    text,
+    voice_id,
+    workspace_id,
+    currentWorkspaceId: CURRENT_WORKSPACE_ID,
+    api: lightconeApi,
+  })
+);
+// ── plan_video_segments (migrated from chat-bridge; TTS decoupled) ────────
+// Pure planner — takes per-segment {text, audio_path, visual_kind, ...} and
+// returns segments with audio_duration_ms / presentation.duration / dwell_ms
+// / subtitle_text filled in. Caller MUST run synthesize_tts per segment first
+// and pass the resulting audio_path here. No longer synthesizes TTS itself
+// (V2 migration; see docs/scenario-content-creation/video-synthesis-design.md).
+server.tool(
+  'plan_video_segments',
+  'Universal audio-video sync planning step. For each segment, reads the supplied audio_path via ffprobe, '
+  + 'measures audio duration, and returns a planned segments array with audio_path / audio_duration_ms / '
+  + 'subtitle_text / presentation.duration / dwell_ms filled in — ready to pass directly to both '
+  + 'record_url_narration (as the recording plan) AND compose_video_v2 (as the segment list). '
+  + 'Must be called before compose_video_v2 when any segment has audio_path.\n\n'
+  + 'Inputs per segment: {text, audio_path (required, from synthesize_tts), visual_kind, visual_path or visual_paths, '
+  + 'optionally transition / presentation.style}. Standard chain: synthesize_tts × N → plan_video_segments → '
+  + 'record_url_narration + compose_video_v2 (both use the same plan output).',
+  {
+    segments: z.array(z.object({
+      text: z.string().describe('Narration text for this segment — used as subtitle_text in the output.'),
+      audio_path: z.string().describe('Absolute path to the segment\'s mp3 (from synthesize_tts).'),
+      visual_kind: z.enum(['image', 'video', 'gif', 'carousel']).describe('Type of visual for compose_video_v2.'),
+      visual_path: z.string().optional().describe('Absolute path to a single image / video / gif file.'),
+      visual_paths: z.array(z.string()).optional().describe('For carousel: array of image paths, one per card.'),
+      transition: z.enum(['cut', 'fade', 'crossfade']).optional().describe('Transition to next segment. Default cut.'),
+      presentation: z.object({
+        style: z.enum(['static', 'scroll']).optional(),
+      }).optional().describe('Optional presentation hints (style only). duration/per_card_duration are computed.'),
+      dwell_ms: z.number().optional().describe('Optional override for record_url_narration phase duration. Default = audio_duration_ms.'),
+    })).describe('Segments to plan. audio_path is required for each.'),
+  },
+  async ({ segments }) => {
+    const result = await runPlanVideoSegmentsTool({ segments });
+    if (!result?.isError) _planVideoSegmentsCalledThisSession = true;
+    return result;
+  }
+);
+// ── compose_video_v2 (migrated from chat-bridge) ──────────────────────────
+// Tool-level enforcement of the standard chain: TTS-bearing segments require
+// plan_video_segments to have run earlier in this session. Without it manual
+// dwell/duration math has repeatedly produced misaligned subtitles, silent
+// tails, and re-records (Task #25/#26 trial).
+server.tool(
+  'compose_video_v2',
+  'Compose a video from a list of segments using ffmpeg. Each segment has a visual source (image / scroll / '
+  + 'carousel / video / gif), optional audio, and optional subtitle text. Subtitles are burned in when '
+  + 'subtitle_text is provided. Segments are concatenated in order; outro clips are appended after.\n\n'
+  + 'When any segment has audio_path, MUST be preceded by plan_video_segments in the same session '
+  + '(plan_video_segments fills duration/subtitle_text/audio_path mechanically; manual alignment is rejected). '
+  + 'Returns a local mp4 path + size_bytes.',
+  {
+    segments: z.array(z.object({
+      visual_path: z.string().optional().describe('Absolute path to a single image / video / gif.'),
+      visual_paths: z.array(z.string()).optional().describe('For carousel: array of image paths, one per card.'),
+      visual_kind: z.enum(['image', 'video', 'gif', 'carousel']).describe('Type of visual.'),
+      presentation: z.object({
+        style: z.enum(['static', 'scroll']).optional(),
+        duration: z.number().optional().describe('Segment duration in seconds. Required for image/scroll.'),
+        per_card_duration: z.number().optional().describe('Seconds per card for carousel.'),
+      }).optional(),
+      audio_path: z.string().nullable().optional().describe('Absolute path to audio (mp3). null/omit for silence.'),
+      subtitle_text: z.string().optional().describe('Narration text to burn as subtitle. Displayed for the full segment duration.'),
+      transition: z.enum(['cut', 'fade', 'crossfade']).optional().describe('Transition to next segment. Default cut.'),
+    })).describe('Ordered list of video segments.'),
+    outro_paths: z.array(z.string()).optional().describe('Absolute paths to outro video clips appended at end.'),
+    resolution: z.string().optional().describe('Output resolution WxH. Default "1080x1920".'),
+    output_path: z.string().optional().describe('Absolute output path. Auto-generated if omitted.'),
+  },
+  async (args) => {
+    const segments = Array.isArray(args?.segments) ? args.segments : [];
+    const hasNarration = segments.some(s => typeof s?.audio_path === 'string' && s.audio_path.trim());
+    if (hasNarration && !_planVideoSegmentsCalledThisSession) {
+      return toolError(
+        'compose_video_v2 refused: TTS-bearing segments (audio_path present) require plan_video_segments '
+        + 'to have run earlier in this session — it mechanically aligns audio_duration / video_duration / '
+        + 'subtitle_text with a safety buffer. Manual dwell/duration math has repeatedly produced misaligned '
+        + 'subtitles and silent tails that force re-recording.\n\n'
+        + 'Standard chain: synthesize_tts × N (per segment) → plan_video_segments(segments with text + audio_path + '
+        + 'visual_kind + visual_path) → compose_video_v2 (use the returned segments verbatim, only swap '
+        + 'visual_path/visual_kind for the real media). Call plan_video_segments now and pass its output here.'
+      );
+    }
+    return runComposeVideoV2Tool({ ...args, workspaceDir: WORKSPACE_DIR });
+  }
+);
+// ── record_url_narration (migrated from chat-bridge) ──────────────────────
+// Records a silent mp4 of a URL via Chromium+Xvfb+Playwright recordVideo,
+// driven by a beat-by-beat plan. Hard-block: requires plan_video_segments to
+// have run in this session — hand-written dwell_ms has drifted from TTS
+// audio in production runs (Tasks #20/#25/#26), forcing re-records.
+server.tool(
+  'record_url_narration',
+  'Record a silent video of a URL by driving Chromium on an Xvfb display and capturing it with Playwright recordVideo, driven by a video plan; ffmpeg then transcodes the recording to mp4. Outputs a silent mp4 that can be passed to compose_video_v2 as a video-kind segment with an audio_path for narration.\n\nUse this as the canonical recording step for URL-narration videos. Falls back: if the page needs interactions outside the visual_action vocabulary (clicks, waits, OCR loops), use Monitor (Bash) with custom Playwright instead.\n\nMUST be preceded by plan_video_segments in the same session — feed plan_video_segments\'s `segments` array as `plan.sections` so dwell_ms aligns mechanically with TTS audio_duration_ms (hand-written dwell_ms has drifted and forced re-records in production).\n\nRuntime requirements: this tool only works on a Linux daemon machine with Xvfb + Chromium + ffmpeg installed (ffmpeg is used to transcode the recording to mp4; no x11grab device support needed). macOS / Windows daemons will fail at startup.',
+  {
+    url: z.string().describe('Page URL to record'),
+    plan: z.record(z.any()).describe(
+      'A video plan: an object with `phases` (or `sections`), each a "visual beat" with '
+      + '`action` (scroll_to_dwell / linear_scroll_during / scroll_back / hold / ...), a '
+      + 'target (`target_y` or `focus_region:[y1,y2]`) for scroll-type actions, and '
+      + '`dwell_ms` (how long to hold that beat — should match the segment\'s TTS duration).\n\n'
+      + 'Standard chain: pass plan_video_segments\'s `segments` array directly as `plan.sections` — '
+      + 'each segment\'s `dwell_ms` is already set to its `audio_duration_ms`.\n\n'
+      + 'For RECRUITMENT URLs (mp.weixin.qq.com / 校招 / 实习 / 岗位 content), each section MUST '
+      + 'also declare `target_y_content_label` — a short Chinese label describing what content '
+      + 'sits at that pixel y position on the page (e.g. "标题区" / "岗位信息卡片" / "公司介绍" / '
+      + '"届别说明"). Labels matching forbidden regions ("二维码" / "扫码" / "投递入口" / "投递方式" / '
+      + '"联系方式" / "微信号" / "QR" / "阅读原文" / "外链") will cause the tool to refuse the '
+      + 'recording — recruitment content must NOT dwell on these areas (see fragments.md '
+      + 'frag.short.recruitment_url_mode_policy). Pick a different target_y in the 标题/岗位 '
+      + 'information area and rewrite that section.'
+    ),
+    output_path: z.string().optional().describe('Workspace-relative output mp4 path. Default tmp/wx3_video/recorded-{ts}.mp4'),
+    events_path: z.string().optional().describe('Workspace-relative events.json path. Default ${output_path}.events.json'),
+    viewport: z.object({
+      width: z.number().optional(),
+      height: z.number().optional(),
+    }).optional().describe('Default 1080x1920 (mobile portrait). Override only if the plan requires a different shape.'),
+    fps: z.number().optional().describe('Default 30. Do not change unless needed.'),
+    settle_ms: z.number().optional().describe('Default 4000. Settle wait after navigation before recording starts.'),
+  },
+  async (args) => {
+    if (isBlockedCvmaxEditorVideoTool('record_url_narration')) {
+      return cvmaxEditorVideoToolError('record_url_narration');
+    }
+    return runRecordUrlNarrationTool({
+      args,
+      currentWorkspaceId: CURRENT_WORKSPACE_ID,
+      workspaceDir: WORKSPACE_DIR,
+      planVideoSegmentsCalled: _planVideoSegmentsCalledThisSession,
+    });
+  }
+);
+// ── render_text_to_image (migrated from chat-bridge) ──────────────────────
+server.tool(
+  'render_text_to_image',
+  'Render text content into image(s) for video synthesis. style=scroll produces a single tall image (for a scrolling video segment); style=carousel produces one image per card (for a slide-show segment). Returns local file paths.',
+  {
+    content: z.union([z.string(), z.array(z.string())]).describe('Text content. For carousel, pass an array of strings — one per card. For scroll, pass a single string (or array joined with line breaks).'),
+    style: z.enum(['scroll', 'carousel']).describe('scroll: one tall image; carousel: one image per card.'),
+    theme: z.enum(['dark', 'light']).optional().describe('Color theme. Default dark.'),
+    width: z.number().optional().describe('Image width in pixels. Default 1080.'),
+    card_height: z.number().optional().describe('Card height in pixels (carousel) or viewport height (scroll baseline). Default 1920.'),
+    font_size: z.number().optional().describe('Base font size in pixels. Default 48.'),
+  },
+  async (args) => runRenderTextToImageTool(args)
+);
+// ── render_html_to_image (migrated from chat-bridge) ──────────────────────
+server.tool(
+  'render_html_to_image',
+  'Render a raw HTML string to a PNG image by navigating to it as a local file:// page. Unlike evaluate_script+document.write on about:blank, this preserves file:// origin so <img src="file:///..."> references load correctly. Returns the output image path.',
+  {
+    html: z.string().describe('Full HTML document to render (including <!doctype>, <html>, <head>, <body>).'),
+    output_path: z.string().optional().describe('Absolute path to save the PNG. Auto-generated in /tmp if omitted.'),
+    viewport_width: z.number().optional().describe('Viewport width in pixels. Default 1080.'),
+    viewport_height: z.number().optional().describe('Viewport height in pixels. Default 1920.'),
+    wait_until: z.enum(['load', 'networkidle', 'domcontentloaded']).optional().describe('Navigation wait condition. Default load.'),
+  },
+  async (args) => runRenderHtmlToImageTool(args)
+);
+// ── take_page_screenshot (migrated from chat-bridge) ──────────────────────
+server.tool(
+  'take_page_screenshot',
+  'Open a URL with a headless browser and capture a screenshot. crop=above_fold captures only the visible viewport (ideal for thumbnail-style frames); crop=full_page captures the entire page height.',
+  {
+    url: z.string().describe('Page URL to screenshot.'),
+    crop: z.enum(['above_fold', 'full_page']).optional().describe('Capture mode. Default above_fold.'),
+    viewport: z.object({
+      width: z.number().optional(),
+      height: z.number().optional(),
+    }).optional().describe('Viewport size. Default 390×844 (mobile).'),
+    wait_for: z.enum(['load', 'networkidle', 'domcontentloaded']).optional().describe('Page load event to wait for before screenshotting. Default networkidle.'),
+  },
+  async (args) => runTakePageScreenshotTool(args)
+);
 const transport = new StdioServerTransport();
 await server.connect(transport);
 console.error('[official-media-tools] MCP Server started');

package/mcp-servers/official/media-tools/lib/lightcone-api.js ADDED Viewed

@@ -0,0 +1,41 @@
+// Minimal HTTP helper for media-tools to call lightcone server's internal API.
+// Wraps fetch with the right URL prefix + auth headers + JSON encoding.
+//
+// Unlike daemon/src/chat-bridge.js's `api`, this helper does NOT route through
+// the governance/cache layer — media-tools is a separate stdio MCP server and
+// governance integration is chat-bridge-specific. If a tool here needs
+// governance-mediated execution, route it through chat-bridge's thin-proxy
+// instead (see weixin-tools for the pattern).
+const SERVER_URL = String(process.env.SERVER_URL ?? '').replace(/\/+$/, '');
+const MACHINE_API_KEY = String(process.env.MACHINE_API_KEY ?? '');
+const AGENT_ID = String(process.env.AGENT_ID ?? '');
+if (!SERVER_URL) throw new Error('media-tools: SERVER_URL env var is required');
+if (!MACHINE_API_KEY) throw new Error('media-tools: MACHINE_API_KEY env var is required');
+if (!AGENT_ID) throw new Error('media-tools: AGENT_ID env var is required');
+export async function lightconeApi(method, apiPath, body) {
+  const url = `${SERVER_URL}/internal/agent/${encodeURIComponent(AGENT_ID)}${apiPath}`;
+  const res = await fetch(url, {
+    method,
+    headers: {
+      'Content-Type': 'application/json',
+      'Authorization': `Bearer ${MACHINE_API_KEY}`,
+    },
+    body: body != null ? JSON.stringify(body) : undefined,
+  });
+  if (!res.ok) {
+    let text = '';
+    try { text = await res.text(); } catch { /* ignore */ }
+    const err = new Error(`lightcone ${method} ${apiPath} → ${res.status}: ${text.slice(0, 400)}`);
+    err.status = res.status;
+    err.body = text;
+    throw err;
+  }
+  return res.json();
+}
+// Exposed so tools can construct workspace-aware fallback identifiers.
+export const CURRENT_AGENT_ID = AGENT_ID;
+export const CURRENT_WORKSPACE_ID = String(process.env.WORKSPACE_ID ?? '');

package/mcp-servers/official/media-tools/manifest.json CHANGED Viewed

@@ -5,7 +5,14 @@
   "runtime": "node",
   "entrypoint": "index.js",
   "tool_declarations": [
-    { "name": "add_title_effects", "classification": "cacheable" }
+    { "name": "add_title_effects", "classification": "cacheable" },
+    { "name": "synthesize_tts", "classification": "mandatory" },
+    { "name": "plan_video_segments", "classification": "mandatory" },
+    { "name": "compose_video_v2", "classification": "mandatory" },
+    { "name": "record_url_narration", "classification": "mandatory" },
+    { "name": "render_text_to_image", "classification": "cacheable" },
+    { "name": "render_html_to_image", "classification": "cacheable" },
+    { "name": "take_page_screenshot", "classification": "cacheable" }
   ],
   "smoke_test": {
     "tool": "add_title_effects",

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@lightcone-ai/daemon",
-  "version": "0.16.1",
+  "version": "0.17.0",
   "type": "module",
   "main": "src/index.js",
   "bin": {

package/src/chat-bridge.js CHANGED Viewed

@@ -2,22 +2,19 @@
 import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
 import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
 import { z } from 'zod';
-import { createReadStream, existsSync, mkdirSync, readFileSync, writeFileSync, statSync } from 'fs';
+import { existsSync, mkdirSync, readFileSync, writeFileSync, statSync } from 'fs';
 import { createHash, randomUUID } from 'crypto';
 import path, { extname } from 'path';
 import os from 'os';
-import { recordUrlNarration } from './_vendor/video/recorder/index.js';
 import { writeLocalFileToWorkspace, resolveWorkspaceFileUploadPlan } from './workspace-file-upload.js';
 import { UploadJobManager } from './upload-job-manager.js';
 import { createUploadServerApi } from './upload-server-api.js';
-import { runRecordUrlNarrationTool } from './record-url-narration-tool.js';
+// record_url_narration moved to media-tools MCP server (V4 migration);
+// recorder import / handler are now consumed there, not from chat-bridge.
 import { runSubmitToLibraryTool } from './submit-to-library-tool.js';
-import { runRenderTextToImageTool } from './tools/render-text-to-image.js';
-import { runRenderHtmlToImageTool } from './tools/render-html-to-image.js';
-import { runSynthesisTtsTool } from './tools/synthesize-tts.js';
-import { runPlanVideoSegmentsTool } from './tools/plan-video-segments.js';
-import { runComposeVideoV2Tool } from './tools/compose-video-v2.js';
-import { runTakePageScreenshotTool } from './tools/take-page-screenshot.js';
+// render_text_to_image, render_html_to_image, take_page_screenshot moved to
+// media-tools MCP server (V5 migration). Handlers still live in
+// daemon/src/tools/ as shared modules and are imported there.
 import { runGetLibraryFileTool } from './tools/get-library-file.js';
 import { isLeaseInvalidated, clearInvalidatedLease } from './governance-state.js';
 import { classifyLeaseWindow } from './lease-window.js';
@@ -72,8 +69,9 @@ let currentWorkspaceId = WORKSPACE_ID;
 // Remove entirely once the new atomic tool framework is stable and the legacy pipeline retires.
 const CVMAX_WORKSPACE_ID = process.env.BLOCKED_EDITOR_WORKSPACE_ID ?? '';
 const CVMAX_EDITOR_IN_CHIEF_AGENT_ID = process.env.BLOCKED_EDITOR_AGENT_ID ?? '';
+// record_url_narration moved to media-tools and carries its own copy of this
+// block. submit_to_library stays here.
 const CVMAX_EDITOR_BLOCKED_VIDEO_TOOLS = new Set([
-  'record_url_narration',
   'submit_to_library',
 ]);
@@ -162,7 +160,6 @@ const DEFAULT_TOOL_CLASSIFICATION = {
   update_goal_field: 'mandatory',
   supersede_goal_field: 'mandatory',
   request_credential_auth: 'mandatory',
-  record_url_narration: 'mandatory',
   submit_to_library: 'mandatory',
   register_data_source: 'mandatory',
   bind_workspace_scenario: 'mandatory',
@@ -491,37 +488,6 @@ async function directApi(method, apiPath, body) {
   return res.json();
 }
-async function directApiVideoUpload(apiPath, {
-  localPath,
-  filename,
-  contentType = 'video/mp4',
-}) {
-  const url = `${SERVER_URL}/internal/agent/${AGENT_ID}${apiPath}`;
-  const headers = {
-    'Authorization': `Bearer ${MACHINE_API_KEY}`,
-    'Content-Type': contentType,
-  };
-  if (filename) headers['X-File-Name'] = filename;
-  let res;
-  try {
-    res = await fetch(url, {
-      method: 'POST',
-      headers,
-      body: createReadStream(localPath),
-      duplex: 'half',
-    });
-  } catch (error) {
-    throw buildDirectApiTransportError({ method: 'POST', apiPath, error });
-  }
-  if (!res.ok) {
-    const text = await res.text();
-    throw buildDirectApiHttpError({ method: 'POST', apiPath, status: res.status, text });
-  }
-  return res.json();
-}
 async function callGovernance(payload, { retry = true } = {}) {
   const attempts = retry ? 2 : 1;
   let lastError = null;
@@ -609,63 +575,6 @@ async function governanceRoundTrip({ method, apiPath, body, toolName, classifica
   return directApi(method, apiPath, nextBody);
 }
-async function runMandatoryLocalTool({ toolName, toolInput = {}, executor }) {
-  const classification = TOOL_CLASSIFICATION[toolName] ?? 'mandatory';
-  const traceId = randomUUID();
-  enqueueBundleEvent('tool_call_started', {
-    trace_id: traceId,
-    tool_name: toolName,
-    tool_classification: classification,
-    method: 'LOCAL',
-    api_path: '/local-tool',
-  });
-  try {
-    await ensureGovernanceContext();
-    const governancePayload = {
-      spawn_bundle_id: governanceContext.spawnBundleId,
-      policy_version: governanceContext.policyVersion,
-      tool_name: toolName,
-      tool_input: toolInput,
-      tool_classification: classification,
-      agent_id: AGENT_ID,
-      idempotency_key: randomUUID(),
-      lease_id: governanceContext.lease?.lease_id ?? null,
-    };
-    const governance = await callGovernance(governancePayload, { retry: true });
-    if (governance.policy_lease) applyPolicyLease(governance.policy_lease);
-    if (governance.verdict === 'reject' || governance.verdict === 'defer_human') {
-      throw governanceError(governanceReasonCode(governance.reason));
-    }
-    const checkedInput = (governance.verdict === 'modify' && governance.modified_input && typeof governance.modified_input === 'object')
-      ? { ...toolInput, ...governance.modified_input }
-      : toolInput;
-    const result = await executor(checkedInput);
-    if (CACHE_INVALIDATION_TOOLS.has(toolName)) {
-      governanceContext.cache.clear();
-    }
-    enqueueBundleEvent('tool_call_succeeded', {
-      trace_id: traceId,
-      tool_name: toolName,
-      tool_classification: classification,
-      source: 'governance_roundtrip',
-    });
-    return result;
-  } catch (error) {
-    if (shouldEmitToolCallFailed(error)) {
-      enqueueBundleEvent('tool_call_failed', {
-        trace_id: traceId,
-        tool_name: toolName,
-        tool_classification: classification,
-        reason: toolCallFailedReason(error),
-      });
-    }
-    throw error;
-  }
-}
 function renewCacheInBackground({ method, apiPath, body, toolName, cacheKey }) {
   if (governanceContext.renewalInFlight.has(cacheKey)) return;
   governanceContext.renewalInFlight.add(cacheKey);
@@ -1375,102 +1284,12 @@ server.tool('request_credential_auth',
   }
 );
-// ── render_text_to_image ───────────────────────────────────────────────────────
-server.tool('render_text_to_image',
-  'Render text content into image(s) for video synthesis. style=scroll produces a single tall image (for a scrolling video segment); style=carousel produces one image per card (for a slide-show segment). Returns local file paths.',
-  {
-    content: z.union([z.string(), z.array(z.string())]).describe('Text content. For carousel, pass an array of strings — one per card. For scroll, pass a single string (or array joined with line breaks).'),
-    style: z.enum(['scroll', 'carousel']).describe('scroll: one tall image; carousel: one image per card.'),
-    theme: z.enum(['dark', 'light']).optional().describe('Color theme. Default dark.'),
-    width: z.number().optional().describe('Image width in pixels. Default 1080.'),
-    card_height: z.number().optional().describe('Card height in pixels (carousel) or viewport height (scroll baseline). Default 1920.'),
-    font_size: z.number().optional().describe('Base font size in pixels. Default 48.'),
-  },
-  async (args) => runRenderTextToImageTool(args)
-);
-// ── render_html_to_image ───────────────────────────────────────────────────────
-server.tool('render_html_to_image',
-  'Render a raw HTML string to a PNG image by navigating to it as a local file:// page. Unlike evaluate_script+document.write on about:blank, this preserves file:// origin so <img src="file:///..."> references load correctly. Returns the output image path.',
-  {
-    html: z.string().describe('Full HTML document to render (including <!doctype>, <html>, <head>, <body>).'),
-    output_path: z.string().optional().describe('Absolute path to save the PNG. Auto-generated in /tmp if omitted.'),
-    viewport_width: z.number().optional().describe('Viewport width in pixels. Default 1080.'),
-    viewport_height: z.number().optional().describe('Viewport height in pixels. Default 1920.'),
-    wait_until: z.enum(['load', 'networkidle', 'domcontentloaded']).optional().describe('Navigation wait condition. Default load.'),
-  },
-  async (args) => runRenderHtmlToImageTool(args)
-);
-// ── synthesize_tts ─────────────────────────────────────────────────────────────
-server.tool('synthesize_tts',
-  'Convert text to speech using the workspace MiniMax TTS credential. Returns a local mp3 file path and duration. Use this to generate narration audio for individual video segments.',
-  {
-    text: z.string().describe('Text to synthesize. Keep under 500 characters per call for reliable results.'),
-    voice_id: z.string().optional().describe('MiniMax voice ID. Omit to use the workspace default voice.'),
-    workspace_id: z.string().optional().describe('Target workspace. Defaults to current workspace context.'),
-  },
-  async (args) => runSynthesisTtsTool({ ...args, currentWorkspaceId, api })
-);
-// ── plan_video_segments ────────────────────────────────────────────────────────
-server.tool('plan_video_segments',
-  'Universal audio-video sync planning step. For each segment, call TTS to get the real audio duration, then compute the visual duration with a safety buffer. Returns a planned segments array ready to pass directly to compose_video_v2 (with audio_path, presentation.duration/per_card_duration, and subtitle_text pre-filled). Always call this before compose_video_v2 when you have narration text.',
-  {
-    segments: z.array(z.object({
-      text: z.string().describe('Narration text for this segment. TTS will be generated from this.'),
-      visual_kind: z.enum(['image', 'video', 'gif', 'carousel']).describe('Type of visual.'),
-      visual_path: z.string().optional().describe('Absolute path to a single image, video, or gif file.'),
-      visual_paths: z.array(z.string()).optional().describe('For carousel: array of image paths, one per card.'),
-      transition: z.enum(['cut', 'fade', 'crossfade']).optional().describe('Transition to next segment. Default cut.'),
-      presentation: z.object({
-        style: z.enum(['static', 'scroll']).optional(),
-      }).optional().describe('Partial presentation hints (style only). duration/per_card_duration are computed from TTS.'),
-    })).describe('Segments to plan. Each must have narration text and visual info.'),
-    voice_id: z.string().optional().describe('TTS voice ID. Omit to use workspace default.'),
-    workspace_id: z.string().optional().describe('Target workspace. Defaults to current workspace context.'),
-  },
-  async (args) => runPlanVideoSegmentsTool({ ...args, currentWorkspaceId, api })
-);
-// ── compose_video_v2 ───────────────────────────────────────────────────────────
-server.tool('compose_video_v2',
-  'Compose a video from a list of segments using ffmpeg. Each segment has a visual source (image/scroll/carousel/video/gif), optional audio, and optional subtitle text. Subtitles are burned into the video by default when subtitle_text is provided. Segments are concatenated in order; outro clips are appended at the end. Returns a local mp4 path.\n\nTypical flow: plan_video_segments → compose_video_v2 (segments output fed directly in).',
-  {
-    segments: z.array(z.object({
-      visual_path: z.string().optional().describe('Absolute path to a single image, video, or gif file.'),
-      visual_paths: z.array(z.string()).optional().describe('For carousel: array of image paths, one per card.'),
-      visual_kind: z.enum(['image', 'video', 'gif', 'carousel']).describe('Type of visual.'),
-      presentation: z.object({
-        style: z.enum(['static', 'scroll']).optional().describe('For image: static (default) or scroll (pan upward).'),
-        duration: z.number().optional().describe('Segment duration in seconds. Required for image/scroll.'),
-        per_card_duration: z.number().optional().describe('Seconds per card for carousel.'),
-      }).optional(),
-      audio_path: z.string().nullable().optional().describe('Absolute path to audio (mp3). null or omit for silence.'),
-      subtitle_text: z.string().optional().describe('Narration text to burn as subtitle for this segment. Displayed for the full segment duration.'),
-      transition: z.enum(['cut', 'fade', 'crossfade']).optional().describe('Transition to next segment. Default cut.'),
-    })).describe('Ordered list of video segments.'),
-    outro_paths: z.array(z.string()).optional().describe('Absolute paths to outro video clips appended after all segments.'),
-    resolution: z.string().optional().describe('Output resolution WxH. Default "1080x1920".'),
-    output_path: z.string().optional().describe('Absolute output path for the mp4. Auto-generated if omitted.'),
-  },
-  async (args) => runComposeVideoV2Tool({ ...args, workspaceDir: WORKSPACE_DIR })
-);
-// ── take_page_screenshot ───────────────────────────────────────────────────────
-server.tool('take_page_screenshot',
-  'Open a URL with a headless browser and capture a screenshot. crop=above_fold captures only the visible viewport (ideal for thumbnail-style frames); crop=full_page captures the entire page height.',
-  {
-    url: z.string().describe('Page URL to screenshot.'),
-    crop: z.enum(['above_fold', 'full_page']).optional().describe('Capture mode. Default above_fold.'),
-    viewport: z.object({
-      width: z.number().optional(),
-      height: z.number().optional(),
-    }).optional().describe('Viewport size. Default 390×844 (mobile).'),
-    wait_for: z.enum(['load', 'networkidle', 'domcontentloaded']).optional().describe('Page load event to wait for before screenshotting. Default networkidle.'),
-  },
-  async (args) => runTakePageScreenshotTool(args)
-);
+// render_text_to_image, render_html_to_image, take_page_screenshot, synthesize_tts,
+// plan_video_segments, compose_video_v2, record_url_narration — all moved to
+// media-tools MCP server (V1–V5 migration; see
+// docs/scenario-content-creation/video-synthesis-design.md). The whole video
+// pipeline now lives in one stdio server so session-scoped flags (plan-was-
+// called) can gate downstream tools.
 // ── get_library_file ───────────────────────────────────────────────────────────
 server.tool('get_library_file',
@@ -1482,48 +1301,10 @@ server.tool('get_library_file',
   async (args) => runGetLibraryFileTool({ ...args, currentWorkspaceId, api, SERVER_URL, MACHINE_API_KEY, workspaceDir: WORKSPACE_DIR })
 );
-// ── record_url_narration ────────────────────────────────────────────────────────
-server.tool('record_url_narration',
-  'Record a silent video of a URL by driving Chromium on an Xvfb display and capturing it with Playwright recordVideo, driven by a video plan; ffmpeg then transcodes the recording to mp4. Outputs a silent mp4 that can be passed to compose_video_v2 as a video-kind segment with an audio_path for narration.\n\nUse this as the canonical recording step for URL-narration videos. Falls back: if the page needs interactions outside the visual_action vocabulary (clicks, waits, OCR loops), use Monitor (Bash) with custom Playwright instead.\n\nRuntime requirements: this tool only works on a Linux daemon machine with Xvfb + Chromium + ffmpeg installed (ffmpeg is used to transcode the recording to mp4; no x11grab device support needed). macOS / Windows daemons will fail at startup.',
-  {
-    url: z.string().describe('Page URL to record'),
-    plan: z.record(z.any()).describe(
-      'A video plan: an object with `phases` (or `sections`), each a "visual beat" with '
-      + '`action` (scroll_to_dwell / linear_scroll_during / scroll_back / hold / ...), a '
-      + 'target (`target_y` or `focus_region:[y1,y2]`) for scroll-type actions, and '
-      + '`dwell_ms` (how long to hold that beat — should match the segment\'s TTS duration).\n\n'
-      + 'For RECRUITMENT URLs (mp.weixin.qq.com / 校招 / 实习 / 岗位 content), each section MUST '
-      + 'also declare `target_y_content_label` — a short Chinese label describing what content '
-      + 'sits at that pixel y position on the page (e.g. "标题区" / "岗位信息卡片" / "公司介绍" / '
-      + '"届别说明"). Look at the take_page_screenshot output, find the y-pixel, and label it. '
-      + 'Labels matching forbidden regions ("二维码" / "扫码" / "投递入口" / "投递方式" / "联系方式" / '
-      + '"微信号" / "QR" / "阅读原文" / "外链") will cause the tool to refuse the recording — '
-      + 'recruitment content must NOT dwell on these areas (see fragments.md '
-      + 'frag.short.recruitment_url_mode_policy). Pick a different target_y in the 标题/岗位 '
-      + 'information area and rewrite that section.'
-    ),
-    output_path: z.string().optional().describe('Workspace-relative output mp4 path. Default tmp/wx3_video/recorded-{ts}.mp4'),
-    events_path: z.string().optional().describe('Workspace-relative events.json path. Default ${output_path}.events.json'),
-    viewport: z.object({
-      width: z.number().optional(),
-      height: z.number().optional(),
-    }).optional().describe('Default 1080x1920 (mobile portrait). Override only if the plan requires a different shape.'),
-    fps: z.number().optional().describe('Default 30. Do not change unless needed.'),
-    settle_ms: z.number().optional().describe('Default 4000. Settle wait after navigation before recording starts.'),
-  },
-  async (args) => {
-    if (isBlockedCvmaxEditorVideoTool('record_url_narration')) {
-      return cvmaxEditorVideoToolError('record_url_narration');
-    }
-    return runRecordUrlNarrationTool({
-      args,
-      currentWorkspaceId,
-      workspaceDir: WORKSPACE_DIR,
-      runMandatoryLocalToolFn: runMandatoryLocalTool,
-      recordUrlNarrationFn: recordUrlNarration,
-    });
-  }
-);
+// record_url_narration moved to media-tools MCP server (V4 migration). The
+// session-scoped plan_video_segments check now lives there alongside the
+// other video pipeline tools (synthesize_tts / plan_video_segments /
+// compose_video_v2). The CVMAX editor_in_chief block also moved with it.
 // ── submit_to_library ──────────────────────────────────────────────────────────
 server.tool('submit_to_library',

package/src/tools/plan-video-segments.js CHANGED Viewed

@@ -1,7 +1,22 @@
-import { mkdirSync, writeFileSync } from 'fs';
-import { randomUUID } from 'crypto';
-import path from 'path';
-import os from 'os';
+// plan_video_segments — pure audio/video alignment planner.
+//
+// Takes per-segment {text, audio_path, visual_kind, ...} and returns unified
+// plan segments with:
+//   - audio_duration_ms (read via ffprobe from the provided audio_path)
+//   - subtitle_text (= text)
+//   - presentation.duration / per_card_duration (audio_duration + buffer)
+//   - dwell_ms (= audio_duration; lets the same segment drive record_url_narration)
+//
+// Previously this tool ALSO synthesized TTS internally — which duplicated
+// the work when callers had already run synthesize_tts, and caused the
+// "wrong standard chain" confusion in fragments.md. TTS is now decoupled:
+// callers must run synthesize_tts per segment first and pass the resulting
+// audio_path here. See docs/scenario-content-creation/video-synthesis-design.md.
+//
+// Lives in daemon/src/tools/ as a reusable handler module; the MCP-tool
+// registration lives in daemon/mcp-servers/official/media-tools/index.js.
+import { spawn } from 'node:child_process';
 function toolText(text) {
   return { content: [{ type: 'text', text }] };
@@ -11,125 +26,99 @@ function toolError(text) {
   return { isError: true, content: [{ type: 'text', text }] };
 }
-function inferAudioExt(url) {
-  const clean = String(url ?? '').split('?')[0];
-  const ext = path.extname(clean).toLowerCase();
-  return ['.mp3', '.wav', '.flac', '.aac', '.ogg'].includes(ext) ? ext : '.mp3';
-}
-async function synthesizeSegmentTts(text, { workspace_id, voice_id, api }) {
-  const payload = { workspace_id, text, speed: 1, format: 'mp3' };
-  if (voice_id) payload.voice_preset = String(voice_id).trim();
-  const data = await api('POST', '/tts/voiceover', payload);
-  const remoteAudioUrl = String(data.audio_url ?? '').trim();
-  if (!remoteAudioUrl) throw new Error('TTS API did not return audio_url');
-  const downloadRes = await fetch(remoteAudioUrl);
-  if (!downloadRes.ok) throw new Error(`Failed to download audio (${downloadRes.status})`);
-  const fileBuffer = Buffer.from(await downloadRes.arrayBuffer());
-  const outDir = path.join(os.tmpdir(), 'lightcone-tts');
-  mkdirSync(outDir, { recursive: true });
-  const ext = inferAudioExt(remoteAudioUrl);
-  const outPath = path.join(outDir, `tts-${Date.now()}-${randomUUID().slice(0, 8)}${ext}`);
-  writeFileSync(outPath, fileBuffer);
-  const durationMs = Number(data.duration_ms ?? 0);
-  return { audio_path: outPath, audio_duration_ms: durationMs };
+async function probeAudioDurationMs(audioPath) {
+  return new Promise((resolve, reject) => {
+    const proc = spawn('ffprobe', [
+      '-v', 'error',
+      '-show_entries', 'format=duration',
+      '-of', 'csv=p=0',
+      audioPath,
+    ], { stdio: ['ignore', 'pipe', 'pipe'] });
+    const out = [];
+    const err = [];
+    proc.stdout.on('data', c => out.push(c));
+    proc.stderr.on('data', c => err.push(c));
+    proc.on('close', code => {
+      if (code !== 0) {
+        return reject(new Error(`ffprobe exited ${code}: ${Buffer.concat(err).toString().slice(-300)}`));
+      }
+      const seconds = parseFloat(Buffer.concat(out).toString().trim());
+      if (!Number.isFinite(seconds)) {
+        return reject(new Error(`ffprobe returned non-numeric duration: ${Buffer.concat(out).toString().slice(0, 200)}`));
+      }
+      resolve(Math.round(seconds * 1000));
+    });
+    proc.on('error', err2 => reject(new Error(`ffprobe spawn failed: ${err2.message}`)));
+  });
 }
-// Compute segment duration from audio duration: audio + 0.5s buffer, rounded up to nearest 0.5s.
+// Plan visual duration from audio duration: audio + buffer, rounded up to the
+// nearest 0.5s. scroll-style images get a longer buffer because eyes need
+// extra time to follow the motion.
 function planDurationSec(audioDurationMs, bufferSec = 0.5) {
   const raw = audioDurationMs / 1000 + bufferSec;
-  return Math.ceil(raw * 2) / 2; // round up to nearest 0.5s
+  return Math.ceil(raw * 2) / 2;
 }
-// Run fn over items with a bounded number of concurrent workers (FIFO drain).
-async function mapWithConcurrency(items, limit, fn) {
-  const queue = items.map((item, index) => ({ item, index }));
-  const workers = Array.from({ length: Math.max(1, Math.min(limit, queue.length)) }, async () => {
-    while (queue.length > 0) {
-      const next = queue.shift();
-      await fn(next.item, next.index);
-    }
-  });
-  await Promise.all(workers);
-}
-const TTS_CONCURRENCY = 5;
-export async function runPlanVideoSegmentsTool({ segments, workspace_id, voice_id, currentWorkspaceId, api }) {
+export async function runPlanVideoSegmentsTool({ segments } = {}) {
   if (!Array.isArray(segments) || segments.length === 0) {
     return toolError('segments must be a non-empty array.');
   }
-  const targetWorkspaceId = String(workspace_id ?? currentWorkspaceId ?? '').trim();
-  if (!targetWorkspaceId) {
-    return toolError('workspace_id is required (no current workspace context).');
+  // Up-front validation — fail fast before any work.
+  for (let i = 0; i < segments.length; i += 1) {
+    const seg = segments[i] ?? {};
+    if (typeof seg.audio_path !== 'string' || !seg.audio_path.trim()) {
+      return toolError(
+        `segments[${i}]: audio_path is required. plan_video_segments no longer synthesizes TTS — call synthesize_tts(text) `
+        + 'first and pass the returned path as audio_path. Standard chain: synthesize_tts × N → plan_video_segments → '
+        + 'record_url_narration + compose_video_v2 (share the same plan).'
+      );
+    }
+    const kind = String(seg.visual_kind ?? '');
+    if (!kind) {
+      return toolError(`segments[${i}]: visual_kind is required (image / video / gif / carousel).`);
+    }
   }
   const planned = [];
-  const errors = [];
+  const warnings = [];
+  for (let i = 0; i < segments.length; i += 1) {
+    const seg = segments[i];
+    const text = String(seg.text ?? '').trim();
+    const kind = String(seg.visual_kind);
-  // Synthesize TTS for every text-bearing segment up front, in parallel (bounded),
-  // so an N-segment plan no longer pays N sequential round-trips to the TTS API.
-  const audioResults = new Array(segments.length).fill(null);
-  const ttsJobs = segments
-    .map((seg, i) => ({ i, text: String(seg.text ?? '').trim() }))
-    .filter(job => job.text);
-  await mapWithConcurrency(ttsJobs, TTS_CONCURRENCY, async ({ i, text }) => {
+    let audioDurationMs;
     try {
-      audioResults[i] = await synthesizeSegmentTts(text, { workspace_id: targetWorkspaceId, voice_id, api });
+      audioDurationMs = await probeAudioDurationMs(seg.audio_path);
     } catch (err) {
-      errors.push(`segments[${i}]: TTS failed — ${err.message}`);
-      audioResults[i] = { audio_path: null, audio_duration_ms: 3000 }; // fallback estimate
+      warnings.push(`segments[${i}]: audio probe failed (${err.message}); falling back to 3000ms`);
+      audioDurationMs = 3000;
     }
-  });
-  errors.sort((a, b) => {
-    const na = Number((a.match(/segments\[(\d+)\]/) ?? [])[1] ?? 0);
-    const nb = Number((b.match(/segments\[(\d+)\]/) ?? [])[1] ?? 0);
-    return na - nb;
-  });
-  for (let i = 0; i < segments.length; i++) {
-    const seg = segments[i];
-    const text = String(seg.text ?? '').trim();
-    const kind = String(seg.visual_kind ?? 'image');
-    const audioResult = audioResults[i];
-    const audioDurationMs = audioResult?.audio_duration_ms ?? 0;
     let presentation;
     if (kind === 'carousel') {
       const numCards = Array.isArray(seg.visual_paths) ? seg.visual_paths.length : 1;
-      const totalDuration = audioDurationMs > 0 ? planDurationSec(audioDurationMs) : numCards * 4;
+      const totalDuration = planDurationSec(audioDurationMs);
       const perCard = Math.max(2, Math.ceil((totalDuration / numCards) * 2) / 2);
       presentation = { per_card_duration: perCard };
     } else {
-      // image, scroll, video, gif
-      const duration = audioDurationMs > 0 ? planDurationSec(audioDurationMs, kind === 'scroll' ? 1.0 : 0.5) : 4;
+      // image / scroll / video / gif
+      const duration = planDurationSec(audioDurationMs, kind === 'scroll' ? 1.0 : 0.5);
       presentation = { duration, ...(kind === 'scroll' ? { style: 'scroll' } : {}) };
     }
-    // dwell_ms lets the same segment double as a record_url_narration plan phase
-    // (the recorder reads dwell_ms / duration_ms for how long to hold each beat).
-    // Prefer the real measured audio length; fall back to the planned visual duration.
-    const dwellMs = audioDurationMs > 0
-      ? audioDurationMs
-      : Math.round((presentation.duration ?? presentation.per_card_duration ?? 4) * 1000);
-    const planned_seg = {
+    planned.push({
       ...seg,
-      ...(audioResult?.audio_path ? { audio_path: audioResult.audio_path } : {}),
+      audio_path: seg.audio_path,
+      audio_duration_ms: audioDurationMs,
       ...(text ? { subtitle_text: text } : {}),
       presentation: { ...presentation, ...(seg.presentation ?? {}) },
-      dwell_ms: seg.dwell_ms ?? dwellMs,
-    };
-    if (audioResult?.audio_duration_ms) {
-      planned_seg.audio_duration_ms = audioResult.audio_duration_ms;
-    }
-    planned.push(planned_seg);
+      // dwell_ms doubles as record_url_narration's per-phase hold duration so
+      // recording naturally tracks the narration audio.
+      dwell_ms: seg.dwell_ms ?? audioDurationMs,
+    });
   }
   const result = {
@@ -141,7 +130,7 @@ export async function runPlanVideoSegmentsTool({ segments, workspace_id, voice_i
         : (s.presentation?.duration ?? 4);
       return sum + Math.round(d * 1000);
     }, 0),
-    ...(errors.length > 0 ? { warnings: errors } : {}),
+    ...(warnings.length > 0 ? { warnings } : {}),
   };
   return toolText(JSON.stringify(result, null, 2));

package/src/{record-url-narration-tool.js → tools/record-url-narration.js} RENAMED Viewed

@@ -1,6 +1,23 @@
+// record_url_narration — atomic recording tool.
+//
+// Drives Chromium on Xvfb + Playwright recordVideo to capture a silent mp4 of
+// a URL following a beat-by-beat visual plan, then ffmpeg-transcodes it. The
+// resulting silent mp4 feeds into compose_video_v2 as a video-kind segment
+// alongside narration audio.
+//
+// Lives in daemon/src/tools/ as a reusable handler module; the MCP-tool
+// registration lives in daemon/mcp-servers/official/media-tools/index.js.
+// Migrated out of chat-bridge.js (V4) — no longer wrapped by
+// runMandatoryLocalTool / governance round-trip. media-tools is a separate
+// stdio MCP server and governance integration is chat-bridge-specific;
+// matches the precedent set by synthesize_tts / plan_video_segments /
+// compose_video_v2 in V1/V2/V3.
 import { mkdirSync } from 'fs';
 import path from 'path';
+import { recordUrlNarration as defaultRecordUrlNarrationFn } from '../_vendor/video/recorder/index.js';
 function toolText(text) {
   return { content: [{ type: 'text', text }] };
 }
@@ -55,10 +72,6 @@ function derivePhaseCount({ plan, recorderOutput }) {
   return segments ? segments.length : null;
 }
-// record_url_narration is an atomic tool, not the tail of a fixed pipeline.
-// The plan may be hand-written by the scripter or produced by plan_video_segments;
-// it just needs a non-empty list of segments with per-segment visual action + duration
-// so the recording stays in sync with the narration audio.
 function assertPipelineCompliance(plan) {
   if (!isPlainObject(plan)) return;
   if (!planSegments(plan)) {
@@ -86,10 +99,6 @@ const FORBIDDEN_REGION_PATTERNS = [
 ];
 function isRecruitmentLikeUrl(url) {
-  // Conservative URL-based heuristic: mp.weixin.qq.com pages forwarding 招聘 /
-  // 校招 / 实习 / job content. Until we have content classification, treat
-  // mp.weixin.qq.com URLs as recruitment-class for safety — the cost of a
-  // mis-flag is "agent must add a label", not "recording fails permanently".
   if (typeof url !== 'string') return false;
   return /mp\.weixin\.qq\.com/.test(url);
 }
@@ -101,11 +110,6 @@ function describeForbiddenMatch(label) {
   return null;
 }
-/**
- * For recruitment-class URLs, every plan section must declare what content
- * sits at its target_y, and the label must NOT match the forbidden-region
- * patterns. Returns null on pass, error message string on fail.
- */
 function checkSafeRegionLabels({ url, plan }) {
   if (!isRecruitmentLikeUrl(url)) return null;
   const segments = planSegments(plan);
@@ -181,16 +185,13 @@ export async function runRecordUrlNarrationTool({
   args = {},
   currentWorkspaceId = '',
   workspaceDir = process.cwd(),
-  runMandatoryLocalToolFn,
-  recordUrlNarrationFn,
+  planVideoSegmentsCalled = false,
+  recordUrlNarrationFn = defaultRecordUrlNarrationFn,
   nowMs = () => Date.now(),
 } = {}) {
   if (!currentWorkspaceId) {
     return toolError('No workspace context.');
   }
-  if (typeof runMandatoryLocalToolFn !== 'function') {
-    return toolError('Error: record_url_narration runMandatoryLocalToolFn is required.');
-  }
   if (typeof recordUrlNarrationFn !== 'function') {
     return toolError('Error: record_url_narration executor is required.');
   }
@@ -220,51 +221,53 @@ export async function runRecordUrlNarrationTool({
     return toolError(`Error: ${safeRegionError}`);
   }
+  // Standard-chain hard block: refuse recordings unless plan_video_segments
+  // ran in this session. Discovered repeatedly in Tasks #20/#25/#26 that
+  // agents hand-write dwell_ms by guessing, producing recordings whose phase
+  // boundaries drift from the TTS audio they will eventually be paired with —
+  // forcing a full re-record. plan_video_segments fills dwell_ms mechanically
+  // from ffprobe audio duration, eliminating the drift.
+  if (!planVideoSegmentsCalled) {
+    return toolError(
+      'Error: record_url_narration refused: plan_video_segments must run earlier in this '
+      + 'session so dwell_ms / phase durations are mechanically aligned with the segment\'s '
+      + 'TTS audio (audio_duration_ms). Hand-written dwell_ms has repeatedly drifted from '
+      + 'the actual TTS duration in production runs, forcing full re-records.\n\n'
+      + 'Standard chain: synthesize_tts × N (per segment) → plan_video_segments(segments with '
+      + 'text + audio_path + visual_kind=video + visual_path) → record_url_narration (feed '
+      + 'plan_video_segments output as plan.sections — each section\'s dwell_ms is already '
+      + 'set to audio_duration_ms) + compose_video_v2 (same plan output). Call plan_video_segments '
+      + 'now, then pass its `segments` array as `plan.sections` here.'
+    );
+  }
   try {
-    const result = await runMandatoryLocalToolFn({
-      toolName: 'record_url_narration',
-      toolInput: validatedInput,
-      executor: async (checkedInput = {}) => {
-        const mergedInput = {
-          ...validatedInput,
-          ...checkedInput,
-        };
-        const finalInput = validateRecordUrlNarrationArgs(mergedInput);
-        const { resolvedOutputPath, resolvedEventsPath } = resolveRecordUrlNarrationPaths({
-          workspaceDir,
-          outputPath: finalInput.output_path,
-          eventsPath: finalInput.events_path,
-          nowMs,
-        });
-        mkdirSync(path.dirname(resolvedOutputPath), { recursive: true });
-        mkdirSync(path.dirname(resolvedEventsPath), { recursive: true });
-        const recorderOutput = await recordUrlNarrationFn({
-          url: finalInput.url,
-          plan: finalInput.plan,
-          output_path: resolvedOutputPath,
-          events_path: resolvedEventsPath,
-          viewport: finalInput.viewport,
-          fps: finalInput.fps,
-          settle_ms: finalInput.settle_ms,
-        });
-        return {
-          videoPath: resolvedOutputPath,
-          eventsPath: resolvedEventsPath,
-          durationMs: deriveDurationMs(recorderOutput),
-          phases: derivePhaseCount({ plan: finalInput.plan, recorderOutput }),
-        };
-      },
+    const { resolvedOutputPath, resolvedEventsPath } = resolveRecordUrlNarrationPaths({
+      workspaceDir,
+      outputPath: validatedInput.output_path,
+      eventsPath: validatedInput.events_path,
+      nowMs,
+    });
+    mkdirSync(path.dirname(resolvedOutputPath), { recursive: true });
+    mkdirSync(path.dirname(resolvedEventsPath), { recursive: true });
+    const recorderOutput = await recordUrlNarrationFn({
+      url: validatedInput.url,
+      plan: validatedInput.plan,
+      output_path: resolvedOutputPath,
+      events_path: resolvedEventsPath,
+      viewport: validatedInput.viewport,
+      fps: validatedInput.fps,
+      settle_ms: validatedInput.settle_ms,
     });
     return toolText(
       `Recorded URL narration.\n`
-      + `video_path=${result.videoPath}\n`
-      + `events_path=${result.eventsPath}\n`
-      + `duration_ms=${result.durationMs ?? 'unknown'}\n`
-      + `phases=${result.phases ?? 'n/a'}`
+      + `video_path=${resolvedOutputPath}\n`
+      + `events_path=${resolvedEventsPath}\n`
+      + `duration_ms=${deriveDurationMs(recorderOutput) ?? 'unknown'}\n`
+      + `phases=${derivePhaseCount({ plan: validatedInput.plan, recorderOutput }) ?? 'n/a'}`
     );
   } catch (error) {
     return toolError(`Error: ${error.message}`);