npm - @lightcone-ai/daemon - Versions diffs - 0.16.2 → 0.17.1 - Mend

@lightcone-ai/daemon 0.16.2 → 0.17.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/mcp-servers/official/media-tools/index.js +288 -0
package/mcp-servers/official/media-tools/lib/lightcone-api.js +41 -0
package/mcp-servers/official/media-tools/manifest.json +8 -1
package/package.json +1 -1
package/src/chat-bridge.js +28 -276
package/src/mcp-config.js +5 -0
package/src/tools/plan-video-segments.js +83 -94
package/src/{record-url-narration-tool.js → tools/record-url-narration.js} +61 -58
package/src/video-brief-flag.js +78 -0

package/mcp-servers/official/media-tools/index.js CHANGED Viewed

@@ -5,6 +5,56 @@ import { z } from 'zod';
 import { addTitleEffects } from './lib/render.js';
 import { SUPPORTED_PRESETS } from './lib/presets.js';
+import { runSynthesisTtsTool } from '../../../src/tools/synthesize-tts.js';
+import { runPlanVideoSegmentsTool } from '../../../src/tools/plan-video-segments.js';
+import { runComposeVideoV2Tool } from '../../../src/tools/compose-video-v2.js';
+import { runRecordUrlNarrationTool } from '../../../src/tools/record-url-narration.js';
+import { runRenderTextToImageTool } from '../../../src/tools/render-text-to-image.js';
+import { runRenderHtmlToImageTool } from '../../../src/tools/render-html-to-image.js';
+import { runTakePageScreenshotTool } from '../../../src/tools/take-page-screenshot.js';
+import { hasFreshVideoBrief } from '../../../src/video-brief-flag.js';
+import { lightconeApi, CURRENT_WORKSPACE_ID, CURRENT_AGENT_ID } from './lib/lightcone-api.js';
+const WORKSPACE_DIR = String(process.env.WORKSPACE_DIR ?? '');
+// CVMAX editor_in_chief block: in one workspace, the editor_in_chief agent
+// must not run video production tools directly (short_video_scripter owns
+// that role). Env-gated so ops can rotate workspace/agent IDs without code
+// changes. Previously lived in chat-bridge; moved here alongside the tool
+// it gates (V4 migration). submit_to_library, the other gated tool, still
+// lives in chat-bridge and keeps its own copy of this check.
+const CVMAX_WORKSPACE_ID = String(process.env.BLOCKED_EDITOR_WORKSPACE_ID ?? '');
+const CVMAX_EDITOR_IN_CHIEF_AGENT_ID = String(process.env.BLOCKED_EDITOR_AGENT_ID ?? '');
+function isBlockedCvmaxEditorVideoTool(toolName) {
+  return CURRENT_WORKSPACE_ID === CVMAX_WORKSPACE_ID
+    && CURRENT_AGENT_ID === CVMAX_EDITOR_IN_CHIEF_AGENT_ID
+    && CVMAX_WORKSPACE_ID
+    && CVMAX_EDITOR_IN_CHIEF_AGENT_ID
+    && toolName === 'record_url_narration';
+}
+function cvmaxEditorVideoToolError(toolName) {
+  return {
+    isError: true,
+    content: [{
+      type: 'text',
+      text:
+        `Error: ${toolName} blocked for editor_in_chief in CvMax. `
+        + 'In this workspace, @short_video_scripter owns video production. '
+        + 'editor_in_chief may route, review, or assist with OCR/verification, but must not run video production tools directly.',
+    }],
+  };
+}
+// Session-scoped flag set when plan_video_segments runs successfully.
+// compose_video_v2 refuses TTS-bearing segments (any segment with audio_path)
+// unless this is true — the agent must route audio through plan_video_segments
+// first so durations / subtitle_text are mechanically aligned. media-tools is
+// spawned per-agent, so a fresh agent session must call plan_video_segments
+// fresh. (This flag previously lived in chat-bridge module scope; moved here
+// alongside the tools it gates, see V2 migration.)
+let _planVideoSegmentsCalledThisSession = false;
 const PRESET_ENUM = z.enum(SUPPORTED_PRESETS);
 const POSITION_ENUM = z.enum(['top', 'center', 'bottom']);
@@ -63,6 +113,244 @@ server.tool(
   }
 );
+// ── synthesize_tts (migrated from chat-bridge) ────────────────────────────
+// Pure TTS atomic tool: text → mp3 file. The lightcone server proxies to
+// MiniMax TTS; this tool downloads the resulting mp3 to a local tmp path so
+// downstream tools (plan_video_segments / compose_video_v2) can read it.
+//
+// Per the video-synthesis-design migration (see docs/upload-pipeline-design.md
+// and docs/scenario-content-creation/video-synthesis-design.md), this tool
+// lives in media-tools rather than chat-bridge so the video pipeline is a
+// single coherent MCP server.
+server.tool(
+  'synthesize_tts',
+  'Run MiniMax TTS on a snippet of narration text and download the resulting mp3 to a local tmp path. '
+  + 'Returns the local path and duration. Call once per video segment — do not concatenate all narration '
+  + 'into a single call (segment-level audio is required for plan_video_segments to align video durations).',
+  {
+    text: z.string().min(1).describe('Narration text for this segment. Will be synthesized as a single mp3.'),
+    voice_id: z.string().optional().describe('TTS voice preset. Omit to use workspace default.'),
+    workspace_id: z.string().optional().describe('Target workspace. Defaults to the current workspace.'),
+  },
+  async ({ text, voice_id, workspace_id }) => runSynthesisTtsTool({
+    text,
+    voice_id,
+    workspace_id,
+    currentWorkspaceId: CURRENT_WORKSPACE_ID,
+    api: lightconeApi,
+  })
+);
+// ── plan_video_segments (migrated from chat-bridge; TTS decoupled) ────────
+// Pure planner — takes per-segment {text, audio_path, visual_kind, ...} and
+// returns segments with audio_duration_ms / presentation.duration / dwell_ms
+// / subtitle_text filled in. Caller MUST run synthesize_tts per segment first
+// and pass the resulting audio_path here. No longer synthesizes TTS itself
+// (V2 migration; see docs/scenario-content-creation/video-synthesis-design.md).
+server.tool(
+  'plan_video_segments',
+  'Universal audio-video sync planning step. For each segment, reads the supplied audio_path via ffprobe, '
+  + 'measures audio duration, and returns a planned segments array with audio_path / audio_duration_ms / '
+  + 'subtitle_text / presentation.duration / dwell_ms filled in — ready to pass directly to both '
+  + 'record_url_narration (as the recording plan) AND compose_video_v2 (as the segment list). '
+  + 'Must be called before compose_video_v2 when any segment has audio_path.\n\n'
+  + 'Inputs per segment: {text, audio_path (required, from synthesize_tts), visual_kind, visual_path or visual_paths, '
+  + 'optionally transition / presentation.style}. Standard chain: synthesize_tts × N → plan_video_segments → '
+  + 'record_url_narration + compose_video_v2 (both use the same plan output).',
+  {
+    segments: z.array(z.object({
+      text: z.string().describe('Narration text for this segment — used as subtitle_text in the output.'),
+      audio_path: z.string().describe('Absolute path to the segment\'s mp3 (from synthesize_tts).'),
+      visual_kind: z.enum(['image', 'video', 'gif', 'carousel']).describe('Type of visual for compose_video_v2.'),
+      visual_path: z.string().optional().describe('Absolute path to a single image / video / gif file.'),
+      visual_paths: z.array(z.string()).optional().describe('For carousel: array of image paths, one per card.'),
+      transition: z.enum(['cut', 'fade', 'crossfade']).optional().describe('Transition to next segment. Default cut.'),
+      presentation: z.object({
+        style: z.enum(['static', 'scroll']).optional(),
+      }).optional().describe('Optional presentation hints (style only). duration/per_card_duration are computed.'),
+      dwell_ms: z.number().optional().describe('Optional override for record_url_narration phase duration. Default = audio_duration_ms.'),
+    })).describe('Segments to plan. audio_path is required for each.'),
+  },
+  async ({ segments }) => {
+    const result = await runPlanVideoSegmentsTool({ segments });
+    if (!result?.isError) _planVideoSegmentsCalledThisSession = true;
+    return result;
+  }
+);
+// ── compose_video_v2 (migrated from chat-bridge) ──────────────────────────
+// Tool-level enforcement of the standard chain: TTS-bearing segments require
+// plan_video_segments to have run earlier in this session. Without it manual
+// dwell/duration math has repeatedly produced misaligned subtitles, silent
+// tails, and re-records (Task #25/#26 trial).
+server.tool(
+  'compose_video_v2',
+  'Compose a video from a list of segments using ffmpeg. Each segment has a visual source (image / scroll / '
+  + 'carousel / video / gif), optional audio, and optional subtitle text. Subtitles are burned in when '
+  + 'subtitle_text is provided. Segments are concatenated in order; outro clips are appended after.\n\n'
+  + 'When any segment has audio_path, MUST be preceded by plan_video_segments in the same session '
+  + '(plan_video_segments fills duration/subtitle_text/audio_path mechanically; manual alignment is rejected). '
+  + 'Returns a local mp4 path + size_bytes.',
+  {
+    segments: z.array(z.object({
+      visual_path: z.string().optional().describe('Absolute path to a single image / video / gif.'),
+      visual_paths: z.array(z.string()).optional().describe('For carousel: array of image paths, one per card.'),
+      visual_kind: z.enum(['image', 'video', 'gif', 'carousel']).describe('Type of visual.'),
+      presentation: z.object({
+        style: z.enum(['static', 'scroll']).optional(),
+        duration: z.number().optional().describe('Segment duration in seconds. Required for image/scroll.'),
+        per_card_duration: z.number().optional().describe('Seconds per card for carousel.'),
+      }).optional(),
+      audio_path: z.string().nullable().optional().describe('Absolute path to audio (mp3). null/omit for silence.'),
+      subtitle_text: z.string().optional().describe('Narration text to burn as subtitle. Displayed for the full segment duration.'),
+      transition: z.enum(['cut', 'fade', 'crossfade']).optional().describe('Transition to next segment. Default cut.'),
+    })).describe('Ordered list of video segments.'),
+    outro_paths: z.array(z.string()).optional().describe('Absolute paths to outro video clips appended at end.'),
+    resolution: z.string().optional().describe('Output resolution WxH. Default "1080x1920".'),
+    output_path: z.string().optional().describe('Absolute output path. Auto-generated if omitted.'),
+  },
+  async (args) => {
+    const segments = Array.isArray(args?.segments) ? args.segments : [];
+    const hasNarration = segments.some(s => typeof s?.audio_path === 'string' && s.audio_path.trim());
+    if (hasNarration && !_planVideoSegmentsCalledThisSession) {
+      return toolError(
+        'compose_video_v2 refused: TTS-bearing segments (audio_path present) require plan_video_segments '
+        + 'to have run earlier in this session — it mechanically aligns audio_duration / video_duration / '
+        + 'subtitle_text with a safety buffer. Manual dwell/duration math has repeatedly produced misaligned '
+        + 'subtitles and silent tails that force re-recording.\n\n'
+        + 'Standard chain: synthesize_tts × N (per segment) → plan_video_segments(segments with text + audio_path + '
+        + 'visual_kind + visual_path) → compose_video_v2 (use the returned segments verbatim, only swap '
+        + 'visual_path/visual_kind for the real media). Call plan_video_segments now and pass its output here.'
+      );
+    }
+    if (hasNarration && !hasFreshVideoBrief({ workspaceId: CURRENT_WORKSPACE_ID, agentId: CURRENT_AGENT_ID })) {
+      return toolError(
+        'compose_video_v2 refused: must send a 确认稿 (production-brief) to the user via send_message before '
+        + 'compositing a narration video. The system scans send_message content for a brief — a message that '
+        + 'BOTH asks the user to confirm (确认 / 你看 / OK 吗 / 可以吗 / 同意 / 通过 / 行不行) AND describes '
+        + 'at least two of: 画面 / 时长 / 文案 / 口播 / 字幕 / 顺序 / 口吻 / 分镜 / 配音 — no such message '
+        + 'was sent in the last 6 hours for this workspace+agent.\n\n'
+        + '"已生成 TTS" / "开始合成" / progress reports do NOT count. Send a concrete confirmation draft '
+        + 'first (e.g. "我准备这么做：画面是真录屏，时长约 1 分钟，文案如下…，字幕开启，公司顺序 A→B→C，'
+        + '口吻是…—— 你 OK 吗？") and wait for the user to reply OK before calling compose_video_v2 again.'
+      );
+    }
+    return runComposeVideoV2Tool({ ...args, workspaceDir: WORKSPACE_DIR });
+  }
+);
+// ── record_url_narration (migrated from chat-bridge) ──────────────────────
+// Records a silent mp4 of a URL via Chromium+Xvfb+Playwright recordVideo,
+// driven by a beat-by-beat plan. Hard-block: requires plan_video_segments to
+// have run in this session — hand-written dwell_ms has drifted from TTS
+// audio in production runs (Tasks #20/#25/#26), forcing re-records.
+server.tool(
+  'record_url_narration',
+  'Record a silent video of a URL by driving Chromium on an Xvfb display and capturing it with Playwright recordVideo, driven by a video plan; ffmpeg then transcodes the recording to mp4. Outputs a silent mp4 that can be passed to compose_video_v2 as a video-kind segment with an audio_path for narration.\n\nUse this as the canonical recording step for URL-narration videos. Falls back: if the page needs interactions outside the visual_action vocabulary (clicks, waits, OCR loops), use Monitor (Bash) with custom Playwright instead.\n\nMUST be preceded by plan_video_segments in the same session — feed plan_video_segments\'s `segments` array as `plan.sections` so dwell_ms aligns mechanically with TTS audio_duration_ms (hand-written dwell_ms has drifted and forced re-records in production).\n\nRuntime requirements: this tool only works on a Linux daemon machine with Xvfb + Chromium + ffmpeg installed (ffmpeg is used to transcode the recording to mp4; no x11grab device support needed). macOS / Windows daemons will fail at startup.',
+  {
+    url: z.string().describe('Page URL to record'),
+    plan: z.record(z.any()).describe(
+      'A video plan: an object with `phases` (or `sections`), each a "visual beat" with '
+      + '`action` (scroll_to_dwell / linear_scroll_during / scroll_back / hold / ...), a '
+      + 'target (`target_y` or `focus_region:[y1,y2]`) for scroll-type actions, and '
+      + '`dwell_ms` (how long to hold that beat — should match the segment\'s TTS duration).\n\n'
+      + 'Standard chain: pass plan_video_segments\'s `segments` array directly as `plan.sections` — '
+      + 'each segment\'s `dwell_ms` is already set to its `audio_duration_ms`.\n\n'
+      + 'For RECRUITMENT URLs (mp.weixin.qq.com / 校招 / 实习 / 岗位 content), each section MUST '
+      + 'also declare `target_y_content_label` — a short Chinese label describing what content '
+      + 'sits at that pixel y position on the page (e.g. "标题区" / "岗位信息卡片" / "公司介绍" / '
+      + '"届别说明"). Labels matching forbidden regions ("二维码" / "扫码" / "投递入口" / "投递方式" / '
+      + '"联系方式" / "微信号" / "QR" / "阅读原文" / "外链") will cause the tool to refuse the '
+      + 'recording — recruitment content must NOT dwell on these areas (see fragments.md '
+      + 'frag.short.recruitment_url_mode_policy). Pick a different target_y in the 标题/岗位 '
+      + 'information area and rewrite that section.'
+    ),
+    output_path: z.string().optional().describe('Workspace-relative output mp4 path. Default tmp/wx3_video/recorded-{ts}.mp4'),
+    events_path: z.string().optional().describe('Workspace-relative events.json path. Default ${output_path}.events.json'),
+    viewport: z.object({
+      width: z.number().optional(),
+      height: z.number().optional(),
+    }).optional().describe('Default 1080x1920 (mobile portrait). Override only if the plan requires a different shape.'),
+    fps: z.number().optional().describe('Default 30. Do not change unless needed.'),
+    settle_ms: z.number().optional().describe('Default 4000. Settle wait after navigation before recording starts.'),
+  },
+  async (args) => {
+    if (isBlockedCvmaxEditorVideoTool('record_url_narration')) {
+      return cvmaxEditorVideoToolError('record_url_narration');
+    }
+    // record_url_narration is part of the narration-video pipeline (paired
+    // with synthesize_tts + plan_video_segments + compose_video_v2), so it
+    // requires the same 确认稿 gate as compose_video_v2 — catching the skip
+    // earlier saves TTS + recording time.
+    if (!hasFreshVideoBrief({ workspaceId: CURRENT_WORKSPACE_ID, agentId: CURRENT_AGENT_ID })) {
+      return {
+        isError: true,
+        content: [{ type: 'text', text:
+          'Error: record_url_narration refused: must send a 确认稿 (production-brief) to the user via '
+          + 'send_message before starting a narration recording. The system scans send_message content for '
+          + 'a brief — a message that BOTH asks the user to confirm (确认 / 你看 / OK 吗 / 可以吗 / 同意 / '
+          + '通过 / 行不行) AND describes at least two of: 画面 / 时长 / 文案 / 口播 / 字幕 / 顺序 / 口吻 / '
+          + '分镜 / 配音 — no such message was sent in the last 6 hours for this workspace+agent.\n\n'
+          + '"已生成 TTS" / "开始合成" / progress reports do NOT count. Send a concrete confirmation draft '
+          + 'first (e.g. "我准备这么做：画面是真录屏，时长约 1 分钟，文案如下…，字幕开启，公司顺序 A→B→C，'
+          + '口吻是…—— 你 OK 吗？") and wait for the user to reply OK before calling record_url_narration.'
+        }],
+      };
+    }
+    return runRecordUrlNarrationTool({
+      args,
+      currentWorkspaceId: CURRENT_WORKSPACE_ID,
+      workspaceDir: WORKSPACE_DIR,
+      planVideoSegmentsCalled: _planVideoSegmentsCalledThisSession,
+    });
+  }
+);
+// ── render_text_to_image (migrated from chat-bridge) ──────────────────────
+server.tool(
+  'render_text_to_image',
+  'Render text content into image(s) for video synthesis. style=scroll produces a single tall image (for a scrolling video segment); style=carousel produces one image per card (for a slide-show segment). Returns local file paths.',
+  {
+    content: z.union([z.string(), z.array(z.string())]).describe('Text content. For carousel, pass an array of strings — one per card. For scroll, pass a single string (or array joined with line breaks).'),
+    style: z.enum(['scroll', 'carousel']).describe('scroll: one tall image; carousel: one image per card.'),
+    theme: z.enum(['dark', 'light']).optional().describe('Color theme. Default dark.'),
+    width: z.number().optional().describe('Image width in pixels. Default 1080.'),
+    card_height: z.number().optional().describe('Card height in pixels (carousel) or viewport height (scroll baseline). Default 1920.'),
+    font_size: z.number().optional().describe('Base font size in pixels. Default 48.'),
+  },
+  async (args) => runRenderTextToImageTool(args)
+);
+// ── render_html_to_image (migrated from chat-bridge) ──────────────────────
+server.tool(
+  'render_html_to_image',
+  'Render a raw HTML string to a PNG image by navigating to it as a local file:// page. Unlike evaluate_script+document.write on about:blank, this preserves file:// origin so <img src="file:///..."> references load correctly. Returns the output image path.',
+  {
+    html: z.string().describe('Full HTML document to render (including <!doctype>, <html>, <head>, <body>).'),
+    output_path: z.string().optional().describe('Absolute path to save the PNG. Auto-generated in /tmp if omitted.'),
+    viewport_width: z.number().optional().describe('Viewport width in pixels. Default 1080.'),
+    viewport_height: z.number().optional().describe('Viewport height in pixels. Default 1920.'),
+    wait_until: z.enum(['load', 'networkidle', 'domcontentloaded']).optional().describe('Navigation wait condition. Default load.'),
+  },
+  async (args) => runRenderHtmlToImageTool(args)
+);
+// ── take_page_screenshot (migrated from chat-bridge) ──────────────────────
+server.tool(
+  'take_page_screenshot',
+  'Open a URL with a headless browser and capture a screenshot. crop=above_fold captures only the visible viewport (ideal for thumbnail-style frames); crop=full_page captures the entire page height.',
+  {
+    url: z.string().describe('Page URL to screenshot.'),
+    crop: z.enum(['above_fold', 'full_page']).optional().describe('Capture mode. Default above_fold.'),
+    viewport: z.object({
+      width: z.number().optional(),
+      height: z.number().optional(),
+    }).optional().describe('Viewport size. Default 390×844 (mobile).'),
+    wait_for: z.enum(['load', 'networkidle', 'domcontentloaded']).optional().describe('Page load event to wait for before screenshotting. Default networkidle.'),
+  },
+  async (args) => runTakePageScreenshotTool(args)
+);
 const transport = new StdioServerTransport();
 await server.connect(transport);
 console.error('[official-media-tools] MCP Server started');

package/mcp-servers/official/media-tools/lib/lightcone-api.js ADDED Viewed

@@ -0,0 +1,41 @@
+// Minimal HTTP helper for media-tools to call lightcone server's internal API.
+// Wraps fetch with the right URL prefix + auth headers + JSON encoding.
+//
+// Unlike daemon/src/chat-bridge.js's `api`, this helper does NOT route through
+// the governance/cache layer — media-tools is a separate stdio MCP server and
+// governance integration is chat-bridge-specific. If a tool here needs
+// governance-mediated execution, route it through chat-bridge's thin-proxy
+// instead (see weixin-tools for the pattern).
+const SERVER_URL = String(process.env.SERVER_URL ?? '').replace(/\/+$/, '');
+const MACHINE_API_KEY = String(process.env.MACHINE_API_KEY ?? '');
+const AGENT_ID = String(process.env.AGENT_ID ?? '');
+if (!SERVER_URL) throw new Error('media-tools: SERVER_URL env var is required');
+if (!MACHINE_API_KEY) throw new Error('media-tools: MACHINE_API_KEY env var is required');
+if (!AGENT_ID) throw new Error('media-tools: AGENT_ID env var is required');
+export async function lightconeApi(method, apiPath, body) {
+  const url = `${SERVER_URL}/internal/agent/${encodeURIComponent(AGENT_ID)}${apiPath}`;
+  const res = await fetch(url, {
+    method,
+    headers: {
+      'Content-Type': 'application/json',
+      'Authorization': `Bearer ${MACHINE_API_KEY}`,
+    },
+    body: body != null ? JSON.stringify(body) : undefined,
+  });
+  if (!res.ok) {
+    let text = '';
+    try { text = await res.text(); } catch { /* ignore */ }
+    const err = new Error(`lightcone ${method} ${apiPath} → ${res.status}: ${text.slice(0, 400)}`);
+    err.status = res.status;
+    err.body = text;
+    throw err;
+  }
+  return res.json();
+}
+// Exposed so tools can construct workspace-aware fallback identifiers.
+export const CURRENT_AGENT_ID = AGENT_ID;
+export const CURRENT_WORKSPACE_ID = String(process.env.WORKSPACE_ID ?? '');

package/mcp-servers/official/media-tools/manifest.json CHANGED Viewed

@@ -5,7 +5,14 @@
   "runtime": "node",
   "entrypoint": "index.js",
   "tool_declarations": [
-    { "name": "add_title_effects", "classification": "cacheable" }
+    { "name": "add_title_effects", "classification": "cacheable" },
+    { "name": "synthesize_tts", "classification": "mandatory" },
+    { "name": "plan_video_segments", "classification": "mandatory" },
+    { "name": "compose_video_v2", "classification": "mandatory" },
+    { "name": "record_url_narration", "classification": "mandatory" },
+    { "name": "render_text_to_image", "classification": "cacheable" },
+    { "name": "render_html_to_image", "classification": "cacheable" },
+    { "name": "take_page_screenshot", "classification": "cacheable" }
   ],
   "smoke_test": {
     "tool": "add_title_effects",

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@lightcone-ai/daemon",
-  "version": "0.16.2",
+  "version": "0.17.1",
   "type": "module",
   "main": "src/index.js",
   "bin": {

package/src/chat-bridge.js CHANGED Viewed

@@ -2,23 +2,21 @@
 import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
 import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
 import { z } from 'zod';
-import { createReadStream, existsSync, mkdirSync, readFileSync, writeFileSync, statSync } from 'fs';
+import { existsSync, mkdirSync, readFileSync, writeFileSync, statSync } from 'fs';
 import { createHash, randomUUID } from 'crypto';
 import path, { extname } from 'path';
 import os from 'os';
-import { recordUrlNarration } from './_vendor/video/recorder/index.js';
 import { writeLocalFileToWorkspace, resolveWorkspaceFileUploadPlan } from './workspace-file-upload.js';
 import { UploadJobManager } from './upload-job-manager.js';
 import { createUploadServerApi } from './upload-server-api.js';
-import { runRecordUrlNarrationTool } from './record-url-narration-tool.js';
+// record_url_narration moved to media-tools MCP server (V4 migration);
+// recorder import / handler are now consumed there, not from chat-bridge.
 import { runSubmitToLibraryTool } from './submit-to-library-tool.js';
-import { runRenderTextToImageTool } from './tools/render-text-to-image.js';
-import { runRenderHtmlToImageTool } from './tools/render-html-to-image.js';
-import { runSynthesisTtsTool } from './tools/synthesize-tts.js';
-import { runPlanVideoSegmentsTool } from './tools/plan-video-segments.js';
-import { runComposeVideoV2Tool } from './tools/compose-video-v2.js';
-import { runTakePageScreenshotTool } from './tools/take-page-screenshot.js';
+// render_text_to_image, render_html_to_image, take_page_screenshot moved to
+// media-tools MCP server (V5 migration). Handlers still live in
+// daemon/src/tools/ as shared modules and are imported there.
 import { runGetLibraryFileTool } from './tools/get-library-file.js';
+import { markVideoBriefSent, looksLikeVideoBrief } from './video-brief-flag.js';
 import { isLeaseInvalidated, clearInvalidatedLease } from './governance-state.js';
 import { classifyLeaseWindow } from './lease-window.js';
 import {
@@ -72,8 +70,9 @@ let currentWorkspaceId = WORKSPACE_ID;
 // Remove entirely once the new atomic tool framework is stable and the legacy pipeline retires.
 const CVMAX_WORKSPACE_ID = process.env.BLOCKED_EDITOR_WORKSPACE_ID ?? '';
 const CVMAX_EDITOR_IN_CHIEF_AGENT_ID = process.env.BLOCKED_EDITOR_AGENT_ID ?? '';
+// record_url_narration moved to media-tools and carries its own copy of this
+// block. submit_to_library stays here.
 const CVMAX_EDITOR_BLOCKED_VIDEO_TOOLS = new Set([
-  'record_url_narration',
   'submit_to_library',
 ]);
@@ -162,7 +161,6 @@ const DEFAULT_TOOL_CLASSIFICATION = {
   update_goal_field: 'mandatory',
   supersede_goal_field: 'mandatory',
   request_credential_auth: 'mandatory',
-  record_url_narration: 'mandatory',
   submit_to_library: 'mandatory',
   register_data_source: 'mandatory',
   bind_workspace_scenario: 'mandatory',
@@ -491,37 +489,6 @@ async function directApi(method, apiPath, body) {
   return res.json();
 }
-async function directApiVideoUpload(apiPath, {
-  localPath,
-  filename,
-  contentType = 'video/mp4',
-}) {
-  const url = `${SERVER_URL}/internal/agent/${AGENT_ID}${apiPath}`;
-  const headers = {
-    'Authorization': `Bearer ${MACHINE_API_KEY}`,
-    'Content-Type': contentType,
-  };
-  if (filename) headers['X-File-Name'] = filename;
-  let res;
-  try {
-    res = await fetch(url, {
-      method: 'POST',
-      headers,
-      body: createReadStream(localPath),
-      duplex: 'half',
-    });
-  } catch (error) {
-    throw buildDirectApiTransportError({ method: 'POST', apiPath, error });
-  }
-  if (!res.ok) {
-    const text = await res.text();
-    throw buildDirectApiHttpError({ method: 'POST', apiPath, status: res.status, text });
-  }
-  return res.json();
-}
 async function callGovernance(payload, { retry = true } = {}) {
   const attempts = retry ? 2 : 1;
   let lastError = null;
@@ -609,63 +576,6 @@ async function governanceRoundTrip({ method, apiPath, body, toolName, classifica
   return directApi(method, apiPath, nextBody);
 }
-async function runMandatoryLocalTool({ toolName, toolInput = {}, executor }) {
-  const classification = TOOL_CLASSIFICATION[toolName] ?? 'mandatory';
-  const traceId = randomUUID();
-  enqueueBundleEvent('tool_call_started', {
-    trace_id: traceId,
-    tool_name: toolName,
-    tool_classification: classification,
-    method: 'LOCAL',
-    api_path: '/local-tool',
-  });
-  try {
-    await ensureGovernanceContext();
-    const governancePayload = {
-      spawn_bundle_id: governanceContext.spawnBundleId,
-      policy_version: governanceContext.policyVersion,
-      tool_name: toolName,
-      tool_input: toolInput,
-      tool_classification: classification,
-      agent_id: AGENT_ID,
-      idempotency_key: randomUUID(),
-      lease_id: governanceContext.lease?.lease_id ?? null,
-    };
-    const governance = await callGovernance(governancePayload, { retry: true });
-    if (governance.policy_lease) applyPolicyLease(governance.policy_lease);
-    if (governance.verdict === 'reject' || governance.verdict === 'defer_human') {
-      throw governanceError(governanceReasonCode(governance.reason));
-    }
-    const checkedInput = (governance.verdict === 'modify' && governance.modified_input && typeof governance.modified_input === 'object')
-      ? { ...toolInput, ...governance.modified_input }
-      : toolInput;
-    const result = await executor(checkedInput);
-    if (CACHE_INVALIDATION_TOOLS.has(toolName)) {
-      governanceContext.cache.clear();
-    }
-    enqueueBundleEvent('tool_call_succeeded', {
-      trace_id: traceId,
-      tool_name: toolName,
-      tool_classification: classification,
-      source: 'governance_roundtrip',
-    });
-    return result;
-  } catch (error) {
-    if (shouldEmitToolCallFailed(error)) {
-      enqueueBundleEvent('tool_call_failed', {
-        trace_id: traceId,
-        tool_name: toolName,
-        tool_classification: classification,
-        reason: toolCallFailedReason(error),
-      });
-    }
-    throw error;
-  }
-}
 function renewCacheInBackground({ method, apiPath, body, toolName, cacheKey }) {
   if (governanceContext.renewalInFlight.has(cacheKey)) return;
   governanceContext.renewalInFlight.add(cacheKey);
@@ -886,6 +796,15 @@ server.tool('send_message', 'Send a message to a workspace, DM, or thread', {
   content: z.string().describe('Message content'),
 }, async ({ target, content }) => {
   const data = await api('POST', '/send', { target, content });
+  // Heuristic: if this looks like a video-production 确认稿 (asks permission +
+  // describes plan), mark a cross-process flag so media-tools' compose_video_v2
+  // / record_url_narration can verify a brief was actually sent before running.
+  // See daemon/src/video-brief-flag.js for the detection rules.
+  if (looksLikeVideoBrief(content) && AGENT_ID && currentWorkspaceId) {
+    try {
+      markVideoBriefSent({ workspaceId: currentWorkspaceId, agentId: AGENT_ID, content });
+    } catch { /* best-effort; failure to mark is non-fatal */ }
+  }
   return { content: [{ type: 'text', text: `Sent. messageId=${data.messageId} threadTarget=${data.threadTarget}` }] };
 });
@@ -1375,141 +1294,12 @@ server.tool('request_credential_auth',
   }
 );
-// ── render_text_to_image ───────────────────────────────────────────────────────
-server.tool('render_text_to_image',
-  'Render text content into image(s) for video synthesis. style=scroll produces a single tall image (for a scrolling video segment); style=carousel produces one image per card (for a slide-show segment). Returns local file paths.',
-  {
-    content: z.union([z.string(), z.array(z.string())]).describe('Text content. For carousel, pass an array of strings — one per card. For scroll, pass a single string (or array joined with line breaks).'),
-    style: z.enum(['scroll', 'carousel']).describe('scroll: one tall image; carousel: one image per card.'),
-    theme: z.enum(['dark', 'light']).optional().describe('Color theme. Default dark.'),
-    width: z.number().optional().describe('Image width in pixels. Default 1080.'),
-    card_height: z.number().optional().describe('Card height in pixels (carousel) or viewport height (scroll baseline). Default 1920.'),
-    font_size: z.number().optional().describe('Base font size in pixels. Default 48.'),
-  },
-  async (args) => runRenderTextToImageTool(args)
-);
-// ── render_html_to_image ───────────────────────────────────────────────────────
-server.tool('render_html_to_image',
-  'Render a raw HTML string to a PNG image by navigating to it as a local file:// page. Unlike evaluate_script+document.write on about:blank, this preserves file:// origin so <img src="file:///..."> references load correctly. Returns the output image path.',
-  {
-    html: z.string().describe('Full HTML document to render (including <!doctype>, <html>, <head>, <body>).'),
-    output_path: z.string().optional().describe('Absolute path to save the PNG. Auto-generated in /tmp if omitted.'),
-    viewport_width: z.number().optional().describe('Viewport width in pixels. Default 1080.'),
-    viewport_height: z.number().optional().describe('Viewport height in pixels. Default 1920.'),
-    wait_until: z.enum(['load', 'networkidle', 'domcontentloaded']).optional().describe('Navigation wait condition. Default load.'),
-  },
-  async (args) => runRenderHtmlToImageTool(args)
-);
-// ── synthesize_tts ─────────────────────────────────────────────────────────────
-server.tool('synthesize_tts',
-  'Convert text to speech using the workspace MiniMax TTS credential. Returns a local mp3 file path and duration. Use this to generate narration audio for individual video segments.',
-  {
-    text: z.string().describe('Text to synthesize. Keep under 500 characters per call for reliable results.'),
-    voice_id: z.string().optional().describe('MiniMax voice ID. Omit to use the workspace default voice.'),
-    workspace_id: z.string().optional().describe('Target workspace. Defaults to current workspace context.'),
-  },
-  async (args) => runSynthesisTtsTool({ ...args, currentWorkspaceId, api })
-);
-// ── plan_video_segments ────────────────────────────────────────────────────────
-// Session-scoped flag set when plan_video_segments runs. compose_video_v2
-// refuses TTS-bearing segments unless this is true — the agent must route
-// audio through plan_video_segments first so durations / subtitle_text /
-// audio_path are mechanically aligned. This is a per-chat-bridge-process
-// flag, so a fresh codex session must call plan_video_segments fresh.
-let _planVideoSegmentsCalledThisSession = false;
-server.tool('plan_video_segments',
-  'Universal audio-video sync planning step. For each segment, call TTS to get the real audio duration, then compute the visual duration with a safety buffer. Returns a planned segments array ready to pass directly to compose_video_v2 (with audio_path, presentation.duration/per_card_duration, and subtitle_text pre-filled). Always call this before compose_video_v2 when you have narration text.',
-  {
-    segments: z.array(z.object({
-      text: z.string().describe('Narration text for this segment. TTS will be generated from this.'),
-      visual_kind: z.enum(['image', 'video', 'gif', 'carousel']).describe('Type of visual.'),
-      visual_path: z.string().optional().describe('Absolute path to a single image, video, or gif file.'),
-      visual_paths: z.array(z.string()).optional().describe('For carousel: array of image paths, one per card.'),
-      transition: z.enum(['cut', 'fade', 'crossfade']).optional().describe('Transition to next segment. Default cut.'),
-      presentation: z.object({
-        style: z.enum(['static', 'scroll']).optional(),
-      }).optional().describe('Partial presentation hints (style only). duration/per_card_duration are computed from TTS.'),
-    })).describe('Segments to plan. Each must have narration text and visual info.'),
-    voice_id: z.string().optional().describe('TTS voice ID. Omit to use workspace default.'),
-    workspace_id: z.string().optional().describe('Target workspace. Defaults to current workspace context.'),
-  },
-  async (args) => {
-    const result = await runPlanVideoSegmentsTool({ ...args, currentWorkspaceId, api });
-    if (!result?.isError) _planVideoSegmentsCalledThisSession = true;
-    return result;
-  }
-);
-// ── compose_video_v2 ───────────────────────────────────────────────────────────
-server.tool('compose_video_v2',
-  'Compose a video from a list of segments using ffmpeg. Each segment has a visual source (image/scroll/carousel/video/gif), optional audio, and optional subtitle text. Subtitles are burned into the video by default when subtitle_text is provided. Segments are concatenated in order; outro clips are appended at the end. Returns a local mp4 path.\n\nTypical flow: plan_video_segments → compose_video_v2 (segments output fed directly in).',
-  {
-    segments: z.array(z.object({
-      visual_path: z.string().optional().describe('Absolute path to a single image, video, or gif file.'),
-      visual_paths: z.array(z.string()).optional().describe('For carousel: array of image paths, one per card.'),
-      visual_kind: z.enum(['image', 'video', 'gif', 'carousel']).describe('Type of visual.'),
-      presentation: z.object({
-        style: z.enum(['static', 'scroll']).optional().describe('For image: static (default) or scroll (pan upward).'),
-        duration: z.number().optional().describe('Segment duration in seconds. Required for image/scroll.'),
-        per_card_duration: z.number().optional().describe('Seconds per card for carousel.'),
-      }).optional(),
-      audio_path: z.string().nullable().optional().describe('Absolute path to audio (mp3). null or omit for silence.'),
-      subtitle_text: z.string().optional().describe('Narration text to burn as subtitle for this segment. Displayed for the full segment duration.'),
-      transition: z.enum(['cut', 'fade', 'crossfade']).optional().describe('Transition to next segment. Default cut.'),
-    })).describe('Ordered list of video segments.'),
-    outro_paths: z.array(z.string()).optional().describe('Absolute paths to outro video clips appended after all segments.'),
-    resolution: z.string().optional().describe('Output resolution WxH. Default "1080x1920".'),
-    output_path: z.string().optional().describe('Absolute output path for the mp4. Auto-generated if omitted.'),
-  },
-  async (args) => {
-    // Tool-level enforcement of the synthesize_tts → plan_video_segments →
-    // compose_video_v2 standard chain. If any segment has audio_path (i.e.
-    // narration is involved) and the agent never invoked plan_video_segments
-    // in this session, refuse the compose — manual dwell/duration math is
-    // unreliable (last syllable cut, silent tails, subtitle drift). Observed
-    // twice in row: agent skipped plan_video_segments, manually estimated
-    // dwell_ms wrong, ended up with too-long records and silent tails it then
-    // re-recorded to fix — wasting record_url_narration runs that
-    // plan_video_segments would have prevented.
-    const segments = Array.isArray(args?.segments) ? args.segments : [];
-    const hasNarration = segments.some(s => typeof s?.audio_path === 'string' && s.audio_path.trim());
-    if (hasNarration && !_planVideoSegmentsCalledThisSession) {
-      return {
-        isError: true,
-        content: [{
-          type: 'text',
-          text: 'compose_video_v2 refused: TTS-bearing segments (audio_path present) require plan_video_segments '
-            + 'to have run earlier in this session — it mechanically aligns audio_duration / video_duration / '
-            + 'subtitle_text with a safety buffer. Manual dwell/duration math has repeatedly produced misaligned '
-            + 'subtitles and silent tails that force re-recording.\n\n'
-            + 'Standard chain: synthesize_tts(per segment) → plan_video_segments(with text+visual_kind+visual_path) '
-            + '→ compose_video_v2(use the returned segments verbatim, only swap visual_path/visual_kind for real '
-            + 'media). Call plan_video_segments now and pass its output here.',
-        }],
-      };
-    }
-    return runComposeVideoV2Tool({ ...args, workspaceDir: WORKSPACE_DIR });
-  }
-);
-// ── take_page_screenshot ───────────────────────────────────────────────────────
-server.tool('take_page_screenshot',
-  'Open a URL with a headless browser and capture a screenshot. crop=above_fold captures only the visible viewport (ideal for thumbnail-style frames); crop=full_page captures the entire page height.',
-  {
-    url: z.string().describe('Page URL to screenshot.'),
-    crop: z.enum(['above_fold', 'full_page']).optional().describe('Capture mode. Default above_fold.'),
-    viewport: z.object({
-      width: z.number().optional(),
-      height: z.number().optional(),
-    }).optional().describe('Viewport size. Default 390×844 (mobile).'),
-    wait_for: z.enum(['load', 'networkidle', 'domcontentloaded']).optional().describe('Page load event to wait for before screenshotting. Default networkidle.'),
-  },
-  async (args) => runTakePageScreenshotTool(args)
-);
+// render_text_to_image, render_html_to_image, take_page_screenshot, synthesize_tts,
+// plan_video_segments, compose_video_v2, record_url_narration — all moved to
+// media-tools MCP server (V1–V5 migration; see
+// docs/scenario-content-creation/video-synthesis-design.md). The whole video
+// pipeline now lives in one stdio server so session-scoped flags (plan-was-
+// called) can gate downstream tools.
 // ── get_library_file ───────────────────────────────────────────────────────────
 server.tool('get_library_file',
@@ -1521,48 +1311,10 @@ server.tool('get_library_file',
   async (args) => runGetLibraryFileTool({ ...args, currentWorkspaceId, api, SERVER_URL, MACHINE_API_KEY, workspaceDir: WORKSPACE_DIR })
 );
-// ── record_url_narration ────────────────────────────────────────────────────────
-server.tool('record_url_narration',
-  'Record a silent video of a URL by driving Chromium on an Xvfb display and capturing it with Playwright recordVideo, driven by a video plan; ffmpeg then transcodes the recording to mp4. Outputs a silent mp4 that can be passed to compose_video_v2 as a video-kind segment with an audio_path for narration.\n\nUse this as the canonical recording step for URL-narration videos. Falls back: if the page needs interactions outside the visual_action vocabulary (clicks, waits, OCR loops), use Monitor (Bash) with custom Playwright instead.\n\nRuntime requirements: this tool only works on a Linux daemon machine with Xvfb + Chromium + ffmpeg installed (ffmpeg is used to transcode the recording to mp4; no x11grab device support needed). macOS / Windows daemons will fail at startup.',
-  {
-    url: z.string().describe('Page URL to record'),
-    plan: z.record(z.any()).describe(
-      'A video plan: an object with `phases` (or `sections`), each a "visual beat" with '
-      + '`action` (scroll_to_dwell / linear_scroll_during / scroll_back / hold / ...), a '
-      + 'target (`target_y` or `focus_region:[y1,y2]`) for scroll-type actions, and '
-      + '`dwell_ms` (how long to hold that beat — should match the segment\'s TTS duration).\n\n'
-      + 'For RECRUITMENT URLs (mp.weixin.qq.com / 校招 / 实习 / 岗位 content), each section MUST '
-      + 'also declare `target_y_content_label` — a short Chinese label describing what content '
-      + 'sits at that pixel y position on the page (e.g. "标题区" / "岗位信息卡片" / "公司介绍" / '
-      + '"届别说明"). Look at the take_page_screenshot output, find the y-pixel, and label it. '
-      + 'Labels matching forbidden regions ("二维码" / "扫码" / "投递入口" / "投递方式" / "联系方式" / '
-      + '"微信号" / "QR" / "阅读原文" / "外链") will cause the tool to refuse the recording — '
-      + 'recruitment content must NOT dwell on these areas (see fragments.md '
-      + 'frag.short.recruitment_url_mode_policy). Pick a different target_y in the 标题/岗位 '
-      + 'information area and rewrite that section.'
-    ),
-    output_path: z.string().optional().describe('Workspace-relative output mp4 path. Default tmp/wx3_video/recorded-{ts}.mp4'),
-    events_path: z.string().optional().describe('Workspace-relative events.json path. Default ${output_path}.events.json'),
-    viewport: z.object({
-      width: z.number().optional(),
-      height: z.number().optional(),
-    }).optional().describe('Default 1080x1920 (mobile portrait). Override only if the plan requires a different shape.'),
-    fps: z.number().optional().describe('Default 30. Do not change unless needed.'),
-    settle_ms: z.number().optional().describe('Default 4000. Settle wait after navigation before recording starts.'),
-  },
-  async (args) => {
-    if (isBlockedCvmaxEditorVideoTool('record_url_narration')) {
-      return cvmaxEditorVideoToolError('record_url_narration');
-    }
-    return runRecordUrlNarrationTool({
-      args,
-      currentWorkspaceId,
-      workspaceDir: WORKSPACE_DIR,
-      runMandatoryLocalToolFn: runMandatoryLocalTool,
-      recordUrlNarrationFn: recordUrlNarration,
-    });
-  }
-);
+// record_url_narration moved to media-tools MCP server (V4 migration). The
+// session-scoped plan_video_segments check now lives there alongside the
+// other video pipeline tools (synthesize_tts / plan_video_segments /
+// compose_video_v2). The CVMAX editor_in_chief block also moved with it.
 // ── submit_to_library ──────────────────────────────────────────────────────────
 server.tool('submit_to_library',

package/src/mcp-config.js CHANGED Viewed

@@ -89,6 +89,11 @@ const SERVER_BACKED_MCP_SERVERS = new Set([
   'audience-research',
   'hook-pattern-library',
   'weixin-tools',
+  // media-tools (V1–V5 chat-bridge → media-tools migration): synthesize_tts
+  // hits /tts/voiceover and the CvMax editor_in_chief gate + video-brief
+  // checks read CURRENT_AGENT_ID / CURRENT_WORKSPACE_ID. lib/lightcone-api.js
+  // throws at module load without the SERVER_URL/MACHINE_API_KEY/AGENT_ID triple.
+  'media-tools',
 ]);
 function baseEnvForServer(serverKey, { serverUrl, authToken, agentId, workspaceId, workspaceDir }) {

package/src/tools/plan-video-segments.js CHANGED Viewed

@@ -1,7 +1,22 @@
-import { mkdirSync, writeFileSync } from 'fs';
-import { randomUUID } from 'crypto';
-import path from 'path';
-import os from 'os';
+// plan_video_segments — pure audio/video alignment planner.
+//
+// Takes per-segment {text, audio_path, visual_kind, ...} and returns unified
+// plan segments with:
+//   - audio_duration_ms (read via ffprobe from the provided audio_path)
+//   - subtitle_text (= text)
+//   - presentation.duration / per_card_duration (audio_duration + buffer)
+//   - dwell_ms (= audio_duration; lets the same segment drive record_url_narration)
+//
+// Previously this tool ALSO synthesized TTS internally — which duplicated
+// the work when callers had already run synthesize_tts, and caused the
+// "wrong standard chain" confusion in fragments.md. TTS is now decoupled:
+// callers must run synthesize_tts per segment first and pass the resulting
+// audio_path here. See docs/scenario-content-creation/video-synthesis-design.md.
+//
+// Lives in daemon/src/tools/ as a reusable handler module; the MCP-tool
+// registration lives in daemon/mcp-servers/official/media-tools/index.js.
+import { spawn } from 'node:child_process';
 function toolText(text) {
   return { content: [{ type: 'text', text }] };
@@ -11,125 +26,99 @@ function toolError(text) {
   return { isError: true, content: [{ type: 'text', text }] };
 }
-function inferAudioExt(url) {
-  const clean = String(url ?? '').split('?')[0];
-  const ext = path.extname(clean).toLowerCase();
-  return ['.mp3', '.wav', '.flac', '.aac', '.ogg'].includes(ext) ? ext : '.mp3';
-}
-async function synthesizeSegmentTts(text, { workspace_id, voice_id, api }) {
-  const payload = { workspace_id, text, speed: 1, format: 'mp3' };
-  if (voice_id) payload.voice_preset = String(voice_id).trim();
-  const data = await api('POST', '/tts/voiceover', payload);
-  const remoteAudioUrl = String(data.audio_url ?? '').trim();
-  if (!remoteAudioUrl) throw new Error('TTS API did not return audio_url');
-  const downloadRes = await fetch(remoteAudioUrl);
-  if (!downloadRes.ok) throw new Error(`Failed to download audio (${downloadRes.status})`);
-  const fileBuffer = Buffer.from(await downloadRes.arrayBuffer());
-  const outDir = path.join(os.tmpdir(), 'lightcone-tts');
-  mkdirSync(outDir, { recursive: true });
-  const ext = inferAudioExt(remoteAudioUrl);
-  const outPath = path.join(outDir, `tts-${Date.now()}-${randomUUID().slice(0, 8)}${ext}`);
-  writeFileSync(outPath, fileBuffer);
-  const durationMs = Number(data.duration_ms ?? 0);
-  return { audio_path: outPath, audio_duration_ms: durationMs };
+async function probeAudioDurationMs(audioPath) {
+  return new Promise((resolve, reject) => {
+    const proc = spawn('ffprobe', [
+      '-v', 'error',
+      '-show_entries', 'format=duration',
+      '-of', 'csv=p=0',
+      audioPath,
+    ], { stdio: ['ignore', 'pipe', 'pipe'] });
+    const out = [];
+    const err = [];
+    proc.stdout.on('data', c => out.push(c));
+    proc.stderr.on('data', c => err.push(c));
+    proc.on('close', code => {
+      if (code !== 0) {
+        return reject(new Error(`ffprobe exited ${code}: ${Buffer.concat(err).toString().slice(-300)}`));
+      }
+      const seconds = parseFloat(Buffer.concat(out).toString().trim());
+      if (!Number.isFinite(seconds)) {
+        return reject(new Error(`ffprobe returned non-numeric duration: ${Buffer.concat(out).toString().slice(0, 200)}`));
+      }
+      resolve(Math.round(seconds * 1000));
+    });
+    proc.on('error', err2 => reject(new Error(`ffprobe spawn failed: ${err2.message}`)));
+  });
 }
-// Compute segment duration from audio duration: audio + 0.5s buffer, rounded up to nearest 0.5s.
+// Plan visual duration from audio duration: audio + buffer, rounded up to the
+// nearest 0.5s. scroll-style images get a longer buffer because eyes need
+// extra time to follow the motion.
 function planDurationSec(audioDurationMs, bufferSec = 0.5) {
   const raw = audioDurationMs / 1000 + bufferSec;
-  return Math.ceil(raw * 2) / 2; // round up to nearest 0.5s
+  return Math.ceil(raw * 2) / 2;
 }
-// Run fn over items with a bounded number of concurrent workers (FIFO drain).
-async function mapWithConcurrency(items, limit, fn) {
-  const queue = items.map((item, index) => ({ item, index }));
-  const workers = Array.from({ length: Math.max(1, Math.min(limit, queue.length)) }, async () => {
-    while (queue.length > 0) {
-      const next = queue.shift();
-      await fn(next.item, next.index);
-    }
-  });
-  await Promise.all(workers);
-}
-const TTS_CONCURRENCY = 5;
-export async function runPlanVideoSegmentsTool({ segments, workspace_id, voice_id, currentWorkspaceId, api }) {
+export async function runPlanVideoSegmentsTool({ segments } = {}) {
   if (!Array.isArray(segments) || segments.length === 0) {
     return toolError('segments must be a non-empty array.');
   }
-  const targetWorkspaceId = String(workspace_id ?? currentWorkspaceId ?? '').trim();
-  if (!targetWorkspaceId) {
-    return toolError('workspace_id is required (no current workspace context).');
+  // Up-front validation — fail fast before any work.
+  for (let i = 0; i < segments.length; i += 1) {
+    const seg = segments[i] ?? {};
+    if (typeof seg.audio_path !== 'string' || !seg.audio_path.trim()) {
+      return toolError(
+        `segments[${i}]: audio_path is required. plan_video_segments no longer synthesizes TTS — call synthesize_tts(text) `
+        + 'first and pass the returned path as audio_path. Standard chain: synthesize_tts × N → plan_video_segments → '
+        + 'record_url_narration + compose_video_v2 (share the same plan).'
+      );
+    }
+    const kind = String(seg.visual_kind ?? '');
+    if (!kind) {
+      return toolError(`segments[${i}]: visual_kind is required (image / video / gif / carousel).`);
+    }
   }
   const planned = [];
-  const errors = [];
+  const warnings = [];
+  for (let i = 0; i < segments.length; i += 1) {
+    const seg = segments[i];
+    const text = String(seg.text ?? '').trim();
+    const kind = String(seg.visual_kind);
-  // Synthesize TTS for every text-bearing segment up front, in parallel (bounded),
-  // so an N-segment plan no longer pays N sequential round-trips to the TTS API.
-  const audioResults = new Array(segments.length).fill(null);
-  const ttsJobs = segments
-    .map((seg, i) => ({ i, text: String(seg.text ?? '').trim() }))
-    .filter(job => job.text);
-  await mapWithConcurrency(ttsJobs, TTS_CONCURRENCY, async ({ i, text }) => {
+    let audioDurationMs;
     try {
-      audioResults[i] = await synthesizeSegmentTts(text, { workspace_id: targetWorkspaceId, voice_id, api });
+      audioDurationMs = await probeAudioDurationMs(seg.audio_path);
     } catch (err) {
-      errors.push(`segments[${i}]: TTS failed — ${err.message}`);
-      audioResults[i] = { audio_path: null, audio_duration_ms: 3000 }; // fallback estimate
+      warnings.push(`segments[${i}]: audio probe failed (${err.message}); falling back to 3000ms`);
+      audioDurationMs = 3000;
     }
-  });
-  errors.sort((a, b) => {
-    const na = Number((a.match(/segments\[(\d+)\]/) ?? [])[1] ?? 0);
-    const nb = Number((b.match(/segments\[(\d+)\]/) ?? [])[1] ?? 0);
-    return na - nb;
-  });
-  for (let i = 0; i < segments.length; i++) {
-    const seg = segments[i];
-    const text = String(seg.text ?? '').trim();
-    const kind = String(seg.visual_kind ?? 'image');
-    const audioResult = audioResults[i];
-    const audioDurationMs = audioResult?.audio_duration_ms ?? 0;
     let presentation;
     if (kind === 'carousel') {
       const numCards = Array.isArray(seg.visual_paths) ? seg.visual_paths.length : 1;
-      const totalDuration = audioDurationMs > 0 ? planDurationSec(audioDurationMs) : numCards * 4;
+      const totalDuration = planDurationSec(audioDurationMs);
       const perCard = Math.max(2, Math.ceil((totalDuration / numCards) * 2) / 2);
       presentation = { per_card_duration: perCard };
     } else {
-      // image, scroll, video, gif
-      const duration = audioDurationMs > 0 ? planDurationSec(audioDurationMs, kind === 'scroll' ? 1.0 : 0.5) : 4;
+      // image / scroll / video / gif
+      const duration = planDurationSec(audioDurationMs, kind === 'scroll' ? 1.0 : 0.5);
       presentation = { duration, ...(kind === 'scroll' ? { style: 'scroll' } : {}) };
     }
-    // dwell_ms lets the same segment double as a record_url_narration plan phase
-    // (the recorder reads dwell_ms / duration_ms for how long to hold each beat).
-    // Prefer the real measured audio length; fall back to the planned visual duration.
-    const dwellMs = audioDurationMs > 0
-      ? audioDurationMs
-      : Math.round((presentation.duration ?? presentation.per_card_duration ?? 4) * 1000);
-    const planned_seg = {
+    planned.push({
       ...seg,
-      ...(audioResult?.audio_path ? { audio_path: audioResult.audio_path } : {}),
+      audio_path: seg.audio_path,
+      audio_duration_ms: audioDurationMs,
       ...(text ? { subtitle_text: text } : {}),
       presentation: { ...presentation, ...(seg.presentation ?? {}) },
-      dwell_ms: seg.dwell_ms ?? dwellMs,
-    };
-    if (audioResult?.audio_duration_ms) {
-      planned_seg.audio_duration_ms = audioResult.audio_duration_ms;
-    }
-    planned.push(planned_seg);
+      // dwell_ms doubles as record_url_narration's per-phase hold duration so
+      // recording naturally tracks the narration audio.
+      dwell_ms: seg.dwell_ms ?? audioDurationMs,
+    });
   }
   const result = {
@@ -141,7 +130,7 @@ export async function runPlanVideoSegmentsTool({ segments, workspace_id, voice_i
         : (s.presentation?.duration ?? 4);
       return sum + Math.round(d * 1000);
     }, 0),
-    ...(errors.length > 0 ? { warnings: errors } : {}),
+    ...(warnings.length > 0 ? { warnings } : {}),
   };
   return toolText(JSON.stringify(result, null, 2));

package/src/{record-url-narration-tool.js → tools/record-url-narration.js} RENAMED Viewed

@@ -1,6 +1,23 @@
+// record_url_narration — atomic recording tool.
+//
+// Drives Chromium on Xvfb + Playwright recordVideo to capture a silent mp4 of
+// a URL following a beat-by-beat visual plan, then ffmpeg-transcodes it. The
+// resulting silent mp4 feeds into compose_video_v2 as a video-kind segment
+// alongside narration audio.
+//
+// Lives in daemon/src/tools/ as a reusable handler module; the MCP-tool
+// registration lives in daemon/mcp-servers/official/media-tools/index.js.
+// Migrated out of chat-bridge.js (V4) — no longer wrapped by
+// runMandatoryLocalTool / governance round-trip. media-tools is a separate
+// stdio MCP server and governance integration is chat-bridge-specific;
+// matches the precedent set by synthesize_tts / plan_video_segments /
+// compose_video_v2 in V1/V2/V3.
 import { mkdirSync } from 'fs';
 import path from 'path';
+import { recordUrlNarration as defaultRecordUrlNarrationFn } from '../_vendor/video/recorder/index.js';
 function toolText(text) {
   return { content: [{ type: 'text', text }] };
 }
@@ -55,10 +72,6 @@ function derivePhaseCount({ plan, recorderOutput }) {
   return segments ? segments.length : null;
 }
-// record_url_narration is an atomic tool, not the tail of a fixed pipeline.
-// The plan may be hand-written by the scripter or produced by plan_video_segments;
-// it just needs a non-empty list of segments with per-segment visual action + duration
-// so the recording stays in sync with the narration audio.
 function assertPipelineCompliance(plan) {
   if (!isPlainObject(plan)) return;
   if (!planSegments(plan)) {
@@ -86,10 +99,6 @@ const FORBIDDEN_REGION_PATTERNS = [
 ];
 function isRecruitmentLikeUrl(url) {
-  // Conservative URL-based heuristic: mp.weixin.qq.com pages forwarding 招聘 /
-  // 校招 / 实习 / job content. Until we have content classification, treat
-  // mp.weixin.qq.com URLs as recruitment-class for safety — the cost of a
-  // mis-flag is "agent must add a label", not "recording fails permanently".
   if (typeof url !== 'string') return false;
   return /mp\.weixin\.qq\.com/.test(url);
 }
@@ -101,11 +110,6 @@ function describeForbiddenMatch(label) {
   return null;
 }
-/**
- * For recruitment-class URLs, every plan section must declare what content
- * sits at its target_y, and the label must NOT match the forbidden-region
- * patterns. Returns null on pass, error message string on fail.
- */
 function checkSafeRegionLabels({ url, plan }) {
   if (!isRecruitmentLikeUrl(url)) return null;
   const segments = planSegments(plan);
@@ -181,16 +185,13 @@ export async function runRecordUrlNarrationTool({
   args = {},
   currentWorkspaceId = '',
   workspaceDir = process.cwd(),
-  runMandatoryLocalToolFn,
-  recordUrlNarrationFn,
+  planVideoSegmentsCalled = false,
+  recordUrlNarrationFn = defaultRecordUrlNarrationFn,
   nowMs = () => Date.now(),
 } = {}) {
   if (!currentWorkspaceId) {
     return toolError('No workspace context.');
   }
-  if (typeof runMandatoryLocalToolFn !== 'function') {
-    return toolError('Error: record_url_narration runMandatoryLocalToolFn is required.');
-  }
   if (typeof recordUrlNarrationFn !== 'function') {
     return toolError('Error: record_url_narration executor is required.');
   }
@@ -220,51 +221,53 @@ export async function runRecordUrlNarrationTool({
     return toolError(`Error: ${safeRegionError}`);
   }
+  // Standard-chain hard block: refuse recordings unless plan_video_segments
+  // ran in this session. Discovered repeatedly in Tasks #20/#25/#26 that
+  // agents hand-write dwell_ms by guessing, producing recordings whose phase
+  // boundaries drift from the TTS audio they will eventually be paired with —
+  // forcing a full re-record. plan_video_segments fills dwell_ms mechanically
+  // from ffprobe audio duration, eliminating the drift.
+  if (!planVideoSegmentsCalled) {
+    return toolError(
+      'Error: record_url_narration refused: plan_video_segments must run earlier in this '
+      + 'session so dwell_ms / phase durations are mechanically aligned with the segment\'s '
+      + 'TTS audio (audio_duration_ms). Hand-written dwell_ms has repeatedly drifted from '
+      + 'the actual TTS duration in production runs, forcing full re-records.\n\n'
+      + 'Standard chain: synthesize_tts × N (per segment) → plan_video_segments(segments with '
+      + 'text + audio_path + visual_kind=video + visual_path) → record_url_narration (feed '
+      + 'plan_video_segments output as plan.sections — each section\'s dwell_ms is already '
+      + 'set to audio_duration_ms) + compose_video_v2 (same plan output). Call plan_video_segments '
+      + 'now, then pass its `segments` array as `plan.sections` here.'
+    );
+  }
   try {
-    const result = await runMandatoryLocalToolFn({
-      toolName: 'record_url_narration',
-      toolInput: validatedInput,
-      executor: async (checkedInput = {}) => {
-        const mergedInput = {
-          ...validatedInput,
-          ...checkedInput,
-        };
-        const finalInput = validateRecordUrlNarrationArgs(mergedInput);
-        const { resolvedOutputPath, resolvedEventsPath } = resolveRecordUrlNarrationPaths({
-          workspaceDir,
-          outputPath: finalInput.output_path,
-          eventsPath: finalInput.events_path,
-          nowMs,
-        });
-        mkdirSync(path.dirname(resolvedOutputPath), { recursive: true });
-        mkdirSync(path.dirname(resolvedEventsPath), { recursive: true });
-        const recorderOutput = await recordUrlNarrationFn({
-          url: finalInput.url,
-          plan: finalInput.plan,
-          output_path: resolvedOutputPath,
-          events_path: resolvedEventsPath,
-          viewport: finalInput.viewport,
-          fps: finalInput.fps,
-          settle_ms: finalInput.settle_ms,
-        });
-        return {
-          videoPath: resolvedOutputPath,
-          eventsPath: resolvedEventsPath,
-          durationMs: deriveDurationMs(recorderOutput),
-          phases: derivePhaseCount({ plan: finalInput.plan, recorderOutput }),
-        };
-      },
+    const { resolvedOutputPath, resolvedEventsPath } = resolveRecordUrlNarrationPaths({
+      workspaceDir,
+      outputPath: validatedInput.output_path,
+      eventsPath: validatedInput.events_path,
+      nowMs,
+    });
+    mkdirSync(path.dirname(resolvedOutputPath), { recursive: true });
+    mkdirSync(path.dirname(resolvedEventsPath), { recursive: true });
+    const recorderOutput = await recordUrlNarrationFn({
+      url: validatedInput.url,
+      plan: validatedInput.plan,
+      output_path: resolvedOutputPath,
+      events_path: resolvedEventsPath,
+      viewport: validatedInput.viewport,
+      fps: validatedInput.fps,
+      settle_ms: validatedInput.settle_ms,
     });
     return toolText(
       `Recorded URL narration.\n`
-      + `video_path=${result.videoPath}\n`
-      + `events_path=${result.eventsPath}\n`
-      + `duration_ms=${result.durationMs ?? 'unknown'}\n`
-      + `phases=${result.phases ?? 'n/a'}`
+      + `video_path=${resolvedOutputPath}\n`
+      + `events_path=${resolvedEventsPath}\n`
+      + `duration_ms=${deriveDurationMs(recorderOutput) ?? 'unknown'}\n`
+      + `phases=${derivePhaseCount({ plan: validatedInput.plan, recorderOutput }) ?? 'n/a'}`
     );
   } catch (error) {
     return toolError(`Error: ${error.message}`);

package/src/video-brief-flag.js ADDED Viewed

@@ -0,0 +1,78 @@
+// Cross-process flag for "the agent has sent a video-production 确认稿 (brief)
+// to the user in this workspace+agent context recently". chat-bridge's
+// send_message tool writes the flag when the outgoing message heuristically
+// looks like a confirmation brief; media-tools' compose_video_v2 and
+// record_url_narration read the flag and refuse to proceed without it.
+//
+// Why a file flag instead of in-process state: send_message lives in
+// chat-bridge (one stdio MCP server), compose_video_v2/record_url_narration
+// live in media-tools (a different stdio MCP server, same machine). Both are
+// spawned by the same codex CLI session per agent, so they share env (notably
+// AGENT_ID / WORKSPACE_ID) but not memory. A flag file under ~/.lightcone is
+// the simplest cross-process medium and survives short-lived MCP restarts.
+//
+// The heuristic is intentionally specific (asks-permission marker AND
+// 2+ plan-describing markers) so casual progress reports like "已生成 TTS"
+// or "画面已就绪" do NOT satisfy it. A motivated agent could game the
+// detection by stuffing keywords into any send_message, but the default
+// codex behavior (which silently skipped the soft prompt rule) is what we
+// need to interrupt — and gaming is observable in chat history.
+import { mkdirSync, statSync, utimesSync, writeFileSync, existsSync } from 'node:fs';
+import path from 'node:path';
+import os from 'node:os';
+const TTL_MS = 6 * 60 * 60 * 1000; // 6 hours
+const FILE_NAME = 'video-brief-sent.flag';
+function flagDir(workspaceId, agentId) {
+  return path.join(os.homedir(), '.lightcone', 'sessions', workspaceId, agentId);
+}
+function flagPath(workspaceId, agentId) {
+  return path.join(flagDir(workspaceId, agentId), FILE_NAME);
+}
+export function markVideoBriefSent({ workspaceId, agentId, content }) {
+  if (!workspaceId || !agentId) return;
+  const dir = flagDir(workspaceId, agentId);
+  const p = flagPath(workspaceId, agentId);
+  mkdirSync(dir, { recursive: true });
+  writeFileSync(p, String(content ?? '').slice(0, 4096));
+  const now = new Date();
+  utimesSync(p, now, now);
+}
+export function hasFreshVideoBrief({ workspaceId, agentId, ttlMs = TTL_MS } = {}) {
+  if (!workspaceId || !agentId) return false;
+  const p = flagPath(workspaceId, agentId);
+  if (!existsSync(p)) return false;
+  try {
+    const st = statSync(p);
+    return (Date.now() - st.mtimeMs) <= ttlMs;
+  } catch { return false; }
+}
+// Permission-asking markers — the message must ask the user to decide / OK.
+// 确认 alone is too broad (matches "无需确认" / "已确认硬约束" / "确认收到"); require
+// a specific permission-ask shape: 请确认 / 确认稿 / 你确认 / 确认[吗?？] / 等确认.
+const PERMISSION_MARKERS = [
+  /请.*确认/, /你.*确认/, /确认\s*[吗?？]/, /等.*确认/, /确认稿/,
+  /你看/, /OK\s*吗/i, /可以吗/, /同意吗/, /通过吗/, /行不行/, /如何\?|如何？/,
+];
+// Plan-describing markers — must cover at least 2 different aspects of the brief.
+const PLAN_MARKERS = [
+  /画面/, /时长/, /文案/, /口播/, /字幕/, /顺序/, /口吻/, /分镜/, /配音/,
+];
+export function looksLikeVideoBrief(content) {
+  if (typeof content !== 'string') return false;
+  // Min length 20 — Chinese is character-dense, a plausible brief like
+  // "请确认：画面/时长/字幕已定。同意吗？" is only ~20 chars but still a valid brief.
+  if (content.length < 20) return false;
+  const hasPermissionAsk = PERMISSION_MARKERS.some(rx => rx.test(content));
+  if (!hasPermissionAsk) return false;
+  const distinctPlanHits = PLAN_MARKERS.filter(rx => rx.test(content)).length;
+  return distinctPlanHits >= 2;
+}