@lightcone-ai/daemon 0.16.1 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,6 +5,55 @@ import { z } from 'zod';
5
5
 
6
6
  import { addTitleEffects } from './lib/render.js';
7
7
  import { SUPPORTED_PRESETS } from './lib/presets.js';
8
+ import { runSynthesisTtsTool } from '../../../src/tools/synthesize-tts.js';
9
+ import { runPlanVideoSegmentsTool } from '../../../src/tools/plan-video-segments.js';
10
+ import { runComposeVideoV2Tool } from '../../../src/tools/compose-video-v2.js';
11
+ import { runRecordUrlNarrationTool } from '../../../src/tools/record-url-narration.js';
12
+ import { runRenderTextToImageTool } from '../../../src/tools/render-text-to-image.js';
13
+ import { runRenderHtmlToImageTool } from '../../../src/tools/render-html-to-image.js';
14
+ import { runTakePageScreenshotTool } from '../../../src/tools/take-page-screenshot.js';
15
+ import { lightconeApi, CURRENT_WORKSPACE_ID, CURRENT_AGENT_ID } from './lib/lightcone-api.js';
16
+
17
+ const WORKSPACE_DIR = String(process.env.WORKSPACE_DIR ?? '');
18
+
19
+ // CVMAX editor_in_chief block: in one workspace, the editor_in_chief agent
20
+ // must not run video production tools directly (short_video_scripter owns
21
+ // that role). Env-gated so ops can rotate workspace/agent IDs without code
22
+ // changes. Previously lived in chat-bridge; moved here alongside the tool
23
+ // it gates (V4 migration). submit_to_library, the other gated tool, still
24
+ // lives in chat-bridge and keeps its own copy of this check.
25
+ const CVMAX_WORKSPACE_ID = String(process.env.BLOCKED_EDITOR_WORKSPACE_ID ?? '');
26
+ const CVMAX_EDITOR_IN_CHIEF_AGENT_ID = String(process.env.BLOCKED_EDITOR_AGENT_ID ?? '');
27
+
28
+ function isBlockedCvmaxEditorVideoTool(toolName) {
29
+ return CURRENT_WORKSPACE_ID === CVMAX_WORKSPACE_ID
30
+ && CURRENT_AGENT_ID === CVMAX_EDITOR_IN_CHIEF_AGENT_ID
31
+ && CVMAX_WORKSPACE_ID
32
+ && CVMAX_EDITOR_IN_CHIEF_AGENT_ID
33
+ && toolName === 'record_url_narration';
34
+ }
35
+
36
+ function cvmaxEditorVideoToolError(toolName) {
37
+ return {
38
+ isError: true,
39
+ content: [{
40
+ type: 'text',
41
+ text:
42
+ `Error: ${toolName} blocked for editor_in_chief in CvMax. `
43
+ + 'In this workspace, @short_video_scripter owns video production. '
44
+ + 'editor_in_chief may route, review, or assist with OCR/verification, but must not run video production tools directly.',
45
+ }],
46
+ };
47
+ }
48
+
49
+ // Session-scoped flag set when plan_video_segments runs successfully.
50
+ // compose_video_v2 refuses TTS-bearing segments (any segment with audio_path)
51
+ // unless this is true — the agent must route audio through plan_video_segments
52
+ // first so durations / subtitle_text are mechanically aligned. media-tools is
53
+ // spawned per-agent, so a fresh agent session must call plan_video_segments
54
+ // fresh. (This flag previously lived in chat-bridge module scope; moved here
55
+ // alongside the tools it gates, see V2 migration.)
56
+ let _planVideoSegmentsCalledThisSession = false;
8
57
 
9
58
  const PRESET_ENUM = z.enum(SUPPORTED_PRESETS);
10
59
  const POSITION_ENUM = z.enum(['top', 'center', 'bottom']);
@@ -63,6 +112,213 @@ server.tool(
63
112
  }
64
113
  );
65
114
 
115
+ // ── synthesize_tts (migrated from chat-bridge) ────────────────────────────
116
+ // Pure TTS atomic tool: text → mp3 file. The lightcone server proxies to
117
+ // MiniMax TTS; this tool downloads the resulting mp3 to a local tmp path so
118
+ // downstream tools (plan_video_segments / compose_video_v2) can read it.
119
+ //
120
+ // Per the video-synthesis-design migration (see docs/upload-pipeline-design.md
121
+ // and docs/scenario-content-creation/video-synthesis-design.md), this tool
122
+ // lives in media-tools rather than chat-bridge so the video pipeline is a
123
+ // single coherent MCP server.
124
+ server.tool(
125
+ 'synthesize_tts',
126
+ 'Run MiniMax TTS on a snippet of narration text and download the resulting mp3 to a local tmp path. '
127
+ + 'Returns the local path and duration. Call once per video segment — do not concatenate all narration '
128
+ + 'into a single call (segment-level audio is required for plan_video_segments to align video durations).',
129
+ {
130
+ text: z.string().min(1).describe('Narration text for this segment. Will be synthesized as a single mp3.'),
131
+ voice_id: z.string().optional().describe('TTS voice preset. Omit to use workspace default.'),
132
+ workspace_id: z.string().optional().describe('Target workspace. Defaults to the current workspace.'),
133
+ },
134
+ async ({ text, voice_id, workspace_id }) => runSynthesisTtsTool({
135
+ text,
136
+ voice_id,
137
+ workspace_id,
138
+ currentWorkspaceId: CURRENT_WORKSPACE_ID,
139
+ api: lightconeApi,
140
+ })
141
+ );
142
+
143
+ // ── plan_video_segments (migrated from chat-bridge; TTS decoupled) ────────
144
+ // Pure planner — takes per-segment {text, audio_path, visual_kind, ...} and
145
+ // returns segments with audio_duration_ms / presentation.duration / dwell_ms
146
+ // / subtitle_text filled in. Caller MUST run synthesize_tts per segment first
147
+ // and pass the resulting audio_path here. No longer synthesizes TTS itself
148
+ // (V2 migration; see docs/scenario-content-creation/video-synthesis-design.md).
149
+ server.tool(
150
+ 'plan_video_segments',
151
+ 'Universal audio-video sync planning step. For each segment, reads the supplied audio_path via ffprobe, '
152
+ + 'measures audio duration, and returns a planned segments array with audio_path / audio_duration_ms / '
153
+ + 'subtitle_text / presentation.duration / dwell_ms filled in — ready to pass directly to both '
154
+ + 'record_url_narration (as the recording plan) AND compose_video_v2 (as the segment list). '
155
+ + 'Must be called before compose_video_v2 when any segment has audio_path.\n\n'
156
+ + 'Inputs per segment: {text, audio_path (required, from synthesize_tts), visual_kind, visual_path or visual_paths, '
157
+ + 'optionally transition / presentation.style}. Standard chain: synthesize_tts × N → plan_video_segments → '
158
+ + 'record_url_narration + compose_video_v2 (both use the same plan output).',
159
+ {
160
+ segments: z.array(z.object({
161
+ text: z.string().describe('Narration text for this segment — used as subtitle_text in the output.'),
162
+ audio_path: z.string().describe('Absolute path to the segment\'s mp3 (from synthesize_tts).'),
163
+ visual_kind: z.enum(['image', 'video', 'gif', 'carousel']).describe('Type of visual for compose_video_v2.'),
164
+ visual_path: z.string().optional().describe('Absolute path to a single image / video / gif file.'),
165
+ visual_paths: z.array(z.string()).optional().describe('For carousel: array of image paths, one per card.'),
166
+ transition: z.enum(['cut', 'fade', 'crossfade']).optional().describe('Transition to next segment. Default cut.'),
167
+ presentation: z.object({
168
+ style: z.enum(['static', 'scroll']).optional(),
169
+ }).optional().describe('Optional presentation hints (style only). duration/per_card_duration are computed.'),
170
+ dwell_ms: z.number().optional().describe('Optional override for record_url_narration phase duration. Default = audio_duration_ms.'),
171
+ })).describe('Segments to plan. audio_path is required for each.'),
172
+ },
173
+ async ({ segments }) => {
174
+ const result = await runPlanVideoSegmentsTool({ segments });
175
+ if (!result?.isError) _planVideoSegmentsCalledThisSession = true;
176
+ return result;
177
+ }
178
+ );
179
+
180
+ // ── compose_video_v2 (migrated from chat-bridge) ──────────────────────────
181
+ // Tool-level enforcement of the standard chain: TTS-bearing segments require
182
+ // plan_video_segments to have run earlier in this session. Without it manual
183
+ // dwell/duration math has repeatedly produced misaligned subtitles, silent
184
+ // tails, and re-records (Task #25/#26 trial).
185
+ server.tool(
186
+ 'compose_video_v2',
187
+ 'Compose a video from a list of segments using ffmpeg. Each segment has a visual source (image / scroll / '
188
+ + 'carousel / video / gif), optional audio, and optional subtitle text. Subtitles are burned in when '
189
+ + 'subtitle_text is provided. Segments are concatenated in order; outro clips are appended after.\n\n'
190
+ + 'When any segment has audio_path, MUST be preceded by plan_video_segments in the same session '
191
+ + '(plan_video_segments fills duration/subtitle_text/audio_path mechanically; manual alignment is rejected). '
192
+ + 'Returns a local mp4 path + size_bytes.',
193
+ {
194
+ segments: z.array(z.object({
195
+ visual_path: z.string().optional().describe('Absolute path to a single image / video / gif.'),
196
+ visual_paths: z.array(z.string()).optional().describe('For carousel: array of image paths, one per card.'),
197
+ visual_kind: z.enum(['image', 'video', 'gif', 'carousel']).describe('Type of visual.'),
198
+ presentation: z.object({
199
+ style: z.enum(['static', 'scroll']).optional(),
200
+ duration: z.number().optional().describe('Segment duration in seconds. Required for image/scroll.'),
201
+ per_card_duration: z.number().optional().describe('Seconds per card for carousel.'),
202
+ }).optional(),
203
+ audio_path: z.string().nullable().optional().describe('Absolute path to audio (mp3). null/omit for silence.'),
204
+ subtitle_text: z.string().optional().describe('Narration text to burn as subtitle. Displayed for the full segment duration.'),
205
+ transition: z.enum(['cut', 'fade', 'crossfade']).optional().describe('Transition to next segment. Default cut.'),
206
+ })).describe('Ordered list of video segments.'),
207
+ outro_paths: z.array(z.string()).optional().describe('Absolute paths to outro video clips appended at end.'),
208
+ resolution: z.string().optional().describe('Output resolution WxH. Default "1080x1920".'),
209
+ output_path: z.string().optional().describe('Absolute output path. Auto-generated if omitted.'),
210
+ },
211
+ async (args) => {
212
+ const segments = Array.isArray(args?.segments) ? args.segments : [];
213
+ const hasNarration = segments.some(s => typeof s?.audio_path === 'string' && s.audio_path.trim());
214
+ if (hasNarration && !_planVideoSegmentsCalledThisSession) {
215
+ return toolError(
216
+ 'compose_video_v2 refused: TTS-bearing segments (audio_path present) require plan_video_segments '
217
+ + 'to have run earlier in this session — it mechanically aligns audio_duration / video_duration / '
218
+ + 'subtitle_text with a safety buffer. Manual dwell/duration math has repeatedly produced misaligned '
219
+ + 'subtitles and silent tails that force re-recording.\n\n'
220
+ + 'Standard chain: synthesize_tts × N (per segment) → plan_video_segments(segments with text + audio_path + '
221
+ + 'visual_kind + visual_path) → compose_video_v2 (use the returned segments verbatim, only swap '
222
+ + 'visual_path/visual_kind for the real media). Call plan_video_segments now and pass its output here.'
223
+ );
224
+ }
225
+ return runComposeVideoV2Tool({ ...args, workspaceDir: WORKSPACE_DIR });
226
+ }
227
+ );
228
+
229
+ // ── record_url_narration (migrated from chat-bridge) ──────────────────────
230
+ // Records a silent mp4 of a URL via Chromium+Xvfb+Playwright recordVideo,
231
+ // driven by a beat-by-beat plan. Hard-block: requires plan_video_segments to
232
+ // have run in this session — hand-written dwell_ms has drifted from TTS
233
+ // audio in production runs (Tasks #20/#25/#26), forcing re-records.
234
+ server.tool(
235
+ 'record_url_narration',
236
+ 'Record a silent video of a URL by driving Chromium on an Xvfb display and capturing it with Playwright recordVideo, driven by a video plan; ffmpeg then transcodes the recording to mp4. Outputs a silent mp4 that can be passed to compose_video_v2 as a video-kind segment with an audio_path for narration.\n\nUse this as the canonical recording step for URL-narration videos. Falls back: if the page needs interactions outside the visual_action vocabulary (clicks, waits, OCR loops), use Monitor (Bash) with custom Playwright instead.\n\nMUST be preceded by plan_video_segments in the same session — feed plan_video_segments\'s `segments` array as `plan.sections` so dwell_ms aligns mechanically with TTS audio_duration_ms (hand-written dwell_ms has drifted and forced re-records in production).\n\nRuntime requirements: this tool only works on a Linux daemon machine with Xvfb + Chromium + ffmpeg installed (ffmpeg is used to transcode the recording to mp4; no x11grab device support needed). macOS / Windows daemons will fail at startup.',
237
+ {
238
+ url: z.string().describe('Page URL to record'),
239
+ plan: z.record(z.any()).describe(
240
+ 'A video plan: an object with `phases` (or `sections`), each a "visual beat" with '
241
+ + '`action` (scroll_to_dwell / linear_scroll_during / scroll_back / hold / ...), a '
242
+ + 'target (`target_y` or `focus_region:[y1,y2]`) for scroll-type actions, and '
243
+ + '`dwell_ms` (how long to hold that beat — should match the segment\'s TTS duration).\n\n'
244
+ + 'Standard chain: pass plan_video_segments\'s `segments` array directly as `plan.sections` — '
245
+ + 'each segment\'s `dwell_ms` is already set to its `audio_duration_ms`.\n\n'
246
+ + 'For RECRUITMENT URLs (mp.weixin.qq.com / 校招 / 实习 / 岗位 content), each section MUST '
247
+ + 'also declare `target_y_content_label` — a short Chinese label describing what content '
248
+ + 'sits at that pixel y position on the page (e.g. "标题区" / "岗位信息卡片" / "公司介绍" / '
249
+ + '"届别说明"). Labels matching forbidden regions ("二维码" / "扫码" / "投递入口" / "投递方式" / '
250
+ + '"联系方式" / "微信号" / "QR" / "阅读原文" / "外链") will cause the tool to refuse the '
251
+ + 'recording — recruitment content must NOT dwell on these areas (see fragments.md '
252
+ + 'frag.short.recruitment_url_mode_policy). Pick a different target_y in the 标题/岗位 '
253
+ + 'information area and rewrite that section.'
254
+ ),
255
+ output_path: z.string().optional().describe('Workspace-relative output mp4 path. Default tmp/wx3_video/recorded-{ts}.mp4'),
256
+ events_path: z.string().optional().describe('Workspace-relative events.json path. Default ${output_path}.events.json'),
257
+ viewport: z.object({
258
+ width: z.number().optional(),
259
+ height: z.number().optional(),
260
+ }).optional().describe('Default 1080x1920 (mobile portrait). Override only if the plan requires a different shape.'),
261
+ fps: z.number().optional().describe('Default 30. Do not change unless needed.'),
262
+ settle_ms: z.number().optional().describe('Default 4000. Settle wait after navigation before recording starts.'),
263
+ },
264
+ async (args) => {
265
+ if (isBlockedCvmaxEditorVideoTool('record_url_narration')) {
266
+ return cvmaxEditorVideoToolError('record_url_narration');
267
+ }
268
+ return runRecordUrlNarrationTool({
269
+ args,
270
+ currentWorkspaceId: CURRENT_WORKSPACE_ID,
271
+ workspaceDir: WORKSPACE_DIR,
272
+ planVideoSegmentsCalled: _planVideoSegmentsCalledThisSession,
273
+ });
274
+ }
275
+ );
276
+
277
+ // ── render_text_to_image (migrated from chat-bridge) ──────────────────────
278
+ server.tool(
279
+ 'render_text_to_image',
280
+ 'Render text content into image(s) for video synthesis. style=scroll produces a single tall image (for a scrolling video segment); style=carousel produces one image per card (for a slide-show segment). Returns local file paths.',
281
+ {
282
+ content: z.union([z.string(), z.array(z.string())]).describe('Text content. For carousel, pass an array of strings — one per card. For scroll, pass a single string (or array joined with line breaks).'),
283
+ style: z.enum(['scroll', 'carousel']).describe('scroll: one tall image; carousel: one image per card.'),
284
+ theme: z.enum(['dark', 'light']).optional().describe('Color theme. Default dark.'),
285
+ width: z.number().optional().describe('Image width in pixels. Default 1080.'),
286
+ card_height: z.number().optional().describe('Card height in pixels (carousel) or viewport height (scroll baseline). Default 1920.'),
287
+ font_size: z.number().optional().describe('Base font size in pixels. Default 48.'),
288
+ },
289
+ async (args) => runRenderTextToImageTool(args)
290
+ );
291
+
292
+ // ── render_html_to_image (migrated from chat-bridge) ──────────────────────
293
+ server.tool(
294
+ 'render_html_to_image',
295
+ 'Render a raw HTML string to a PNG image by navigating to it as a local file:// page. Unlike evaluate_script+document.write on about:blank, this preserves file:// origin so <img src="file:///..."> references load correctly. Returns the output image path.',
296
+ {
297
+ html: z.string().describe('Full HTML document to render (including <!doctype>, <html>, <head>, <body>).'),
298
+ output_path: z.string().optional().describe('Absolute path to save the PNG. Auto-generated in /tmp if omitted.'),
299
+ viewport_width: z.number().optional().describe('Viewport width in pixels. Default 1080.'),
300
+ viewport_height: z.number().optional().describe('Viewport height in pixels. Default 1920.'),
301
+ wait_until: z.enum(['load', 'networkidle', 'domcontentloaded']).optional().describe('Navigation wait condition. Default load.'),
302
+ },
303
+ async (args) => runRenderHtmlToImageTool(args)
304
+ );
305
+
306
+ // ── take_page_screenshot (migrated from chat-bridge) ──────────────────────
307
+ server.tool(
308
+ 'take_page_screenshot',
309
+ 'Open a URL with a headless browser and capture a screenshot. crop=above_fold captures only the visible viewport (ideal for thumbnail-style frames); crop=full_page captures the entire page height.',
310
+ {
311
+ url: z.string().describe('Page URL to screenshot.'),
312
+ crop: z.enum(['above_fold', 'full_page']).optional().describe('Capture mode. Default above_fold.'),
313
+ viewport: z.object({
314
+ width: z.number().optional(),
315
+ height: z.number().optional(),
316
+ }).optional().describe('Viewport size. Default 390×844 (mobile).'),
317
+ wait_for: z.enum(['load', 'networkidle', 'domcontentloaded']).optional().describe('Page load event to wait for before screenshotting. Default networkidle.'),
318
+ },
319
+ async (args) => runTakePageScreenshotTool(args)
320
+ );
321
+
66
322
  const transport = new StdioServerTransport();
67
323
  await server.connect(transport);
68
324
  console.error('[official-media-tools] MCP Server started');
@@ -0,0 +1,41 @@
1
+ // Minimal HTTP helper for media-tools to call lightcone server's internal API.
2
+ // Wraps fetch with the right URL prefix + auth headers + JSON encoding.
3
+ //
4
+ // Unlike daemon/src/chat-bridge.js's `api`, this helper does NOT route through
5
+ // the governance/cache layer — media-tools is a separate stdio MCP server and
6
+ // governance integration is chat-bridge-specific. If a tool here needs
7
+ // governance-mediated execution, route it through chat-bridge's thin-proxy
8
+ // instead (see weixin-tools for the pattern).
9
+
10
+ const SERVER_URL = String(process.env.SERVER_URL ?? '').replace(/\/+$/, '');
11
+ const MACHINE_API_KEY = String(process.env.MACHINE_API_KEY ?? '');
12
+ const AGENT_ID = String(process.env.AGENT_ID ?? '');
13
+
14
+ if (!SERVER_URL) throw new Error('media-tools: SERVER_URL env var is required');
15
+ if (!MACHINE_API_KEY) throw new Error('media-tools: MACHINE_API_KEY env var is required');
16
+ if (!AGENT_ID) throw new Error('media-tools: AGENT_ID env var is required');
17
+
18
+ export async function lightconeApi(method, apiPath, body) {
19
+ const url = `${SERVER_URL}/internal/agent/${encodeURIComponent(AGENT_ID)}${apiPath}`;
20
+ const res = await fetch(url, {
21
+ method,
22
+ headers: {
23
+ 'Content-Type': 'application/json',
24
+ 'Authorization': `Bearer ${MACHINE_API_KEY}`,
25
+ },
26
+ body: body != null ? JSON.stringify(body) : undefined,
27
+ });
28
+ if (!res.ok) {
29
+ let text = '';
30
+ try { text = await res.text(); } catch { /* ignore */ }
31
+ const err = new Error(`lightcone ${method} ${apiPath} → ${res.status}: ${text.slice(0, 400)}`);
32
+ err.status = res.status;
33
+ err.body = text;
34
+ throw err;
35
+ }
36
+ return res.json();
37
+ }
38
+
39
+ // Exposed so tools can construct workspace-aware fallback identifiers.
40
+ export const CURRENT_AGENT_ID = AGENT_ID;
41
+ export const CURRENT_WORKSPACE_ID = String(process.env.WORKSPACE_ID ?? '');
@@ -5,7 +5,14 @@
5
5
  "runtime": "node",
6
6
  "entrypoint": "index.js",
7
7
  "tool_declarations": [
8
- { "name": "add_title_effects", "classification": "cacheable" }
8
+ { "name": "add_title_effects", "classification": "cacheable" },
9
+ { "name": "synthesize_tts", "classification": "mandatory" },
10
+ { "name": "plan_video_segments", "classification": "mandatory" },
11
+ { "name": "compose_video_v2", "classification": "mandatory" },
12
+ { "name": "record_url_narration", "classification": "mandatory" },
13
+ { "name": "render_text_to_image", "classification": "cacheable" },
14
+ { "name": "render_html_to_image", "classification": "cacheable" },
15
+ { "name": "take_page_screenshot", "classification": "cacheable" }
9
16
  ],
10
17
  "smoke_test": {
11
18
  "tool": "add_title_effects",
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lightcone-ai/daemon",
3
- "version": "0.16.1",
3
+ "version": "0.17.0",
4
4
  "type": "module",
5
5
  "main": "src/index.js",
6
6
  "bin": {
@@ -2,22 +2,19 @@
2
2
  import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
3
3
  import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
4
4
  import { z } from 'zod';
5
- import { createReadStream, existsSync, mkdirSync, readFileSync, writeFileSync, statSync } from 'fs';
5
+ import { existsSync, mkdirSync, readFileSync, writeFileSync, statSync } from 'fs';
6
6
  import { createHash, randomUUID } from 'crypto';
7
7
  import path, { extname } from 'path';
8
8
  import os from 'os';
9
- import { recordUrlNarration } from './_vendor/video/recorder/index.js';
10
9
  import { writeLocalFileToWorkspace, resolveWorkspaceFileUploadPlan } from './workspace-file-upload.js';
11
10
  import { UploadJobManager } from './upload-job-manager.js';
12
11
  import { createUploadServerApi } from './upload-server-api.js';
13
- import { runRecordUrlNarrationTool } from './record-url-narration-tool.js';
12
+ // record_url_narration moved to media-tools MCP server (V4 migration);
13
+ // recorder import / handler are now consumed there, not from chat-bridge.
14
14
  import { runSubmitToLibraryTool } from './submit-to-library-tool.js';
15
- import { runRenderTextToImageTool } from './tools/render-text-to-image.js';
16
- import { runRenderHtmlToImageTool } from './tools/render-html-to-image.js';
17
- import { runSynthesisTtsTool } from './tools/synthesize-tts.js';
18
- import { runPlanVideoSegmentsTool } from './tools/plan-video-segments.js';
19
- import { runComposeVideoV2Tool } from './tools/compose-video-v2.js';
20
- import { runTakePageScreenshotTool } from './tools/take-page-screenshot.js';
15
+ // render_text_to_image, render_html_to_image, take_page_screenshot moved to
16
+ // media-tools MCP server (V5 migration). Handlers still live in
17
+ // daemon/src/tools/ as shared modules and are imported there.
21
18
  import { runGetLibraryFileTool } from './tools/get-library-file.js';
22
19
  import { isLeaseInvalidated, clearInvalidatedLease } from './governance-state.js';
23
20
  import { classifyLeaseWindow } from './lease-window.js';
@@ -72,8 +69,9 @@ let currentWorkspaceId = WORKSPACE_ID;
72
69
  // Remove entirely once the new atomic tool framework is stable and the legacy pipeline retires.
73
70
  const CVMAX_WORKSPACE_ID = process.env.BLOCKED_EDITOR_WORKSPACE_ID ?? '';
74
71
  const CVMAX_EDITOR_IN_CHIEF_AGENT_ID = process.env.BLOCKED_EDITOR_AGENT_ID ?? '';
72
+ // record_url_narration moved to media-tools and carries its own copy of this
73
+ // block. submit_to_library stays here.
75
74
  const CVMAX_EDITOR_BLOCKED_VIDEO_TOOLS = new Set([
76
- 'record_url_narration',
77
75
  'submit_to_library',
78
76
  ]);
79
77
 
@@ -162,7 +160,6 @@ const DEFAULT_TOOL_CLASSIFICATION = {
162
160
  update_goal_field: 'mandatory',
163
161
  supersede_goal_field: 'mandatory',
164
162
  request_credential_auth: 'mandatory',
165
- record_url_narration: 'mandatory',
166
163
  submit_to_library: 'mandatory',
167
164
  register_data_source: 'mandatory',
168
165
  bind_workspace_scenario: 'mandatory',
@@ -491,37 +488,6 @@ async function directApi(method, apiPath, body) {
491
488
  return res.json();
492
489
  }
493
490
 
494
- async function directApiVideoUpload(apiPath, {
495
- localPath,
496
- filename,
497
- contentType = 'video/mp4',
498
- }) {
499
- const url = `${SERVER_URL}/internal/agent/${AGENT_ID}${apiPath}`;
500
- const headers = {
501
- 'Authorization': `Bearer ${MACHINE_API_KEY}`,
502
- 'Content-Type': contentType,
503
- };
504
- if (filename) headers['X-File-Name'] = filename;
505
-
506
- let res;
507
- try {
508
- res = await fetch(url, {
509
- method: 'POST',
510
- headers,
511
- body: createReadStream(localPath),
512
- duplex: 'half',
513
- });
514
- } catch (error) {
515
- throw buildDirectApiTransportError({ method: 'POST', apiPath, error });
516
- }
517
-
518
- if (!res.ok) {
519
- const text = await res.text();
520
- throw buildDirectApiHttpError({ method: 'POST', apiPath, status: res.status, text });
521
- }
522
- return res.json();
523
- }
524
-
525
491
  async function callGovernance(payload, { retry = true } = {}) {
526
492
  const attempts = retry ? 2 : 1;
527
493
  let lastError = null;
@@ -609,63 +575,6 @@ async function governanceRoundTrip({ method, apiPath, body, toolName, classifica
609
575
  return directApi(method, apiPath, nextBody);
610
576
  }
611
577
 
612
- async function runMandatoryLocalTool({ toolName, toolInput = {}, executor }) {
613
- const classification = TOOL_CLASSIFICATION[toolName] ?? 'mandatory';
614
- const traceId = randomUUID();
615
- enqueueBundleEvent('tool_call_started', {
616
- trace_id: traceId,
617
- tool_name: toolName,
618
- tool_classification: classification,
619
- method: 'LOCAL',
620
- api_path: '/local-tool',
621
- });
622
-
623
- try {
624
- await ensureGovernanceContext();
625
- const governancePayload = {
626
- spawn_bundle_id: governanceContext.spawnBundleId,
627
- policy_version: governanceContext.policyVersion,
628
- tool_name: toolName,
629
- tool_input: toolInput,
630
- tool_classification: classification,
631
- agent_id: AGENT_ID,
632
- idempotency_key: randomUUID(),
633
- lease_id: governanceContext.lease?.lease_id ?? null,
634
- };
635
- const governance = await callGovernance(governancePayload, { retry: true });
636
- if (governance.policy_lease) applyPolicyLease(governance.policy_lease);
637
- if (governance.verdict === 'reject' || governance.verdict === 'defer_human') {
638
- throw governanceError(governanceReasonCode(governance.reason));
639
- }
640
-
641
- const checkedInput = (governance.verdict === 'modify' && governance.modified_input && typeof governance.modified_input === 'object')
642
- ? { ...toolInput, ...governance.modified_input }
643
- : toolInput;
644
- const result = await executor(checkedInput);
645
- if (CACHE_INVALIDATION_TOOLS.has(toolName)) {
646
- governanceContext.cache.clear();
647
- }
648
-
649
- enqueueBundleEvent('tool_call_succeeded', {
650
- trace_id: traceId,
651
- tool_name: toolName,
652
- tool_classification: classification,
653
- source: 'governance_roundtrip',
654
- });
655
- return result;
656
- } catch (error) {
657
- if (shouldEmitToolCallFailed(error)) {
658
- enqueueBundleEvent('tool_call_failed', {
659
- trace_id: traceId,
660
- tool_name: toolName,
661
- tool_classification: classification,
662
- reason: toolCallFailedReason(error),
663
- });
664
- }
665
- throw error;
666
- }
667
- }
668
-
669
578
  function renewCacheInBackground({ method, apiPath, body, toolName, cacheKey }) {
670
579
  if (governanceContext.renewalInFlight.has(cacheKey)) return;
671
580
  governanceContext.renewalInFlight.add(cacheKey);
@@ -1375,102 +1284,12 @@ server.tool('request_credential_auth',
1375
1284
  }
1376
1285
  );
1377
1286
 
1378
- // ── render_text_to_image ───────────────────────────────────────────────────────
1379
- server.tool('render_text_to_image',
1380
- 'Render text content into image(s) for video synthesis. style=scroll produces a single tall image (for a scrolling video segment); style=carousel produces one image per card (for a slide-show segment). Returns local file paths.',
1381
- {
1382
- content: z.union([z.string(), z.array(z.string())]).describe('Text content. For carousel, pass an array of strings — one per card. For scroll, pass a single string (or array joined with line breaks).'),
1383
- style: z.enum(['scroll', 'carousel']).describe('scroll: one tall image; carousel: one image per card.'),
1384
- theme: z.enum(['dark', 'light']).optional().describe('Color theme. Default dark.'),
1385
- width: z.number().optional().describe('Image width in pixels. Default 1080.'),
1386
- card_height: z.number().optional().describe('Card height in pixels (carousel) or viewport height (scroll baseline). Default 1920.'),
1387
- font_size: z.number().optional().describe('Base font size in pixels. Default 48.'),
1388
- },
1389
- async (args) => runRenderTextToImageTool(args)
1390
- );
1391
-
1392
- // ── render_html_to_image ───────────────────────────────────────────────────────
1393
- server.tool('render_html_to_image',
1394
- 'Render a raw HTML string to a PNG image by navigating to it as a local file:// page. Unlike evaluate_script+document.write on about:blank, this preserves file:// origin so <img src="file:///..."> references load correctly. Returns the output image path.',
1395
- {
1396
- html: z.string().describe('Full HTML document to render (including <!doctype>, <html>, <head>, <body>).'),
1397
- output_path: z.string().optional().describe('Absolute path to save the PNG. Auto-generated in /tmp if omitted.'),
1398
- viewport_width: z.number().optional().describe('Viewport width in pixels. Default 1080.'),
1399
- viewport_height: z.number().optional().describe('Viewport height in pixels. Default 1920.'),
1400
- wait_until: z.enum(['load', 'networkidle', 'domcontentloaded']).optional().describe('Navigation wait condition. Default load.'),
1401
- },
1402
- async (args) => runRenderHtmlToImageTool(args)
1403
- );
1404
-
1405
- // ── synthesize_tts ─────────────────────────────────────────────────────────────
1406
- server.tool('synthesize_tts',
1407
- 'Convert text to speech using the workspace MiniMax TTS credential. Returns a local mp3 file path and duration. Use this to generate narration audio for individual video segments.',
1408
- {
1409
- text: z.string().describe('Text to synthesize. Keep under 500 characters per call for reliable results.'),
1410
- voice_id: z.string().optional().describe('MiniMax voice ID. Omit to use the workspace default voice.'),
1411
- workspace_id: z.string().optional().describe('Target workspace. Defaults to current workspace context.'),
1412
- },
1413
- async (args) => runSynthesisTtsTool({ ...args, currentWorkspaceId, api })
1414
- );
1415
-
1416
- // ── plan_video_segments ────────────────────────────────────────────────────────
1417
- server.tool('plan_video_segments',
1418
- 'Universal audio-video sync planning step. For each segment, call TTS to get the real audio duration, then compute the visual duration with a safety buffer. Returns a planned segments array ready to pass directly to compose_video_v2 (with audio_path, presentation.duration/per_card_duration, and subtitle_text pre-filled). Always call this before compose_video_v2 when you have narration text.',
1419
- {
1420
- segments: z.array(z.object({
1421
- text: z.string().describe('Narration text for this segment. TTS will be generated from this.'),
1422
- visual_kind: z.enum(['image', 'video', 'gif', 'carousel']).describe('Type of visual.'),
1423
- visual_path: z.string().optional().describe('Absolute path to a single image, video, or gif file.'),
1424
- visual_paths: z.array(z.string()).optional().describe('For carousel: array of image paths, one per card.'),
1425
- transition: z.enum(['cut', 'fade', 'crossfade']).optional().describe('Transition to next segment. Default cut.'),
1426
- presentation: z.object({
1427
- style: z.enum(['static', 'scroll']).optional(),
1428
- }).optional().describe('Partial presentation hints (style only). duration/per_card_duration are computed from TTS.'),
1429
- })).describe('Segments to plan. Each must have narration text and visual info.'),
1430
- voice_id: z.string().optional().describe('TTS voice ID. Omit to use workspace default.'),
1431
- workspace_id: z.string().optional().describe('Target workspace. Defaults to current workspace context.'),
1432
- },
1433
- async (args) => runPlanVideoSegmentsTool({ ...args, currentWorkspaceId, api })
1434
- );
1435
-
1436
- // ── compose_video_v2 ───────────────────────────────────────────────────────────
1437
- server.tool('compose_video_v2',
1438
- 'Compose a video from a list of segments using ffmpeg. Each segment has a visual source (image/scroll/carousel/video/gif), optional audio, and optional subtitle text. Subtitles are burned into the video by default when subtitle_text is provided. Segments are concatenated in order; outro clips are appended at the end. Returns a local mp4 path.\n\nTypical flow: plan_video_segments → compose_video_v2 (segments output fed directly in).',
1439
- {
1440
- segments: z.array(z.object({
1441
- visual_path: z.string().optional().describe('Absolute path to a single image, video, or gif file.'),
1442
- visual_paths: z.array(z.string()).optional().describe('For carousel: array of image paths, one per card.'),
1443
- visual_kind: z.enum(['image', 'video', 'gif', 'carousel']).describe('Type of visual.'),
1444
- presentation: z.object({
1445
- style: z.enum(['static', 'scroll']).optional().describe('For image: static (default) or scroll (pan upward).'),
1446
- duration: z.number().optional().describe('Segment duration in seconds. Required for image/scroll.'),
1447
- per_card_duration: z.number().optional().describe('Seconds per card for carousel.'),
1448
- }).optional(),
1449
- audio_path: z.string().nullable().optional().describe('Absolute path to audio (mp3). null or omit for silence.'),
1450
- subtitle_text: z.string().optional().describe('Narration text to burn as subtitle for this segment. Displayed for the full segment duration.'),
1451
- transition: z.enum(['cut', 'fade', 'crossfade']).optional().describe('Transition to next segment. Default cut.'),
1452
- })).describe('Ordered list of video segments.'),
1453
- outro_paths: z.array(z.string()).optional().describe('Absolute paths to outro video clips appended after all segments.'),
1454
- resolution: z.string().optional().describe('Output resolution WxH. Default "1080x1920".'),
1455
- output_path: z.string().optional().describe('Absolute output path for the mp4. Auto-generated if omitted.'),
1456
- },
1457
- async (args) => runComposeVideoV2Tool({ ...args, workspaceDir: WORKSPACE_DIR })
1458
- );
1459
-
1460
- // ── take_page_screenshot ───────────────────────────────────────────────────────
1461
- server.tool('take_page_screenshot',
1462
- 'Open a URL with a headless browser and capture a screenshot. crop=above_fold captures only the visible viewport (ideal for thumbnail-style frames); crop=full_page captures the entire page height.',
1463
- {
1464
- url: z.string().describe('Page URL to screenshot.'),
1465
- crop: z.enum(['above_fold', 'full_page']).optional().describe('Capture mode. Default above_fold.'),
1466
- viewport: z.object({
1467
- width: z.number().optional(),
1468
- height: z.number().optional(),
1469
- }).optional().describe('Viewport size. Default 390×844 (mobile).'),
1470
- wait_for: z.enum(['load', 'networkidle', 'domcontentloaded']).optional().describe('Page load event to wait for before screenshotting. Default networkidle.'),
1471
- },
1472
- async (args) => runTakePageScreenshotTool(args)
1473
- );
1287
+ // render_text_to_image, render_html_to_image, take_page_screenshot, synthesize_tts,
1288
+ // plan_video_segments, compose_video_v2, record_url_narration — all moved to
1289
+ // media-tools MCP server (V1–V5 migration; see
1290
+ // docs/scenario-content-creation/video-synthesis-design.md). The whole video
1291
+ // pipeline now lives in one stdio server so session-scoped flags (plan-was-
1292
+ // called) can gate downstream tools.
1474
1293
 
1475
1294
  // ── get_library_file ───────────────────────────────────────────────────────────
1476
1295
  server.tool('get_library_file',
@@ -1482,48 +1301,10 @@ server.tool('get_library_file',
1482
1301
  async (args) => runGetLibraryFileTool({ ...args, currentWorkspaceId, api, SERVER_URL, MACHINE_API_KEY, workspaceDir: WORKSPACE_DIR })
1483
1302
  );
1484
1303
 
1485
- // ── record_url_narration ────────────────────────────────────────────────────────
1486
- server.tool('record_url_narration',
1487
- 'Record a silent video of a URL by driving Chromium on an Xvfb display and capturing it with Playwright recordVideo, driven by a video plan; ffmpeg then transcodes the recording to mp4. Outputs a silent mp4 that can be passed to compose_video_v2 as a video-kind segment with an audio_path for narration.\n\nUse this as the canonical recording step for URL-narration videos. Falls back: if the page needs interactions outside the visual_action vocabulary (clicks, waits, OCR loops), use Monitor (Bash) with custom Playwright instead.\n\nRuntime requirements: this tool only works on a Linux daemon machine with Xvfb + Chromium + ffmpeg installed (ffmpeg is used to transcode the recording to mp4; no x11grab device support needed). macOS / Windows daemons will fail at startup.',
1488
- {
1489
- url: z.string().describe('Page URL to record'),
1490
- plan: z.record(z.any()).describe(
1491
- 'A video plan: an object with `phases` (or `sections`), each a "visual beat" with '
1492
- + '`action` (scroll_to_dwell / linear_scroll_during / scroll_back / hold / ...), a '
1493
- + 'target (`target_y` or `focus_region:[y1,y2]`) for scroll-type actions, and '
1494
- + '`dwell_ms` (how long to hold that beat — should match the segment\'s TTS duration).\n\n'
1495
- + 'For RECRUITMENT URLs (mp.weixin.qq.com / 校招 / 实习 / 岗位 content), each section MUST '
1496
- + 'also declare `target_y_content_label` — a short Chinese label describing what content '
1497
- + 'sits at that pixel y position on the page (e.g. "标题区" / "岗位信息卡片" / "公司介绍" / '
1498
- + '"届别说明"). Look at the take_page_screenshot output, find the y-pixel, and label it. '
1499
- + 'Labels matching forbidden regions ("二维码" / "扫码" / "投递入口" / "投递方式" / "联系方式" / '
1500
- + '"微信号" / "QR" / "阅读原文" / "外链") will cause the tool to refuse the recording — '
1501
- + 'recruitment content must NOT dwell on these areas (see fragments.md '
1502
- + 'frag.short.recruitment_url_mode_policy). Pick a different target_y in the 标题/岗位 '
1503
- + 'information area and rewrite that section.'
1504
- ),
1505
- output_path: z.string().optional().describe('Workspace-relative output mp4 path. Default tmp/wx3_video/recorded-{ts}.mp4'),
1506
- events_path: z.string().optional().describe('Workspace-relative events.json path. Default ${output_path}.events.json'),
1507
- viewport: z.object({
1508
- width: z.number().optional(),
1509
- height: z.number().optional(),
1510
- }).optional().describe('Default 1080x1920 (mobile portrait). Override only if the plan requires a different shape.'),
1511
- fps: z.number().optional().describe('Default 30. Do not change unless needed.'),
1512
- settle_ms: z.number().optional().describe('Default 4000. Settle wait after navigation before recording starts.'),
1513
- },
1514
- async (args) => {
1515
- if (isBlockedCvmaxEditorVideoTool('record_url_narration')) {
1516
- return cvmaxEditorVideoToolError('record_url_narration');
1517
- }
1518
- return runRecordUrlNarrationTool({
1519
- args,
1520
- currentWorkspaceId,
1521
- workspaceDir: WORKSPACE_DIR,
1522
- runMandatoryLocalToolFn: runMandatoryLocalTool,
1523
- recordUrlNarrationFn: recordUrlNarration,
1524
- });
1525
- }
1526
- );
1304
+ // record_url_narration moved to media-tools MCP server (V4 migration). The
1305
+ // session-scoped plan_video_segments check now lives there alongside the
1306
+ // other video pipeline tools (synthesize_tts / plan_video_segments /
1307
+ // compose_video_v2). The CVMAX editor_in_chief block also moved with it.
1527
1308
 
1528
1309
  // ── submit_to_library ──────────────────────────────────────────────────────────
1529
1310
  server.tool('submit_to_library',
@@ -1,7 +1,22 @@
1
- import { mkdirSync, writeFileSync } from 'fs';
2
- import { randomUUID } from 'crypto';
3
- import path from 'path';
4
- import os from 'os';
1
+ // plan_video_segments pure audio/video alignment planner.
2
+ //
3
+ // Takes per-segment {text, audio_path, visual_kind, ...} and returns unified
4
+ // plan segments with:
5
+ // - audio_duration_ms (read via ffprobe from the provided audio_path)
6
+ // - subtitle_text (= text)
7
+ // - presentation.duration / per_card_duration (audio_duration + buffer)
8
+ // - dwell_ms (= audio_duration; lets the same segment drive record_url_narration)
9
+ //
10
+ // Previously this tool ALSO synthesized TTS internally — which duplicated
11
+ // the work when callers had already run synthesize_tts, and caused the
12
+ // "wrong standard chain" confusion in fragments.md. TTS is now decoupled:
13
+ // callers must run synthesize_tts per segment first and pass the resulting
14
+ // audio_path here. See docs/scenario-content-creation/video-synthesis-design.md.
15
+ //
16
+ // Lives in daemon/src/tools/ as a reusable handler module; the MCP-tool
17
+ // registration lives in daemon/mcp-servers/official/media-tools/index.js.
18
+
19
+ import { spawn } from 'node:child_process';
5
20
 
6
21
  function toolText(text) {
7
22
  return { content: [{ type: 'text', text }] };
@@ -11,125 +26,99 @@ function toolError(text) {
11
26
  return { isError: true, content: [{ type: 'text', text }] };
12
27
  }
13
28
 
14
- function inferAudioExt(url) {
15
- const clean = String(url ?? '').split('?')[0];
16
- const ext = path.extname(clean).toLowerCase();
17
- return ['.mp3', '.wav', '.flac', '.aac', '.ogg'].includes(ext) ? ext : '.mp3';
18
- }
19
-
20
- async function synthesizeSegmentTts(text, { workspace_id, voice_id, api }) {
21
- const payload = { workspace_id, text, speed: 1, format: 'mp3' };
22
- if (voice_id) payload.voice_preset = String(voice_id).trim();
23
-
24
- const data = await api('POST', '/tts/voiceover', payload);
25
- const remoteAudioUrl = String(data.audio_url ?? '').trim();
26
- if (!remoteAudioUrl) throw new Error('TTS API did not return audio_url');
27
-
28
- const downloadRes = await fetch(remoteAudioUrl);
29
- if (!downloadRes.ok) throw new Error(`Failed to download audio (${downloadRes.status})`);
30
-
31
- const fileBuffer = Buffer.from(await downloadRes.arrayBuffer());
32
- const outDir = path.join(os.tmpdir(), 'lightcone-tts');
33
- mkdirSync(outDir, { recursive: true });
34
- const ext = inferAudioExt(remoteAudioUrl);
35
- const outPath = path.join(outDir, `tts-${Date.now()}-${randomUUID().slice(0, 8)}${ext}`);
36
- writeFileSync(outPath, fileBuffer);
37
-
38
- const durationMs = Number(data.duration_ms ?? 0);
39
- return { audio_path: outPath, audio_duration_ms: durationMs };
29
+ async function probeAudioDurationMs(audioPath) {
30
+ return new Promise((resolve, reject) => {
31
+ const proc = spawn('ffprobe', [
32
+ '-v', 'error',
33
+ '-show_entries', 'format=duration',
34
+ '-of', 'csv=p=0',
35
+ audioPath,
36
+ ], { stdio: ['ignore', 'pipe', 'pipe'] });
37
+ const out = [];
38
+ const err = [];
39
+ proc.stdout.on('data', c => out.push(c));
40
+ proc.stderr.on('data', c => err.push(c));
41
+ proc.on('close', code => {
42
+ if (code !== 0) {
43
+ return reject(new Error(`ffprobe exited ${code}: ${Buffer.concat(err).toString().slice(-300)}`));
44
+ }
45
+ const seconds = parseFloat(Buffer.concat(out).toString().trim());
46
+ if (!Number.isFinite(seconds)) {
47
+ return reject(new Error(`ffprobe returned non-numeric duration: ${Buffer.concat(out).toString().slice(0, 200)}`));
48
+ }
49
+ resolve(Math.round(seconds * 1000));
50
+ });
51
+ proc.on('error', err2 => reject(new Error(`ffprobe spawn failed: ${err2.message}`)));
52
+ });
40
53
  }
41
54
 
42
- // Compute segment duration from audio duration: audio + 0.5s buffer, rounded up to nearest 0.5s.
55
+ // Plan visual duration from audio duration: audio + buffer, rounded up to the
56
+ // nearest 0.5s. scroll-style images get a longer buffer because eyes need
57
+ // extra time to follow the motion.
43
58
  function planDurationSec(audioDurationMs, bufferSec = 0.5) {
44
59
  const raw = audioDurationMs / 1000 + bufferSec;
45
- return Math.ceil(raw * 2) / 2; // round up to nearest 0.5s
60
+ return Math.ceil(raw * 2) / 2;
46
61
  }
47
62
 
48
- // Run fn over items with a bounded number of concurrent workers (FIFO drain).
49
- async function mapWithConcurrency(items, limit, fn) {
50
- const queue = items.map((item, index) => ({ item, index }));
51
- const workers = Array.from({ length: Math.max(1, Math.min(limit, queue.length)) }, async () => {
52
- while (queue.length > 0) {
53
- const next = queue.shift();
54
- await fn(next.item, next.index);
55
- }
56
- });
57
- await Promise.all(workers);
58
- }
59
-
60
- const TTS_CONCURRENCY = 5;
61
-
62
- export async function runPlanVideoSegmentsTool({ segments, workspace_id, voice_id, currentWorkspaceId, api }) {
63
+ export async function runPlanVideoSegmentsTool({ segments } = {}) {
63
64
  if (!Array.isArray(segments) || segments.length === 0) {
64
65
  return toolError('segments must be a non-empty array.');
65
66
  }
66
67
 
67
- const targetWorkspaceId = String(workspace_id ?? currentWorkspaceId ?? '').trim();
68
- if (!targetWorkspaceId) {
69
- return toolError('workspace_id is required (no current workspace context).');
68
+ // Up-front validation fail fast before any work.
69
+ for (let i = 0; i < segments.length; i += 1) {
70
+ const seg = segments[i] ?? {};
71
+ if (typeof seg.audio_path !== 'string' || !seg.audio_path.trim()) {
72
+ return toolError(
73
+ `segments[${i}]: audio_path is required. plan_video_segments no longer synthesizes TTS — call synthesize_tts(text) `
74
+ + 'first and pass the returned path as audio_path. Standard chain: synthesize_tts × N → plan_video_segments → '
75
+ + 'record_url_narration + compose_video_v2 (share the same plan).'
76
+ );
77
+ }
78
+ const kind = String(seg.visual_kind ?? '');
79
+ if (!kind) {
80
+ return toolError(`segments[${i}]: visual_kind is required (image / video / gif / carousel).`);
81
+ }
70
82
  }
71
83
 
72
84
  const planned = [];
73
- const errors = [];
85
+ const warnings = [];
86
+
87
+ for (let i = 0; i < segments.length; i += 1) {
88
+ const seg = segments[i];
89
+ const text = String(seg.text ?? '').trim();
90
+ const kind = String(seg.visual_kind);
74
91
 
75
- // Synthesize TTS for every text-bearing segment up front, in parallel (bounded),
76
- // so an N-segment plan no longer pays N sequential round-trips to the TTS API.
77
- const audioResults = new Array(segments.length).fill(null);
78
- const ttsJobs = segments
79
- .map((seg, i) => ({ i, text: String(seg.text ?? '').trim() }))
80
- .filter(job => job.text);
81
- await mapWithConcurrency(ttsJobs, TTS_CONCURRENCY, async ({ i, text }) => {
92
+ let audioDurationMs;
82
93
  try {
83
- audioResults[i] = await synthesizeSegmentTts(text, { workspace_id: targetWorkspaceId, voice_id, api });
94
+ audioDurationMs = await probeAudioDurationMs(seg.audio_path);
84
95
  } catch (err) {
85
- errors.push(`segments[${i}]: TTS failed ${err.message}`);
86
- audioResults[i] = { audio_path: null, audio_duration_ms: 3000 }; // fallback estimate
96
+ warnings.push(`segments[${i}]: audio probe failed (${err.message}); falling back to 3000ms`);
97
+ audioDurationMs = 3000;
87
98
  }
88
- });
89
- errors.sort((a, b) => {
90
- const na = Number((a.match(/segments\[(\d+)\]/) ?? [])[1] ?? 0);
91
- const nb = Number((b.match(/segments\[(\d+)\]/) ?? [])[1] ?? 0);
92
- return na - nb;
93
- });
94
99
 
95
- for (let i = 0; i < segments.length; i++) {
96
- const seg = segments[i];
97
- const text = String(seg.text ?? '').trim();
98
- const kind = String(seg.visual_kind ?? 'image');
99
- const audioResult = audioResults[i];
100
-
101
- const audioDurationMs = audioResult?.audio_duration_ms ?? 0;
102
100
  let presentation;
103
-
104
101
  if (kind === 'carousel') {
105
102
  const numCards = Array.isArray(seg.visual_paths) ? seg.visual_paths.length : 1;
106
- const totalDuration = audioDurationMs > 0 ? planDurationSec(audioDurationMs) : numCards * 4;
103
+ const totalDuration = planDurationSec(audioDurationMs);
107
104
  const perCard = Math.max(2, Math.ceil((totalDuration / numCards) * 2) / 2);
108
105
  presentation = { per_card_duration: perCard };
109
106
  } else {
110
- // image, scroll, video, gif
111
- const duration = audioDurationMs > 0 ? planDurationSec(audioDurationMs, kind === 'scroll' ? 1.0 : 0.5) : 4;
107
+ // image / scroll / video / gif
108
+ const duration = planDurationSec(audioDurationMs, kind === 'scroll' ? 1.0 : 0.5);
112
109
  presentation = { duration, ...(kind === 'scroll' ? { style: 'scroll' } : {}) };
113
110
  }
114
111
 
115
- // dwell_ms lets the same segment double as a record_url_narration plan phase
116
- // (the recorder reads dwell_ms / duration_ms for how long to hold each beat).
117
- // Prefer the real measured audio length; fall back to the planned visual duration.
118
- const dwellMs = audioDurationMs > 0
119
- ? audioDurationMs
120
- : Math.round((presentation.duration ?? presentation.per_card_duration ?? 4) * 1000);
121
-
122
- const planned_seg = {
112
+ planned.push({
123
113
  ...seg,
124
- ...(audioResult?.audio_path ? { audio_path: audioResult.audio_path } : {}),
114
+ audio_path: seg.audio_path,
115
+ audio_duration_ms: audioDurationMs,
125
116
  ...(text ? { subtitle_text: text } : {}),
126
117
  presentation: { ...presentation, ...(seg.presentation ?? {}) },
127
- dwell_ms: seg.dwell_ms ?? dwellMs,
128
- };
129
- if (audioResult?.audio_duration_ms) {
130
- planned_seg.audio_duration_ms = audioResult.audio_duration_ms;
131
- }
132
- planned.push(planned_seg);
118
+ // dwell_ms doubles as record_url_narration's per-phase hold duration so
119
+ // recording naturally tracks the narration audio.
120
+ dwell_ms: seg.dwell_ms ?? audioDurationMs,
121
+ });
133
122
  }
134
123
 
135
124
  const result = {
@@ -141,7 +130,7 @@ export async function runPlanVideoSegmentsTool({ segments, workspace_id, voice_i
141
130
  : (s.presentation?.duration ?? 4);
142
131
  return sum + Math.round(d * 1000);
143
132
  }, 0),
144
- ...(errors.length > 0 ? { warnings: errors } : {}),
133
+ ...(warnings.length > 0 ? { warnings } : {}),
145
134
  };
146
135
 
147
136
  return toolText(JSON.stringify(result, null, 2));
@@ -1,6 +1,23 @@
1
+ // record_url_narration — atomic recording tool.
2
+ //
3
+ // Drives Chromium on Xvfb + Playwright recordVideo to capture a silent mp4 of
4
+ // a URL following a beat-by-beat visual plan, then ffmpeg-transcodes it. The
5
+ // resulting silent mp4 feeds into compose_video_v2 as a video-kind segment
6
+ // alongside narration audio.
7
+ //
8
+ // Lives in daemon/src/tools/ as a reusable handler module; the MCP-tool
9
+ // registration lives in daemon/mcp-servers/official/media-tools/index.js.
10
+ // Migrated out of chat-bridge.js (V4) — no longer wrapped by
11
+ // runMandatoryLocalTool / governance round-trip. media-tools is a separate
12
+ // stdio MCP server and governance integration is chat-bridge-specific;
13
+ // matches the precedent set by synthesize_tts / plan_video_segments /
14
+ // compose_video_v2 in V1/V2/V3.
15
+
1
16
  import { mkdirSync } from 'fs';
2
17
  import path from 'path';
3
18
 
19
+ import { recordUrlNarration as defaultRecordUrlNarrationFn } from '../_vendor/video/recorder/index.js';
20
+
4
21
  function toolText(text) {
5
22
  return { content: [{ type: 'text', text }] };
6
23
  }
@@ -55,10 +72,6 @@ function derivePhaseCount({ plan, recorderOutput }) {
55
72
  return segments ? segments.length : null;
56
73
  }
57
74
 
58
- // record_url_narration is an atomic tool, not the tail of a fixed pipeline.
59
- // The plan may be hand-written by the scripter or produced by plan_video_segments;
60
- // it just needs a non-empty list of segments with per-segment visual action + duration
61
- // so the recording stays in sync with the narration audio.
62
75
  function assertPipelineCompliance(plan) {
63
76
  if (!isPlainObject(plan)) return;
64
77
  if (!planSegments(plan)) {
@@ -86,10 +99,6 @@ const FORBIDDEN_REGION_PATTERNS = [
86
99
  ];
87
100
 
88
101
  function isRecruitmentLikeUrl(url) {
89
- // Conservative URL-based heuristic: mp.weixin.qq.com pages forwarding 招聘 /
90
- // 校招 / 实习 / job content. Until we have content classification, treat
91
- // mp.weixin.qq.com URLs as recruitment-class for safety — the cost of a
92
- // mis-flag is "agent must add a label", not "recording fails permanently".
93
102
  if (typeof url !== 'string') return false;
94
103
  return /mp\.weixin\.qq\.com/.test(url);
95
104
  }
@@ -101,11 +110,6 @@ function describeForbiddenMatch(label) {
101
110
  return null;
102
111
  }
103
112
 
104
- /**
105
- * For recruitment-class URLs, every plan section must declare what content
106
- * sits at its target_y, and the label must NOT match the forbidden-region
107
- * patterns. Returns null on pass, error message string on fail.
108
- */
109
113
  function checkSafeRegionLabels({ url, plan }) {
110
114
  if (!isRecruitmentLikeUrl(url)) return null;
111
115
  const segments = planSegments(plan);
@@ -181,16 +185,13 @@ export async function runRecordUrlNarrationTool({
181
185
  args = {},
182
186
  currentWorkspaceId = '',
183
187
  workspaceDir = process.cwd(),
184
- runMandatoryLocalToolFn,
185
- recordUrlNarrationFn,
188
+ planVideoSegmentsCalled = false,
189
+ recordUrlNarrationFn = defaultRecordUrlNarrationFn,
186
190
  nowMs = () => Date.now(),
187
191
  } = {}) {
188
192
  if (!currentWorkspaceId) {
189
193
  return toolError('No workspace context.');
190
194
  }
191
- if (typeof runMandatoryLocalToolFn !== 'function') {
192
- return toolError('Error: record_url_narration runMandatoryLocalToolFn is required.');
193
- }
194
195
  if (typeof recordUrlNarrationFn !== 'function') {
195
196
  return toolError('Error: record_url_narration executor is required.');
196
197
  }
@@ -220,51 +221,53 @@ export async function runRecordUrlNarrationTool({
220
221
  return toolError(`Error: ${safeRegionError}`);
221
222
  }
222
223
 
224
+ // Standard-chain hard block: refuse recordings unless plan_video_segments
225
+ // ran in this session. Discovered repeatedly in Tasks #20/#25/#26 that
226
+ // agents hand-write dwell_ms by guessing, producing recordings whose phase
227
+ // boundaries drift from the TTS audio they will eventually be paired with —
228
+ // forcing a full re-record. plan_video_segments fills dwell_ms mechanically
229
+ // from ffprobe audio duration, eliminating the drift.
230
+ if (!planVideoSegmentsCalled) {
231
+ return toolError(
232
+ 'Error: record_url_narration refused: plan_video_segments must run earlier in this '
233
+ + 'session so dwell_ms / phase durations are mechanically aligned with the segment\'s '
234
+ + 'TTS audio (audio_duration_ms). Hand-written dwell_ms has repeatedly drifted from '
235
+ + 'the actual TTS duration in production runs, forcing full re-records.\n\n'
236
+ + 'Standard chain: synthesize_tts × N (per segment) → plan_video_segments(segments with '
237
+ + 'text + audio_path + visual_kind=video + visual_path) → record_url_narration (feed '
238
+ + 'plan_video_segments output as plan.sections — each section\'s dwell_ms is already '
239
+ + 'set to audio_duration_ms) + compose_video_v2 (same plan output). Call plan_video_segments '
240
+ + 'now, then pass its `segments` array as `plan.sections` here.'
241
+ );
242
+ }
243
+
223
244
  try {
224
- const result = await runMandatoryLocalToolFn({
225
- toolName: 'record_url_narration',
226
- toolInput: validatedInput,
227
- executor: async (checkedInput = {}) => {
228
- const mergedInput = {
229
- ...validatedInput,
230
- ...checkedInput,
231
- };
232
- const finalInput = validateRecordUrlNarrationArgs(mergedInput);
233
- const { resolvedOutputPath, resolvedEventsPath } = resolveRecordUrlNarrationPaths({
234
- workspaceDir,
235
- outputPath: finalInput.output_path,
236
- eventsPath: finalInput.events_path,
237
- nowMs,
238
- });
239
-
240
- mkdirSync(path.dirname(resolvedOutputPath), { recursive: true });
241
- mkdirSync(path.dirname(resolvedEventsPath), { recursive: true });
242
-
243
- const recorderOutput = await recordUrlNarrationFn({
244
- url: finalInput.url,
245
- plan: finalInput.plan,
246
- output_path: resolvedOutputPath,
247
- events_path: resolvedEventsPath,
248
- viewport: finalInput.viewport,
249
- fps: finalInput.fps,
250
- settle_ms: finalInput.settle_ms,
251
- });
252
-
253
- return {
254
- videoPath: resolvedOutputPath,
255
- eventsPath: resolvedEventsPath,
256
- durationMs: deriveDurationMs(recorderOutput),
257
- phases: derivePhaseCount({ plan: finalInput.plan, recorderOutput }),
258
- };
259
- },
245
+ const { resolvedOutputPath, resolvedEventsPath } = resolveRecordUrlNarrationPaths({
246
+ workspaceDir,
247
+ outputPath: validatedInput.output_path,
248
+ eventsPath: validatedInput.events_path,
249
+ nowMs,
250
+ });
251
+
252
+ mkdirSync(path.dirname(resolvedOutputPath), { recursive: true });
253
+ mkdirSync(path.dirname(resolvedEventsPath), { recursive: true });
254
+
255
+ const recorderOutput = await recordUrlNarrationFn({
256
+ url: validatedInput.url,
257
+ plan: validatedInput.plan,
258
+ output_path: resolvedOutputPath,
259
+ events_path: resolvedEventsPath,
260
+ viewport: validatedInput.viewport,
261
+ fps: validatedInput.fps,
262
+ settle_ms: validatedInput.settle_ms,
260
263
  });
261
264
 
262
265
  return toolText(
263
266
  `Recorded URL narration.\n`
264
- + `video_path=${result.videoPath}\n`
265
- + `events_path=${result.eventsPath}\n`
266
- + `duration_ms=${result.durationMs ?? 'unknown'}\n`
267
- + `phases=${result.phases ?? 'n/a'}`
267
+ + `video_path=${resolvedOutputPath}\n`
268
+ + `events_path=${resolvedEventsPath}\n`
269
+ + `duration_ms=${deriveDurationMs(recorderOutput) ?? 'unknown'}\n`
270
+ + `phases=${derivePhaseCount({ plan: validatedInput.plan, recorderOutput }) ?? 'n/a'}`
268
271
  );
269
272
  } catch (error) {
270
273
  return toolError(`Error: ${error.message}`);