@lightcone-ai/daemon 0.16.2 → 0.17.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,6 +5,56 @@ import { z } from 'zod';
5
5
 
6
6
  import { addTitleEffects } from './lib/render.js';
7
7
  import { SUPPORTED_PRESETS } from './lib/presets.js';
8
+ import { runSynthesisTtsTool } from '../../../src/tools/synthesize-tts.js';
9
+ import { runPlanVideoSegmentsTool } from '../../../src/tools/plan-video-segments.js';
10
+ import { runComposeVideoV2Tool } from '../../../src/tools/compose-video-v2.js';
11
+ import { runRecordUrlNarrationTool } from '../../../src/tools/record-url-narration.js';
12
+ import { runRenderTextToImageTool } from '../../../src/tools/render-text-to-image.js';
13
+ import { runRenderHtmlToImageTool } from '../../../src/tools/render-html-to-image.js';
14
+ import { runTakePageScreenshotTool } from '../../../src/tools/take-page-screenshot.js';
15
+ import { hasFreshVideoBrief } from '../../../src/video-brief-flag.js';
16
+ import { lightconeApi, CURRENT_WORKSPACE_ID, CURRENT_AGENT_ID } from './lib/lightcone-api.js';
17
+
18
+ const WORKSPACE_DIR = String(process.env.WORKSPACE_DIR ?? '');
19
+
20
+ // CVMAX editor_in_chief block: in one workspace, the editor_in_chief agent
21
+ // must not run video production tools directly (short_video_scripter owns
22
+ // that role). Env-gated so ops can rotate workspace/agent IDs without code
23
+ // changes. Previously lived in chat-bridge; moved here alongside the tool
24
+ // it gates (V4 migration). submit_to_library, the other gated tool, still
25
+ // lives in chat-bridge and keeps its own copy of this check.
26
+ const CVMAX_WORKSPACE_ID = String(process.env.BLOCKED_EDITOR_WORKSPACE_ID ?? '');
27
+ const CVMAX_EDITOR_IN_CHIEF_AGENT_ID = String(process.env.BLOCKED_EDITOR_AGENT_ID ?? '');
28
+
29
+ function isBlockedCvmaxEditorVideoTool(toolName) {
30
+ return CURRENT_WORKSPACE_ID === CVMAX_WORKSPACE_ID
31
+ && CURRENT_AGENT_ID === CVMAX_EDITOR_IN_CHIEF_AGENT_ID
32
+ && CVMAX_WORKSPACE_ID
33
+ && CVMAX_EDITOR_IN_CHIEF_AGENT_ID
34
+ && toolName === 'record_url_narration';
35
+ }
36
+
37
+ function cvmaxEditorVideoToolError(toolName) {
38
+ return {
39
+ isError: true,
40
+ content: [{
41
+ type: 'text',
42
+ text:
43
+ `Error: ${toolName} blocked for editor_in_chief in CvMax. `
44
+ + 'In this workspace, @short_video_scripter owns video production. '
45
+ + 'editor_in_chief may route, review, or assist with OCR/verification, but must not run video production tools directly.',
46
+ }],
47
+ };
48
+ }
49
+
50
+ // Session-scoped flag set when plan_video_segments runs successfully.
51
+ // compose_video_v2 refuses TTS-bearing segments (any segment with audio_path)
52
+ // unless this is true — the agent must route audio through plan_video_segments
53
+ // first so durations / subtitle_text are mechanically aligned. media-tools is
54
+ // spawned per-agent, so a fresh agent session must call plan_video_segments
55
+ // fresh. (This flag previously lived in chat-bridge module scope; moved here
56
+ // alongside the tools it gates, see V2 migration.)
57
+ let _planVideoSegmentsCalledThisSession = false;
8
58
 
9
59
  const PRESET_ENUM = z.enum(SUPPORTED_PRESETS);
10
60
  const POSITION_ENUM = z.enum(['top', 'center', 'bottom']);
@@ -63,6 +113,244 @@ server.tool(
63
113
  }
64
114
  );
65
115
 
116
+ // ── synthesize_tts (migrated from chat-bridge) ────────────────────────────
117
+ // Pure TTS atomic tool: text → mp3 file. The lightcone server proxies to
118
+ // MiniMax TTS; this tool downloads the resulting mp3 to a local tmp path so
119
+ // downstream tools (plan_video_segments / compose_video_v2) can read it.
120
+ //
121
+ // Per the video-synthesis-design migration (see docs/upload-pipeline-design.md
122
+ // and docs/scenario-content-creation/video-synthesis-design.md), this tool
123
+ // lives in media-tools rather than chat-bridge so the video pipeline is a
124
+ // single coherent MCP server.
125
+ server.tool(
126
+ 'synthesize_tts',
127
+ 'Run MiniMax TTS on a snippet of narration text and download the resulting mp3 to a local tmp path. '
128
+ + 'Returns the local path and duration. Call once per video segment — do not concatenate all narration '
129
+ + 'into a single call (segment-level audio is required for plan_video_segments to align video durations).',
130
+ {
131
+ text: z.string().min(1).describe('Narration text for this segment. Will be synthesized as a single mp3.'),
132
+ voice_id: z.string().optional().describe('TTS voice preset. Omit to use workspace default.'),
133
+ workspace_id: z.string().optional().describe('Target workspace. Defaults to the current workspace.'),
134
+ },
135
+ async ({ text, voice_id, workspace_id }) => runSynthesisTtsTool({
136
+ text,
137
+ voice_id,
138
+ workspace_id,
139
+ currentWorkspaceId: CURRENT_WORKSPACE_ID,
140
+ api: lightconeApi,
141
+ })
142
+ );
143
+
144
+ // ── plan_video_segments (migrated from chat-bridge; TTS decoupled) ────────
145
+ // Pure planner — takes per-segment {text, audio_path, visual_kind, ...} and
146
+ // returns segments with audio_duration_ms / presentation.duration / dwell_ms
147
+ // / subtitle_text filled in. Caller MUST run synthesize_tts per segment first
148
+ // and pass the resulting audio_path here. No longer synthesizes TTS itself
149
+ // (V2 migration; see docs/scenario-content-creation/video-synthesis-design.md).
150
+ server.tool(
151
+ 'plan_video_segments',
152
+ 'Universal audio-video sync planning step. For each segment, reads the supplied audio_path via ffprobe, '
153
+ + 'measures audio duration, and returns a planned segments array with audio_path / audio_duration_ms / '
154
+ + 'subtitle_text / presentation.duration / dwell_ms filled in — ready to pass directly to both '
155
+ + 'record_url_narration (as the recording plan) AND compose_video_v2 (as the segment list). '
156
+ + 'Must be called before compose_video_v2 when any segment has audio_path.\n\n'
157
+ + 'Inputs per segment: {text, audio_path (required, from synthesize_tts), visual_kind, visual_path or visual_paths, '
158
+ + 'optionally transition / presentation.style}. Standard chain: synthesize_tts × N → plan_video_segments → '
159
+ + 'record_url_narration + compose_video_v2 (both use the same plan output).',
160
+ {
161
+ segments: z.array(z.object({
162
+ text: z.string().describe('Narration text for this segment — used as subtitle_text in the output.'),
163
+ audio_path: z.string().describe('Absolute path to the segment\'s mp3 (from synthesize_tts).'),
164
+ visual_kind: z.enum(['image', 'video', 'gif', 'carousel']).describe('Type of visual for compose_video_v2.'),
165
+ visual_path: z.string().optional().describe('Absolute path to a single image / video / gif file.'),
166
+ visual_paths: z.array(z.string()).optional().describe('For carousel: array of image paths, one per card.'),
167
+ transition: z.enum(['cut', 'fade', 'crossfade']).optional().describe('Transition to next segment. Default cut.'),
168
+ presentation: z.object({
169
+ style: z.enum(['static', 'scroll']).optional(),
170
+ }).optional().describe('Optional presentation hints (style only). duration/per_card_duration are computed.'),
171
+ dwell_ms: z.number().optional().describe('Optional override for record_url_narration phase duration. Default = audio_duration_ms.'),
172
+ })).describe('Segments to plan. audio_path is required for each.'),
173
+ },
174
+ async ({ segments }) => {
175
+ const result = await runPlanVideoSegmentsTool({ segments });
176
+ if (!result?.isError) _planVideoSegmentsCalledThisSession = true;
177
+ return result;
178
+ }
179
+ );
180
+
181
+ // ── compose_video_v2 (migrated from chat-bridge) ──────────────────────────
182
+ // Tool-level enforcement of the standard chain: TTS-bearing segments require
183
+ // plan_video_segments to have run earlier in this session. Without it manual
184
+ // dwell/duration math has repeatedly produced misaligned subtitles, silent
185
+ // tails, and re-records (Task #25/#26 trial).
186
+ server.tool(
187
+ 'compose_video_v2',
188
+ 'Compose a video from a list of segments using ffmpeg. Each segment has a visual source (image / scroll / '
189
+ + 'carousel / video / gif), optional audio, and optional subtitle text. Subtitles are burned in when '
190
+ + 'subtitle_text is provided. Segments are concatenated in order; outro clips are appended after.\n\n'
191
+ + 'When any segment has audio_path, MUST be preceded by plan_video_segments in the same session '
192
+ + '(plan_video_segments fills duration/subtitle_text/audio_path mechanically; manual alignment is rejected). '
193
+ + 'Returns a local mp4 path + size_bytes.',
194
+ {
195
+ segments: z.array(z.object({
196
+ visual_path: z.string().optional().describe('Absolute path to a single image / video / gif.'),
197
+ visual_paths: z.array(z.string()).optional().describe('For carousel: array of image paths, one per card.'),
198
+ visual_kind: z.enum(['image', 'video', 'gif', 'carousel']).describe('Type of visual.'),
199
+ presentation: z.object({
200
+ style: z.enum(['static', 'scroll']).optional(),
201
+ duration: z.number().optional().describe('Segment duration in seconds. Required for image/scroll.'),
202
+ per_card_duration: z.number().optional().describe('Seconds per card for carousel.'),
203
+ }).optional(),
204
+ audio_path: z.string().nullable().optional().describe('Absolute path to audio (mp3). null/omit for silence.'),
205
+ subtitle_text: z.string().optional().describe('Narration text to burn as subtitle. Displayed for the full segment duration.'),
206
+ transition: z.enum(['cut', 'fade', 'crossfade']).optional().describe('Transition to next segment. Default cut.'),
207
+ })).describe('Ordered list of video segments.'),
208
+ outro_paths: z.array(z.string()).optional().describe('Absolute paths to outro video clips appended at end.'),
209
+ resolution: z.string().optional().describe('Output resolution WxH. Default "1080x1920".'),
210
+ output_path: z.string().optional().describe('Absolute output path. Auto-generated if omitted.'),
211
+ },
212
+ async (args) => {
213
+ const segments = Array.isArray(args?.segments) ? args.segments : [];
214
+ const hasNarration = segments.some(s => typeof s?.audio_path === 'string' && s.audio_path.trim());
215
+ if (hasNarration && !_planVideoSegmentsCalledThisSession) {
216
+ return toolError(
217
+ 'compose_video_v2 refused: TTS-bearing segments (audio_path present) require plan_video_segments '
218
+ + 'to have run earlier in this session — it mechanically aligns audio_duration / video_duration / '
219
+ + 'subtitle_text with a safety buffer. Manual dwell/duration math has repeatedly produced misaligned '
220
+ + 'subtitles and silent tails that force re-recording.\n\n'
221
+ + 'Standard chain: synthesize_tts × N (per segment) → plan_video_segments(segments with text + audio_path + '
222
+ + 'visual_kind + visual_path) → compose_video_v2 (use the returned segments verbatim, only swap '
223
+ + 'visual_path/visual_kind for the real media). Call plan_video_segments now and pass its output here.'
224
+ );
225
+ }
226
+ if (hasNarration && !hasFreshVideoBrief({ workspaceId: CURRENT_WORKSPACE_ID, agentId: CURRENT_AGENT_ID })) {
227
+ return toolError(
228
+ 'compose_video_v2 refused: must send a 确认稿 (production-brief) to the user via send_message before '
229
+ + 'compositing a narration video. The system scans send_message content for a brief — a message that '
230
+ + 'BOTH asks the user to confirm (确认 / 你看 / OK 吗 / 可以吗 / 同意 / 通过 / 行不行) AND describes '
231
+ + 'at least two of: 画面 / 时长 / 文案 / 口播 / 字幕 / 顺序 / 口吻 / 分镜 / 配音 — no such message '
232
+ + 'was sent in the last 6 hours for this workspace+agent.\n\n'
233
+ + '"已生成 TTS" / "开始合成" / progress reports do NOT count. Send a concrete confirmation draft '
234
+ + 'first (e.g. "我准备这么做:画面是真录屏,时长约 1 分钟,文案如下…,字幕开启,公司顺序 A→B→C,'
235
+ + '口吻是…—— 你 OK 吗?") and wait for the user to reply OK before calling compose_video_v2 again.'
236
+ );
237
+ }
238
+ return runComposeVideoV2Tool({ ...args, workspaceDir: WORKSPACE_DIR });
239
+ }
240
+ );
241
+
242
+ // ── record_url_narration (migrated from chat-bridge) ──────────────────────
243
+ // Records a silent mp4 of a URL via Chromium+Xvfb+Playwright recordVideo,
244
+ // driven by a beat-by-beat plan. Hard-block: requires plan_video_segments to
245
+ // have run in this session — hand-written dwell_ms has drifted from TTS
246
+ // audio in production runs (Tasks #20/#25/#26), forcing re-records.
247
+ server.tool(
248
+ 'record_url_narration',
249
+ 'Record a silent video of a URL by driving Chromium on an Xvfb display and capturing it with Playwright recordVideo, driven by a video plan; ffmpeg then transcodes the recording to mp4. Outputs a silent mp4 that can be passed to compose_video_v2 as a video-kind segment with an audio_path for narration.\n\nUse this as the canonical recording step for URL-narration videos. Falls back: if the page needs interactions outside the visual_action vocabulary (clicks, waits, OCR loops), use Monitor (Bash) with custom Playwright instead.\n\nMUST be preceded by plan_video_segments in the same session — feed plan_video_segments\'s `segments` array as `plan.sections` so dwell_ms aligns mechanically with TTS audio_duration_ms (hand-written dwell_ms has drifted and forced re-records in production).\n\nRuntime requirements: this tool only works on a Linux daemon machine with Xvfb + Chromium + ffmpeg installed (ffmpeg is used to transcode the recording to mp4; no x11grab device support needed). macOS / Windows daemons will fail at startup.',
250
+ {
251
+ url: z.string().describe('Page URL to record'),
252
+ plan: z.record(z.any()).describe(
253
+ 'A video plan: an object with `phases` (or `sections`), each a "visual beat" with '
254
+ + '`action` (scroll_to_dwell / linear_scroll_during / scroll_back / hold / ...), a '
255
+ + 'target (`target_y` or `focus_region:[y1,y2]`) for scroll-type actions, and '
256
+ + '`dwell_ms` (how long to hold that beat — should match the segment\'s TTS duration).\n\n'
257
+ + 'Standard chain: pass plan_video_segments\'s `segments` array directly as `plan.sections` — '
258
+ + 'each segment\'s `dwell_ms` is already set to its `audio_duration_ms`.\n\n'
259
+ + 'For RECRUITMENT URLs (mp.weixin.qq.com / 校招 / 实习 / 岗位 content), each section MUST '
260
+ + 'also declare `target_y_content_label` — a short Chinese label describing what content '
261
+ + 'sits at that pixel y position on the page (e.g. "标题区" / "岗位信息卡片" / "公司介绍" / '
262
+ + '"届别说明"). Labels matching forbidden regions ("二维码" / "扫码" / "投递入口" / "投递方式" / '
263
+ + '"联系方式" / "微信号" / "QR" / "阅读原文" / "外链") will cause the tool to refuse the '
264
+ + 'recording — recruitment content must NOT dwell on these areas (see fragments.md '
265
+ + 'frag.short.recruitment_url_mode_policy). Pick a different target_y in the 标题/岗位 '
266
+ + 'information area and rewrite that section.'
267
+ ),
268
+ output_path: z.string().optional().describe('Workspace-relative output mp4 path. Default tmp/wx3_video/recorded-{ts}.mp4'),
269
+ events_path: z.string().optional().describe('Workspace-relative events.json path. Default ${output_path}.events.json'),
270
+ viewport: z.object({
271
+ width: z.number().optional(),
272
+ height: z.number().optional(),
273
+ }).optional().describe('Default 1080x1920 (mobile portrait). Override only if the plan requires a different shape.'),
274
+ fps: z.number().optional().describe('Default 30. Do not change unless needed.'),
275
+ settle_ms: z.number().optional().describe('Default 4000. Settle wait after navigation before recording starts.'),
276
+ },
277
+ async (args) => {
278
+ if (isBlockedCvmaxEditorVideoTool('record_url_narration')) {
279
+ return cvmaxEditorVideoToolError('record_url_narration');
280
+ }
281
+ // record_url_narration is part of the narration-video pipeline (paired
282
+ // with synthesize_tts + plan_video_segments + compose_video_v2), so it
283
+ // requires the same 确认稿 gate as compose_video_v2 — catching the skip
284
+ // earlier saves TTS + recording time.
285
+ if (!hasFreshVideoBrief({ workspaceId: CURRENT_WORKSPACE_ID, agentId: CURRENT_AGENT_ID })) {
286
+ return {
287
+ isError: true,
288
+ content: [{ type: 'text', text:
289
+ 'Error: record_url_narration refused: must send a 确认稿 (production-brief) to the user via '
290
+ + 'send_message before starting a narration recording. The system scans send_message content for '
291
+ + 'a brief — a message that BOTH asks the user to confirm (确认 / 你看 / OK 吗 / 可以吗 / 同意 / '
292
+ + '通过 / 行不行) AND describes at least two of: 画面 / 时长 / 文案 / 口播 / 字幕 / 顺序 / 口吻 / '
293
+ + '分镜 / 配音 — no such message was sent in the last 6 hours for this workspace+agent.\n\n'
294
+ + '"已生成 TTS" / "开始合成" / progress reports do NOT count. Send a concrete confirmation draft '
295
+ + 'first (e.g. "我准备这么做:画面是真录屏,时长约 1 分钟,文案如下…,字幕开启,公司顺序 A→B→C,'
296
+ + '口吻是…—— 你 OK 吗?") and wait for the user to reply OK before calling record_url_narration.'
297
+ }],
298
+ };
299
+ }
300
+ return runRecordUrlNarrationTool({
301
+ args,
302
+ currentWorkspaceId: CURRENT_WORKSPACE_ID,
303
+ workspaceDir: WORKSPACE_DIR,
304
+ planVideoSegmentsCalled: _planVideoSegmentsCalledThisSession,
305
+ });
306
+ }
307
+ );
308
+
309
+ // ── render_text_to_image (migrated from chat-bridge) ──────────────────────
310
+ server.tool(
311
+ 'render_text_to_image',
312
+ 'Render text content into image(s) for video synthesis. style=scroll produces a single tall image (for a scrolling video segment); style=carousel produces one image per card (for a slide-show segment). Returns local file paths.',
313
+ {
314
+ content: z.union([z.string(), z.array(z.string())]).describe('Text content. For carousel, pass an array of strings — one per card. For scroll, pass a single string (or array joined with line breaks).'),
315
+ style: z.enum(['scroll', 'carousel']).describe('scroll: one tall image; carousel: one image per card.'),
316
+ theme: z.enum(['dark', 'light']).optional().describe('Color theme. Default dark.'),
317
+ width: z.number().optional().describe('Image width in pixels. Default 1080.'),
318
+ card_height: z.number().optional().describe('Card height in pixels (carousel) or viewport height (scroll baseline). Default 1920.'),
319
+ font_size: z.number().optional().describe('Base font size in pixels. Default 48.'),
320
+ },
321
+ async (args) => runRenderTextToImageTool(args)
322
+ );
323
+
324
+ // ── render_html_to_image (migrated from chat-bridge) ──────────────────────
325
+ server.tool(
326
+ 'render_html_to_image',
327
+ 'Render a raw HTML string to a PNG image by navigating to it as a local file:// page. Unlike evaluate_script+document.write on about:blank, this preserves file:// origin so <img src="file:///..."> references load correctly. Returns the output image path.',
328
+ {
329
+ html: z.string().describe('Full HTML document to render (including <!doctype>, <html>, <head>, <body>).'),
330
+ output_path: z.string().optional().describe('Absolute path to save the PNG. Auto-generated in /tmp if omitted.'),
331
+ viewport_width: z.number().optional().describe('Viewport width in pixels. Default 1080.'),
332
+ viewport_height: z.number().optional().describe('Viewport height in pixels. Default 1920.'),
333
+ wait_until: z.enum(['load', 'networkidle', 'domcontentloaded']).optional().describe('Navigation wait condition. Default load.'),
334
+ },
335
+ async (args) => runRenderHtmlToImageTool(args)
336
+ );
337
+
338
+ // ── take_page_screenshot (migrated from chat-bridge) ──────────────────────
339
+ server.tool(
340
+ 'take_page_screenshot',
341
+ 'Open a URL with a headless browser and capture a screenshot. crop=above_fold captures only the visible viewport (ideal for thumbnail-style frames); crop=full_page captures the entire page height.',
342
+ {
343
+ url: z.string().describe('Page URL to screenshot.'),
344
+ crop: z.enum(['above_fold', 'full_page']).optional().describe('Capture mode. Default above_fold.'),
345
+ viewport: z.object({
346
+ width: z.number().optional(),
347
+ height: z.number().optional(),
348
+ }).optional().describe('Viewport size. Default 390×844 (mobile).'),
349
+ wait_for: z.enum(['load', 'networkidle', 'domcontentloaded']).optional().describe('Page load event to wait for before screenshotting. Default networkidle.'),
350
+ },
351
+ async (args) => runTakePageScreenshotTool(args)
352
+ );
353
+
66
354
  const transport = new StdioServerTransport();
67
355
  await server.connect(transport);
68
356
  console.error('[official-media-tools] MCP Server started');
@@ -0,0 +1,41 @@
1
+ // Minimal HTTP helper for media-tools to call lightcone server's internal API.
2
+ // Wraps fetch with the right URL prefix + auth headers + JSON encoding.
3
+ //
4
+ // Unlike daemon/src/chat-bridge.js's `api`, this helper does NOT route through
5
+ // the governance/cache layer — media-tools is a separate stdio MCP server and
6
+ // governance integration is chat-bridge-specific. If a tool here needs
7
+ // governance-mediated execution, route it through chat-bridge's thin-proxy
8
+ // instead (see weixin-tools for the pattern).
9
+
10
+ const SERVER_URL = String(process.env.SERVER_URL ?? '').replace(/\/+$/, '');
11
+ const MACHINE_API_KEY = String(process.env.MACHINE_API_KEY ?? '');
12
+ const AGENT_ID = String(process.env.AGENT_ID ?? '');
13
+
14
+ if (!SERVER_URL) throw new Error('media-tools: SERVER_URL env var is required');
15
+ if (!MACHINE_API_KEY) throw new Error('media-tools: MACHINE_API_KEY env var is required');
16
+ if (!AGENT_ID) throw new Error('media-tools: AGENT_ID env var is required');
17
+
18
+ export async function lightconeApi(method, apiPath, body) {
19
+ const url = `${SERVER_URL}/internal/agent/${encodeURIComponent(AGENT_ID)}${apiPath}`;
20
+ const res = await fetch(url, {
21
+ method,
22
+ headers: {
23
+ 'Content-Type': 'application/json',
24
+ 'Authorization': `Bearer ${MACHINE_API_KEY}`,
25
+ },
26
+ body: body != null ? JSON.stringify(body) : undefined,
27
+ });
28
+ if (!res.ok) {
29
+ let text = '';
30
+ try { text = await res.text(); } catch { /* ignore */ }
31
+ const err = new Error(`lightcone ${method} ${apiPath} → ${res.status}: ${text.slice(0, 400)}`);
32
+ err.status = res.status;
33
+ err.body = text;
34
+ throw err;
35
+ }
36
+ return res.json();
37
+ }
38
+
39
+ // Exposed so tools can construct workspace-aware fallback identifiers.
40
+ export const CURRENT_AGENT_ID = AGENT_ID;
41
+ export const CURRENT_WORKSPACE_ID = String(process.env.WORKSPACE_ID ?? '');
@@ -5,7 +5,14 @@
5
5
  "runtime": "node",
6
6
  "entrypoint": "index.js",
7
7
  "tool_declarations": [
8
- { "name": "add_title_effects", "classification": "cacheable" }
8
+ { "name": "add_title_effects", "classification": "cacheable" },
9
+ { "name": "synthesize_tts", "classification": "mandatory" },
10
+ { "name": "plan_video_segments", "classification": "mandatory" },
11
+ { "name": "compose_video_v2", "classification": "mandatory" },
12
+ { "name": "record_url_narration", "classification": "mandatory" },
13
+ { "name": "render_text_to_image", "classification": "cacheable" },
14
+ { "name": "render_html_to_image", "classification": "cacheable" },
15
+ { "name": "take_page_screenshot", "classification": "cacheable" }
9
16
  ],
10
17
  "smoke_test": {
11
18
  "tool": "add_title_effects",
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lightcone-ai/daemon",
3
- "version": "0.16.2",
3
+ "version": "0.17.1",
4
4
  "type": "module",
5
5
  "main": "src/index.js",
6
6
  "bin": {
@@ -2,23 +2,21 @@
2
2
  import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
3
3
  import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
4
4
  import { z } from 'zod';
5
- import { createReadStream, existsSync, mkdirSync, readFileSync, writeFileSync, statSync } from 'fs';
5
+ import { existsSync, mkdirSync, readFileSync, writeFileSync, statSync } from 'fs';
6
6
  import { createHash, randomUUID } from 'crypto';
7
7
  import path, { extname } from 'path';
8
8
  import os from 'os';
9
- import { recordUrlNarration } from './_vendor/video/recorder/index.js';
10
9
  import { writeLocalFileToWorkspace, resolveWorkspaceFileUploadPlan } from './workspace-file-upload.js';
11
10
  import { UploadJobManager } from './upload-job-manager.js';
12
11
  import { createUploadServerApi } from './upload-server-api.js';
13
- import { runRecordUrlNarrationTool } from './record-url-narration-tool.js';
12
+ // record_url_narration moved to media-tools MCP server (V4 migration);
13
+ // recorder import / handler are now consumed there, not from chat-bridge.
14
14
  import { runSubmitToLibraryTool } from './submit-to-library-tool.js';
15
- import { runRenderTextToImageTool } from './tools/render-text-to-image.js';
16
- import { runRenderHtmlToImageTool } from './tools/render-html-to-image.js';
17
- import { runSynthesisTtsTool } from './tools/synthesize-tts.js';
18
- import { runPlanVideoSegmentsTool } from './tools/plan-video-segments.js';
19
- import { runComposeVideoV2Tool } from './tools/compose-video-v2.js';
20
- import { runTakePageScreenshotTool } from './tools/take-page-screenshot.js';
15
+ // render_text_to_image, render_html_to_image, take_page_screenshot moved to
16
+ // media-tools MCP server (V5 migration). Handlers still live in
17
+ // daemon/src/tools/ as shared modules and are imported there.
21
18
  import { runGetLibraryFileTool } from './tools/get-library-file.js';
19
+ import { markVideoBriefSent, looksLikeVideoBrief } from './video-brief-flag.js';
22
20
  import { isLeaseInvalidated, clearInvalidatedLease } from './governance-state.js';
23
21
  import { classifyLeaseWindow } from './lease-window.js';
24
22
  import {
@@ -72,8 +70,9 @@ let currentWorkspaceId = WORKSPACE_ID;
72
70
  // Remove entirely once the new atomic tool framework is stable and the legacy pipeline retires.
73
71
  const CVMAX_WORKSPACE_ID = process.env.BLOCKED_EDITOR_WORKSPACE_ID ?? '';
74
72
  const CVMAX_EDITOR_IN_CHIEF_AGENT_ID = process.env.BLOCKED_EDITOR_AGENT_ID ?? '';
73
+ // record_url_narration moved to media-tools and carries its own copy of this
74
+ // block. submit_to_library stays here.
75
75
  const CVMAX_EDITOR_BLOCKED_VIDEO_TOOLS = new Set([
76
- 'record_url_narration',
77
76
  'submit_to_library',
78
77
  ]);
79
78
 
@@ -162,7 +161,6 @@ const DEFAULT_TOOL_CLASSIFICATION = {
162
161
  update_goal_field: 'mandatory',
163
162
  supersede_goal_field: 'mandatory',
164
163
  request_credential_auth: 'mandatory',
165
- record_url_narration: 'mandatory',
166
164
  submit_to_library: 'mandatory',
167
165
  register_data_source: 'mandatory',
168
166
  bind_workspace_scenario: 'mandatory',
@@ -491,37 +489,6 @@ async function directApi(method, apiPath, body) {
491
489
  return res.json();
492
490
  }
493
491
 
494
- async function directApiVideoUpload(apiPath, {
495
- localPath,
496
- filename,
497
- contentType = 'video/mp4',
498
- }) {
499
- const url = `${SERVER_URL}/internal/agent/${AGENT_ID}${apiPath}`;
500
- const headers = {
501
- 'Authorization': `Bearer ${MACHINE_API_KEY}`,
502
- 'Content-Type': contentType,
503
- };
504
- if (filename) headers['X-File-Name'] = filename;
505
-
506
- let res;
507
- try {
508
- res = await fetch(url, {
509
- method: 'POST',
510
- headers,
511
- body: createReadStream(localPath),
512
- duplex: 'half',
513
- });
514
- } catch (error) {
515
- throw buildDirectApiTransportError({ method: 'POST', apiPath, error });
516
- }
517
-
518
- if (!res.ok) {
519
- const text = await res.text();
520
- throw buildDirectApiHttpError({ method: 'POST', apiPath, status: res.status, text });
521
- }
522
- return res.json();
523
- }
524
-
525
492
  async function callGovernance(payload, { retry = true } = {}) {
526
493
  const attempts = retry ? 2 : 1;
527
494
  let lastError = null;
@@ -609,63 +576,6 @@ async function governanceRoundTrip({ method, apiPath, body, toolName, classifica
609
576
  return directApi(method, apiPath, nextBody);
610
577
  }
611
578
 
612
- async function runMandatoryLocalTool({ toolName, toolInput = {}, executor }) {
613
- const classification = TOOL_CLASSIFICATION[toolName] ?? 'mandatory';
614
- const traceId = randomUUID();
615
- enqueueBundleEvent('tool_call_started', {
616
- trace_id: traceId,
617
- tool_name: toolName,
618
- tool_classification: classification,
619
- method: 'LOCAL',
620
- api_path: '/local-tool',
621
- });
622
-
623
- try {
624
- await ensureGovernanceContext();
625
- const governancePayload = {
626
- spawn_bundle_id: governanceContext.spawnBundleId,
627
- policy_version: governanceContext.policyVersion,
628
- tool_name: toolName,
629
- tool_input: toolInput,
630
- tool_classification: classification,
631
- agent_id: AGENT_ID,
632
- idempotency_key: randomUUID(),
633
- lease_id: governanceContext.lease?.lease_id ?? null,
634
- };
635
- const governance = await callGovernance(governancePayload, { retry: true });
636
- if (governance.policy_lease) applyPolicyLease(governance.policy_lease);
637
- if (governance.verdict === 'reject' || governance.verdict === 'defer_human') {
638
- throw governanceError(governanceReasonCode(governance.reason));
639
- }
640
-
641
- const checkedInput = (governance.verdict === 'modify' && governance.modified_input && typeof governance.modified_input === 'object')
642
- ? { ...toolInput, ...governance.modified_input }
643
- : toolInput;
644
- const result = await executor(checkedInput);
645
- if (CACHE_INVALIDATION_TOOLS.has(toolName)) {
646
- governanceContext.cache.clear();
647
- }
648
-
649
- enqueueBundleEvent('tool_call_succeeded', {
650
- trace_id: traceId,
651
- tool_name: toolName,
652
- tool_classification: classification,
653
- source: 'governance_roundtrip',
654
- });
655
- return result;
656
- } catch (error) {
657
- if (shouldEmitToolCallFailed(error)) {
658
- enqueueBundleEvent('tool_call_failed', {
659
- trace_id: traceId,
660
- tool_name: toolName,
661
- tool_classification: classification,
662
- reason: toolCallFailedReason(error),
663
- });
664
- }
665
- throw error;
666
- }
667
- }
668
-
669
579
  function renewCacheInBackground({ method, apiPath, body, toolName, cacheKey }) {
670
580
  if (governanceContext.renewalInFlight.has(cacheKey)) return;
671
581
  governanceContext.renewalInFlight.add(cacheKey);
@@ -886,6 +796,15 @@ server.tool('send_message', 'Send a message to a workspace, DM, or thread', {
886
796
  content: z.string().describe('Message content'),
887
797
  }, async ({ target, content }) => {
888
798
  const data = await api('POST', '/send', { target, content });
799
+ // Heuristic: if this looks like a video-production 确认稿 (asks permission +
800
+ // describes plan), mark a cross-process flag so media-tools' compose_video_v2
801
+ // / record_url_narration can verify a brief was actually sent before running.
802
+ // See daemon/src/video-brief-flag.js for the detection rules.
803
+ if (looksLikeVideoBrief(content) && AGENT_ID && currentWorkspaceId) {
804
+ try {
805
+ markVideoBriefSent({ workspaceId: currentWorkspaceId, agentId: AGENT_ID, content });
806
+ } catch { /* best-effort; failure to mark is non-fatal */ }
807
+ }
889
808
  return { content: [{ type: 'text', text: `Sent. messageId=${data.messageId} threadTarget=${data.threadTarget}` }] };
890
809
  });
891
810
 
@@ -1375,141 +1294,12 @@ server.tool('request_credential_auth',
1375
1294
  }
1376
1295
  );
1377
1296
 
1378
- // ── render_text_to_image ───────────────────────────────────────────────────────
1379
- server.tool('render_text_to_image',
1380
- 'Render text content into image(s) for video synthesis. style=scroll produces a single tall image (for a scrolling video segment); style=carousel produces one image per card (for a slide-show segment). Returns local file paths.',
1381
- {
1382
- content: z.union([z.string(), z.array(z.string())]).describe('Text content. For carousel, pass an array of strings — one per card. For scroll, pass a single string (or array joined with line breaks).'),
1383
- style: z.enum(['scroll', 'carousel']).describe('scroll: one tall image; carousel: one image per card.'),
1384
- theme: z.enum(['dark', 'light']).optional().describe('Color theme. Default dark.'),
1385
- width: z.number().optional().describe('Image width in pixels. Default 1080.'),
1386
- card_height: z.number().optional().describe('Card height in pixels (carousel) or viewport height (scroll baseline). Default 1920.'),
1387
- font_size: z.number().optional().describe('Base font size in pixels. Default 48.'),
1388
- },
1389
- async (args) => runRenderTextToImageTool(args)
1390
- );
1391
-
1392
- // ── render_html_to_image ───────────────────────────────────────────────────────
1393
- server.tool('render_html_to_image',
1394
- 'Render a raw HTML string to a PNG image by navigating to it as a local file:// page. Unlike evaluate_script+document.write on about:blank, this preserves file:// origin so <img src="file:///..."> references load correctly. Returns the output image path.',
1395
- {
1396
- html: z.string().describe('Full HTML document to render (including <!doctype>, <html>, <head>, <body>).'),
1397
- output_path: z.string().optional().describe('Absolute path to save the PNG. Auto-generated in /tmp if omitted.'),
1398
- viewport_width: z.number().optional().describe('Viewport width in pixels. Default 1080.'),
1399
- viewport_height: z.number().optional().describe('Viewport height in pixels. Default 1920.'),
1400
- wait_until: z.enum(['load', 'networkidle', 'domcontentloaded']).optional().describe('Navigation wait condition. Default load.'),
1401
- },
1402
- async (args) => runRenderHtmlToImageTool(args)
1403
- );
1404
-
1405
- // ── synthesize_tts ─────────────────────────────────────────────────────────────
1406
- server.tool('synthesize_tts',
1407
- 'Convert text to speech using the workspace MiniMax TTS credential. Returns a local mp3 file path and duration. Use this to generate narration audio for individual video segments.',
1408
- {
1409
- text: z.string().describe('Text to synthesize. Keep under 500 characters per call for reliable results.'),
1410
- voice_id: z.string().optional().describe('MiniMax voice ID. Omit to use the workspace default voice.'),
1411
- workspace_id: z.string().optional().describe('Target workspace. Defaults to current workspace context.'),
1412
- },
1413
- async (args) => runSynthesisTtsTool({ ...args, currentWorkspaceId, api })
1414
- );
1415
-
1416
- // ── plan_video_segments ────────────────────────────────────────────────────────
1417
- // Session-scoped flag set when plan_video_segments runs. compose_video_v2
1418
- // refuses TTS-bearing segments unless this is true — the agent must route
1419
- // audio through plan_video_segments first so durations / subtitle_text /
1420
- // audio_path are mechanically aligned. This is a per-chat-bridge-process
1421
- // flag, so a fresh codex session must call plan_video_segments fresh.
1422
- let _planVideoSegmentsCalledThisSession = false;
1423
-
1424
- server.tool('plan_video_segments',
1425
- 'Universal audio-video sync planning step. For each segment, call TTS to get the real audio duration, then compute the visual duration with a safety buffer. Returns a planned segments array ready to pass directly to compose_video_v2 (with audio_path, presentation.duration/per_card_duration, and subtitle_text pre-filled). Always call this before compose_video_v2 when you have narration text.',
1426
- {
1427
- segments: z.array(z.object({
1428
- text: z.string().describe('Narration text for this segment. TTS will be generated from this.'),
1429
- visual_kind: z.enum(['image', 'video', 'gif', 'carousel']).describe('Type of visual.'),
1430
- visual_path: z.string().optional().describe('Absolute path to a single image, video, or gif file.'),
1431
- visual_paths: z.array(z.string()).optional().describe('For carousel: array of image paths, one per card.'),
1432
- transition: z.enum(['cut', 'fade', 'crossfade']).optional().describe('Transition to next segment. Default cut.'),
1433
- presentation: z.object({
1434
- style: z.enum(['static', 'scroll']).optional(),
1435
- }).optional().describe('Partial presentation hints (style only). duration/per_card_duration are computed from TTS.'),
1436
- })).describe('Segments to plan. Each must have narration text and visual info.'),
1437
- voice_id: z.string().optional().describe('TTS voice ID. Omit to use workspace default.'),
1438
- workspace_id: z.string().optional().describe('Target workspace. Defaults to current workspace context.'),
1439
- },
1440
- async (args) => {
1441
- const result = await runPlanVideoSegmentsTool({ ...args, currentWorkspaceId, api });
1442
- if (!result?.isError) _planVideoSegmentsCalledThisSession = true;
1443
- return result;
1444
- }
1445
- );
1446
-
1447
- // ── compose_video_v2 ───────────────────────────────────────────────────────────
1448
- server.tool('compose_video_v2',
1449
- 'Compose a video from a list of segments using ffmpeg. Each segment has a visual source (image/scroll/carousel/video/gif), optional audio, and optional subtitle text. Subtitles are burned into the video by default when subtitle_text is provided. Segments are concatenated in order; outro clips are appended at the end. Returns a local mp4 path.\n\nTypical flow: plan_video_segments → compose_video_v2 (segments output fed directly in).',
1450
- {
1451
- segments: z.array(z.object({
1452
- visual_path: z.string().optional().describe('Absolute path to a single image, video, or gif file.'),
1453
- visual_paths: z.array(z.string()).optional().describe('For carousel: array of image paths, one per card.'),
1454
- visual_kind: z.enum(['image', 'video', 'gif', 'carousel']).describe('Type of visual.'),
1455
- presentation: z.object({
1456
- style: z.enum(['static', 'scroll']).optional().describe('For image: static (default) or scroll (pan upward).'),
1457
- duration: z.number().optional().describe('Segment duration in seconds. Required for image/scroll.'),
1458
- per_card_duration: z.number().optional().describe('Seconds per card for carousel.'),
1459
- }).optional(),
1460
- audio_path: z.string().nullable().optional().describe('Absolute path to audio (mp3). null or omit for silence.'),
1461
- subtitle_text: z.string().optional().describe('Narration text to burn as subtitle for this segment. Displayed for the full segment duration.'),
1462
- transition: z.enum(['cut', 'fade', 'crossfade']).optional().describe('Transition to next segment. Default cut.'),
1463
- })).describe('Ordered list of video segments.'),
1464
- outro_paths: z.array(z.string()).optional().describe('Absolute paths to outro video clips appended after all segments.'),
1465
- resolution: z.string().optional().describe('Output resolution WxH. Default "1080x1920".'),
1466
- output_path: z.string().optional().describe('Absolute output path for the mp4. Auto-generated if omitted.'),
1467
- },
1468
- async (args) => {
1469
- // Tool-level enforcement of the synthesize_tts → plan_video_segments →
1470
- // compose_video_v2 standard chain. If any segment has audio_path (i.e.
1471
- // narration is involved) and the agent never invoked plan_video_segments
1472
- // in this session, refuse the compose — manual dwell/duration math is
1473
- // unreliable (last syllable cut, silent tails, subtitle drift). Observed
1474
- // twice in row: agent skipped plan_video_segments, manually estimated
1475
- // dwell_ms wrong, ended up with too-long records and silent tails it then
1476
- // re-recorded to fix — wasting record_url_narration runs that
1477
- // plan_video_segments would have prevented.
1478
- const segments = Array.isArray(args?.segments) ? args.segments : [];
1479
- const hasNarration = segments.some(s => typeof s?.audio_path === 'string' && s.audio_path.trim());
1480
- if (hasNarration && !_planVideoSegmentsCalledThisSession) {
1481
- return {
1482
- isError: true,
1483
- content: [{
1484
- type: 'text',
1485
- text: 'compose_video_v2 refused: TTS-bearing segments (audio_path present) require plan_video_segments '
1486
- + 'to have run earlier in this session — it mechanically aligns audio_duration / video_duration / '
1487
- + 'subtitle_text with a safety buffer. Manual dwell/duration math has repeatedly produced misaligned '
1488
- + 'subtitles and silent tails that force re-recording.\n\n'
1489
- + 'Standard chain: synthesize_tts(per segment) → plan_video_segments(with text+visual_kind+visual_path) '
1490
- + '→ compose_video_v2(use the returned segments verbatim, only swap visual_path/visual_kind for real '
1491
- + 'media). Call plan_video_segments now and pass its output here.',
1492
- }],
1493
- };
1494
- }
1495
- return runComposeVideoV2Tool({ ...args, workspaceDir: WORKSPACE_DIR });
1496
- }
1497
- );
1498
-
1499
- // ── take_page_screenshot ───────────────────────────────────────────────────────
1500
- server.tool('take_page_screenshot',
1501
- 'Open a URL with a headless browser and capture a screenshot. crop=above_fold captures only the visible viewport (ideal for thumbnail-style frames); crop=full_page captures the entire page height.',
1502
- {
1503
- url: z.string().describe('Page URL to screenshot.'),
1504
- crop: z.enum(['above_fold', 'full_page']).optional().describe('Capture mode. Default above_fold.'),
1505
- viewport: z.object({
1506
- width: z.number().optional(),
1507
- height: z.number().optional(),
1508
- }).optional().describe('Viewport size. Default 390×844 (mobile).'),
1509
- wait_for: z.enum(['load', 'networkidle', 'domcontentloaded']).optional().describe('Page load event to wait for before screenshotting. Default networkidle.'),
1510
- },
1511
- async (args) => runTakePageScreenshotTool(args)
1512
- );
1297
+ // render_text_to_image, render_html_to_image, take_page_screenshot, synthesize_tts,
1298
+ // plan_video_segments, compose_video_v2, record_url_narration — all moved to
1299
+ // media-tools MCP server (V1–V5 migration; see
1300
+ // docs/scenario-content-creation/video-synthesis-design.md). The whole video
1301
+ // pipeline now lives in one stdio server so session-scoped flags (plan-was-
1302
+ // called) can gate downstream tools.
1513
1303
 
1514
1304
  // ── get_library_file ───────────────────────────────────────────────────────────
1515
1305
  server.tool('get_library_file',
@@ -1521,48 +1311,10 @@ server.tool('get_library_file',
1521
1311
  async (args) => runGetLibraryFileTool({ ...args, currentWorkspaceId, api, SERVER_URL, MACHINE_API_KEY, workspaceDir: WORKSPACE_DIR })
1522
1312
  );
1523
1313
 
1524
- // ── record_url_narration ────────────────────────────────────────────────────────
1525
- server.tool('record_url_narration',
1526
- 'Record a silent video of a URL by driving Chromium on an Xvfb display and capturing it with Playwright recordVideo, driven by a video plan; ffmpeg then transcodes the recording to mp4. Outputs a silent mp4 that can be passed to compose_video_v2 as a video-kind segment with an audio_path for narration.\n\nUse this as the canonical recording step for URL-narration videos. Falls back: if the page needs interactions outside the visual_action vocabulary (clicks, waits, OCR loops), use Monitor (Bash) with custom Playwright instead.\n\nRuntime requirements: this tool only works on a Linux daemon machine with Xvfb + Chromium + ffmpeg installed (ffmpeg is used to transcode the recording to mp4; no x11grab device support needed). macOS / Windows daemons will fail at startup.',
1527
- {
1528
- url: z.string().describe('Page URL to record'),
1529
- plan: z.record(z.any()).describe(
1530
- 'A video plan: an object with `phases` (or `sections`), each a "visual beat" with '
1531
- + '`action` (scroll_to_dwell / linear_scroll_during / scroll_back / hold / ...), a '
1532
- + 'target (`target_y` or `focus_region:[y1,y2]`) for scroll-type actions, and '
1533
- + '`dwell_ms` (how long to hold that beat — should match the segment\'s TTS duration).\n\n'
1534
- + 'For RECRUITMENT URLs (mp.weixin.qq.com / 校招 / 实习 / 岗位 content), each section MUST '
1535
- + 'also declare `target_y_content_label` — a short Chinese label describing what content '
1536
- + 'sits at that pixel y position on the page (e.g. "标题区" / "岗位信息卡片" / "公司介绍" / '
1537
- + '"届别说明"). Look at the take_page_screenshot output, find the y-pixel, and label it. '
1538
- + 'Labels matching forbidden regions ("二维码" / "扫码" / "投递入口" / "投递方式" / "联系方式" / '
1539
- + '"微信号" / "QR" / "阅读原文" / "外链") will cause the tool to refuse the recording — '
1540
- + 'recruitment content must NOT dwell on these areas (see fragments.md '
1541
- + 'frag.short.recruitment_url_mode_policy). Pick a different target_y in the 标题/岗位 '
1542
- + 'information area and rewrite that section.'
1543
- ),
1544
- output_path: z.string().optional().describe('Workspace-relative output mp4 path. Default tmp/wx3_video/recorded-{ts}.mp4'),
1545
- events_path: z.string().optional().describe('Workspace-relative events.json path. Default ${output_path}.events.json'),
1546
- viewport: z.object({
1547
- width: z.number().optional(),
1548
- height: z.number().optional(),
1549
- }).optional().describe('Default 1080x1920 (mobile portrait). Override only if the plan requires a different shape.'),
1550
- fps: z.number().optional().describe('Default 30. Do not change unless needed.'),
1551
- settle_ms: z.number().optional().describe('Default 4000. Settle wait after navigation before recording starts.'),
1552
- },
1553
- async (args) => {
1554
- if (isBlockedCvmaxEditorVideoTool('record_url_narration')) {
1555
- return cvmaxEditorVideoToolError('record_url_narration');
1556
- }
1557
- return runRecordUrlNarrationTool({
1558
- args,
1559
- currentWorkspaceId,
1560
- workspaceDir: WORKSPACE_DIR,
1561
- runMandatoryLocalToolFn: runMandatoryLocalTool,
1562
- recordUrlNarrationFn: recordUrlNarration,
1563
- });
1564
- }
1565
- );
1314
+ // record_url_narration moved to media-tools MCP server (V4 migration). The
1315
+ // session-scoped plan_video_segments check now lives there alongside the
1316
+ // other video pipeline tools (synthesize_tts / plan_video_segments /
1317
+ // compose_video_v2). The CVMAX editor_in_chief block also moved with it.
1566
1318
 
1567
1319
  // ── submit_to_library ──────────────────────────────────────────────────────────
1568
1320
  server.tool('submit_to_library',
package/src/mcp-config.js CHANGED
@@ -89,6 +89,11 @@ const SERVER_BACKED_MCP_SERVERS = new Set([
89
89
  'audience-research',
90
90
  'hook-pattern-library',
91
91
  'weixin-tools',
92
+ // media-tools (V1–V5 chat-bridge → media-tools migration): synthesize_tts
93
+ // hits /tts/voiceover and the CvMax editor_in_chief gate + video-brief
94
+ // checks read CURRENT_AGENT_ID / CURRENT_WORKSPACE_ID. lib/lightcone-api.js
95
+ // throws at module load without the SERVER_URL/MACHINE_API_KEY/AGENT_ID triple.
96
+ 'media-tools',
92
97
  ]);
93
98
 
94
99
  function baseEnvForServer(serverKey, { serverUrl, authToken, agentId, workspaceId, workspaceDir }) {
@@ -1,7 +1,22 @@
1
- import { mkdirSync, writeFileSync } from 'fs';
2
- import { randomUUID } from 'crypto';
3
- import path from 'path';
4
- import os from 'os';
1
+ // plan_video_segments pure audio/video alignment planner.
2
+ //
3
+ // Takes per-segment {text, audio_path, visual_kind, ...} and returns unified
4
+ // plan segments with:
5
+ // - audio_duration_ms (read via ffprobe from the provided audio_path)
6
+ // - subtitle_text (= text)
7
+ // - presentation.duration / per_card_duration (audio_duration + buffer)
8
+ // - dwell_ms (= audio_duration; lets the same segment drive record_url_narration)
9
+ //
10
+ // Previously this tool ALSO synthesized TTS internally — which duplicated
11
+ // the work when callers had already run synthesize_tts, and caused the
12
+ // "wrong standard chain" confusion in fragments.md. TTS is now decoupled:
13
+ // callers must run synthesize_tts per segment first and pass the resulting
14
+ // audio_path here. See docs/scenario-content-creation/video-synthesis-design.md.
15
+ //
16
+ // Lives in daemon/src/tools/ as a reusable handler module; the MCP-tool
17
+ // registration lives in daemon/mcp-servers/official/media-tools/index.js.
18
+
19
+ import { spawn } from 'node:child_process';
5
20
 
6
21
  function toolText(text) {
7
22
  return { content: [{ type: 'text', text }] };
@@ -11,125 +26,99 @@ function toolError(text) {
11
26
  return { isError: true, content: [{ type: 'text', text }] };
12
27
  }
13
28
 
14
- function inferAudioExt(url) {
15
- const clean = String(url ?? '').split('?')[0];
16
- const ext = path.extname(clean).toLowerCase();
17
- return ['.mp3', '.wav', '.flac', '.aac', '.ogg'].includes(ext) ? ext : '.mp3';
18
- }
19
-
20
- async function synthesizeSegmentTts(text, { workspace_id, voice_id, api }) {
21
- const payload = { workspace_id, text, speed: 1, format: 'mp3' };
22
- if (voice_id) payload.voice_preset = String(voice_id).trim();
23
-
24
- const data = await api('POST', '/tts/voiceover', payload);
25
- const remoteAudioUrl = String(data.audio_url ?? '').trim();
26
- if (!remoteAudioUrl) throw new Error('TTS API did not return audio_url');
27
-
28
- const downloadRes = await fetch(remoteAudioUrl);
29
- if (!downloadRes.ok) throw new Error(`Failed to download audio (${downloadRes.status})`);
30
-
31
- const fileBuffer = Buffer.from(await downloadRes.arrayBuffer());
32
- const outDir = path.join(os.tmpdir(), 'lightcone-tts');
33
- mkdirSync(outDir, { recursive: true });
34
- const ext = inferAudioExt(remoteAudioUrl);
35
- const outPath = path.join(outDir, `tts-${Date.now()}-${randomUUID().slice(0, 8)}${ext}`);
36
- writeFileSync(outPath, fileBuffer);
37
-
38
- const durationMs = Number(data.duration_ms ?? 0);
39
- return { audio_path: outPath, audio_duration_ms: durationMs };
29
+ async function probeAudioDurationMs(audioPath) {
30
+ return new Promise((resolve, reject) => {
31
+ const proc = spawn('ffprobe', [
32
+ '-v', 'error',
33
+ '-show_entries', 'format=duration',
34
+ '-of', 'csv=p=0',
35
+ audioPath,
36
+ ], { stdio: ['ignore', 'pipe', 'pipe'] });
37
+ const out = [];
38
+ const err = [];
39
+ proc.stdout.on('data', c => out.push(c));
40
+ proc.stderr.on('data', c => err.push(c));
41
+ proc.on('close', code => {
42
+ if (code !== 0) {
43
+ return reject(new Error(`ffprobe exited ${code}: ${Buffer.concat(err).toString().slice(-300)}`));
44
+ }
45
+ const seconds = parseFloat(Buffer.concat(out).toString().trim());
46
+ if (!Number.isFinite(seconds)) {
47
+ return reject(new Error(`ffprobe returned non-numeric duration: ${Buffer.concat(out).toString().slice(0, 200)}`));
48
+ }
49
+ resolve(Math.round(seconds * 1000));
50
+ });
51
+ proc.on('error', err2 => reject(new Error(`ffprobe spawn failed: ${err2.message}`)));
52
+ });
40
53
  }
41
54
 
42
- // Compute segment duration from audio duration: audio + 0.5s buffer, rounded up to nearest 0.5s.
55
+ // Plan visual duration from audio duration: audio + buffer, rounded up to the
56
+ // nearest 0.5s. scroll-style images get a longer buffer because eyes need
57
+ // extra time to follow the motion.
43
58
  function planDurationSec(audioDurationMs, bufferSec = 0.5) {
44
59
  const raw = audioDurationMs / 1000 + bufferSec;
45
- return Math.ceil(raw * 2) / 2; // round up to nearest 0.5s
60
+ return Math.ceil(raw * 2) / 2;
46
61
  }
47
62
 
48
- // Run fn over items with a bounded number of concurrent workers (FIFO drain).
49
- async function mapWithConcurrency(items, limit, fn) {
50
- const queue = items.map((item, index) => ({ item, index }));
51
- const workers = Array.from({ length: Math.max(1, Math.min(limit, queue.length)) }, async () => {
52
- while (queue.length > 0) {
53
- const next = queue.shift();
54
- await fn(next.item, next.index);
55
- }
56
- });
57
- await Promise.all(workers);
58
- }
59
-
60
- const TTS_CONCURRENCY = 5;
61
-
62
- export async function runPlanVideoSegmentsTool({ segments, workspace_id, voice_id, currentWorkspaceId, api }) {
63
+ export async function runPlanVideoSegmentsTool({ segments } = {}) {
63
64
  if (!Array.isArray(segments) || segments.length === 0) {
64
65
  return toolError('segments must be a non-empty array.');
65
66
  }
66
67
 
67
- const targetWorkspaceId = String(workspace_id ?? currentWorkspaceId ?? '').trim();
68
- if (!targetWorkspaceId) {
69
- return toolError('workspace_id is required (no current workspace context).');
68
+ // Up-front validation fail fast before any work.
69
+ for (let i = 0; i < segments.length; i += 1) {
70
+ const seg = segments[i] ?? {};
71
+ if (typeof seg.audio_path !== 'string' || !seg.audio_path.trim()) {
72
+ return toolError(
73
+ `segments[${i}]: audio_path is required. plan_video_segments no longer synthesizes TTS — call synthesize_tts(text) `
74
+ + 'first and pass the returned path as audio_path. Standard chain: synthesize_tts × N → plan_video_segments → '
75
+ + 'record_url_narration + compose_video_v2 (share the same plan).'
76
+ );
77
+ }
78
+ const kind = String(seg.visual_kind ?? '');
79
+ if (!kind) {
80
+ return toolError(`segments[${i}]: visual_kind is required (image / video / gif / carousel).`);
81
+ }
70
82
  }
71
83
 
72
84
  const planned = [];
73
- const errors = [];
85
+ const warnings = [];
86
+
87
+ for (let i = 0; i < segments.length; i += 1) {
88
+ const seg = segments[i];
89
+ const text = String(seg.text ?? '').trim();
90
+ const kind = String(seg.visual_kind);
74
91
 
75
- // Synthesize TTS for every text-bearing segment up front, in parallel (bounded),
76
- // so an N-segment plan no longer pays N sequential round-trips to the TTS API.
77
- const audioResults = new Array(segments.length).fill(null);
78
- const ttsJobs = segments
79
- .map((seg, i) => ({ i, text: String(seg.text ?? '').trim() }))
80
- .filter(job => job.text);
81
- await mapWithConcurrency(ttsJobs, TTS_CONCURRENCY, async ({ i, text }) => {
92
+ let audioDurationMs;
82
93
  try {
83
- audioResults[i] = await synthesizeSegmentTts(text, { workspace_id: targetWorkspaceId, voice_id, api });
94
+ audioDurationMs = await probeAudioDurationMs(seg.audio_path);
84
95
  } catch (err) {
85
- errors.push(`segments[${i}]: TTS failed ${err.message}`);
86
- audioResults[i] = { audio_path: null, audio_duration_ms: 3000 }; // fallback estimate
96
+ warnings.push(`segments[${i}]: audio probe failed (${err.message}); falling back to 3000ms`);
97
+ audioDurationMs = 3000;
87
98
  }
88
- });
89
- errors.sort((a, b) => {
90
- const na = Number((a.match(/segments\[(\d+)\]/) ?? [])[1] ?? 0);
91
- const nb = Number((b.match(/segments\[(\d+)\]/) ?? [])[1] ?? 0);
92
- return na - nb;
93
- });
94
99
 
95
- for (let i = 0; i < segments.length; i++) {
96
- const seg = segments[i];
97
- const text = String(seg.text ?? '').trim();
98
- const kind = String(seg.visual_kind ?? 'image');
99
- const audioResult = audioResults[i];
100
-
101
- const audioDurationMs = audioResult?.audio_duration_ms ?? 0;
102
100
  let presentation;
103
-
104
101
  if (kind === 'carousel') {
105
102
  const numCards = Array.isArray(seg.visual_paths) ? seg.visual_paths.length : 1;
106
- const totalDuration = audioDurationMs > 0 ? planDurationSec(audioDurationMs) : numCards * 4;
103
+ const totalDuration = planDurationSec(audioDurationMs);
107
104
  const perCard = Math.max(2, Math.ceil((totalDuration / numCards) * 2) / 2);
108
105
  presentation = { per_card_duration: perCard };
109
106
  } else {
110
- // image, scroll, video, gif
111
- const duration = audioDurationMs > 0 ? planDurationSec(audioDurationMs, kind === 'scroll' ? 1.0 : 0.5) : 4;
107
+ // image / scroll / video / gif
108
+ const duration = planDurationSec(audioDurationMs, kind === 'scroll' ? 1.0 : 0.5);
112
109
  presentation = { duration, ...(kind === 'scroll' ? { style: 'scroll' } : {}) };
113
110
  }
114
111
 
115
- // dwell_ms lets the same segment double as a record_url_narration plan phase
116
- // (the recorder reads dwell_ms / duration_ms for how long to hold each beat).
117
- // Prefer the real measured audio length; fall back to the planned visual duration.
118
- const dwellMs = audioDurationMs > 0
119
- ? audioDurationMs
120
- : Math.round((presentation.duration ?? presentation.per_card_duration ?? 4) * 1000);
121
-
122
- const planned_seg = {
112
+ planned.push({
123
113
  ...seg,
124
- ...(audioResult?.audio_path ? { audio_path: audioResult.audio_path } : {}),
114
+ audio_path: seg.audio_path,
115
+ audio_duration_ms: audioDurationMs,
125
116
  ...(text ? { subtitle_text: text } : {}),
126
117
  presentation: { ...presentation, ...(seg.presentation ?? {}) },
127
- dwell_ms: seg.dwell_ms ?? dwellMs,
128
- };
129
- if (audioResult?.audio_duration_ms) {
130
- planned_seg.audio_duration_ms = audioResult.audio_duration_ms;
131
- }
132
- planned.push(planned_seg);
118
+ // dwell_ms doubles as record_url_narration's per-phase hold duration so
119
+ // recording naturally tracks the narration audio.
120
+ dwell_ms: seg.dwell_ms ?? audioDurationMs,
121
+ });
133
122
  }
134
123
 
135
124
  const result = {
@@ -141,7 +130,7 @@ export async function runPlanVideoSegmentsTool({ segments, workspace_id, voice_i
141
130
  : (s.presentation?.duration ?? 4);
142
131
  return sum + Math.round(d * 1000);
143
132
  }, 0),
144
- ...(errors.length > 0 ? { warnings: errors } : {}),
133
+ ...(warnings.length > 0 ? { warnings } : {}),
145
134
  };
146
135
 
147
136
  return toolText(JSON.stringify(result, null, 2));
@@ -1,6 +1,23 @@
1
+ // record_url_narration — atomic recording tool.
2
+ //
3
+ // Drives Chromium on Xvfb + Playwright recordVideo to capture a silent mp4 of
4
+ // a URL following a beat-by-beat visual plan, then ffmpeg-transcodes it. The
5
+ // resulting silent mp4 feeds into compose_video_v2 as a video-kind segment
6
+ // alongside narration audio.
7
+ //
8
+ // Lives in daemon/src/tools/ as a reusable handler module; the MCP-tool
9
+ // registration lives in daemon/mcp-servers/official/media-tools/index.js.
10
+ // Migrated out of chat-bridge.js (V4) — no longer wrapped by
11
+ // runMandatoryLocalTool / governance round-trip. media-tools is a separate
12
+ // stdio MCP server and governance integration is chat-bridge-specific;
13
+ // matches the precedent set by synthesize_tts / plan_video_segments /
14
+ // compose_video_v2 in V1/V2/V3.
15
+
1
16
  import { mkdirSync } from 'fs';
2
17
  import path from 'path';
3
18
 
19
+ import { recordUrlNarration as defaultRecordUrlNarrationFn } from '../_vendor/video/recorder/index.js';
20
+
4
21
  function toolText(text) {
5
22
  return { content: [{ type: 'text', text }] };
6
23
  }
@@ -55,10 +72,6 @@ function derivePhaseCount({ plan, recorderOutput }) {
55
72
  return segments ? segments.length : null;
56
73
  }
57
74
 
58
- // record_url_narration is an atomic tool, not the tail of a fixed pipeline.
59
- // The plan may be hand-written by the scripter or produced by plan_video_segments;
60
- // it just needs a non-empty list of segments with per-segment visual action + duration
61
- // so the recording stays in sync with the narration audio.
62
75
  function assertPipelineCompliance(plan) {
63
76
  if (!isPlainObject(plan)) return;
64
77
  if (!planSegments(plan)) {
@@ -86,10 +99,6 @@ const FORBIDDEN_REGION_PATTERNS = [
86
99
  ];
87
100
 
88
101
  function isRecruitmentLikeUrl(url) {
89
- // Conservative URL-based heuristic: mp.weixin.qq.com pages forwarding 招聘 /
90
- // 校招 / 实习 / job content. Until we have content classification, treat
91
- // mp.weixin.qq.com URLs as recruitment-class for safety — the cost of a
92
- // mis-flag is "agent must add a label", not "recording fails permanently".
93
102
  if (typeof url !== 'string') return false;
94
103
  return /mp\.weixin\.qq\.com/.test(url);
95
104
  }
@@ -101,11 +110,6 @@ function describeForbiddenMatch(label) {
101
110
  return null;
102
111
  }
103
112
 
104
- /**
105
- * For recruitment-class URLs, every plan section must declare what content
106
- * sits at its target_y, and the label must NOT match the forbidden-region
107
- * patterns. Returns null on pass, error message string on fail.
108
- */
109
113
  function checkSafeRegionLabels({ url, plan }) {
110
114
  if (!isRecruitmentLikeUrl(url)) return null;
111
115
  const segments = planSegments(plan);
@@ -181,16 +185,13 @@ export async function runRecordUrlNarrationTool({
181
185
  args = {},
182
186
  currentWorkspaceId = '',
183
187
  workspaceDir = process.cwd(),
184
- runMandatoryLocalToolFn,
185
- recordUrlNarrationFn,
188
+ planVideoSegmentsCalled = false,
189
+ recordUrlNarrationFn = defaultRecordUrlNarrationFn,
186
190
  nowMs = () => Date.now(),
187
191
  } = {}) {
188
192
  if (!currentWorkspaceId) {
189
193
  return toolError('No workspace context.');
190
194
  }
191
- if (typeof runMandatoryLocalToolFn !== 'function') {
192
- return toolError('Error: record_url_narration runMandatoryLocalToolFn is required.');
193
- }
194
195
  if (typeof recordUrlNarrationFn !== 'function') {
195
196
  return toolError('Error: record_url_narration executor is required.');
196
197
  }
@@ -220,51 +221,53 @@ export async function runRecordUrlNarrationTool({
220
221
  return toolError(`Error: ${safeRegionError}`);
221
222
  }
222
223
 
224
+ // Standard-chain hard block: refuse recordings unless plan_video_segments
225
+ // ran in this session. Discovered repeatedly in Tasks #20/#25/#26 that
226
+ // agents hand-write dwell_ms by guessing, producing recordings whose phase
227
+ // boundaries drift from the TTS audio they will eventually be paired with —
228
+ // forcing a full re-record. plan_video_segments fills dwell_ms mechanically
229
+ // from ffprobe audio duration, eliminating the drift.
230
+ if (!planVideoSegmentsCalled) {
231
+ return toolError(
232
+ 'Error: record_url_narration refused: plan_video_segments must run earlier in this '
233
+ + 'session so dwell_ms / phase durations are mechanically aligned with the segment\'s '
234
+ + 'TTS audio (audio_duration_ms). Hand-written dwell_ms has repeatedly drifted from '
235
+ + 'the actual TTS duration in production runs, forcing full re-records.\n\n'
236
+ + 'Standard chain: synthesize_tts × N (per segment) → plan_video_segments(segments with '
237
+ + 'text + audio_path + visual_kind=video + visual_path) → record_url_narration (feed '
238
+ + 'plan_video_segments output as plan.sections — each section\'s dwell_ms is already '
239
+ + 'set to audio_duration_ms) + compose_video_v2 (same plan output). Call plan_video_segments '
240
+ + 'now, then pass its `segments` array as `plan.sections` here.'
241
+ );
242
+ }
243
+
223
244
  try {
224
- const result = await runMandatoryLocalToolFn({
225
- toolName: 'record_url_narration',
226
- toolInput: validatedInput,
227
- executor: async (checkedInput = {}) => {
228
- const mergedInput = {
229
- ...validatedInput,
230
- ...checkedInput,
231
- };
232
- const finalInput = validateRecordUrlNarrationArgs(mergedInput);
233
- const { resolvedOutputPath, resolvedEventsPath } = resolveRecordUrlNarrationPaths({
234
- workspaceDir,
235
- outputPath: finalInput.output_path,
236
- eventsPath: finalInput.events_path,
237
- nowMs,
238
- });
239
-
240
- mkdirSync(path.dirname(resolvedOutputPath), { recursive: true });
241
- mkdirSync(path.dirname(resolvedEventsPath), { recursive: true });
242
-
243
- const recorderOutput = await recordUrlNarrationFn({
244
- url: finalInput.url,
245
- plan: finalInput.plan,
246
- output_path: resolvedOutputPath,
247
- events_path: resolvedEventsPath,
248
- viewport: finalInput.viewport,
249
- fps: finalInput.fps,
250
- settle_ms: finalInput.settle_ms,
251
- });
252
-
253
- return {
254
- videoPath: resolvedOutputPath,
255
- eventsPath: resolvedEventsPath,
256
- durationMs: deriveDurationMs(recorderOutput),
257
- phases: derivePhaseCount({ plan: finalInput.plan, recorderOutput }),
258
- };
259
- },
245
+ const { resolvedOutputPath, resolvedEventsPath } = resolveRecordUrlNarrationPaths({
246
+ workspaceDir,
247
+ outputPath: validatedInput.output_path,
248
+ eventsPath: validatedInput.events_path,
249
+ nowMs,
250
+ });
251
+
252
+ mkdirSync(path.dirname(resolvedOutputPath), { recursive: true });
253
+ mkdirSync(path.dirname(resolvedEventsPath), { recursive: true });
254
+
255
+ const recorderOutput = await recordUrlNarrationFn({
256
+ url: validatedInput.url,
257
+ plan: validatedInput.plan,
258
+ output_path: resolvedOutputPath,
259
+ events_path: resolvedEventsPath,
260
+ viewport: validatedInput.viewport,
261
+ fps: validatedInput.fps,
262
+ settle_ms: validatedInput.settle_ms,
260
263
  });
261
264
 
262
265
  return toolText(
263
266
  `Recorded URL narration.\n`
264
- + `video_path=${result.videoPath}\n`
265
- + `events_path=${result.eventsPath}\n`
266
- + `duration_ms=${result.durationMs ?? 'unknown'}\n`
267
- + `phases=${result.phases ?? 'n/a'}`
267
+ + `video_path=${resolvedOutputPath}\n`
268
+ + `events_path=${resolvedEventsPath}\n`
269
+ + `duration_ms=${deriveDurationMs(recorderOutput) ?? 'unknown'}\n`
270
+ + `phases=${derivePhaseCount({ plan: validatedInput.plan, recorderOutput }) ?? 'n/a'}`
268
271
  );
269
272
  } catch (error) {
270
273
  return toolError(`Error: ${error.message}`);
@@ -0,0 +1,78 @@
1
+ // Cross-process flag for "the agent has sent a video-production 确认稿 (brief)
2
+ // to the user in this workspace+agent context recently". chat-bridge's
3
+ // send_message tool writes the flag when the outgoing message heuristically
4
+ // looks like a confirmation brief; media-tools' compose_video_v2 and
5
+ // record_url_narration read the flag and refuse to proceed without it.
6
+ //
7
+ // Why a file flag instead of in-process state: send_message lives in
8
+ // chat-bridge (one stdio MCP server), compose_video_v2/record_url_narration
9
+ // live in media-tools (a different stdio MCP server, same machine). Both are
10
+ // spawned by the same codex CLI session per agent, so they share env (notably
11
+ // AGENT_ID / WORKSPACE_ID) but not memory. A flag file under ~/.lightcone is
12
+ // the simplest cross-process medium and survives short-lived MCP restarts.
13
+ //
14
+ // The heuristic is intentionally specific (asks-permission marker AND
15
+ // 2+ plan-describing markers) so casual progress reports like "已生成 TTS"
16
+ // or "画面已就绪" do NOT satisfy it. A motivated agent could game the
17
+ // detection by stuffing keywords into any send_message, but the default
18
+ // codex behavior (which silently skipped the soft prompt rule) is what we
19
+ // need to interrupt — and gaming is observable in chat history.
20
+
21
+ import { mkdirSync, statSync, utimesSync, writeFileSync, existsSync } from 'node:fs';
22
+ import path from 'node:path';
23
+ import os from 'node:os';
24
+
25
+ const TTL_MS = 6 * 60 * 60 * 1000; // 6 hours
26
+ const FILE_NAME = 'video-brief-sent.flag';
27
+
28
+ function flagDir(workspaceId, agentId) {
29
+ return path.join(os.homedir(), '.lightcone', 'sessions', workspaceId, agentId);
30
+ }
31
+
32
+ function flagPath(workspaceId, agentId) {
33
+ return path.join(flagDir(workspaceId, agentId), FILE_NAME);
34
+ }
35
+
36
+ export function markVideoBriefSent({ workspaceId, agentId, content }) {
37
+ if (!workspaceId || !agentId) return;
38
+ const dir = flagDir(workspaceId, agentId);
39
+ const p = flagPath(workspaceId, agentId);
40
+ mkdirSync(dir, { recursive: true });
41
+ writeFileSync(p, String(content ?? '').slice(0, 4096));
42
+ const now = new Date();
43
+ utimesSync(p, now, now);
44
+ }
45
+
46
+ export function hasFreshVideoBrief({ workspaceId, agentId, ttlMs = TTL_MS } = {}) {
47
+ if (!workspaceId || !agentId) return false;
48
+ const p = flagPath(workspaceId, agentId);
49
+ if (!existsSync(p)) return false;
50
+ try {
51
+ const st = statSync(p);
52
+ return (Date.now() - st.mtimeMs) <= ttlMs;
53
+ } catch { return false; }
54
+ }
55
+
56
+ // Permission-asking markers — the message must ask the user to decide / OK.
57
+ // 确认 alone is too broad (matches "无需确认" / "已确认硬约束" / "确认收到"); require
58
+ // a specific permission-ask shape: 请确认 / 确认稿 / 你确认 / 确认[吗??] / 等确认.
59
+ const PERMISSION_MARKERS = [
60
+ /请.*确认/, /你.*确认/, /确认\s*[吗??]/, /等.*确认/, /确认稿/,
61
+ /你看/, /OK\s*吗/i, /可以吗/, /同意吗/, /通过吗/, /行不行/, /如何\?|如何?/,
62
+ ];
63
+
64
+ // Plan-describing markers — must cover at least 2 different aspects of the brief.
65
+ const PLAN_MARKERS = [
66
+ /画面/, /时长/, /文案/, /口播/, /字幕/, /顺序/, /口吻/, /分镜/, /配音/,
67
+ ];
68
+
69
+ export function looksLikeVideoBrief(content) {
70
+ if (typeof content !== 'string') return false;
71
+ // Min length 20 — Chinese is character-dense, a plausible brief like
72
+ // "请确认:画面/时长/字幕已定。同意吗?" is only ~20 chars but still a valid brief.
73
+ if (content.length < 20) return false;
74
+ const hasPermissionAsk = PERMISSION_MARKERS.some(rx => rx.test(content));
75
+ if (!hasPermissionAsk) return false;
76
+ const distinctPlanHits = PLAN_MARKERS.filter(rx => rx.test(content)).length;
77
+ return distinctPlanHits >= 2;
78
+ }