@lightcone-ai/daemon 0.17.1 → 0.18.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,13 +6,17 @@ import { z } from 'zod';
6
6
  import { addTitleEffects } from './lib/render.js';
7
7
  import { SUPPORTED_PRESETS } from './lib/presets.js';
8
8
  import { runSynthesisTtsTool } from '../../../src/tools/synthesize-tts.js';
9
+ import { runListTtsVoicesTool } from '../../../src/tools/list-tts-voices.js';
10
+ import { runPreviewTtsVoiceTool } from '../../../src/tools/preview-tts-voice.js';
11
+ import { runGetDefaultVoiceTool } from '../../../src/tools/get-default-voice.js';
12
+ import { runSetDefaultVoiceTool } from '../../../src/tools/set-default-voice.js';
9
13
  import { runPlanVideoSegmentsTool } from '../../../src/tools/plan-video-segments.js';
10
14
  import { runComposeVideoV2Tool } from '../../../src/tools/compose-video-v2.js';
11
15
  import { runRecordUrlNarrationTool } from '../../../src/tools/record-url-narration.js';
12
16
  import { runRenderTextToImageTool } from '../../../src/tools/render-text-to-image.js';
13
17
  import { runRenderHtmlToImageTool } from '../../../src/tools/render-html-to-image.js';
14
18
  import { runTakePageScreenshotTool } from '../../../src/tools/take-page-screenshot.js';
15
- import { hasFreshVideoBrief } from '../../../src/video-brief-flag.js';
19
+ import { hasVideoBriefSent } from '../../../src/video-brief-flag.js';
16
20
  import { lightconeApi, CURRENT_WORKSPACE_ID, CURRENT_AGENT_ID } from './lib/lightcone-api.js';
17
21
 
18
22
  const WORKSPACE_DIR = String(process.env.WORKSPACE_DIR ?? '');
@@ -129,7 +133,9 @@ server.tool(
129
133
  + 'into a single call (segment-level audio is required for plan_video_segments to align video durations).',
130
134
  {
131
135
  text: z.string().min(1).describe('Narration text for this segment. Will be synthesized as a single mp3.'),
132
- voice_id: z.string().optional().describe('TTS voice preset. Omit to use workspace default.'),
136
+ voice_id: z.string().optional().describe('MiniMax voice_id (e.g. "Chinese (Mandarin)_Warm_Girl"). Omit to use the workspace default. '
137
+ + 'If neither is set the call fails with tts_default_voice_required — call list_tts_voices to discover options '
138
+ + 'and set_default_voice once the user picks one.'),
133
139
  workspace_id: z.string().optional().describe('Target workspace. Defaults to the current workspace.'),
134
140
  },
135
141
  async ({ text, voice_id, workspace_id }) => runSynthesisTtsTool({
@@ -141,6 +147,87 @@ server.tool(
141
147
  })
142
148
  );
143
149
 
150
+ // ── TTS voice catalog + workspace preference (added in the voice-selection
151
+ // rebuild — see docs/scenario-content-creation/video-synthesis-design.md).
152
+ // These four tools let the agent discover MiniMax voices, sample them, and
153
+ // persist a workspace-level default — all from inside the IM conversation,
154
+ // no separate UI. There is no fallback default voice: synthesize_tts will
155
+ // throw tts_default_voice_required if neither the request nor the workspace
156
+ // has a voice_id set, which is the agent's cue to run this discovery flow.
157
+ server.tool(
158
+ 'list_tts_voices',
159
+ 'List TTS voices available to the current workspace (system catalog + any cloned voices owned by this workspace). '
160
+ + 'Use this to show the user options before calling set_default_voice or synthesize_tts. '
161
+ + 'Filter by language (e.g. "zh-CN"), style_tag (e.g. "recommended", "news", "warm"), or free-text query.',
162
+ {
163
+ language: z.string().optional().describe('BCP-47 language tag, e.g. "zh-CN", "en-US".'),
164
+ origin: z.enum(['system', 'cloned']).optional().describe('Restrict to system catalog or this workspace\'s cloned voices.'),
165
+ query: z.string().optional().describe('Free-text match against voice_id and display_name.'),
166
+ style_tag: z.string().optional().describe('Match a single style tag (e.g. "recommended" for the curated shortlist).'),
167
+ limit: z.number().int().positive().max(200).optional().describe('Max rows to return. Default 100.'),
168
+ workspace_id: z.string().optional().describe('Target workspace. Defaults to the current workspace.'),
169
+ },
170
+ async ({ language, origin, query, style_tag, limit, workspace_id }) => runListTtsVoicesTool({
171
+ language,
172
+ origin,
173
+ query,
174
+ style_tag,
175
+ limit,
176
+ workspace_id,
177
+ currentWorkspaceId: CURRENT_WORKSPACE_ID,
178
+ api: lightconeApi,
179
+ })
180
+ );
181
+
182
+ server.tool(
183
+ 'preview_tts_voice',
184
+ 'Synthesize a short sample so the user can hear a voice before committing. Returns a playable mp3 path + audio_url. '
185
+ + 'Pass voice_id from list_tts_voices; text is optional (defaults to a neutral Chinese sample sentence).',
186
+ {
187
+ voice_id: z.string().min(1).describe('voice_id to preview, from list_tts_voices.'),
188
+ text: z.string().optional().describe('Custom sample text. Defaults to a short Chinese sentence.'),
189
+ workspace_id: z.string().optional().describe('Target workspace. Defaults to the current workspace.'),
190
+ },
191
+ async ({ voice_id, text, workspace_id }) => runPreviewTtsVoiceTool({
192
+ voice_id,
193
+ text,
194
+ workspace_id,
195
+ currentWorkspaceId: CURRENT_WORKSPACE_ID,
196
+ api: lightconeApi,
197
+ })
198
+ );
199
+
200
+ server.tool(
201
+ 'get_default_voice',
202
+ 'Read the current workspace\'s default TTS voice. Returns null when not set — that is the signal to run the discovery flow '
203
+ + '(list_tts_voices → user picks → set_default_voice) before doing any synthesis.',
204
+ {
205
+ workspace_id: z.string().optional().describe('Target workspace. Defaults to the current workspace.'),
206
+ },
207
+ async ({ workspace_id }) => runGetDefaultVoiceTool({
208
+ workspace_id,
209
+ currentWorkspaceId: CURRENT_WORKSPACE_ID,
210
+ api: lightconeApi,
211
+ })
212
+ );
213
+
214
+ server.tool(
215
+ 'set_default_voice',
216
+ 'Persist a workspace-level default TTS voice. The voice_id must exist in the workspace\'s visible catalog '
217
+ + '(system voice or a cloned voice owned by this workspace) — otherwise this tool fails with tts_voice_not_found. '
218
+ + 'Call this after the user picks from list_tts_voices, OR when the user explicitly asks to change their default.',
219
+ {
220
+ voice_id: z.string().min(1).describe('voice_id to make the default. Must be in this workspace\'s catalog.'),
221
+ workspace_id: z.string().optional().describe('Target workspace. Defaults to the current workspace.'),
222
+ },
223
+ async ({ voice_id, workspace_id }) => runSetDefaultVoiceTool({
224
+ voice_id,
225
+ workspace_id,
226
+ currentWorkspaceId: CURRENT_WORKSPACE_ID,
227
+ api: lightconeApi,
228
+ })
229
+ );
230
+
144
231
  // ── plan_video_segments (migrated from chat-bridge; TTS decoupled) ────────
145
232
  // Pure planner — takes per-segment {text, audio_path, visual_kind, ...} and
146
233
  // returns segments with audio_duration_ms / presentation.duration / dwell_ms
@@ -187,10 +274,14 @@ server.tool(
187
274
  'compose_video_v2',
188
275
  'Compose a video from a list of segments using ffmpeg. Each segment has a visual source (image / scroll / '
189
276
  + 'carousel / video / gif), optional audio, and optional subtitle text. Subtitles are burned in when '
190
- + 'subtitle_text is provided. Segments are concatenated in order; outro clips are appended after.\n\n'
277
+ + 'subtitle_text is provided AND burn_subtitles is not false. Segments are concatenated in order; outro clips are appended after.\n\n'
191
278
  + 'When any segment has audio_path, MUST be preceded by plan_video_segments in the same session '
192
279
  + '(plan_video_segments fills duration/subtitle_text/audio_path mechanically; manual alignment is rejected). '
193
- + 'Returns a local mp4 path + size_bytes.',
280
+ + 'Returns a local mp4 path + size_bytes.\n\n'
281
+ + 'Dual / multi-version delivery (e.g. subtitled+voiced + clean silent): pass the variants[] array — one call '
282
+ + 'runs the heavy per-segment ffmpeg work ONCE and only diverges at audio mux + concat + subtitle burn per '
283
+ + 'variant. That is ~1.2-1.4× single-version time vs ~2× when calling this tool twice. Each variant chooses '
284
+ + 'its own burn_subtitles and include_audio independently.',
194
285
  {
195
286
  segments: z.array(z.object({
196
287
  visual_path: z.string().optional().describe('Absolute path to a single image / video / gif.'),
@@ -207,7 +298,18 @@ server.tool(
207
298
  })).describe('Ordered list of video segments.'),
208
299
  outro_paths: z.array(z.string()).optional().describe('Absolute paths to outro video clips appended at end.'),
209
300
  resolution: z.string().optional().describe('Output resolution WxH. Default "1080x1920".'),
210
- output_path: z.string().optional().describe('Absolute output path. Auto-generated if omitted.'),
301
+ output_path: z.string().optional().describe('Absolute output path (single-output mode). Auto-generated if omitted. Ignored when variants[] is provided.'),
302
+ burn_subtitles: z.boolean().optional().describe('Single-output mode only: whether to burn subtitle_text. Default true. '
303
+ + 'For producing multiple variants in one call, use variants[] instead.'),
304
+ variants: z.array(z.object({
305
+ output_path: z.string().describe('Absolute output path for this variant. Each variant must use a unique path.'),
306
+ burn_subtitles: z.boolean().optional().describe('Whether to burn subtitle_text into THIS variant. Default true.'),
307
+ include_audio: z.boolean().optional().describe('Whether to mux segment.audio_path into THIS variant. Default true. '
308
+ + 'Pass false for a fully silent copy (skips audio mux entirely; segment.audio_path is ignored for this variant).'),
309
+ })).optional().describe('Multi-output mode: one call produces all variants. '
310
+ + 'Visual segment processing (the heavy work) runs once; each variant only repeats audio mux + concat + optional subtitle burn. '
311
+ + 'Typical use: [{output_path:"with-sub.mp4"}, {output_path:"clean.mp4", burn_subtitles:false, include_audio:false}] '
312
+ + 'to deliver a subtitled+voiced version and a silent clean version together.'),
211
313
  },
212
314
  async (args) => {
213
315
  const segments = Array.isArray(args?.segments) ? args.segments : [];
@@ -223,13 +325,13 @@ server.tool(
223
325
  + 'visual_path/visual_kind for the real media). Call plan_video_segments now and pass its output here.'
224
326
  );
225
327
  }
226
- if (hasNarration && !hasFreshVideoBrief({ workspaceId: CURRENT_WORKSPACE_ID, agentId: CURRENT_AGENT_ID })) {
328
+ if (hasNarration && !hasVideoBriefSent({ workspaceId: CURRENT_WORKSPACE_ID, agentId: CURRENT_AGENT_ID })) {
227
329
  return toolError(
228
330
  'compose_video_v2 refused: must send a 确认稿 (production-brief) to the user via send_message before '
229
331
  + 'compositing a narration video. The system scans send_message content for a brief — a message that '
230
332
  + 'BOTH asks the user to confirm (确认 / 你看 / OK 吗 / 可以吗 / 同意 / 通过 / 行不行) AND describes '
231
333
  + 'at least two of: 画面 / 时长 / 文案 / 口播 / 字幕 / 顺序 / 口吻 / 分镜 / 配音 — no such message '
232
- + 'was sent in the last 6 hours for this workspace+agent.\n\n'
334
+ + 'has been sent in this workspace+agent.\n\n'
233
335
  + '"已生成 TTS" / "开始合成" / progress reports do NOT count. Send a concrete confirmation draft '
234
336
  + 'first (e.g. "我准备这么做:画面是真录屏,时长约 1 分钟,文案如下…,字幕开启,公司顺序 A→B→C,'
235
337
  + '口吻是…—— 你 OK 吗?") and wait for the user to reply OK before calling compose_video_v2 again.'
@@ -282,7 +384,7 @@ server.tool(
282
384
  // with synthesize_tts + plan_video_segments + compose_video_v2), so it
283
385
  // requires the same 确认稿 gate as compose_video_v2 — catching the skip
284
386
  // earlier saves TTS + recording time.
285
- if (!hasFreshVideoBrief({ workspaceId: CURRENT_WORKSPACE_ID, agentId: CURRENT_AGENT_ID })) {
387
+ if (!hasVideoBriefSent({ workspaceId: CURRENT_WORKSPACE_ID, agentId: CURRENT_AGENT_ID })) {
286
388
  return {
287
389
  isError: true,
288
390
  content: [{ type: 'text', text:
@@ -290,7 +392,7 @@ server.tool(
290
392
  + 'send_message before starting a narration recording. The system scans send_message content for '
291
393
  + 'a brief — a message that BOTH asks the user to confirm (确认 / 你看 / OK 吗 / 可以吗 / 同意 / '
292
394
  + '通过 / 行不行) AND describes at least two of: 画面 / 时长 / 文案 / 口播 / 字幕 / 顺序 / 口吻 / '
293
- + '分镜 / 配音 — no such message was sent in the last 6 hours for this workspace+agent.\n\n'
395
+ + '分镜 / 配音 — no such message has been sent in this workspace+agent.\n\n'
294
396
  + '"已生成 TTS" / "开始合成" / progress reports do NOT count. Send a concrete confirmation draft '
295
397
  + 'first (e.g. "我准备这么做:画面是真录屏,时长约 1 分钟,文案如下…,字幕开启,公司顺序 A→B→C,'
296
398
  + '口吻是…—— 你 OK 吗?") and wait for the user to reply OK before calling record_url_narration.'
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lightcone-ai/daemon",
3
- "version": "0.17.1",
3
+ "version": "0.18.1",
4
4
  "type": "module",
5
5
  "main": "src/index.js",
6
6
  "bin": {
@@ -249,16 +249,64 @@ async function applyFadeTransition({ clipA, clipB, tmpDir, style = 'fade' }) {
249
249
  return outPath;
250
250
  }
251
251
 
252
+ // compose_video_v2 supports two modes:
253
+ //
254
+ // 1. Legacy single-output: pass output_path (+ optional burn_subtitles).
255
+ // Returns { path, duration_ms, size_bytes, variants: [..1 entry..] }.
256
+ //
257
+ // 2. Multi-variant: pass variants=[{output_path, burn_subtitles?, include_audio?}, ...].
258
+ // Visual segment processing runs ONCE (the heavy part — per-segment ffmpeg
259
+ // transcode/scale/scroll). Each variant then diverges only at audio mux +
260
+ // concat + subtitle burn — typically a few seconds per extra variant.
261
+ // Returns { variants: [{path, duration_ms, size_bytes, burn_subtitles,
262
+ // include_audio}, ...] }.
263
+ //
264
+ // Use the multi-variant mode when shipping the same content with different
265
+ // subtitle/audio combinations (e.g. subtitled+voiced + clean silent). Calling
266
+ // the legacy mode twice produces correct outputs but redoes per-segment work.
252
267
  export async function composeVideoV2({
253
268
  segments = [],
254
269
  outro_paths = [],
255
270
  resolution = '1080x1920',
256
271
  output_path,
272
+ burn_subtitles = true,
273
+ variants,
257
274
  }) {
258
275
  if (!Array.isArray(segments) || segments.length === 0) {
259
276
  throw new Error('segments must be a non-empty array');
260
277
  }
261
278
 
279
+ // Normalize variants. If caller did not pass an explicit variants array,
280
+ // synthesize a single variant from the legacy output_path + burn_subtitles.
281
+ // include_audio defaults to true (auto-include any segment.audio_path).
282
+ const normalizedVariants = (Array.isArray(variants) && variants.length > 0)
283
+ ? variants.map((v, idx) => {
284
+ if (!v || typeof v !== 'object') {
285
+ throw new Error(`variants[${idx}]: must be an object`);
286
+ }
287
+ const outPath = String(v.output_path ?? '').trim();
288
+ if (!outPath) throw new Error(`variants[${idx}]: output_path is required`);
289
+ return {
290
+ output_path: outPath,
291
+ burn_subtitles: v.burn_subtitles !== false,
292
+ include_audio: v.include_audio !== false,
293
+ };
294
+ })
295
+ : [{
296
+ output_path: output_path ?? path.join(os.tmpdir(), `lightcone-video-${Date.now()}.mp4`),
297
+ burn_subtitles: burn_subtitles !== false,
298
+ include_audio: true,
299
+ }];
300
+
301
+ // Disallow two variants writing to the same file — would race on disk.
302
+ const seenOutputs = new Set();
303
+ for (const v of normalizedVariants) {
304
+ if (seenOutputs.has(v.output_path)) {
305
+ throw new Error(`variants share output_path "${v.output_path}" — each variant needs a unique destination`);
306
+ }
307
+ seenOutputs.add(v.output_path);
308
+ }
309
+
262
310
  const [widthStr, heightStr] = String(resolution).split('x');
263
311
  const width = parseInt(widthStr, 10) || DEFAULT_WIDTH;
264
312
  const height = parseInt(heightStr, 10) || DEFAULT_HEIGHT;
@@ -267,23 +315,19 @@ export async function composeVideoV2({
267
315
  const tmpDir = path.join(os.tmpdir(), `compose-v2-${randomUUID().slice(0, 8)}`);
268
316
  await mkdir(tmpDir, { recursive: true });
269
317
 
270
- const outPath = output_path ?? path.join(os.tmpdir(), `lightcone-video-${Date.now()}.mp4`);
271
- await mkdir(path.dirname(outPath), { recursive: true });
272
-
273
318
  try {
274
- const readyClips = [];
275
-
319
+ // ── Shared phase: generate visual clips per segment ONCE ──────────────────
320
+ // This is the heavy work (image scaling, scroll rendering, video resize +
321
+ // re-encode). Reused across every variant.
322
+ const visualClips = [];
276
323
  for (let i = 0; i < segments.length; i++) {
277
324
  const seg = segments[i];
278
325
  const kind = String(seg.visual_kind ?? 'image');
279
326
  const presentation = seg.presentation ?? {};
280
327
  const style = String(presentation.style ?? 'static');
281
328
  const duration = Number(presentation.duration ?? presentation.per_card_duration ?? 4);
282
- const audioPath = seg.audio_path ?? null;
283
- const transition = String(seg.transition ?? 'cut');
284
329
 
285
330
  let visualClip;
286
-
287
331
  if (kind === 'image') {
288
332
  const imgPath = String(seg.visual_path ?? '');
289
333
  if (!imgPath) throw new Error(`segments[${i}]: visual_path required for kind=image`);
@@ -311,114 +355,145 @@ export async function composeVideoV2({
311
355
  } else {
312
356
  throw new Error(`segments[${i}]: unknown visual_kind "${kind}"`);
313
357
  }
314
-
315
- let finalClip;
316
- if (audioPath && await fileExists(audioPath)) {
317
- finalClip = await muxAudio({ videoPath: visualClip.path, audioPath, duration: visualClip.duration, tmpDir });
318
- } else {
319
- finalClip = await silentClip({ videoPath: visualClip.path, duration: visualClip.duration, tmpDir });
320
- }
321
-
322
- // Accept `text` as an alias for `subtitle_text`: plan_video_segments takes
323
- // segment narration as `text` on input, compose_video_v2's canonical name is
324
- // `subtitle_text`. Either reaches the burn pass so subtitles aren't silently dropped.
325
- const subtitleText = (
326
- typeof seg.subtitle_text === 'string' ? seg.subtitle_text
327
- : typeof seg.text === 'string' ? seg.text
328
- : ''
329
- ).trim();
330
- readyClips.push({ path: finalClip, duration: visualClip.duration, transition, subtitleText });
358
+ visualClips.push(visualClip);
331
359
  }
332
360
 
333
- const outroClips = [];
361
+ // Outros are also shared — they don't depend on subtitle/audio choices.
362
+ const outroClipPaths = [];
334
363
  for (const outroPath of (outro_paths ?? [])) {
335
364
  if (outroPath && await fileExists(outroPath)) {
336
365
  const c = await videoToClip({ videoPath: outroPath, tmpDir, width, height, fps });
337
- outroClips.push(c.path);
366
+ outroClipPaths.push(c.path);
338
367
  }
339
368
  }
340
369
 
341
- // Build subtitle entries with cumulative timeline timestamps. When a clip's
342
- // subtitle text spans several sentences, split it into one event per sentence
343
- // and spread them across the clip in proportion to their length, so a long
344
- // beat reads as sequential lines roughly tracking the narration instead of one
345
- // static wall of text.
346
- let cursorMs = 0;
347
- const subtitleEntries = [];
348
- for (const clip of readyClips) {
349
- if (clip.subtitleText) {
350
- const clipMs = Math.round(clip.duration * 1000);
351
- const sentences = splitSubtitleSentences(clip.subtitleText);
352
- const totalLen = sentences.reduce((sum, s) => sum + Array.from(s).length, 0) || 1;
353
- let offsetMs = 0;
354
- sentences.forEach((sentence, idx) => {
355
- const share = Array.from(sentence).length / totalLen;
356
- const isLast = idx === sentences.length - 1;
357
- const spanMs = isLast ? clipMs - offsetMs : Math.max(1, Math.round(clipMs * share));
358
- subtitleEntries.push({
359
- text: sentence,
360
- start_ms: cursorMs + offsetMs,
361
- end_ms: cursorMs + offsetMs + spanMs,
370
+ // ── Per-variant phase ─────────────────────────────────────────────────────
371
+ // For each variant: mux audio (or silent), build subtitle text, concat with
372
+ // transitions, optionally burn subtitles. Writes to variant.output_path.
373
+ const variantOutputs = [];
374
+ for (let vi = 0; vi < normalizedVariants.length; vi++) {
375
+ const variant = normalizedVariants[vi];
376
+ await mkdir(path.dirname(variant.output_path), { recursive: true });
377
+
378
+ const variantTag = `v${vi}`;
379
+ const readyClips = [];
380
+
381
+ for (let i = 0; i < segments.length; i++) {
382
+ const seg = segments[i];
383
+ const visualClip = visualClips[i];
384
+ const transition = String(seg.transition ?? 'cut');
385
+ const audioPath = variant.include_audio ? (seg.audio_path ?? null) : null;
386
+
387
+ let finalClip;
388
+ if (audioPath && await fileExists(audioPath)) {
389
+ finalClip = await muxAudio({ videoPath: visualClip.path, audioPath, duration: visualClip.duration, tmpDir });
390
+ } else {
391
+ finalClip = await silentClip({ videoPath: visualClip.path, duration: visualClip.duration, tmpDir });
392
+ }
393
+
394
+ // Accept `text` as an alias for `subtitle_text`: plan_video_segments takes
395
+ // segment narration as `text` on input, compose_video_v2's canonical name is
396
+ // `subtitle_text`. burn_subtitles=false drops the text here so the burn-in
397
+ // pass skips entirely.
398
+ const subtitleText = variant.burn_subtitles
399
+ ? (
400
+ typeof seg.subtitle_text === 'string' ? seg.subtitle_text
401
+ : typeof seg.text === 'string' ? seg.text
402
+ : ''
403
+ ).trim()
404
+ : '';
405
+ readyClips.push({ path: finalClip, duration: visualClip.duration, transition, subtitleText });
406
+ }
407
+
408
+ // Subtitle entries: cumulative timeline timestamps. Same logic as before,
409
+ // scoped per variant (subtitleText is already gated by variant.burn_subtitles).
410
+ let cursorMs = 0;
411
+ const subtitleEntries = [];
412
+ for (const clip of readyClips) {
413
+ if (clip.subtitleText) {
414
+ const clipMs = Math.round(clip.duration * 1000);
415
+ const sentences = splitSubtitleSentences(clip.subtitleText);
416
+ const totalLen = sentences.reduce((sum, s) => sum + Array.from(s).length, 0) || 1;
417
+ let offsetMs = 0;
418
+ sentences.forEach((sentence, idx) => {
419
+ const share = Array.from(sentence).length / totalLen;
420
+ const isLast = idx === sentences.length - 1;
421
+ const spanMs = isLast ? clipMs - offsetMs : Math.max(1, Math.round(clipMs * share));
422
+ subtitleEntries.push({
423
+ text: sentence,
424
+ start_ms: cursorMs + offsetMs,
425
+ end_ms: cursorMs + offsetMs + spanMs,
426
+ });
427
+ offsetMs += spanMs;
362
428
  });
363
- offsetMs += spanMs;
364
- });
429
+ }
430
+ cursorMs += Math.round(clip.duration * 1000);
365
431
  }
366
- cursorMs += Math.round(clip.duration * 1000);
367
- }
368
432
 
369
- const allClips = [];
370
- let accumulated = readyClips[0].path;
371
- for (let i = 1; i < readyClips.length; i++) {
372
- const { path: nextClip, transition } = readyClips[i];
373
- if (transition === 'fade' || transition === 'crossfade') {
374
- accumulated = await applyFadeTransition({ clipA: accumulated, clipB: nextClip, tmpDir, style: transition });
375
- } else {
376
- allClips.push(accumulated);
377
- accumulated = nextClip;
433
+ const allClips = [];
434
+ let accumulated = readyClips[0].path;
435
+ for (let i = 1; i < readyClips.length; i++) {
436
+ const { path: nextClip, transition } = readyClips[i];
437
+ if (transition === 'fade' || transition === 'crossfade') {
438
+ accumulated = await applyFadeTransition({ clipA: accumulated, clipB: nextClip, tmpDir, style: transition });
439
+ } else {
440
+ allClips.push(accumulated);
441
+ accumulated = nextClip;
442
+ }
378
443
  }
379
- }
380
- allClips.push(accumulated);
444
+ allClips.push(accumulated);
381
445
 
382
- const finalSequence = [...allClips, ...outroClips];
446
+ const finalSequence = [...allClips, ...outroClipPaths];
383
447
 
384
- // Compose without subtitles first (subtitles are burned in a separate pass)
385
- const preSubPath = subtitleEntries.length > 0
386
- ? path.join(tmpDir, `pre-sub-${randomUUID().slice(0, 8)}.mp4`)
387
- : outPath;
448
+ // Compose without subtitles first (subtitles are burned in a separate pass)
449
+ const preSubPath = subtitleEntries.length > 0
450
+ ? path.join(tmpDir, `${variantTag}-pre-sub-${randomUUID().slice(0, 8)}.mp4`)
451
+ : variant.output_path;
388
452
 
389
- if (finalSequence.length === 1) {
390
- await runFfmpeg(['-i', finalSequence[0], '-c', 'copy', '-movflags', '+faststart', preSubPath], 'ffmpeg copy');
391
- } else {
392
- await concatWithCuts({ clips: finalSequence, outputPath: preSubPath });
393
- }
453
+ if (finalSequence.length === 1) {
454
+ await runFfmpeg(['-i', finalSequence[0], '-c', 'copy', '-movflags', '+faststart', preSubPath], `ffmpeg copy ${variantTag}`);
455
+ } else {
456
+ await concatWithCuts({ clips: finalSequence, outputPath: preSubPath });
457
+ }
394
458
 
395
- // Burn subtitles into final output
396
- if (subtitleEntries.length > 0) {
397
- const assPath = path.join(tmpDir, `subs-${randomUUID().slice(0, 8)}.ass`);
398
- await writeFile(assPath, buildAssContent(subtitleEntries, { playResX: width, playResY: height }));
399
- const escapedAssPath = assPath.replace(/\\/g, '/').replace(/:/g, '\\:').replace(/'/g, "\\'");
400
- await runFfmpeg([
401
- '-i', preSubPath,
402
- '-vf', `subtitles='${escapedAssPath}'`,
403
- '-c:a', 'copy',
404
- '-movflags', '+faststart',
405
- outPath,
406
- ], 'ffmpeg burn-subtitles');
407
- }
459
+ if (subtitleEntries.length > 0) {
460
+ const assPath = path.join(tmpDir, `${variantTag}-subs-${randomUUID().slice(0, 8)}.ass`);
461
+ await writeFile(assPath, buildAssContent(subtitleEntries, { playResX: width, playResY: height }));
462
+ const escapedAssPath = assPath.replace(/\\/g, '/').replace(/:/g, '\\:').replace(/'/g, "\\'");
463
+ await runFfmpeg([
464
+ '-i', preSubPath,
465
+ '-vf', `subtitles='${escapedAssPath}'`,
466
+ '-c:a', 'copy',
467
+ '-movflags', '+faststart',
468
+ variant.output_path,
469
+ ], `ffmpeg burn-subtitles ${variantTag}`);
470
+ }
408
471
 
409
- const totalDuration = await probeDurationSec(outPath);
410
-
411
- // Stat the final file before returning so the caller can rely on size and
412
- // so we can detect the (rare but observed) case where ffmpeg's `close`
413
- // arrived but the kernel writeback wasn't complete. A 0-byte / tiny mp4
414
- // here means the burn-subtitles pass produced nothing usable — fail loudly
415
- // instead of letting a broken file flow into write_workspace_file / submit.
416
- const finalStat = await statAsync(outPath);
417
- const sizeBytes = Number(finalStat.size ?? 0);
418
- if (!Number.isFinite(sizeBytes) || sizeBytes < 1024) {
419
- throw new Error(`compose_video_v2 produced an invalid output: ${outPath} size=${sizeBytes} bytes`);
472
+ const totalDuration = await probeDurationSec(variant.output_path);
473
+ const finalStat = await statAsync(variant.output_path);
474
+ const sizeBytes = Number(finalStat.size ?? 0);
475
+ if (!Number.isFinite(sizeBytes) || sizeBytes < 1024) {
476
+ throw new Error(`compose_video_v2 produced an invalid output: ${variant.output_path} size=${sizeBytes} bytes (variant ${vi})`);
477
+ }
478
+
479
+ variantOutputs.push({
480
+ path: variant.output_path,
481
+ duration_ms: Math.round(totalDuration * 1000),
482
+ size_bytes: sizeBytes,
483
+ burn_subtitles: variant.burn_subtitles,
484
+ include_audio: variant.include_audio,
485
+ });
420
486
  }
421
- return { path: outPath, duration_ms: Math.round(totalDuration * 1000), size_bytes: sizeBytes };
487
+
488
+ // Legacy single-output callers (didn't pass variants) get the same flat
489
+ // shape they used to get, plus the variants array for forward-compat.
490
+ const first = variantOutputs[0];
491
+ return {
492
+ path: first.path,
493
+ duration_ms: first.duration_ms,
494
+ size_bytes: first.size_bytes,
495
+ variants: variantOutputs,
496
+ };
422
497
  } finally {
423
498
  await rm(tmpDir, { recursive: true, force: true });
424
499
  }
@@ -22,7 +22,16 @@ function statSizeOrNull(p) {
22
22
  try { return fs.statSync(p).size; } catch { return null; }
23
23
  }
24
24
 
25
- export async function runComposeVideoV2Tool({ segments, outro_paths, format, resolution, output_path, workspaceDir }) {
25
+ export async function runComposeVideoV2Tool({
26
+ segments,
27
+ outro_paths,
28
+ format,
29
+ resolution,
30
+ output_path,
31
+ burn_subtitles,
32
+ variants,
33
+ workspaceDir,
34
+ }) {
26
35
  if (!Array.isArray(segments) || segments.length === 0) {
27
36
  return toolError('segments must be a non-empty array.');
28
37
  }
@@ -59,6 +68,37 @@ export async function runComposeVideoV2Tool({ segments, outro_paths, format, res
59
68
  );
60
69
  }
61
70
  }
71
+
72
+ // Normalize variants. If caller passed a variants[] array, that takes
73
+ // priority — multi-output mode. Otherwise build a single-element variants
74
+ // array from the legacy output_path + burn_subtitles params.
75
+ const outDir = workspaceDir
76
+ ? path.join(workspaceDir, 'artifacts', 'video')
77
+ : path.join(os.tmpdir(), 'lightcone-video');
78
+
79
+ let normalizedVariants;
80
+ if (Array.isArray(variants) && variants.length > 0) {
81
+ normalizedVariants = variants.map((v, idx) => {
82
+ if (!v || typeof v !== 'object') {
83
+ return null; // surfaced below
84
+ }
85
+ const outPath = String(v.output_path ?? '').trim()
86
+ || path.join(outDir, `composed-${Date.now()}-${idx}-${randomUUID().slice(0, 8)}.mp4`);
87
+ return {
88
+ output_path: outPath,
89
+ burn_subtitles: v.burn_subtitles !== false,
90
+ include_audio: v.include_audio !== false,
91
+ };
92
+ });
93
+ if (normalizedVariants.some(v => v === null)) {
94
+ return toolError('variants must be an array of objects, each with { output_path, burn_subtitles?, include_audio? }.');
95
+ }
96
+ } else {
97
+ const burnSubtitles = burn_subtitles !== false;
98
+ const outPath = output_path ?? path.join(outDir, `composed-${Date.now()}-${randomUUID().slice(0, 8)}.mp4`);
99
+ normalizedVariants = [{ output_path: outPath, burn_subtitles: burnSubtitles, include_audio: true }];
100
+ }
101
+
62
102
  const warnings = [];
63
103
  // Heuristic warning: a multi-segment image video that reuses one single image
64
104
  // will look near-static — usually a sign the source page didn't render and the
@@ -69,44 +109,59 @@ export async function runComposeVideoV2Tool({ segments, outro_paths, format, res
69
109
  + 'The output will be near-static — verify the source page actually rendered before submitting this video.'
70
110
  );
71
111
  }
72
- // Warn when narration is present but no subtitle text is — compose_video_v2 burns
73
- // subtitles only from `subtitle_text` (or its `text` alias); without it the video
74
- // ships with no captions. Simplest fix: pass plan_video_segments' output verbatim.
75
- {
112
+ // Warn when narration is present but no subtitle text is — only meaningful
113
+ // for variants that ARE supposed to burn subtitles. Variants that explicitly
114
+ // ask for burn_subtitles=false are the "clean" path and shouldn't trigger it.
115
+ const variantsWithBurn = normalizedVariants.filter(v => v.burn_subtitles && v.include_audio);
116
+ if (variantsWithBurn.length > 0) {
76
117
  const hasSubText = s => (typeof s?.subtitle_text === 'string' && s.subtitle_text.trim())
77
118
  || (typeof s?.text === 'string' && s.text.trim());
78
119
  const narratedNoSub = segments.filter(s =>
79
120
  (typeof s?.audio_path === 'string' && s.audio_path.trim()) && !hasSubText(s)).length;
80
121
  if (narratedNoSub > 0) {
81
122
  warnings.push(
82
- `WARNING: ${narratedNoSub} segment(s) have narration audio but no subtitle text — the output will have NO subtitles. `
123
+ `WARNING: ${narratedNoSub} segment(s) have narration audio but no subtitle text — `
124
+ + `the subtitled variant${variantsWithBurn.length > 1 ? 's' : ''} will have NO subtitles. `
83
125
  + 'If subtitles are wanted, set subtitle_text per segment (or pass the plan_video_segments output array verbatim).'
84
126
  );
85
127
  }
86
128
  }
87
129
 
88
- const outDir = workspaceDir
89
- ? path.join(workspaceDir, 'artifacts', 'video')
90
- : path.join(os.tmpdir(), 'lightcone-video');
91
-
92
- const outPath = output_path ?? path.join(outDir, `composed-${Date.now()}-${randomUUID().slice(0, 8)}.mp4`);
93
-
94
130
  try {
95
131
  const result = await composeVideoV2({
96
132
  segments,
97
133
  outro_paths: outro_paths ?? [],
98
134
  resolution: resolution ?? '1080x1920',
99
- output_path: outPath,
135
+ variants: normalizedVariants,
100
136
  });
101
137
 
102
- const lines = [
103
- 'compose_video_v2 completed.',
104
- `path=${result.path}`,
105
- `duration_ms=${result.duration_ms}`,
106
- `size_bytes=${result.size_bytes ?? 'unknown'}`,
107
- `segments=${segments.length}`,
108
- `outro_clips=${(outro_paths ?? []).length}`,
109
- ];
138
+ const outputs = Array.isArray(result?.variants) && result.variants.length > 0
139
+ ? result.variants
140
+ : [{ path: result.path, duration_ms: result.duration_ms, size_bytes: result.size_bytes,
141
+ burn_subtitles: normalizedVariants[0].burn_subtitles,
142
+ include_audio: normalizedVariants[0].include_audio }];
143
+
144
+ const lines = ['compose_video_v2 completed.'];
145
+ if (outputs.length === 1) {
146
+ const v = outputs[0];
147
+ lines.push(`path=${v.path}`);
148
+ lines.push(`duration_ms=${v.duration_ms}`);
149
+ lines.push(`size_bytes=${v.size_bytes ?? 'unknown'}`);
150
+ lines.push(`burn_subtitles=${v.burn_subtitles}`);
151
+ lines.push(`include_audio=${v.include_audio}`);
152
+ } else {
153
+ lines.push(`variants=${outputs.length}`);
154
+ outputs.forEach((v, idx) => {
155
+ lines.push(`--- variant ${idx} ---`);
156
+ lines.push(`path=${v.path}`);
157
+ lines.push(`duration_ms=${v.duration_ms}`);
158
+ lines.push(`size_bytes=${v.size_bytes ?? 'unknown'}`);
159
+ lines.push(`burn_subtitles=${v.burn_subtitles}`);
160
+ lines.push(`include_audio=${v.include_audio}`);
161
+ });
162
+ }
163
+ lines.push(`segments=${segments.length}`);
164
+ lines.push(`outro_clips=${(outro_paths ?? []).length}`);
110
165
  for (const w of warnings) lines.push(w);
111
166
  return toolText(lines.join('\n'));
112
167
  } catch (error) {
@@ -0,0 +1,48 @@
1
+ function toolText(text) {
2
+ return { content: [{ type: 'text', text }] };
3
+ }
4
+
5
+ function toolError(text) {
6
+ return { isError: true, content: [{ type: 'text', text }] };
7
+ }
8
+
9
+ export async function runGetDefaultVoiceTool({
10
+ workspace_id,
11
+ currentWorkspaceId,
12
+ api,
13
+ }) {
14
+ const targetWorkspaceId = String(workspace_id ?? currentWorkspaceId ?? '').trim();
15
+ if (!targetWorkspaceId) {
16
+ return toolError('workspace_id is required (no current workspace context).');
17
+ }
18
+
19
+ const params = new URLSearchParams({ workspace_id: targetWorkspaceId });
20
+
21
+ let data;
22
+ try {
23
+ data = await api('GET', `/tts/preferences?${params.toString()}`);
24
+ } catch (error) {
25
+ return toolError(`get_default_voice API error: ${error.message}`);
26
+ }
27
+
28
+ if (!data?.default_voice_id) {
29
+ return toolText('No default TTS voice set for this workspace. Call list_tts_voices to show the user options, then set_default_voice once they pick one.');
30
+ }
31
+
32
+ const v = data.voice;
33
+ const lines = [
34
+ `Default TTS voice for workspace=${data.workspace_id} (provider=${data.provider}):`,
35
+ `voice_id=${data.default_voice_id}`,
36
+ ];
37
+ if (v) {
38
+ lines.push(`display_name=${v.display_name}`);
39
+ if (v.language) lines.push(`language=${v.language}`);
40
+ if (v.gender) lines.push(`gender=${v.gender}`);
41
+ if (Array.isArray(v.style_tags) && v.style_tags.length) {
42
+ lines.push(`style_tags=${v.style_tags.join(', ')}`);
43
+ }
44
+ } else {
45
+ lines.push('(voice metadata not found — catalog may have changed; ask user to pick again)');
46
+ }
47
+ return toolText(lines.join('\n'));
48
+ }
@@ -0,0 +1,58 @@
1
+ function toolText(text) {
2
+ return { content: [{ type: 'text', text }] };
3
+ }
4
+
5
+ function toolError(text) {
6
+ return { isError: true, content: [{ type: 'text', text }] };
7
+ }
8
+
9
+ function formatVoiceLine(voice) {
10
+ const tags = Array.isArray(voice.style_tags) && voice.style_tags.length
11
+ ? ` [${voice.style_tags.join(', ')}]`
12
+ : '';
13
+ const gender = voice.gender ? ` (${voice.gender})` : '';
14
+ const origin = voice.origin === 'cloned' ? ' • cloned' : '';
15
+ return `- ${voice.voice_id} — ${voice.display_name}${gender}${tags}${origin}`;
16
+ }
17
+
18
+ export async function runListTtsVoicesTool({
19
+ language,
20
+ origin,
21
+ query,
22
+ style_tag,
23
+ limit,
24
+ workspace_id,
25
+ currentWorkspaceId,
26
+ api,
27
+ }) {
28
+ const targetWorkspaceId = String(workspace_id ?? currentWorkspaceId ?? '').trim();
29
+ if (!targetWorkspaceId) {
30
+ return toolError('workspace_id is required (no current workspace context).');
31
+ }
32
+
33
+ const params = new URLSearchParams();
34
+ params.set('workspace_id', targetWorkspaceId);
35
+ if (language) params.set('language', String(language).trim());
36
+ if (origin) params.set('origin', String(origin).trim());
37
+ if (query) params.set('query', String(query).trim());
38
+ if (style_tag) params.set('style_tag', String(style_tag).trim());
39
+ if (limit != null && Number.isFinite(Number(limit))) params.set('limit', String(Math.floor(Number(limit))));
40
+
41
+ let data;
42
+ try {
43
+ data = await api('GET', `/tts/voices?${params.toString()}`);
44
+ } catch (error) {
45
+ return toolError(`list_tts_voices API error: ${error.message}`);
46
+ }
47
+
48
+ const voices = Array.isArray(data?.voices) ? data.voices : [];
49
+ if (!voices.length) {
50
+ return toolText('No TTS voices match the requested filters.');
51
+ }
52
+
53
+ const lines = [
54
+ `Found ${voices.length} voice(s) for provider=${data.provider}, workspace=${data.workspace_id}:`,
55
+ ...voices.map(formatVoiceLine),
56
+ ];
57
+ return toolText(lines.join('\n'));
58
+ }
@@ -0,0 +1,75 @@
1
+ import { mkdirSync, writeFileSync } from 'fs';
2
+ import { randomUUID } from 'crypto';
3
+ import path from 'path';
4
+ import os from 'os';
5
+
6
+ function toolText(text) {
7
+ return { content: [{ type: 'text', text }] };
8
+ }
9
+
10
+ function toolError(text) {
11
+ return { isError: true, content: [{ type: 'text', text }] };
12
+ }
13
+
14
+ function inferAudioExt(url) {
15
+ const clean = String(url ?? '').split('?')[0];
16
+ const ext = path.extname(clean).toLowerCase();
17
+ return ['.mp3', '.wav', '.flac', '.aac', '.ogg'].includes(ext) ? ext : '.mp3';
18
+ }
19
+
20
+ export async function runPreviewTtsVoiceTool({
21
+ voice_id,
22
+ text,
23
+ workspace_id,
24
+ currentWorkspaceId,
25
+ api,
26
+ }) {
27
+ const normalizedVoiceId = String(voice_id ?? '').trim();
28
+ if (!normalizedVoiceId) return toolError('voice_id is required for preview_tts_voice.');
29
+
30
+ const targetWorkspaceId = String(workspace_id ?? currentWorkspaceId ?? '').trim();
31
+ if (!targetWorkspaceId) {
32
+ return toolError('workspace_id is required (no current workspace context).');
33
+ }
34
+
35
+ const payload = {
36
+ workspace_id: targetWorkspaceId,
37
+ voice_id: normalizedVoiceId,
38
+ };
39
+ if (text) payload.text = String(text).trim();
40
+
41
+ let data;
42
+ try {
43
+ data = await api('POST', '/tts/voices/preview', payload);
44
+ } catch (error) {
45
+ const message = String(error?.message ?? '');
46
+ if (message.includes('tts_voice_not_found')) {
47
+ return toolError(`voice_id "${normalizedVoiceId}" not found or not visible to this workspace.`);
48
+ }
49
+ return toolError(`preview_tts_voice API error: ${error.message}`);
50
+ }
51
+
52
+ const remoteAudioUrl = String(data.audio_url ?? '').trim();
53
+ if (!remoteAudioUrl) return toolError('preview_tts_voice did not return audio_url.');
54
+
55
+ const downloadRes = await fetch(remoteAudioUrl);
56
+ if (!downloadRes.ok) {
57
+ return toolError(`Failed to download preview audio (${downloadRes.status}).`);
58
+ }
59
+ const fileBuffer = Buffer.from(await downloadRes.arrayBuffer());
60
+ const outDir = path.join(os.tmpdir(), 'lightcone-tts');
61
+ mkdirSync(outDir, { recursive: true });
62
+ const ext = inferAudioExt(remoteAudioUrl);
63
+ const outPath = path.join(outDir, `tts-preview-${Date.now()}-${randomUUID().slice(0, 8)}${ext}`);
64
+ writeFileSync(outPath, fileBuffer);
65
+
66
+ return toolText([
67
+ 'preview_tts_voice completed.',
68
+ `voice_id=${data.voice_id}`,
69
+ `display_name=${data.display_name ?? ''}`,
70
+ `path=${outPath}`,
71
+ `audio_url=${remoteAudioUrl}`,
72
+ `duration_ms=${data.duration_ms ?? 'unknown'}`,
73
+ `preview_text=${data.preview_text ?? ''}`,
74
+ ].join('\n'));
75
+ }
@@ -0,0 +1,44 @@
1
+ function toolText(text) {
2
+ return { content: [{ type: 'text', text }] };
3
+ }
4
+
5
+ function toolError(text) {
6
+ return { isError: true, content: [{ type: 'text', text }] };
7
+ }
8
+
9
+ export async function runSetDefaultVoiceTool({
10
+ voice_id,
11
+ workspace_id,
12
+ currentWorkspaceId,
13
+ api,
14
+ }) {
15
+ const normalizedVoiceId = String(voice_id ?? '').trim();
16
+ if (!normalizedVoiceId) return toolError('voice_id is required for set_default_voice.');
17
+
18
+ const targetWorkspaceId = String(workspace_id ?? currentWorkspaceId ?? '').trim();
19
+ if (!targetWorkspaceId) {
20
+ return toolError('workspace_id is required (no current workspace context).');
21
+ }
22
+
23
+ let data;
24
+ try {
25
+ data = await api('PUT', '/tts/preferences', {
26
+ workspace_id: targetWorkspaceId,
27
+ voice_id: normalizedVoiceId,
28
+ });
29
+ } catch (error) {
30
+ const message = String(error?.message ?? '');
31
+ if (message.includes('tts_voice_not_found')) {
32
+ return toolError(`voice_id "${normalizedVoiceId}" not found or not visible to this workspace. Use list_tts_voices to discover valid IDs.`);
33
+ }
34
+ return toolError(`set_default_voice API error: ${error.message}`);
35
+ }
36
+
37
+ const v = data?.voice;
38
+ const lines = [
39
+ `Default TTS voice updated for workspace=${data.workspace_id} (provider=${data.provider}).`,
40
+ `voice_id=${data.default_voice_id}`,
41
+ ];
42
+ if (v?.display_name) lines.push(`display_name=${v.display_name}`);
43
+ return toolText(lines.join('\n'));
44
+ }
@@ -34,12 +34,22 @@ export async function runSynthesisTtsTool({ text, voice_id, workspace_id, curren
34
34
  speed: 1,
35
35
  format: 'mp3',
36
36
  };
37
- if (voice_id) payload.voice_preset = String(voice_id).trim();
37
+ if (voice_id) payload.voice_id = String(voice_id).trim();
38
38
 
39
39
  let data;
40
40
  try {
41
41
  data = await api('POST', '/tts/voiceover', payload);
42
42
  } catch (error) {
43
+ // Server contract: tts_default_voice_required means the workspace has no
44
+ // default and the caller didn't pass voice_id. Surface a clear message so
45
+ // the agent knows to call list_tts_voices + set_default_voice first.
46
+ const message = String(error?.message ?? '');
47
+ if (message.includes('tts_default_voice_required')) {
48
+ return toolError('No TTS voice selected for this workspace. Call list_tts_voices, let the user pick one, then set_default_voice — or pass voice_id to synthesize_tts.');
49
+ }
50
+ if (message.includes('tts_voice_not_found')) {
51
+ return toolError(`voice_id "${voice_id}" not found or not visible to this workspace. Use list_tts_voices to discover valid IDs.`);
52
+ }
43
53
  return toolError(`synthesize_tts API error: ${error.message}`);
44
54
  }
45
55
 
@@ -63,6 +73,7 @@ export async function runSynthesisTtsTool({ text, voice_id, workspace_id, curren
63
73
  return toolText([
64
74
  'synthesize_tts completed.',
65
75
  `path=${outPath}`,
76
+ `voice_id=${data.voice_id ?? 'unknown'}`,
66
77
  `duration_ms=${data.duration_ms ?? 'unknown'}`,
67
78
  `size_bytes=${fileBuffer.length}`,
68
79
  ].join('\n'));
@@ -1,8 +1,8 @@
1
1
  // Cross-process flag for "the agent has sent a video-production 确认稿 (brief)
2
- // to the user in this workspace+agent context recently". chat-bridge's
3
- // send_message tool writes the flag when the outgoing message heuristically
4
- // looks like a confirmation brief; media-tools' compose_video_v2 and
5
- // record_url_narration read the flag and refuse to proceed without it.
2
+ // to the user in this workspace+agent context". chat-bridge's send_message
3
+ // tool writes the flag when the outgoing message heuristically looks like a
4
+ // confirmation brief; media-tools' compose_video_v2 and record_url_narration
5
+ // read the flag and refuse to proceed without it.
6
6
  //
7
7
  // Why a file flag instead of in-process state: send_message lives in
8
8
  // chat-bridge (one stdio MCP server), compose_video_v2/record_url_narration
@@ -18,11 +18,10 @@
18
18
  // codex behavior (which silently skipped the soft prompt rule) is what we
19
19
  // need to interrupt — and gaming is observable in chat history.
20
20
 
21
- import { mkdirSync, statSync, utimesSync, writeFileSync, existsSync } from 'node:fs';
21
+ import { mkdirSync, writeFileSync, existsSync } from 'node:fs';
22
22
  import path from 'node:path';
23
23
  import os from 'node:os';
24
24
 
25
- const TTL_MS = 6 * 60 * 60 * 1000; // 6 hours
26
25
  const FILE_NAME = 'video-brief-sent.flag';
27
26
 
28
27
  function flagDir(workspaceId, agentId) {
@@ -39,18 +38,11 @@ export function markVideoBriefSent({ workspaceId, agentId, content }) {
39
38
  const p = flagPath(workspaceId, agentId);
40
39
  mkdirSync(dir, { recursive: true });
41
40
  writeFileSync(p, String(content ?? '').slice(0, 4096));
42
- const now = new Date();
43
- utimesSync(p, now, now);
44
41
  }
45
42
 
46
- export function hasFreshVideoBrief({ workspaceId, agentId, ttlMs = TTL_MS } = {}) {
43
+ export function hasVideoBriefSent({ workspaceId, agentId } = {}) {
47
44
  if (!workspaceId || !agentId) return false;
48
- const p = flagPath(workspaceId, agentId);
49
- if (!existsSync(p)) return false;
50
- try {
51
- const st = statSync(p);
52
- return (Date.now() - st.mtimeMs) <= ttlMs;
53
- } catch { return false; }
45
+ return existsSync(flagPath(workspaceId, agentId));
54
46
  }
55
47
 
56
48
  // Permission-asking markers — the message must ask the user to decide / OK.