@lightcone-ai/daemon 0.16.1 → 0.16.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +1 -1
  2. package/src/chat-bridge.js +41 -2
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lightcone-ai/daemon",
3
- "version": "0.16.1",
3
+ "version": "0.16.2",
4
4
  "type": "module",
5
5
  "main": "src/index.js",
6
6
  "bin": {
@@ -1414,6 +1414,13 @@ server.tool('synthesize_tts',
1414
1414
  );
1415
1415
 
1416
1416
  // ── plan_video_segments ────────────────────────────────────────────────────────
1417
+ // Session-scoped flag set when plan_video_segments runs. compose_video_v2
1418
+ // refuses TTS-bearing segments unless this is true — the agent must route
1419
+ // audio through plan_video_segments first so durations / subtitle_text /
1420
+ // audio_path are mechanically aligned. This is a per-chat-bridge-process
1421
+ // flag, so a fresh codex session must call plan_video_segments fresh.
1422
+ let _planVideoSegmentsCalledThisSession = false;
1423
+
1417
1424
  server.tool('plan_video_segments',
1418
1425
  'Universal audio-video sync planning step. For each segment, call TTS to get the real audio duration, then compute the visual duration with a safety buffer. Returns a planned segments array ready to pass directly to compose_video_v2 (with audio_path, presentation.duration/per_card_duration, and subtitle_text pre-filled). Always call this before compose_video_v2 when you have narration text.',
1419
1426
  {
@@ -1430,7 +1437,11 @@ server.tool('plan_video_segments',
1430
1437
  voice_id: z.string().optional().describe('TTS voice ID. Omit to use workspace default.'),
1431
1438
  workspace_id: z.string().optional().describe('Target workspace. Defaults to current workspace context.'),
1432
1439
  },
1433
- async (args) => runPlanVideoSegmentsTool({ ...args, currentWorkspaceId, api })
1440
+ async (args) => {
1441
+ const result = await runPlanVideoSegmentsTool({ ...args, currentWorkspaceId, api });
1442
+ if (!result?.isError) _planVideoSegmentsCalledThisSession = true;
1443
+ return result;
1444
+ }
1434
1445
  );
1435
1446
 
1436
1447
  // ── compose_video_v2 ───────────────────────────────────────────────────────────
@@ -1454,7 +1465,35 @@ server.tool('compose_video_v2',
1454
1465
  resolution: z.string().optional().describe('Output resolution WxH. Default "1080x1920".'),
1455
1466
  output_path: z.string().optional().describe('Absolute output path for the mp4. Auto-generated if omitted.'),
1456
1467
  },
1457
- async (args) => runComposeVideoV2Tool({ ...args, workspaceDir: WORKSPACE_DIR })
1468
+ async (args) => {
1469
+ // Tool-level enforcement of the synthesize_tts → plan_video_segments →
1470
+ // compose_video_v2 standard chain. If any segment has audio_path (i.e.
1471
+ // narration is involved) and the agent never invoked plan_video_segments
1472
+ // in this session, refuse the compose — manual dwell/duration math is
1473
+ // unreliable (last syllable cut, silent tails, subtitle drift). Observed
1474
+ // twice in row: agent skipped plan_video_segments, manually estimated
1475
+ // dwell_ms wrong, ended up with too-long records and silent tails it then
1476
+ // re-recorded to fix — wasting record_url_narration runs that
1477
+ // plan_video_segments would have prevented.
1478
+ const segments = Array.isArray(args?.segments) ? args.segments : [];
1479
+ const hasNarration = segments.some(s => typeof s?.audio_path === 'string' && s.audio_path.trim());
1480
+ if (hasNarration && !_planVideoSegmentsCalledThisSession) {
1481
+ return {
1482
+ isError: true,
1483
+ content: [{
1484
+ type: 'text',
1485
+ text: 'compose_video_v2 refused: TTS-bearing segments (audio_path present) require plan_video_segments '
1486
+ + 'to have run earlier in this session — it mechanically aligns audio_duration / video_duration / '
1487
+ + 'subtitle_text with a safety buffer. Manual dwell/duration math has repeatedly produced misaligned '
1488
+ + 'subtitles and silent tails that force re-recording.\n\n'
1489
+ + 'Standard chain: synthesize_tts(per segment) → plan_video_segments(with text+visual_kind+visual_path) '
1490
+ + '→ compose_video_v2(use the returned segments verbatim, only swap visual_path/visual_kind for real '
1491
+ + 'media). Call plan_video_segments now and pass its output here.',
1492
+ }],
1493
+ };
1494
+ }
1495
+ return runComposeVideoV2Tool({ ...args, workspaceDir: WORKSPACE_DIR });
1496
+ }
1458
1497
  );
1459
1498
 
1460
1499
  // ── take_page_screenshot ───────────────────────────────────────────────────────