@lightcone-ai/daemon 0.16.1 → 0.16.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/chat-bridge.js +41 -2
package/package.json
CHANGED
package/src/chat-bridge.js
CHANGED
|
@@ -1414,6 +1414,13 @@ server.tool('synthesize_tts',
|
|
|
1414
1414
|
);
|
|
1415
1415
|
|
|
1416
1416
|
// ── plan_video_segments ────────────────────────────────────────────────────────
|
|
1417
|
+
// Session-scoped flag set when plan_video_segments runs. compose_video_v2
|
|
1418
|
+
// refuses TTS-bearing segments unless this is true — the agent must route
|
|
1419
|
+
// audio through plan_video_segments first so durations / subtitle_text /
|
|
1420
|
+
// audio_path are mechanically aligned. This is a per-chat-bridge-process
|
|
1421
|
+
// flag, so a fresh codex session must call plan_video_segments fresh.
|
|
1422
|
+
let _planVideoSegmentsCalledThisSession = false;
|
|
1423
|
+
|
|
1417
1424
|
server.tool('plan_video_segments',
|
|
1418
1425
|
'Universal audio-video sync planning step. For each segment, call TTS to get the real audio duration, then compute the visual duration with a safety buffer. Returns a planned segments array ready to pass directly to compose_video_v2 (with audio_path, presentation.duration/per_card_duration, and subtitle_text pre-filled). Always call this before compose_video_v2 when you have narration text.',
|
|
1419
1426
|
{
|
|
@@ -1430,7 +1437,11 @@ server.tool('plan_video_segments',
|
|
|
1430
1437
|
voice_id: z.string().optional().describe('TTS voice ID. Omit to use workspace default.'),
|
|
1431
1438
|
workspace_id: z.string().optional().describe('Target workspace. Defaults to current workspace context.'),
|
|
1432
1439
|
},
|
|
1433
|
-
async (args) =>
|
|
1440
|
+
async (args) => {
|
|
1441
|
+
const result = await runPlanVideoSegmentsTool({ ...args, currentWorkspaceId, api });
|
|
1442
|
+
if (!result?.isError) _planVideoSegmentsCalledThisSession = true;
|
|
1443
|
+
return result;
|
|
1444
|
+
}
|
|
1434
1445
|
);
|
|
1435
1446
|
|
|
1436
1447
|
// ── compose_video_v2 ───────────────────────────────────────────────────────────
|
|
@@ -1454,7 +1465,35 @@ server.tool('compose_video_v2',
|
|
|
1454
1465
|
resolution: z.string().optional().describe('Output resolution WxH. Default "1080x1920".'),
|
|
1455
1466
|
output_path: z.string().optional().describe('Absolute output path for the mp4. Auto-generated if omitted.'),
|
|
1456
1467
|
},
|
|
1457
|
-
async (args) =>
|
|
1468
|
+
async (args) => {
|
|
1469
|
+
// Tool-level enforcement of the synthesize_tts → plan_video_segments →
|
|
1470
|
+
// compose_video_v2 standard chain. If any segment has audio_path (i.e.
|
|
1471
|
+
// narration is involved) and the agent never invoked plan_video_segments
|
|
1472
|
+
// in this session, refuse the compose — manual dwell/duration math is
|
|
1473
|
+
// unreliable (last syllable cut, silent tails, subtitle drift). Observed
|
|
1474
|
+
// twice in row: agent skipped plan_video_segments, manually estimated
|
|
1475
|
+
// dwell_ms wrong, ended up with too-long records and silent tails it then
|
|
1476
|
+
// re-recorded to fix — wasting record_url_narration runs that
|
|
1477
|
+
// plan_video_segments would have prevented.
|
|
1478
|
+
const segments = Array.isArray(args?.segments) ? args.segments : [];
|
|
1479
|
+
const hasNarration = segments.some(s => typeof s?.audio_path === 'string' && s.audio_path.trim());
|
|
1480
|
+
if (hasNarration && !_planVideoSegmentsCalledThisSession) {
|
|
1481
|
+
return {
|
|
1482
|
+
isError: true,
|
|
1483
|
+
content: [{
|
|
1484
|
+
type: 'text',
|
|
1485
|
+
text: 'compose_video_v2 refused: TTS-bearing segments (audio_path present) require plan_video_segments '
|
|
1486
|
+
+ 'to have run earlier in this session — it mechanically aligns audio_duration / video_duration / '
|
|
1487
|
+
+ 'subtitle_text with a safety buffer. Manual dwell/duration math has repeatedly produced misaligned '
|
|
1488
|
+
+ 'subtitles and silent tails that force re-recording.\n\n'
|
|
1489
|
+
+ 'Standard chain: synthesize_tts(per segment) → plan_video_segments(with text+visual_kind+visual_path) '
|
|
1490
|
+
+ '→ compose_video_v2(use the returned segments verbatim, only swap visual_path/visual_kind for real '
|
|
1491
|
+
+ 'media). Call plan_video_segments now and pass its output here.',
|
|
1492
|
+
}],
|
|
1493
|
+
};
|
|
1494
|
+
}
|
|
1495
|
+
return runComposeVideoV2Tool({ ...args, workspaceDir: WORKSPACE_DIR });
|
|
1496
|
+
}
|
|
1458
1497
|
);
|
|
1459
1498
|
|
|
1460
1499
|
// ── take_page_screenshot ───────────────────────────────────────────────────────
|