npm - @lightcone-ai/daemon - Versions diffs - 0.16.1 → 0.16.2 - Mend

@lightcone-ai/daemon 0.16.1 → 0.16.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/package.json +1 -1
package/src/chat-bridge.js +41 -2

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@lightcone-ai/daemon",
-  "version": "0.16.1",
+  "version": "0.16.2",
   "type": "module",
   "main": "src/index.js",
   "bin": {

package/src/chat-bridge.js CHANGED Viewed

@@ -1414,6 +1414,13 @@ server.tool('synthesize_tts',
 );
 // ── plan_video_segments ────────────────────────────────────────────────────────
+// Session-scoped flag set when plan_video_segments runs. compose_video_v2
+// refuses TTS-bearing segments unless this is true — the agent must route
+// audio through plan_video_segments first so durations / subtitle_text /
+// audio_path are mechanically aligned. This is a per-chat-bridge-process
+// flag, so a fresh codex session must call plan_video_segments fresh.
+let _planVideoSegmentsCalledThisSession = false;
 server.tool('plan_video_segments',
   'Universal audio-video sync planning step. For each segment, call TTS to get the real audio duration, then compute the visual duration with a safety buffer. Returns a planned segments array ready to pass directly to compose_video_v2 (with audio_path, presentation.duration/per_card_duration, and subtitle_text pre-filled). Always call this before compose_video_v2 when you have narration text.',
   {
@@ -1430,7 +1437,11 @@ server.tool('plan_video_segments',
     voice_id: z.string().optional().describe('TTS voice ID. Omit to use workspace default.'),
     workspace_id: z.string().optional().describe('Target workspace. Defaults to current workspace context.'),
   },
-  async (args) => runPlanVideoSegmentsTool({ ...args, currentWorkspaceId, api })
+  async (args) => {
+    const result = await runPlanVideoSegmentsTool({ ...args, currentWorkspaceId, api });
+    if (!result?.isError) _planVideoSegmentsCalledThisSession = true;
+    return result;
+  }
 );
 // ── compose_video_v2 ───────────────────────────────────────────────────────────
@@ -1454,7 +1465,35 @@ server.tool('compose_video_v2',
     resolution: z.string().optional().describe('Output resolution WxH. Default "1080x1920".'),
     output_path: z.string().optional().describe('Absolute output path for the mp4. Auto-generated if omitted.'),
   },
-  async (args) => runComposeVideoV2Tool({ ...args, workspaceDir: WORKSPACE_DIR })
+  async (args) => {
+    // Tool-level enforcement of the synthesize_tts → plan_video_segments →
+    // compose_video_v2 standard chain. If any segment has audio_path (i.e.
+    // narration is involved) and the agent never invoked plan_video_segments
+    // in this session, refuse the compose — manual dwell/duration math is
+    // unreliable (last syllable cut, silent tails, subtitle drift). Observed
+    // twice in row: agent skipped plan_video_segments, manually estimated
+    // dwell_ms wrong, ended up with too-long records and silent tails it then
+    // re-recorded to fix — wasting record_url_narration runs that
+    // plan_video_segments would have prevented.
+    const segments = Array.isArray(args?.segments) ? args.segments : [];
+    const hasNarration = segments.some(s => typeof s?.audio_path === 'string' && s.audio_path.trim());
+    if (hasNarration && !_planVideoSegmentsCalledThisSession) {
+      return {
+        isError: true,
+        content: [{
+          type: 'text',
+          text: 'compose_video_v2 refused: TTS-bearing segments (audio_path present) require plan_video_segments '
+            + 'to have run earlier in this session — it mechanically aligns audio_duration / video_duration / '
+            + 'subtitle_text with a safety buffer. Manual dwell/duration math has repeatedly produced misaligned '
+            + 'subtitles and silent tails that force re-recording.\n\n'
+            + 'Standard chain: synthesize_tts(per segment) → plan_video_segments(with text+visual_kind+visual_path) '
+            + '→ compose_video_v2(use the returned segments verbatim, only swap visual_path/visual_kind for real '
+            + 'media). Call plan_video_segments now and pass its output here.',
+        }],
+      };
+    }
+    return runComposeVideoV2Tool({ ...args, workspaceDir: WORKSPACE_DIR });
+  }
 );
 // ── take_page_screenshot ───────────────────────────────────────────────────────