npm - @lightcone-ai/daemon - Versions diffs - 0.18.1 → 0.20.0 - Mend

@lightcone-ai/daemon 0.18.1 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/mcp-servers/official/media-tools/index.js +3 -2
package/package.json +1 -1
package/src/_vendor/video/recorder/index.js +170 -0
package/src/_vendor/video/recorder/plan-executor.js +28 -48
package/src/tools/record-url-narration.js +65 -7

package/mcp-servers/official/media-tools/index.js CHANGED Viewed

@@ -348,7 +348,7 @@ server.tool(
 // audio in production runs (Tasks #20/#25/#26), forcing re-records.
 server.tool(
   'record_url_narration',
-  'Record a silent video of a URL by driving Chromium on an Xvfb display and capturing it with Playwright recordVideo, driven by a video plan; ffmpeg then transcodes the recording to mp4. Outputs a silent mp4 that can be passed to compose_video_v2 as a video-kind segment with an audio_path for narration.\n\nUse this as the canonical recording step for URL-narration videos. Falls back: if the page needs interactions outside the visual_action vocabulary (clicks, waits, OCR loops), use Monitor (Bash) with custom Playwright instead.\n\nMUST be preceded by plan_video_segments in the same session — feed plan_video_segments\'s `segments` array as `plan.sections` so dwell_ms aligns mechanically with TTS audio_duration_ms (hand-written dwell_ms has drifted and forced re-records in production).\n\nRuntime requirements: this tool only works on a Linux daemon machine with Xvfb + Chromium + ffmpeg installed (ffmpeg is used to transcode the recording to mp4; no x11grab device support needed). macOS / Windows daemons will fail at startup.',
+  'Record a silent video of a URL by driving Chromium on an Xvfb display and capturing it with Playwright recordVideo, driven by a video plan; ffmpeg then transcodes the recording to mp4. Outputs a silent mp4 that can be passed to compose_video_v2 as a video-kind segment with an audio_path for narration.\n\nUse this as the canonical recording step for URL-narration videos. Falls back: if the page needs interactions outside the visual_action vocabulary (clicks, waits, OCR loops), use Monitor (Bash) with custom Playwright instead.\n\nMUST be preceded by plan_video_segments in the same session — feed plan_video_segments\'s `segments` array as `plan.sections` so dwell_ms aligns mechanically with TTS audio_duration_ms (hand-written dwell_ms has drifted and forced re-records in production).\n\nMULTI-SECTION OUTPUT (recommended for any URL with ≥2 sections): pass `output_paths` as an array with one path per plan.sections entry. The tool records the URL ONCE continuously (one browser session, one scrollTop, natural scroll flow through all sections), then slices the recording at section boundaries via ffmpeg. This avoids the per-segment scroll-back-to-top reset that happens when the agent splits N sections into N separate record_url_narration calls — that pattern reopens the browser and re-navigates for each segment, which looks visually disjointed even though the per-segment timing is correct.\n\nRuntime requirements: this tool only works on a Linux daemon machine with Xvfb + Chromium + ffmpeg installed (ffmpeg is used to transcode the recording to mp4; no x11grab device support needed). macOS / Windows daemons will fail at startup.',
   {
     url: z.string().describe('Page URL to record'),
     plan: z.record(z.any()).describe(
@@ -367,7 +367,8 @@ server.tool(
       + 'frag.short.recruitment_url_mode_policy). Pick a different target_y in the 标题/岗位 '
       + 'information area and rewrite that section.'
     ),
-    output_path: z.string().optional().describe('Workspace-relative output mp4 path. Default tmp/wx3_video/recorded-{ts}.mp4'),
+    output_path: z.string().optional().describe('Workspace-relative output mp4 path for the CONSOLIDATED master recording. Default tmp/wx3_video/recorded-{ts}.mp4. When output_paths is also provided, this still receives the full continuous recording for verification/debugging.'),
+    output_paths: z.array(z.string()).optional().describe('Multi-section output mode. Pass an array of N workspace-relative paths matching plan.sections length. The tool records ONCE continuously then slices the result into N mp4s at section boundaries (derived from phase_start / phase_end events). RECOMMENDED whenever a URL has ≥2 sections — keeps visual flow natural between sections instead of reopening the browser per segment.'),
     events_path: z.string().optional().describe('Workspace-relative events.json path. Default ${output_path}.events.json'),
     viewport: z.object({
       width: z.number().optional(),

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@lightcone-ai/daemon",
-  "version": "0.18.1",
+  "version": "0.20.0",
   "type": "module",
   "main": "src/index.js",
   "bin": {

package/src/_vendor/video/recorder/index.js CHANGED Viewed

@@ -215,12 +215,128 @@ async function transcodeWebmToMp4({
   });
 }
+// Frame-accurate slice of an mp4 — re-encodes to honour the exact start/end
+// instead of snapping to the nearest keyframe (which `-c copy` would do, and
+// can drift by several seconds with libx264's default ~250-frame GOP).
+// Re-encoding short clips (≤30s) at preset=veryfast is fast (<1s typical),
+// so we trade a bit of CPU for being able to align section cuts to the
+// per-segment TTS the rest of the pipeline expects.
+async function cutMp4Slice({
+  inputPath,
+  outputPath,
+  startMs,
+  durationMs,
+  fps = DEFAULT_FPS,
+  ffmpegBin = 'ffmpeg',
+} = {}) {
+  const startSec = Math.max(0, Number(startMs) || 0) / 1000;
+  const durationSec = Math.max(0.05, Number(durationMs) || 0) / 1000;
+  const args = [
+    '-y',
+    '-i', inputPath,
+    '-ss', startSec.toFixed(3),
+    '-t', durationSec.toFixed(3),
+    '-an',
+    '-c:v', 'libx264',
+    '-preset', 'veryfast',
+    '-pix_fmt', 'yuv420p',
+    ...(Number.isFinite(Number(fps)) && Number(fps) > 0 ? ['-r', String(fps)] : []),
+    '-movflags', '+faststart',
+    outputPath,
+  ];
+  await new Promise((resolve, reject) => {
+    const proc = spawn(ffmpegBin, args, { stdio: ['ignore', 'pipe', 'pipe'] });
+    const errChunks = [];
+    proc.stderr?.on('data', (chunk) => errChunks.push(chunk));
+    proc.once('error', (err) => {
+      const wrapped = new Error(`ffmpeg_spawn_failed:${err.message}`);
+      wrapped.code = 'FFMPEG_SPAWN_FAILED';
+      reject(wrapped);
+    });
+    proc.on('close', (code) => {
+      if (code === 0) return resolve();
+      const wrapped = new Error(
+        `ffmpeg_cut_failed:code=${code}: ${Buffer.concat(errChunks).toString().slice(-2000)}`
+      );
+      wrapped.code = 'FFMPEG_CUT_FAILED';
+      reject(wrapped);
+    });
+  });
+}
+// Derive per-section cut points from eventsLog. phase_start.t_ms / phase_end.t_ms
+// are recorded against the trimmed mp4 timeline (head trim already happened),
+// so we can use them as-is.
+function deriveSectionCutPoints(eventsLog, phaseCount) {
+  if (!Array.isArray(eventsLog) || eventsLog.length === 0) {
+    throw new Error('events_log_empty');
+  }
+  const starts = new Map();
+  const ends = new Map();
+  for (const ev of eventsLog) {
+    if (!ev || typeof ev !== 'object') continue;
+    const id = ev.phase_id;
+    const t = Number(ev.t_ms);
+    if (!id || !Number.isFinite(t)) continue;
+    if (ev.action === 'phase_start' && !starts.has(id)) starts.set(id, t);
+    if (ev.action === 'phase_end') ends.set(id, t);
+  }
+  // Walk phases in order to preserve plan ordering even if events arrived
+  // out-of-order (they shouldn't, but guard against it).
+  const orderedIds = [];
+  for (const ev of eventsLog) {
+    if (ev?.action === 'phase_start' && !orderedIds.includes(ev.phase_id)) {
+      orderedIds.push(ev.phase_id);
+    }
+  }
+  if (orderedIds.length !== phaseCount) {
+    throw new Error(`events_phase_count_mismatch:expected=${phaseCount}:got=${orderedIds.length}`);
+  }
+  return orderedIds.map((id) => {
+    const startMs = starts.get(id);
+    const endMs = ends.get(id);
+    if (!Number.isFinite(startMs) || !Number.isFinite(endMs)) {
+      throw new Error(`phase_timing_missing:${id}`);
+    }
+    if (endMs <= startMs) {
+      throw new Error(`phase_timing_invalid:${id}:start=${startMs}:end=${endMs}`);
+    }
+    return { phase_id: id, start_ms: startMs, end_ms: endMs, duration_ms: endMs - startMs };
+  });
+}
+function normalizeOutputPaths(rawList) {
+  if (rawList == null) return null;
+  if (!Array.isArray(rawList)) {
+    const error = new Error('output_paths_must_be_array');
+    error.code = 'OUTPUT_PATHS_MUST_BE_ARRAY';
+    throw error;
+  }
+  if (rawList.length === 0) return null;
+  return rawList.map((entry, idx) => {
+    const normalized = normalizeText(entry);
+    if (!normalized) {
+      const error = new Error(`output_paths[${idx}]_empty`);
+      error.code = 'OUTPUT_PATHS_ENTRY_EMPTY';
+      throw error;
+    }
+    return path.resolve(normalized);
+  });
+}
 export async function recordUrlNarration({
   plan,
   output_path,
   outputPath = output_path,
   events_path,
   eventsPath = events_path,
+  // Multi-section output: pass an array of N paths matching plan.sections length
+  // to record once continuously and slice the result into N per-section mp4s.
+  // The browser stays open for the whole recording, so visuals flow naturally
+  // between sections (no scroll-back-to-top between each, no page reload). When
+  // omitted, behaves exactly like before — single mp4 at outputPath.
+  output_paths,
+  outputPaths = output_paths,
   url,
   viewport = DEFAULT_VIEWPORT,
   fps = DEFAULT_FPS,
@@ -234,6 +350,7 @@ export async function recordUrlNarration({
   launchChromiumFn = launchChromiumMobile,
   openPageFn = openPageAndSettle,
   transcodeFn = transcodeWebmToMp4,
+  cutFn = cutMp4Slice,
   nowMs = () => Date.now(),
 } = {}) {
   const zoom = Number.isFinite(Number(page_zoom)) && Number(page_zoom) > 0 ? Number(page_zoom) : 1.1;
@@ -249,6 +366,23 @@ export async function recordUrlNarration({
   const resolvedUrl = resolveUrl({ url, plan });
   const normalizedViewport = normalizeViewport(viewport);
   const normalizedFps = normalizeInteger(fps, DEFAULT_FPS);
+  const resolvedOutputPaths = normalizeOutputPaths(outputPaths);
+  // When multi-section output is requested, the count must match plan.sections
+  // 1:1 — otherwise the agent will end up with audio/visual misalignment when
+  // it feeds these into plan_video_segments. Fail loud rather than silently
+  // truncating or padding.
+  if (resolvedOutputPaths && resolvedOutputPaths.length !== phases.length) {
+    const error = new Error(
+      `output_paths_count_mismatch:expected=${phases.length}:got=${resolvedOutputPaths.length}`,
+    );
+    error.code = 'OUTPUT_PATHS_COUNT_MISMATCH';
+    throw error;
+  }
+  if (resolvedOutputPaths) {
+    for (const p of resolvedOutputPaths) {
+      mkdirSync(path.dirname(p), { recursive: true });
+    }
+  }
   mkdirSync(path.dirname(resolvedOutputPath), { recursive: true });
   mkdirSync(path.dirname(resolvedEventsPath), { recursive: true });
@@ -367,12 +501,48 @@ export async function recordUrlNarration({
       ? eventsLog.reduce((max, ev) => Math.max(max, Number(ev?.t_ms) || 0), 0)
       : 0;
+    // Multi-section output: slice the consolidated mp4 at section boundaries
+    // (derived from phase_start / phase_end events). All slices come from the
+    // SAME continuous recording, so the visual flow between sections stays
+    // natural — no browser reload, no scroll-back-to-top per segment.
+    let sectionOutputs = null;
+    if (resolvedOutputPaths) {
+      const cutPoints = deriveSectionCutPoints(eventsLog, phases.length);
+      sectionOutputs = [];
+      for (let i = 0; i < cutPoints.length; i += 1) {
+        const cut = cutPoints[i];
+        const outPath = resolvedOutputPaths[i];
+        await cutFn({
+          inputPath: resolvedOutputPath,
+          outputPath: outPath,
+          startMs: cut.start_ms,
+          durationMs: cut.duration_ms,
+          fps: normalizedFps,
+        });
+        const sliceStat = await stat(outPath);
+        if (!sliceStat.isFile() || sliceStat.size <= 0) {
+          const error = new Error(`section_slice_empty:${outPath}`);
+          error.code = 'SECTION_SLICE_EMPTY';
+          throw error;
+        }
+        sectionOutputs.push({
+          phase_id: cut.phase_id,
+          video_path: outPath,
+          start_ms: cut.start_ms,
+          end_ms: cut.end_ms,
+          duration_ms: cut.duration_ms,
+          size_bytes: Number(sliceStat.size ?? 0),
+        });
+      }
+    }
     return {
       video_path: resolvedOutputPath,
       events_path: resolvedEventsPath,
       events_log: eventsLog,
       duration_ms: lastTms > 0 ? lastTms : null,
       display,
+      sections: sectionOutputs,
     };
   } catch (error) {
     primaryError = error;

package/src/_vendor/video/recorder/plan-executor.js CHANGED Viewed

@@ -1,4 +1,5 @@
 import { resolveDurationMs } from './phase-duration.js';
+import { humanizedScroll } from '../humanized-scroll.js';
 function normalizeText(value) {
   if (typeof value !== 'string') return '';
@@ -228,13 +229,26 @@ function resolveFromY(phase, fallback = null) {
   return Math.round(parsed);
 }
+// Delegates to humanizedScroll, which dispatches real CDP touch events so
+// the browser's gesture engine produces native scroll physics (rubber-band,
+// fling inertia, compositor-paced repaints). The old implementation drove
+// `root.scrollTo(...)` in a setTimeout loop inside page.evaluate — visually
+// smooth in isolation, but bypassed the gesture pipeline entirely, which is
+// what made scrolls feel "robotic" on recordings (see the
+// `not natural` thread in docs/scenario-content-creation discussion).
+//
+// `minSteps` is no longer needed (humanizedScroll computes segments from
+// distance + duration). `jitterPx` is forwarded as `pixel_jitter_px`, which
+// humanizedScroll converts into per-touchMove vertical offset.
 async function animateScroll(page, {
   startY = null,
   targetY,
   durationMs,
   easing = 'easeInOutQuad',
   jitterPx = 0,
-  minSteps = 10,
+  // minSteps is accepted but unused — kept in the signature so callers don't
+  // need updating in this refactor.
+  minSteps: _minSteps,  // eslint-disable-line no-unused-vars
 } = {}) {
   if (!Number.isFinite(Number(targetY))) {
     const error = new Error('phase_target_y_required');
@@ -242,53 +256,19 @@ async function animateScroll(page, {
     throw error;
   }
-  const normalizedDurationMs = Math.max(0, Number(durationMs) || 0);
-  const normalizedMinSteps = Math.max(1, Number(minSteps) || 1);
-  await page.evaluate(async ({
-    startY: evaluateStartY,
-    targetY: evaluateTargetY,
-    durationMs: evaluateDurationMs,
-    easing: evaluateEasing,
-    jitterPx: evaluateJitterPx,
-    minSteps: evaluateMinSteps,
-  }) => {
-    const root = document.scrollingElement || document.documentElement;
-    const wait = (ms) => new Promise(resolve => setTimeout(resolve, ms));
-    const fromY = Number.isFinite(evaluateStartY) ? evaluateStartY : root.scrollTop;
-    const toY = evaluateTargetY;
-    const delta = toY - fromY;
-    const steps = Math.max(evaluateMinSteps, Math.round(Math.max(1, evaluateDurationMs) / 16));
-    const stepDurationMs = evaluateDurationMs <= 0 ? 0 : evaluateDurationMs / steps;
-    const applyEasing = (t) => {
-      if (evaluateEasing === 'linear') return t;
-      if (evaluateEasing === 'easeOutQuad') return 1 - ((1 - t) * (1 - t));
-      return t < 0.5
-        ? 2 * t * t
-        : 1 - (Math.pow(-2 * t + 2, 2) / 2);
-    };
-    root.scrollTo(0, fromY);
-    for (let index = 1; index <= steps; index += 1) {
-      const t = index / steps;
-      const eased = applyEasing(t);
-      const jitter = evaluateJitterPx > 0 ? ((Math.random() * 2 - 1) * evaluateJitterPx) : 0;
-      root.scrollTo(0, fromY + (delta * eased) + jitter);
-      if (stepDurationMs > 0) {
-        await wait(stepDurationMs);
-      }
-    }
-    root.scrollTo(0, toY);
-  }, {
-    startY,
-    targetY,
-    durationMs: normalizedDurationMs,
-    easing,
-    jitterPx: Math.max(0, Number(jitterPx) || 0),
-    minSteps: normalizedMinSteps,
+  const resolvedFromY = Number.isFinite(Number(startY))
+    ? Number(startY)
+    : await page.evaluate(() => {
+        const root = document.scrollingElement || document.documentElement;
+        return Math.round(root.scrollTop);
+      });
+  await humanizedScroll(page, {
+    from_y: resolvedFromY,
+    to_y: Number(targetY),
+    duration_ms: Math.max(0, Number(durationMs) || 0),
+    motion_curve: easing,
+    pixel_jitter_px: Math.max(0, Number(jitterPx) || 0),
   });
 }

package/src/tools/record-url-narration.js CHANGED Viewed

@@ -181,6 +181,21 @@ export function resolveRecordUrlNarrationPaths({
   };
 }
+function resolveOutputPaths(rawList, { workspaceDir }) {
+  if (rawList == null) return null;
+  if (!Array.isArray(rawList)) {
+    throw new Error('output_paths must be an array of file paths (one per section).');
+  }
+  if (rawList.length === 0) return null;
+  return rawList.map((entry, idx) => {
+    const normalized = normalizeText(entry);
+    if (!normalized) {
+      throw new Error(`output_paths[${idx}] is empty — every entry must be a non-empty path.`);
+    }
+    return path.resolve(workspaceDir, normalized);
+  });
+}
 export async function runRecordUrlNarrationTool({
   args = {},
   currentWorkspaceId = '',
@@ -252,23 +267,66 @@ export async function runRecordUrlNarrationTool({
     mkdirSync(path.dirname(resolvedOutputPath), { recursive: true });
     mkdirSync(path.dirname(resolvedEventsPath), { recursive: true });
+    // Multi-section mode: caller passed output_paths. Validate it 1:1 with
+    // plan.sections so the recorder can slice the continuous recording into
+    // per-section mp4s without ambiguity.
+    let resolvedOutputPaths = null;
+    try {
+      resolvedOutputPaths = resolveOutputPaths(validatedInput.output_paths, { workspaceDir });
+    } catch (error) {
+      return toolError(`Error: ${error.message}`);
+    }
+    if (resolvedOutputPaths) {
+      const planSectionCount = (planSegments(validatedInput.plan) ?? []).length;
+      if (resolvedOutputPaths.length !== planSectionCount) {
+        return toolError(
+          `Error: output_paths length (${resolvedOutputPaths.length}) must match `
+          + `plan.sections length (${planSectionCount}). Each section produces exactly one mp4 — `
+          + `don't pad or truncate.`,
+        );
+      }
+    }
     const recorderOutput = await recordUrlNarrationFn({
       url: validatedInput.url,
       plan: validatedInput.plan,
       output_path: resolvedOutputPath,
       events_path: resolvedEventsPath,
+      output_paths: resolvedOutputPaths,
       viewport: validatedInput.viewport,
       fps: validatedInput.fps,
       settle_ms: validatedInput.settle_ms,
     });
-    return toolText(
-      `Recorded URL narration.\n`
-      + `video_path=${resolvedOutputPath}\n`
-      + `events_path=${resolvedEventsPath}\n`
-      + `duration_ms=${deriveDurationMs(recorderOutput) ?? 'unknown'}\n`
-      + `phases=${derivePhaseCount({ plan: validatedInput.plan, recorderOutput }) ?? 'n/a'}`
-    );
+    // Single-output mode (legacy): same one-line summary as before.
+    if (!resolvedOutputPaths) {
+      return toolText(
+        `Recorded URL narration.\n`
+        + `video_path=${resolvedOutputPath}\n`
+        + `events_path=${resolvedEventsPath}\n`
+        + `duration_ms=${deriveDurationMs(recorderOutput) ?? 'unknown'}\n`
+        + `phases=${derivePhaseCount({ plan: validatedInput.plan, recorderOutput }) ?? 'n/a'}`,
+      );
+    }
+    // Multi-section mode: one section block per output mp4, plus the
+    // consolidated master mp4 path for debugging / verification.
+    const sections = Array.isArray(recorderOutput?.sections) ? recorderOutput.sections : [];
+    const lines = [
+      'Recorded URL narration (multi-section).',
+      `master_video_path=${resolvedOutputPath}`,
+      `events_path=${resolvedEventsPath}`,
+      `total_duration_ms=${deriveDurationMs(recorderOutput) ?? 'unknown'}`,
+      `sections=${sections.length}`,
+    ];
+    sections.forEach((s, idx) => {
+      lines.push(`--- section ${idx} (${s.phase_id}) ---`);
+      lines.push(`video_path=${s.video_path}`);
+      lines.push(`start_ms=${s.start_ms}`);
+      lines.push(`duration_ms=${s.duration_ms}`);
+      lines.push(`size_bytes=${s.size_bytes ?? 'unknown'}`);
+    });
+    return toolText(lines.join('\n'));
   } catch (error) {
     return toolError(`Error: ${error.message}`);
   }