npm - @lightcone-ai/daemon - Versions diffs - 0.22.1 → 0.23.1 - Mend

@lightcone-ai/daemon 0.22.1 → 0.23.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/mcp-servers/official/media-tools/index.js +42 -19
package/mcp-servers/official/page-understanding/index.js +6 -7
package/package.json +1 -1
package/src/_vendor/video/cdp-touch.js +184 -0
package/src/_vendor/video/humanized-scroll.js +251 -0
package/src/_vendor/video/recorder/atoms.js +212 -0
package/src/_vendor/video/recorder/index.js +68 -38
package/src/_vendor/video/recorder/plan-executor.js +191 -394
package/src/_vendor/video/understanding/schema.js +316 -0
package/src/drivers/codex.js +11 -2
package/src/tools/plan-video-segments.js +152 -22
package/src/tools/record-url-narration.js +44 -137
package/src/_vendor/video/recorder/phase-duration.js +0 -18
package/src/_vendor/video/recorder/plan-estimator.js +0 -43

package/src/tools/record-url-narration.js CHANGED Viewed

@@ -1,17 +1,16 @@
-// record_url_narration — atomic recording tool.
+// V6 record_url_narration daemon tool wrapper.
 //
-// Drives Chromium on Xvfb + Playwright recordVideo to capture a silent mp4 of
-// a URL following a beat-by-beat visual plan, then ffmpeg-transcodes it. The
-// resulting silent mp4 feeds into compose_video_v2 as a video-kind segment
-// alongside narration audio.
+// Drives Chromium on Xvfb + Playwright recordVideo to capture a silent mp4
+// per section, then ffmpeg-transcodes + slices. The resulting silent mp4s
+// feed compose_video_v2 as video-kind segments alongside narration audio.
 //
-// Lives in daemon/src/tools/ as a reusable handler module; the MCP-tool
-// registration lives in daemon/mcp-servers/official/media-tools/index.js.
-// Migrated out of chat-bridge.js (V4) — no longer wrapped by
-// runMandatoryLocalTool / governance round-trip. media-tools is a separate
-// stdio MCP server and governance integration is chat-bridge-specific;
-// matches the precedent set by synthesize_tts / plan_video_segments /
-// compose_video_v2 in V1/V2/V3.
+// V6 contract (see docs/scenario-content-creation/video-synthesis-design.md):
+//   - page_understanding (from analyze_page) is required — drives safe-region
+//     check and preheat consistency
+//   - plan.sections each carry operations[] of atom calls; V5 fields are rejected
+//   - mp.weixin.qq.com-only keyword blacklist is gone — unsafe_regions from
+//     page_understanding is the universal safety mechanism
+//   - plan_video_segments must run earlier in this session (standard chain)
 import { mkdirSync } from 'fs';
 import path from 'path';
@@ -56,89 +55,9 @@ function deriveDurationMs(recorderOutput) {
   return lastTms > 0 ? lastTms : null;
 }
-function planSegments(plan) {
+function planSections(plan) {
   if (!isPlainObject(plan)) return null;
-  for (const key of ['phases', 'sections', 'segments']) {
-    if (Array.isArray(plan[key]) && plan[key].length > 0) return plan[key];
-  }
-  return null;
-}
-function derivePhaseCount({ plan, recorderOutput }) {
-  const explicit = normalizeNumberOrNull(recorderOutput?.phases);
-  if (explicit != null) return explicit;
-  const segments = planSegments(plan);
-  return segments ? segments.length : null;
-}
-function assertPipelineCompliance(plan) {
-  if (!isPlainObject(plan)) return;
-  if (!planSegments(plan)) {
-    throw new Error(
-      'record_url_narration: `plan` must contain a non-empty `phases` (or `sections` / `segments`) array — '
-      + 'either hand-written or from plan_video_segments. Each entry should carry a visual action and a duration.'
-    );
-  }
-}
-// Forbidden region keywords for recruitment content. If a section's
-// target_y_content_label matches, we refuse to record — the resulting video
-// would show 投递入口 / 二维码 / contact info, which violates the recruitment
-// content policy (see fragments.md frag.short.recruitment_url_mode_policy).
-//
-// Origin: in production runs the agent's plan repeatedly declared a target_y
-// without checking what content lived at that pixel position, and ended up
-// dwelling on QR codes / 投递 entries / 联系方式. The prompt-level rule
-// requiring `target_y_content_label` has been ignored often enough that we
-// enforce it at the tool layer instead.
-const FORBIDDEN_REGION_PATTERNS = [
-  /二维码/, /扫码/, /扫一扫/,
-  /投递入口/, /投递方式/, /投递通道/, /投递渠道/, /报名入口/, /报名方式/,
-  /联系方式/, /联系人/, /微信号/, /\bWeChat\b/i, /\bQQ群\b/,
-  /阅读原文/, /外链/, /\bQR\b/i,
-];
-function isRecruitmentLikeUrl(url) {
-  if (typeof url !== 'string') return false;
-  return /mp\.weixin\.qq\.com/.test(url);
-}
-function describeForbiddenMatch(label) {
-  for (const pattern of FORBIDDEN_REGION_PATTERNS) {
-    if (pattern.test(label)) return pattern.source;
-  }
-  return null;
-}
-function checkSafeRegionLabels({ url, plan }) {
-  if (!isRecruitmentLikeUrl(url)) return null;
-  const segments = planSegments(plan);
-  if (!segments) return null;
-  for (let i = 0; i < segments.length; i += 1) {
-    const seg = segments[i] ?? {};
-    const label = normalizeText(seg.target_y_content_label ?? seg.targetYContentLabel ?? '');
-    if (!label) {
-      return (
-        `record_url_narration: section[${i}] is missing required field `
-        + `\`target_y_content_label\`. For recruitment URLs (mp.weixin.qq.com / `
-        + `校招 / 实习等) you MUST label what content lives at target_y so the `
-        + `tool can verify it is not 二维码/投递入口/联系方式. Look at the page `
-        + `screenshot, find what is at target_y=${seg.target_y ?? '<unset>'}, `
-        + `and add a short label like "标题区" / "岗位信息卡片" / "公司介绍".`
-      );
-    }
-    const match = describeForbiddenMatch(label);
-    if (match) {
-      return (
-        `record_url_narration: section[${i}] target_y=${seg.target_y ?? '?'} `
-        + `is labeled "${label}", which matches a forbidden region pattern `
-        + `/${match}/. Recruitment content must NOT dwell on 投递入口 / 二维码 / `
-        + `联系方式 areas. Pick a different target_y inside the 标题区 / 岗位 `
-        + `信息卡片 / 公司介绍 area and rewrite this section.`
-      );
-    }
-  }
+  if (Array.isArray(plan.sections) && plan.sections.length > 0) return plan.sections;
   return null;
 }
@@ -156,6 +75,24 @@ export function validateRecordUrlNarrationArgs(args = {}) {
     throw error;
   }
+  if (!planSections(args.plan)) {
+    const error = new Error(
+      'plan.sections is required (non-empty array). Each section: { id?, text?, audio_path?, dwell_ms?, operations: [{atom, duration_ms, ...}] }.',
+    );
+    error.code = 'PLAN_SECTIONS_REQUIRED';
+    throw error;
+  }
+  if (!isPlainObject(args.page_understanding)) {
+    const error = new Error(
+      'page_understanding is required — call analyze_page(url) first and pass its output here. '
+      + 'V6 uses page_understanding.unsafe_regions[] to validate scroll_to.y / cursor_focus.y, and '
+      + 'page_understanding.preheat_strategy to align the record browser with the analyze browser.',
+    );
+    error.code = 'PAGE_UNDERSTANDING_REQUIRED';
+    throw error;
+  }
   return {
     ...(args ?? {}),
     url: normalizedUrl,
@@ -219,49 +156,22 @@ export async function runRecordUrlNarrationTool({
     return toolError(`Error: ${error.message}`);
   }
-  try {
-    assertPipelineCompliance(validatedInput.plan);
-  } catch (error) {
-    return toolError(`Error: ${error.message}`);
-  }
-  // Safe-region check for recruitment URLs — refuse plans that dwell on
-  // forbidden regions (二维码 / 投递入口 / 联系方式) before we even start
-  // Chromium. The agent must label each target_y with the content that lives
-  // there, and the labels are pattern-matched against a forbidden list.
-  const safeRegionError = checkSafeRegionLabels({
-    url: validatedInput.url,
-    plan: validatedInput.plan,
-  });
-  if (safeRegionError) {
-    return toolError(`Error: ${safeRegionError}`);
-  }
   // Standard-chain hard block: refuse recordings unless plan_video_segments
-  // ran in this session. Discovered repeatedly in Tasks #20/#25/#26 that
-  // agents hand-write dwell_ms by guessing, producing recordings whose phase
-  // boundaries drift from the TTS audio they will eventually be paired with —
-  // forcing a full re-record. plan_video_segments fills dwell_ms mechanically
-  // from ffprobe audio duration, eliminating the drift.
+  // ran in this session. plan_video_segments is what aligns operations[]
+  // duration sums to the per-section TTS audio duration; skipping it lets
+  // audio/visual drift accumulate across sections.
   if (!planVideoSegmentsCalled) {
     return toolError(
       'Error: record_url_narration refused: plan_video_segments must run earlier in this '
-      + 'session so dwell_ms / phase durations are mechanically aligned with the segment\'s '
-      + 'TTS audio (audio_duration_ms). Hand-written dwell_ms has repeatedly drifted from '
-      + 'the actual TTS duration in production runs, forcing full re-records.\n\n'
-      + 'Standard chain: synthesize_tts × N (per segment) → plan_video_segments(segments with '
-      + 'text + audio_path + visual_kind=video + visual_path) → record_url_narration (feed '
-      + 'plan_video_segments output as plan.sections — each section\'s dwell_ms is already '
-      + 'set to audio_duration_ms) + compose_video_v2 (same plan output). Call plan_video_segments '
-      + 'now, then pass its `segments` array as `plan.sections` here.'
+      + 'session so per-section operations.duration_ms is reconciled with TTS audio_duration_ms.\n\n'
+      + 'V6 standard chain: analyze_page(url) → synthesize_tts × N (per section) → '
+      + 'plan_video_segments(segments with text + audio_path + visual_kind + operations) → '
+      + 'record_url_narration(url, page_understanding, plan=…, output_paths=[…]) + '
+      + 'compose_video_v2(segments=…, variants=[…]). Call plan_video_segments now, then retry.',
     );
   }
   try {
-    // output_paths is REQUIRED. The legacy "default output_path master file"
-    // mode is gone — agents kept defaulting to one-call-per-section because
-    // that was the lowest-friction path. Now every recording is sliced, even
-    // single-section ones (which are just a 1-element output_paths array).
     let resolvedOutputPaths;
     try {
       resolvedOutputPaths = resolveOutputPaths(validatedInput.output_paths, { workspaceDir });
@@ -272,22 +182,18 @@ export async function runRecordUrlNarrationTool({
       return toolError(
         'Error: output_paths is required — one workspace-relative mp4 path per plan.sections entry. '
         + 'Single-section recording is a 1-element array. Multi-section recording records once '
-        + 'continuously (one browser session, one scrollTop) and slices the result at section '
-        + 'boundaries. See frag.short.video_synthesis_tools.',
+        + 'continuously (one browser session) and slices the result at section boundaries.',
       );
     }
-    const planSectionCount = (planSegments(validatedInput.plan) ?? []).length;
-    if (resolvedOutputPaths.length !== planSectionCount) {
+    const sectionCount = (planSections(validatedInput.plan) ?? []).length;
+    if (resolvedOutputPaths.length !== sectionCount) {
       return toolError(
         `Error: output_paths length (${resolvedOutputPaths.length}) must match `
-        + `plan.sections length (${planSectionCount}). Each section produces exactly one mp4 — `
+        + `plan.sections length (${sectionCount}). Each section produces exactly one mp4 — `
         + `don't pad or truncate.`,
       );
     }
-    // The master / events JSON paths are agent-optional debug artifacts.
-    // Default master to a tmp path next to the first output; events default
-    // to <master>.events.json. Agent can override either if they care.
     const { resolvedOutputPath: masterPath, resolvedEventsPath } = resolveRecordUrlNarrationPaths({
       workspaceDir,
       outputPath: validatedInput.output_path,
@@ -300,6 +206,7 @@ export async function runRecordUrlNarrationTool({
     const recorderOutput = await recordUrlNarrationFn({
       url: validatedInput.url,
       plan: validatedInput.plan,
+      page_understanding: validatedInput.page_understanding,
       output_path: masterPath,
       events_path: resolvedEventsPath,
       output_paths: resolvedOutputPaths,

package/src/_vendor/video/recorder/phase-duration.js DELETED Viewed

@@ -1,18 +0,0 @@
-function normalizeInteger(value, fallback = null) {
-  const parsed = Number.parseInt(String(value ?? ''), 10);
-  if (!Number.isFinite(parsed)) return fallback;
-  return parsed;
-}
-export function resolveDurationMs(phase, fallback = 0) {
-  const parsed = normalizeInteger(phase?.duration_ms, null);
-  if (parsed !== null && parsed >= 0) return parsed;
-  const dwellMs = normalizeInteger(phase?.dwell_ms, null);
-  if (dwellMs !== null && dwellMs >= 0) return dwellMs;
-  const secs = Number(phase?.duration_s);
-  if (Number.isFinite(secs) && secs >= 0) return Math.round(secs * 1000);
-  return fallback;
-}

package/src/_vendor/video/recorder/plan-estimator.js DELETED Viewed

@@ -1,43 +0,0 @@
-import { resolveDurationMs } from './phase-duration.js';
-import { normalizePlanPhases } from './plan-executor.js';
-export function estimatePlanDurationMs(plan = {}) {
-  let phases = [];
-  try {
-    phases = normalizePlanPhases(plan);
-  } catch {
-    phases = [];
-  }
-  return phases.reduce((total, phase) => {
-    const action = String(phase?.action ?? phase?.visual_action?.type ?? '').trim().toLowerCase();
-    const durationMs = resolveDurationMs(phase, Number.NaN);
-    const dwellMs = Number(phase?.dwell_ms);
-    const transitionMs = Number(phase?.transition_ms ?? phase?.visual_action?.transition_ms);
-    const effectiveHoldMs = Number.isFinite(dwellMs) && dwellMs > 0
-      ? dwellMs
-      : durationMs;
-    if (action === 'hold' && Number.isFinite(effectiveHoldMs) && effectiveHoldMs > 0) {
-      return total + effectiveHoldMs;
-    }
-    if (action === 'linear_scroll_during') {
-      if (Number.isFinite(effectiveHoldMs) && effectiveHoldMs > 0) return total + effectiveHoldMs;
-      return total + 1200;
-    }
-    if (action === 'scroll_to_dwell' || action === 'cursor_focus' || action === 'scroll_back') {
-      let next = total;
-      if (Number.isFinite(transitionMs) && transitionMs > 0) next += transitionMs;
-      if (Number.isFinite(effectiveHoldMs) && effectiveHoldMs > 0) next += effectiveHoldMs;
-      if (next === total) next += 1200;
-      return next;
-    }
-    if (Number.isFinite(transitionMs) && transitionMs > 0) {
-      return total + transitionMs;
-    }
-    if (Number.isFinite(durationMs) && durationMs > 0) {
-      return total + durationMs;
-    }
-    return total + 800;
-  }, 0);
-}