npm - @lightcone-ai/daemon - Versions diffs - 0.22.1 → 0.23.0 - Mend

@lightcone-ai/daemon 0.22.1 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/mcp-servers/official/media-tools/index.js +42 -19
package/mcp-servers/official/page-understanding/index.js +6 -7
package/package.json +1 -1
package/src/_vendor/video/cdp-touch.js +184 -0
package/src/_vendor/video/humanized-scroll.js +251 -0
package/src/_vendor/video/recorder/atoms.js +212 -0
package/src/_vendor/video/recorder/index.js +68 -38
package/src/_vendor/video/recorder/plan-executor.js +191 -394
package/src/_vendor/video/understanding/schema.js +316 -0
package/src/tools/plan-video-segments.js +152 -22
package/src/tools/record-url-narration.js +44 -137
package/src/_vendor/video/recorder/phase-duration.js +0 -18
package/src/_vendor/video/recorder/plan-estimator.js +0 -43

package/src/_vendor/video/understanding/schema.js ADDED Viewed

@@ -0,0 +1,316 @@
+// V6 page_understanding schema.
+//
+// The output of `analyze_page` — a structured, content-aware view of a URL
+// that downstream tools (record_url_narration, plan_video_segments) and the
+// agent (short_video_scripter) consume. See video-synthesis-design.md §五.
+//
+// V5's recruitment_slots / candidate_hotspots / skip_zones / mode_hint / text_bins
+// vocabulary is gone; V6 is universal across page types and grounds plan
+// authoring in three things:
+//   - blocks[]: from-top-to-bottom content units with semantics and pacing hints
+//   - unsafe_regions[]: y-ranges the recorder must never dwell on
+//   - narrative_arc: LLM's suggested storyline (advisory, agent must cross-check)
+export const VISUAL_KIND_VALUES = Object.freeze([
+  'hero',
+  'title',
+  'subtitle',
+  'paragraph',
+  'list',
+  'image_with_text',
+  'image_only',
+  'table',
+  'callout',
+  'divider',
+  'footer',
+]);
+export const DENSITY_VALUES = Object.freeze(['low', 'medium', 'high']);
+export const VISUAL_WEIGHT_VALUES = Object.freeze([
+  'hero',
+  'primary',
+  'secondary',
+  'aux',
+]);
+export const READING_PRIORITY_VALUES = Object.freeze([
+  'must',
+  'should',
+  'may',
+  'skip',
+]);
+export const UNSAFE_REASON_VALUES = Object.freeze([
+  'qr_code',
+  'contact_info',
+  'application_entry',
+  'external_link',
+  'footer_promo',
+]);
+export const NARRATIVE_STRUCTURE_VALUES = Object.freeze([
+  'linear',
+  'list',
+  'comparison',
+  'hero_then_detail',
+  'step_by_step',
+]);
+export const PREHEAT_STRATEGY_VALUES = Object.freeze([
+  'none',
+  'full_scroll_then_top',
+]);
+const VISUAL_KIND_SET = new Set(VISUAL_KIND_VALUES);
+const DENSITY_SET = new Set(DENSITY_VALUES);
+const VISUAL_WEIGHT_SET = new Set(VISUAL_WEIGHT_VALUES);
+const READING_PRIORITY_SET = new Set(READING_PRIORITY_VALUES);
+const UNSAFE_REASON_SET = new Set(UNSAFE_REASON_VALUES);
+const NARRATIVE_STRUCTURE_SET = new Set(NARRATIVE_STRUCTURE_VALUES);
+const PREHEAT_STRATEGY_SET = new Set(PREHEAT_STRATEGY_VALUES);
+const DEFAULT_PACING_HINT = Object.freeze({ dwell_ms_min: 2500, dwell_ms_max: 6000 });
+function clampInt(value, min, max, fallback) {
+  const n = Number(value);
+  if (!Number.isFinite(n)) return fallback;
+  return Math.max(min, Math.min(max, Math.round(n)));
+}
+function trimString(value, fallback = '') {
+  if (typeof value !== 'string') return fallback;
+  const trimmed = value.trim();
+  return trimmed || fallback;
+}
+function pickEnum(value, set, fallback) {
+  const candidate = typeof value === 'string' ? value.trim() : '';
+  return set.has(candidate) ? candidate : fallback;
+}
+function normalizeKeywords(input) {
+  if (!Array.isArray(input)) return [];
+  const seen = new Set();
+  const out = [];
+  for (const raw of input) {
+    const word = trimString(raw);
+    if (!word) continue;
+    const lower = word.toLowerCase();
+    if (seen.has(lower)) continue;
+    seen.add(lower);
+    out.push(word);
+    if (out.length >= 12) break;
+  }
+  return out;
+}
+function normalizePacingHint(input, fallback = DEFAULT_PACING_HINT) {
+  if (!input || typeof input !== 'object' || Array.isArray(input)) {
+    return { dwell_ms_min: fallback.dwell_ms_min, dwell_ms_max: fallback.dwell_ms_max };
+  }
+  const min = clampInt(input.dwell_ms_min, 500, 60000, fallback.dwell_ms_min);
+  const max = clampInt(input.dwell_ms_max, 500, 60000, fallback.dwell_ms_max);
+  if (max < min) return { dwell_ms_min: min, dwell_ms_max: min + 1000 };
+  return { dwell_ms_min: min, dwell_ms_max: max };
+}
+function normalizeBlock(raw, index, totalHeight) {
+  if (!raw || typeof raw !== 'object' || Array.isArray(raw)) return null;
+  const id = trimString(raw.id, `b${index + 1}`);
+  const yTop = clampInt(raw.y_top, 0, totalHeight, 0);
+  const yBottom = clampInt(raw.y_bottom, 0, totalHeight, yTop);
+  if (yBottom <= yTop) return null;
+  const visualKind = pickEnum(raw.visual_kind, VISUAL_KIND_SET, 'paragraph');
+  const density = pickEnum(raw.density, DENSITY_SET, 'medium');
+  const visualWeight = pickEnum(raw.visual_weight, VISUAL_WEIGHT_SET, 'secondary');
+  const readingPriority = pickEnum(raw.reading_priority, READING_PRIORITY_SET, 'should');
+  return {
+    id,
+    y_top: yTop,
+    y_bottom: yBottom,
+    visual_kind: visualKind,
+    text: trimString(raw.text),
+    summary: trimString(raw.summary),
+    keywords: normalizeKeywords(raw.keywords),
+    density,
+    visual_weight: visualWeight,
+    contains_image: Boolean(raw.contains_image),
+    reading_priority: readingPriority,
+    pacing_hint: normalizePacingHint(raw.pacing_hint),
+    narration_hint: trimString(raw.narration_hint),
+  };
+}
+function normalizeUnsafeRegion(raw, totalHeight) {
+  if (!raw || typeof raw !== 'object' || Array.isArray(raw)) return null;
+  const yTop = clampInt(raw.y_top, 0, totalHeight, 0);
+  const yBottom = clampInt(raw.y_bottom, 0, totalHeight, yTop);
+  if (yBottom <= yTop) return null;
+  const reason = pickEnum(raw.reason, UNSAFE_REASON_SET, 'footer_promo');
+  return { y_top: yTop, y_bottom: yBottom, reason };
+}
+function normalizeNarrativeArc(raw, blockIds) {
+  const fallback = {
+    structure: 'linear',
+    suggested_flow: [],
+  };
+  if (!raw || typeof raw !== 'object' || Array.isArray(raw)) return fallback;
+  const structure = pickEnum(raw.structure, NARRATIVE_STRUCTURE_SET, 'linear');
+  const idSet = new Set(blockIds);
+  const flow = Array.isArray(raw.suggested_flow)
+    ? raw.suggested_flow
+        .map((row) => {
+          if (typeof row === 'string') {
+            const text = row.trim();
+            return text ? { step: text } : null;
+          }
+          if (!row || typeof row !== 'object') return null;
+          const step = trimString(row.step ?? row.note);
+          const blockRefs = Array.isArray(row.block_ids)
+            ? row.block_ids.map(trimString).filter((id) => id && idSet.has(id))
+            : [];
+          if (!step && blockRefs.length === 0) return null;
+          return { step, block_ids: blockRefs };
+        })
+        .filter(Boolean)
+        .slice(0, 12)
+    : [];
+  return { structure, suggested_flow: flow };
+}
+function normalizeViewport(raw) {
+  const width = clampInt(raw?.width, 320, 4096, 1080);
+  const height = clampInt(raw?.height, 480, 4096, 1920);
+  return { width, height };
+}
+export function normalizePageUnderstanding(raw = {}) {
+  const totalHeight = clampInt(raw.full_height_px ?? raw.total_height, 100, 200000, 1920);
+  const viewport = normalizeViewport(raw.viewport);
+  const preheat = pickEnum(raw.preheat_strategy, PREHEAT_STRATEGY_SET, 'full_scroll_then_top');
+  const rawBlocks = Array.isArray(raw.blocks) ? raw.blocks : [];
+  const normalizedBlocks = rawBlocks
+    .map((b, i) => normalizeBlock(b, i, totalHeight))
+    .filter(Boolean)
+    .sort((a, b) => a.y_top - b.y_top);
+  // De-dupe / reassign ids if duplicate
+  const seenIds = new Set();
+  for (const block of normalizedBlocks) {
+    if (seenIds.has(block.id)) {
+      let suffix = 2;
+      let next = `${block.id}_${suffix}`;
+      while (seenIds.has(next)) {
+        suffix += 1;
+        next = `${block.id}_${suffix}`;
+      }
+      block.id = next;
+    }
+    seenIds.add(block.id);
+  }
+  const rawUnsafe = Array.isArray(raw.unsafe_regions) ? raw.unsafe_regions : [];
+  const unsafeRegions = rawUnsafe
+    .map((r) => normalizeUnsafeRegion(r, totalHeight))
+    .filter(Boolean)
+    .sort((a, b) => a.y_top - b.y_top);
+  const narrativeArc = normalizeNarrativeArc(raw.narrative_arc, normalizedBlocks.map((b) => b.id));
+  return {
+    url: trimString(raw.url),
+    page_type: trimString(raw.page_type, 'generic_article'),
+    primary_topic: trimString(raw.primary_topic),
+    viewport,
+    preheat_strategy: preheat,
+    full_height_px: totalHeight,
+    blocks: normalizedBlocks,
+    unsafe_regions: unsafeRegions,
+    narrative_arc: narrativeArc,
+    meta: raw.meta && typeof raw.meta === 'object' && !Array.isArray(raw.meta) ? raw.meta : {},
+  };
+}
+export function validatePageUnderstanding(payload) {
+  const errors = [];
+  const model = payload ?? {};
+  if (typeof model.url !== 'string' || !model.url.trim()) errors.push('url required');
+  if (!Number.isFinite(model.full_height_px) || model.full_height_px <= 0) {
+    errors.push('full_height_px invalid');
+  }
+  if (!model.viewport || typeof model.viewport !== 'object'
+      || !Number.isFinite(model.viewport.width) || !Number.isFinite(model.viewport.height)) {
+    errors.push('viewport invalid');
+  }
+  if (!PREHEAT_STRATEGY_SET.has(model.preheat_strategy)) {
+    errors.push('preheat_strategy invalid');
+  }
+  if (!Array.isArray(model.blocks)) {
+    errors.push('blocks must be array');
+  } else {
+    if (model.blocks.length === 0) errors.push('blocks empty — analyze_page produced no content units');
+    for (let i = 0; i < model.blocks.length; i += 1) {
+      const block = model.blocks[i];
+      if (!block || typeof block !== 'object') {
+        errors.push(`blocks[${i}] invalid`);
+        continue;
+      }
+      if (!block.id || typeof block.id !== 'string') errors.push(`blocks[${i}].id invalid`);
+      if (!Number.isFinite(block.y_top) || !Number.isFinite(block.y_bottom)
+          || block.y_bottom <= block.y_top) {
+        errors.push(`blocks[${i}].y range invalid`);
+      }
+      if (!VISUAL_KIND_SET.has(block.visual_kind)) errors.push(`blocks[${i}].visual_kind invalid`);
+      if (!DENSITY_SET.has(block.density)) errors.push(`blocks[${i}].density invalid`);
+      if (!VISUAL_WEIGHT_SET.has(block.visual_weight)) errors.push(`blocks[${i}].visual_weight invalid`);
+      if (!READING_PRIORITY_SET.has(block.reading_priority)) {
+        errors.push(`blocks[${i}].reading_priority invalid`);
+      }
+    }
+  }
+  if (!Array.isArray(model.unsafe_regions)) {
+    errors.push('unsafe_regions must be array');
+  } else {
+    for (let i = 0; i < model.unsafe_regions.length; i += 1) {
+      const region = model.unsafe_regions[i];
+      if (!region || !Number.isFinite(region.y_top) || !Number.isFinite(region.y_bottom)
+          || region.y_bottom <= region.y_top) {
+        errors.push(`unsafe_regions[${i}] invalid`);
+        continue;
+      }
+      if (!UNSAFE_REASON_SET.has(region.reason)) errors.push(`unsafe_regions[${i}].reason invalid`);
+    }
+  }
+  if (!model.narrative_arc || typeof model.narrative_arc !== 'object') {
+    errors.push('narrative_arc invalid');
+  } else if (!NARRATIVE_STRUCTURE_SET.has(model.narrative_arc.structure)) {
+    errors.push('narrative_arc.structure invalid');
+  }
+  return { ok: errors.length === 0, errors };
+}
+// y-overlap check: does any block / unsafe region overlap the given y range?
+export function findOverlappingUnsafeRegion(unsafeRegions, y) {
+  if (!Array.isArray(unsafeRegions)) return null;
+  const target = Number(y);
+  if (!Number.isFinite(target)) return null;
+  for (const region of unsafeRegions) {
+    if (target >= region.y_top && target <= region.y_bottom) return region;
+  }
+  return null;
+}
+export function findBlockById(blocks, blockId) {
+  if (!Array.isArray(blocks)) return null;
+  return blocks.find((b) => b && b.id === blockId) || null;
+}

package/src/tools/plan-video-segments.js CHANGED Viewed

@@ -1,23 +1,36 @@
-// plan_video_segments — pure audio/video alignment planner.
+// V6 plan_video_segments — audio / operations time alignment.
 //
-// Takes per-segment {text, audio_path, visual_kind, ...} and returns unified
-// plan segments with:
-//   - audio_duration_ms (read via ffprobe from the provided audio_path)
-//   - subtitle_text (= text)
-//   - presentation.duration / per_card_duration (audio_duration + buffer)
-//   - dwell_ms (= audio_duration; lets the same segment drive record_url_narration)
+// Takes per-segment input (text, audio_path, visual_kind, operations[]) and:
+//   1. probes audio_path via ffprobe → audio_duration_ms
+//   2. fills audio_duration_ms / subtitle_text / presentation.duration / dwell_ms
+//   3. expands `"duration_ms": "fill"` on the LAST hold operation to make
+//      sum(operations.duration_ms) ≈ audio_duration_ms
+//   4. validates sum(operations.duration_ms) within 200ms of audio_duration_ms
+//      — rejects with operations_duration_mismatch otherwise
+//   5. rejects V5 fields (action / target_y / target_y_content_label /
+//      focus_region / transition_ms / agent-written dwell_ms / phase.beats)
 //
-// Previously this tool ALSO synthesized TTS internally — which duplicated
-// the work when callers had already run synthesize_tts, and caused the
-// "wrong standard chain" confusion in fragments.md. TTS is now decoupled:
-// callers must run synthesize_tts per segment first and pass the resulting
-// audio_path here. See docs/scenario-content-creation/video-synthesis-design.md.
-//
-// Lives in daemon/src/tools/ as a reusable handler module; the MCP-tool
-// registration lives in daemon/mcp-servers/official/media-tools/index.js.
+// Lives in daemon/src/tools/; MCP registration in
+// daemon/mcp-servers/official/media-tools/index.js.
 import { spawn } from 'node:child_process';
+const DURATION_TOLERANCE_MS = 200;
+const V5_SEGMENT_FIELDS = Object.freeze([
+  'action',
+  'target_y',
+  'target_y_content_label',
+  'targetYContentLabel',
+  'focus_region',
+  'focusRegion',
+  'transition_ms',
+  'transition_ratio',
+  'beats',
+  'visual_action',
+  'camera_motion',
+]);
 function toolText(text) {
   return { content: [{ type: 'text', text }] };
 }
@@ -60,25 +73,125 @@ function planDurationSec(audioDurationMs, bufferSec = 0.5) {
   return Math.ceil(raw * 2) / 2;
 }
+function assertNoV5Fields(seg, index) {
+  for (const field of V5_SEGMENT_FIELDS) {
+    if (Object.prototype.hasOwnProperty.call(seg, field)) {
+      const error = new Error(
+        `phase_v5_fields_removed: segments[${index}] carries V5 field "${field}". `
+        + 'V6 segments: { text, audio_path, visual_kind, operations?, visual_path?, visual_paths?, transition?, presentation? }.',
+      );
+      error.code = 'PHASE_V5_FIELDS_REMOVED';
+      throw error;
+    }
+  }
+  if (seg.dwell_ms != null && Number.isFinite(Number(seg.dwell_ms))) {
+    const error = new Error(
+      `phase_v5_fields_removed: segments[${index}].dwell_ms is set manually. `
+      + 'V6 fills dwell_ms automatically from audio_duration_ms; remove dwell_ms from input.',
+    );
+    error.code = 'PHASE_V5_FIELDS_REMOVED';
+    throw error;
+  }
+}
+// Process operations[]: expand "fill" on the last hold, validate atom shape.
+function processOperations(operations, audioDurationMs, segmentIndex) {
+  if (!Array.isArray(operations) || operations.length === 0) {
+    return { operations: [], durationSumMs: 0 };
+  }
+  const expanded = operations.map((op) => ({ ...op }));
+  let fillIndex = -1;
+  for (let i = 0; i < expanded.length; i += 1) {
+    const op = expanded[i];
+    if (op && op.duration_ms === 'fill') {
+      if (fillIndex !== -1) {
+        const error = new Error(
+          `fill_position_invalid: segments[${segmentIndex}].operations[${i}] has duration_ms="fill" but `
+          + `another fill already at index ${fillIndex}. Only one "fill" allowed, and it must be the LAST hold.`,
+        );
+        error.code = 'FILL_POSITION_INVALID';
+        throw error;
+      }
+      if (op.atom !== 'hold') {
+        const error = new Error(
+          `fill_position_invalid: segments[${segmentIndex}].operations[${i}] has duration_ms="fill" on atom="${op.atom}". `
+          + '"fill" is only allowed on the last hold atom.',
+        );
+        error.code = 'FILL_POSITION_INVALID';
+        throw error;
+      }
+      if (i !== expanded.length - 1) {
+        const error = new Error(
+          `fill_position_invalid: segments[${segmentIndex}].operations[${i}] has duration_ms="fill" but is not `
+          + 'the last operation. "fill" must be the LAST atom.',
+        );
+        error.code = 'FILL_POSITION_INVALID';
+        throw error;
+      }
+      fillIndex = i;
+    }
+  }
+  if (fillIndex !== -1) {
+    let othersSum = 0;
+    for (let i = 0; i < expanded.length; i += 1) {
+      if (i === fillIndex) continue;
+      const n = Number(expanded[i].duration_ms);
+      if (!Number.isFinite(n) || n <= 0) {
+        const error = new Error(
+          `operations_invalid: segments[${segmentIndex}].operations[${i}].duration_ms must be a positive number `
+          + `(got ${expanded[i].duration_ms}).`,
+        );
+        error.code = 'OPERATIONS_INVALID';
+        throw error;
+      }
+      othersSum += n;
+    }
+    const filled = Math.max(0, audioDurationMs - othersSum);
+    expanded[fillIndex] = { ...expanded[fillIndex], duration_ms: filled };
+  }
+  let sum = 0;
+  for (let i = 0; i < expanded.length; i += 1) {
+    const n = Number(expanded[i].duration_ms);
+    if (!Number.isFinite(n) || n <= 0) {
+      const error = new Error(
+        `operations_invalid: segments[${segmentIndex}].operations[${i}].duration_ms must be a positive number `
+        + `after expansion (got ${expanded[i].duration_ms}).`,
+      );
+      error.code = 'OPERATIONS_INVALID';
+      throw error;
+    }
+    sum += n;
+  }
+  return { operations: expanded, durationSumMs: Math.round(sum) };
+}
 export async function runPlanVideoSegmentsTool({ segments } = {}) {
   if (!Array.isArray(segments) || segments.length === 0) {
     return toolError('segments must be a non-empty array.');
   }
-  // Up-front validation — fail fast before any work.
+  // Up-front validation — fail fast before any ffprobe work.
   for (let i = 0; i < segments.length; i += 1) {
     const seg = segments[i] ?? {};
     if (typeof seg.audio_path !== 'string' || !seg.audio_path.trim()) {
       return toolError(
         `segments[${i}]: audio_path is required. plan_video_segments no longer synthesizes TTS — call synthesize_tts(text) `
-        + 'first and pass the returned path as audio_path. Standard chain: synthesize_tts × N → plan_video_segments → '
-        + 'record_url_narration + compose_video_v2 (share the same plan).'
+        + 'first and pass the returned path as audio_path. V6 standard chain: analyze_page → synthesize_tts × N → '
+        + 'plan_video_segments → record_url_narration + compose_video_v2.',
       );
     }
     const kind = String(seg.visual_kind ?? '');
     if (!kind) {
       return toolError(`segments[${i}]: visual_kind is required (image / video / gif / carousel).`);
     }
+    try {
+      assertNoV5Fields(seg, i);
+    } catch (err) {
+      return toolError(err.message);
+    }
   }
   const planned = [];
@@ -97,6 +210,25 @@ export async function runPlanVideoSegmentsTool({ segments } = {}) {
       audioDurationMs = 3000;
     }
+    let processedOps;
+    try {
+      processedOps = processOperations(seg.operations, audioDurationMs, i);
+    } catch (err) {
+      return toolError(err.message);
+    }
+    if (processedOps.operations.length > 0) {
+      const drift = Math.abs(processedOps.durationSumMs - audioDurationMs);
+      if (drift > DURATION_TOLERANCE_MS) {
+        return toolError(
+          `operations_duration_mismatch: segments[${i}].operations duration sum=${processedOps.durationSumMs}ms `
+          + `but audio_duration_ms=${audioDurationMs}ms (drift ${drift}ms > ${DURATION_TOLERANCE_MS}ms tolerance). `
+          + 'Adjust the section operations so their durations sum to TTS audio duration, or use '
+          + '"duration_ms": "fill" on the last hold to auto-fill the remainder.',
+        );
+      }
+    }
     let presentation;
     if (kind === 'carousel') {
       const numCards = Array.isArray(seg.visual_paths) ? seg.visual_paths.length : 1;
@@ -104,7 +236,6 @@ export async function runPlanVideoSegmentsTool({ segments } = {}) {
       const perCard = Math.max(2, Math.ceil((totalDuration / numCards) * 2) / 2);
       presentation = { per_card_duration: perCard };
     } else {
-      // image / scroll / video / gif
       const duration = planDurationSec(audioDurationMs, kind === 'scroll' ? 1.0 : 0.5);
       presentation = { duration, ...(kind === 'scroll' ? { style: 'scroll' } : {}) };
     }
@@ -114,10 +245,9 @@ export async function runPlanVideoSegmentsTool({ segments } = {}) {
       audio_path: seg.audio_path,
       audio_duration_ms: audioDurationMs,
       ...(text ? { subtitle_text: text } : {}),
+      ...(processedOps.operations.length > 0 ? { operations: processedOps.operations } : {}),
       presentation: { ...presentation, ...(seg.presentation ?? {}) },
-      // dwell_ms doubles as record_url_narration's per-phase hold duration so
-      // recording naturally tracks the narration audio.
-      dwell_ms: seg.dwell_ms ?? audioDurationMs,
+      dwell_ms: audioDurationMs,
     });
   }