npm - @lightcone-ai/daemon - Versions diffs - 0.23.4 → 0.23.6 - Mend

@lightcone-ai/daemon 0.23.4 → 0.23.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/mcp-servers/official/media-tools/index.js +31 -2
package/package.json +1 -1
package/src/_vendor/video/recorder/atoms.js +69 -58
package/src/tools/plan-video-segments.js +49 -0

package/mcp-servers/official/media-tools/index.js CHANGED Viewed

@@ -257,13 +257,42 @@ server.tool(
       }).optional().describe('Optional presentation hints (style only). duration/per_card_duration are computed.'),
       operations: z.array(z.object({
         atom: z.enum(['scroll_to', 'hold', 'cursor_focus']),
-        duration_ms: z.union([z.number(), z.literal('fill')]).describe('Atom duration. "fill" allowed only on the LAST hold to auto-fill remaining audio time.'),
+        duration_ms: z.union([z.number(), z.literal('fill')]).describe('Atom duration in ms. "fill" allowed only on the LAST hold to auto-fill remaining audio time.'),
         y: z.number().optional(),
         x: z.number().optional(),
         curve: z.enum(['easeInOutQuad', 'linear', 'easeOutQuad']).optional(),
         mode: z.enum(['auto', 'touch', 'programmatic']).optional(),
         jitter_px: z.number().optional(),
-      })).optional().describe('For visual_kind=video URL recording sections: ordered atom sequence. Sum of duration_ms must equal audio_duration_ms (±200ms); use "fill" on the last hold to auto-balance.'),
+      })).optional().describe(
+        'For visual_kind=video URL recording sections: ordered atom sequence. Sum of duration_ms '
+        + 'must equal audio_duration_ms (±200ms); use "fill" on the last hold to auto-balance.\n\n'
+        + 'TRANSITION + EXPLAIN MODE (REQUIRED — enforced by lint): the recording should feel '
+        + 'like a person opening a page and walking the viewer through it block by block. '
+        + 'Concretely:\n'
+        + '  • scroll_to is a TRANSITION between content blocks — short (~500-800ms is fine), '
+        + 'smooth (atomScrollTo programmatic mode handles smoothness automatically; speed does NOT need to be slow).\n'
+        + '  • hold is where the NARRATION happens — long holds (2-5s) are the norm, not the exception. '
+        + 'This is when the agent says the actual sentences about this block.\n'
+        + "  • Every non-opening segment MUST start with a scroll_to (the transition into this segment's "
+        + 'content block). Segments starting with hold are REJECTED — they cause jump cuts.\n'
+        + '  • The shape is: "scroll to new block → pause and explain → scroll to next block → pause and explain".\n\n'
+        + 'GOOD example for a 5s segment narrating "宁波银行金融科技部 FinTech 暑期专项":\n'
+        + '  [\n'
+        + '    { atom: "scroll_to", y: 280, duration_ms: 700 },     // 0.7s smooth transition to title\n'
+        + '    { atom: "hold",               duration_ms: "fill" }, // ~4.3s: agent narrates this block\n'
+        + '  ]\n\n'
+        + 'GOOD example for a 9s segment with two content blocks inside:\n'
+        + '  [\n'
+        + '    { atom: "scroll_to", y: 980, duration_ms: 700 },     // transition to first block\n'
+        + '    { atom: "hold",              duration_ms: 4000 },    // narrate this block (~"金融产品应用开发岗 …")\n'
+        + '    { atom: "scroll_to", y: 1450, duration_ms: 600 },    // short transition to next block\n'
+        + '    { atom: "hold",               duration_ms: "fill" }, // narrate next block (~3.7s)\n'
+        + '  ]\n\n'
+        + 'BAD example (REJECTED by transition_required):\n'
+        + '  [\n'
+        + '    { atom: "hold", duration_ms: 5000 },     // segment starts with hold ← rejected\n'
+        + '  ]',
+      ),
     })).describe('Segments to plan. audio_path is required for each. V5 fields (action, target_y, target_y_content_label, focus_region, transition_ms, dwell_ms, phase.beats[]) are rejected.'),
   },
   async ({ segments }) => {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@lightcone-ai/daemon",
-  "version": "0.23.4",
+  "version": "0.23.6",
   "type": "module",
   "main": "src/index.js",
   "bin": {

package/src/_vendor/video/recorder/atoms.js CHANGED Viewed

@@ -53,7 +53,7 @@ export async function atomScrollTo(page, _ctx, {
   target_y,
   duration_ms,
   curve = 'easeInOutQuad',
-  jitter_px = 2,
+  jitter_px = 0,  // 不要微动 — 用户反复明确要求
   from_y = null,
   mode = 'auto',
 } = {}) {
@@ -70,17 +70,21 @@ export async function atomScrollTo(page, _ctx, {
   const durationMs = Number(duration_ms);
   const distance = Math.abs(targetY - fromY);
-  // Auto-mode heuristic: touch works for short distances (single segment, no
-  // inter-segment fling interference) OR slow velocities (each segment has
-  // time to settle before the next starts). Fast long-distance scrolls fall
-  // back to programmatic, where the easing loop drives root.scrollTo
-  // deterministically.
-  // Thresholds chosen empirically against the v4 failure mode (~1100px in
-  // ~1000ms → ~1100 px/s, fling-interrupt-fling, page never reached target).
-  const velocity = durationMs > 0 ? (distance / durationMs) * 1000 : 0; // px/s
+  // Auto-mode: default to programmatic (RAF-driven smooth scroll). The touch
+  // path uses humanizedScroll which splits any scroll > 260px into multiple
+  // CDP swipes, each with ±18-26px random horizontal nudge and fling-cancel-
+  // fling boundaries — that looks like "颤抖着分多次拨", not a clean slide.
+  // User feedback is unambiguous: scroll must be a smooth transition between
+  // content blocks, not a teleport (instant snap) and not a wobble (multi-
+  // segment touch with horizontal drift). Programmatic with RAF achieves
+  // both — every frame moves, vertical only, no inter-segment pauses.
+  // Touch mode remains available via explicit `mode: 'touch'` for callers
+  // that specifically want gesture physics.
+  const velocity = durationMs > 0 ? (distance / durationMs) * 1000 : 0; // px/s (kept for diagnostics)
+  void velocity;
   const resolvedMode = mode === 'programmatic' || mode === 'touch'
     ? mode
-    : (distance < 240 || velocity < 500 ? 'touch' : 'programmatic');
+    : 'programmatic';
   if (resolvedMode === 'touch') {
     await humanizedScroll(page, {
@@ -96,68 +100,75 @@ export async function atomScrollTo(page, _ctx, {
       targetY,
       durationMs,
       curve,
-      jitterPx: Math.max(0, Number(jitter_px) || 0),
     });
   }
   return { anchorY: Math.round(targetY) };
 }
-// Programmatic scroll: hands the animation off to the browser's native
-// scroll engine via `scroll-behavior: smooth`. That way the easing runs on
-// the compositor thread at the display refresh rate, independent of how
-// busy the page's JS is. The JS-driven setTimeout approach we tried first
-// gets badly throttled on JS-heavy article pages (60Hz timers can stretch
-// to 150-200ms), turning a 1s transition into 5-8s.
+// Programmatic scroll: JS-driven RAF loop that incrementally updates the
+// scroll position frame-by-frame over `durationMs`. This produces an actual
+// smooth scroll the viewer sees in the recording — the previous version
+// did a hard instant snap and then a static wait, which looked like a
+// teleport ("跳一下然后定格"), not like a person sliding a page.
 //
-// The wait is Node-side, so even if the in-page scrollend never fires we
-// still cap the section at durationMs and move on. We deliberately do NOT
-// wait for scrollend — empirically faster than dispatching the event on
-// page-heavy mobile sites.
+// Why not native `scroll-behavior: smooth` or `scrollTo({behavior:'smooth'})`?
+// In Playwright + a headless mobile context, native smooth-scroll often
+// gets capped to a fixed short duration (~300-500ms) regardless of distance,
+// or is throttled by the page's own scroll logic. We need a duration we
+// control end-to-end.
+//
+// Frame loop runs inside page.evaluate so it stays in lockstep with the
+// page's render thread — important when recordVideo is capturing 30fps.
 async function programmaticScroll(page, {
   fromY,
   targetY,
   durationMs,
+  curve = 'easeInOutQuad',
 } = {}) {
-  // Try every plausible scroll target — mobile article pages sometimes have
-  // a fixed-position outer body and scroll happens on an inner container.
-  // We dispatch to all candidates and let whichever one is actually the
-  // scroller win. Returns the diagnostics so we can debug when the page
-  // refuses to scroll.
-  const diag = await page.evaluate((input) => {
-    const candidates = [];
-    if (document.scrollingElement) candidates.push(document.scrollingElement);
-    if (document.documentElement) candidates.push(document.documentElement);
-    if (document.body) candidates.push(document.body);
-    candidates.push(window);
-    const before = candidates.map((c) => {
-      if (c === window) return { tag: 'window', y: window.scrollY };
-      return { tag: c.tagName, y: c.scrollTop };
-    });
-    // Hard snap to target on every candidate (instant, no animation).
-    for (const c of candidates) {
-      try {
-        if (c === window) window.scrollTo(0, input.targetY);
-        else { c.scrollTop = input.targetY; }
-      } catch { /* ignore */ }
+  await page.evaluate(async (input) => {
+    function pickScroller() {
+      if (document.scrollingElement) return document.scrollingElement;
+      if (document.documentElement) return document.documentElement;
+      return document.body;
     }
+    function easeInOutQuad(t) { return t < 0.5 ? 2 * t * t : 1 - Math.pow(-2 * t + 2, 2) / 2; }
+    function easeOutQuad(t) { return 1 - (1 - t) * (1 - t); }
+    function linear(t) { return t; }
+    const ease = input.curve === 'linear' ? linear
+      : input.curve === 'easeOutQuad' ? easeOutQuad
+      : easeInOutQuad;
-    const after = candidates.map((c) => {
-      if (c === window) return { tag: 'window', y: window.scrollY };
-      return { tag: c.tagName, y: c.scrollTop };
-    });
+    const scroller = pickScroller();
+    const startY = (scroller === document.scrollingElement || scroller === document.documentElement)
+      ? scroller.scrollTop : window.scrollY;
+    const delta = input.targetY - startY;
+    const start = performance.now();
-    return {
-      requested_target: input.targetY,
-      before, after,
-      maxScroll: document.documentElement?.scrollHeight,
-      innerHeight: window.innerHeight,
-    };
-  }, { fromY, targetY });
-  void diag;
-  // Brief dwell to let the page settle the snap before next atom starts.
-  await page.waitForTimeout(Math.max(80, Math.round(durationMs * 0.3)));
+    return new Promise((resolve) => {
+      function tick(now) {
+        const elapsed = now - start;
+        const t = Math.min(1, elapsed / input.durationMs);
+        const y = startY + delta * ease(t);
+        try {
+          if (scroller === window) window.scrollTo(0, y);
+          else { scroller.scrollTop = y; }
+        } catch { /* ignore */ }
+        if (t < 1) {
+          requestAnimationFrame(tick);
+        } else {
+          // Final snap to exact target (in case of sub-pixel drift).
+          try {
+            if (scroller === window) window.scrollTo(0, input.targetY);
+            else { scroller.scrollTop = input.targetY; }
+          } catch { /* ignore */ }
+          resolve();
+        }
+      }
+      requestAnimationFrame(tick);
+    });
+  }, { fromY, targetY, durationMs, curve });
+  // Tiny settle so the next atom sees the scroll committed.
+  await page.waitForTimeout(50);
 }
 // ── atomHold ─────────────────────────────────────────────────────────────────

package/src/tools/plan-video-segments.js CHANGED Viewed

@@ -94,6 +94,54 @@ function assertNoV5Fields(seg, index) {
   }
 }
+// Transition-mode lint — enforce the "explain block → smooth transition → explain block"
+// pattern the user described:
+//   "先说一句话, 然后再往下滑, 介绍内容 1, 再往下滑, 停住介绍内容 2"
+//
+// Key insight: scroll_to is a TRANSITION between content blocks, not a
+// narration vehicle. It can be short (~500-800ms) — speed doesn't matter,
+// only smoothness. Long narration happens during hold, not during scroll.
+//
+// Rule (single rule): every non-opening segment MUST start with a scroll_to.
+// This guarantees a visible transition from the previous segment's anchor
+// to the new content block. Without this, an agent can string back-to-back
+// hold-only segments and the viewer just sees jump cuts in audio with no
+// page movement.
+//
+// What's NOT enforced anymore:
+//   - scroll_to duration_ms is not bounded — short transitions (500ms) and
+//     longer ones (2s+) are both fine. Smoothness comes from atomScrollTo's
+//     RAF-based programmatic implementation, not from duration.
+//   - hold duration_ms is not bounded — long holds (3-5s) are the normal
+//     case (this is where the agent narrates the current block).
+function validateReadingFlow(operations, segmentIndex) {
+  // Opening hook segment is exempt — first segment may legitimately be
+  // a fully static hero shot (e.g. "校招，实习岗位更新，速投" over a poster).
+  if (segmentIndex === 0) return;
+  const ops = Array.isArray(operations) ? operations : [];
+  if (ops.length === 0) return;
+  // The first op of a non-opening segment must be a scroll_to (the
+  // transition into this block's content). All-hold segments produce
+  // back-to-back jump cuts with no visible page movement, which the user
+  // has explicitly rejected.
+  const first = ops[0];
+  if (first?.atom !== 'scroll_to') {
+    const err = new Error(
+      `transition_required: segments[${segmentIndex}] must start with a scroll_to atom — `
+      + 'this is the smooth transition from the previous block to this one. '
+      + `Got first atom "${first?.atom ?? 'none'}". All-hold segments produce jump cuts. `
+      + 'Fix: prepend a scroll_to(target_y=<new block top>, duration_ms=500~1000) before '
+      + 'the hold. The scroll can be short (~600ms is fine); what matters is that the '
+      + "page visibly slides — atomScrollTo's programmatic mode handles smoothness.",
+    );
+    err.code = 'TRANSITION_REQUIRED';
+    throw err;
+  }
+}
 // Process operations[]: expand "fill" on the last hold, validate atom shape.
 function processOperations(operations, audioDurationMs, segmentIndex) {
   if (!Array.isArray(operations) || operations.length === 0) {
@@ -165,6 +213,7 @@ function processOperations(operations, audioDurationMs, segmentIndex) {
     }
     sum += n;
   }
+  validateReadingFlow(expanded, segmentIndex);
   return { operations: expanded, durationSumMs: Math.round(sum) };
 }