@lightcone-ai/daemon 0.23.4 → 0.23.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -257,13 +257,42 @@ server.tool(
257
257
  }).optional().describe('Optional presentation hints (style only). duration/per_card_duration are computed.'),
258
258
  operations: z.array(z.object({
259
259
  atom: z.enum(['scroll_to', 'hold', 'cursor_focus']),
260
- duration_ms: z.union([z.number(), z.literal('fill')]).describe('Atom duration. "fill" allowed only on the LAST hold to auto-fill remaining audio time.'),
260
+ duration_ms: z.union([z.number(), z.literal('fill')]).describe('Atom duration in ms. "fill" allowed only on the LAST hold to auto-fill remaining audio time.'),
261
261
  y: z.number().optional(),
262
262
  x: z.number().optional(),
263
263
  curve: z.enum(['easeInOutQuad', 'linear', 'easeOutQuad']).optional(),
264
264
  mode: z.enum(['auto', 'touch', 'programmatic']).optional(),
265
265
  jitter_px: z.number().optional(),
266
- })).optional().describe('For visual_kind=video URL recording sections: ordered atom sequence. Sum of duration_ms must equal audio_duration_ms (±200ms); use "fill" on the last hold to auto-balance.'),
266
+ })).optional().describe(
267
+ 'For visual_kind=video URL recording sections: ordered atom sequence. Sum of duration_ms '
268
+ + 'must equal audio_duration_ms (±200ms); use "fill" on the last hold to auto-balance.\n\n'
269
+ + 'TRANSITION + EXPLAIN MODE (REQUIRED — enforced by lint): the recording should feel '
270
+ + 'like a person opening a page and walking the viewer through it block by block. '
271
+ + 'Concretely:\n'
272
+ + ' • scroll_to is a TRANSITION between content blocks — short (~500-800ms is fine), '
273
+ + 'smooth (atomScrollTo programmatic mode handles smoothness automatically; speed does NOT need to be slow).\n'
274
+ + ' • hold is where the NARRATION happens — long holds (2-5s) are the norm, not the exception. '
275
+ + 'This is when the agent says the actual sentences about this block.\n'
276
+ + " • Every non-opening segment MUST start with a scroll_to (the transition into this segment's "
277
+ + 'content block). Segments starting with hold are REJECTED — they cause jump cuts.\n'
278
+ + ' • The shape is: "scroll to new block → pause and explain → scroll to next block → pause and explain".\n\n'
279
+ + 'GOOD example for a 5s segment narrating "宁波银行金融科技部 FinTech 暑期专项":\n'
280
+ + ' [\n'
281
+ + ' { atom: "scroll_to", y: 280, duration_ms: 700 }, // 0.7s smooth transition to title\n'
282
+ + ' { atom: "hold", duration_ms: "fill" }, // ~4.3s: agent narrates this block\n'
283
+ + ' ]\n\n'
284
+ + 'GOOD example for a 9s segment with two content blocks inside:\n'
285
+ + ' [\n'
286
+ + ' { atom: "scroll_to", y: 980, duration_ms: 700 }, // transition to first block\n'
287
+ + ' { atom: "hold", duration_ms: 4000 }, // narrate this block (~"金融产品应用开发岗 …")\n'
288
+ + ' { atom: "scroll_to", y: 1450, duration_ms: 600 }, // short transition to next block\n'
289
+ + ' { atom: "hold", duration_ms: "fill" }, // narrate next block (~3.7s)\n'
290
+ + ' ]\n\n'
291
+ + 'BAD example (REJECTED by transition_required):\n'
292
+ + ' [\n'
293
+ + ' { atom: "hold", duration_ms: 5000 }, // segment starts with hold ← rejected\n'
294
+ + ' ]',
295
+ ),
267
296
  })).describe('Segments to plan. audio_path is required for each. V5 fields (action, target_y, target_y_content_label, focus_region, transition_ms, dwell_ms, phase.beats[]) are rejected.'),
268
297
  },
269
298
  async ({ segments }) => {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lightcone-ai/daemon",
3
- "version": "0.23.4",
3
+ "version": "0.23.6",
4
4
  "type": "module",
5
5
  "main": "src/index.js",
6
6
  "bin": {
@@ -53,7 +53,7 @@ export async function atomScrollTo(page, _ctx, {
53
53
  target_y,
54
54
  duration_ms,
55
55
  curve = 'easeInOutQuad',
56
- jitter_px = 2,
56
+ jitter_px = 0, // 不要微动 — 用户反复明确要求
57
57
  from_y = null,
58
58
  mode = 'auto',
59
59
  } = {}) {
@@ -70,17 +70,21 @@ export async function atomScrollTo(page, _ctx, {
70
70
  const durationMs = Number(duration_ms);
71
71
  const distance = Math.abs(targetY - fromY);
72
72
 
73
- // Auto-mode heuristic: touch works for short distances (single segment, no
74
- // inter-segment fling interference) OR slow velocities (each segment has
75
- // time to settle before the next starts). Fast long-distance scrolls fall
76
- // back to programmatic, where the easing loop drives root.scrollTo
77
- // deterministically.
78
- // Thresholds chosen empirically against the v4 failure mode (~1100px in
79
- // ~1000ms ~1100 px/s, fling-interrupt-fling, page never reached target).
80
- const velocity = durationMs > 0 ? (distance / durationMs) * 1000 : 0; // px/s
73
+ // Auto-mode: default to programmatic (RAF-driven smooth scroll). The touch
74
+ // path uses humanizedScroll which splits any scroll > 260px into multiple
75
+ // CDP swipes, each with ±18-26px random horizontal nudge and fling-cancel-
76
+ // fling boundaries that looks like "颤抖着分多次拨", not a clean slide.
77
+ // User feedback is unambiguous: scroll must be a smooth transition between
78
+ // content blocks, not a teleport (instant snap) and not a wobble (multi-
79
+ // segment touch with horizontal drift). Programmatic with RAF achieves
80
+ // both every frame moves, vertical only, no inter-segment pauses.
81
+ // Touch mode remains available via explicit `mode: 'touch'` for callers
82
+ // that specifically want gesture physics.
83
+ const velocity = durationMs > 0 ? (distance / durationMs) * 1000 : 0; // px/s (kept for diagnostics)
84
+ void velocity;
81
85
  const resolvedMode = mode === 'programmatic' || mode === 'touch'
82
86
  ? mode
83
- : (distance < 240 || velocity < 500 ? 'touch' : 'programmatic');
87
+ : 'programmatic';
84
88
 
85
89
  if (resolvedMode === 'touch') {
86
90
  await humanizedScroll(page, {
@@ -96,68 +100,75 @@ export async function atomScrollTo(page, _ctx, {
96
100
  targetY,
97
101
  durationMs,
98
102
  curve,
99
- jitterPx: Math.max(0, Number(jitter_px) || 0),
100
103
  });
101
104
  }
102
105
  return { anchorY: Math.round(targetY) };
103
106
  }
104
107
 
105
- // Programmatic scroll: hands the animation off to the browser's native
106
- // scroll engine via `scroll-behavior: smooth`. That way the easing runs on
107
- // the compositor thread at the display refresh rate, independent of how
108
- // busy the page's JS is. The JS-driven setTimeout approach we tried first
109
- // gets badly throttled on JS-heavy article pages (60Hz timers can stretch
110
- // to 150-200ms), turning a 1s transition into 5-8s.
108
+ // Programmatic scroll: JS-driven RAF loop that incrementally updates the
109
+ // scroll position frame-by-frame over `durationMs`. This produces an actual
110
+ // smooth scroll the viewer sees in the recording the previous version
111
+ // did a hard instant snap and then a static wait, which looked like a
112
+ // teleport ("跳一下然后定格"), not like a person sliding a page.
111
113
  //
112
- // The wait is Node-side, so even if the in-page scrollend never fires we
113
- // still cap the section at durationMs and move on. We deliberately do NOT
114
- // wait for scrollend empirically faster than dispatching the event on
115
- // page-heavy mobile sites.
114
+ // Why not native `scroll-behavior: smooth` or `scrollTo({behavior:'smooth'})`?
115
+ // In Playwright + a headless mobile context, native smooth-scroll often
116
+ // gets capped to a fixed short duration (~300-500ms) regardless of distance,
117
+ // or is throttled by the page's own scroll logic. We need a duration we
118
+ // control end-to-end.
119
+ //
120
+ // Frame loop runs inside page.evaluate so it stays in lockstep with the
121
+ // page's render thread — important when recordVideo is capturing 30fps.
116
122
  async function programmaticScroll(page, {
117
123
  fromY,
118
124
  targetY,
119
125
  durationMs,
126
+ curve = 'easeInOutQuad',
120
127
  } = {}) {
121
- // Try every plausible scroll target — mobile article pages sometimes have
122
- // a fixed-position outer body and scroll happens on an inner container.
123
- // We dispatch to all candidates and let whichever one is actually the
124
- // scroller win. Returns the diagnostics so we can debug when the page
125
- // refuses to scroll.
126
- const diag = await page.evaluate((input) => {
127
- const candidates = [];
128
- if (document.scrollingElement) candidates.push(document.scrollingElement);
129
- if (document.documentElement) candidates.push(document.documentElement);
130
- if (document.body) candidates.push(document.body);
131
- candidates.push(window);
132
-
133
- const before = candidates.map((c) => {
134
- if (c === window) return { tag: 'window', y: window.scrollY };
135
- return { tag: c.tagName, y: c.scrollTop };
136
- });
137
-
138
- // Hard snap to target on every candidate (instant, no animation).
139
- for (const c of candidates) {
140
- try {
141
- if (c === window) window.scrollTo(0, input.targetY);
142
- else { c.scrollTop = input.targetY; }
143
- } catch { /* ignore */ }
128
+ await page.evaluate(async (input) => {
129
+ function pickScroller() {
130
+ if (document.scrollingElement) return document.scrollingElement;
131
+ if (document.documentElement) return document.documentElement;
132
+ return document.body;
144
133
  }
134
+ function easeInOutQuad(t) { return t < 0.5 ? 2 * t * t : 1 - Math.pow(-2 * t + 2, 2) / 2; }
135
+ function easeOutQuad(t) { return 1 - (1 - t) * (1 - t); }
136
+ function linear(t) { return t; }
137
+ const ease = input.curve === 'linear' ? linear
138
+ : input.curve === 'easeOutQuad' ? easeOutQuad
139
+ : easeInOutQuad;
145
140
 
146
- const after = candidates.map((c) => {
147
- if (c === window) return { tag: 'window', y: window.scrollY };
148
- return { tag: c.tagName, y: c.scrollTop };
149
- });
141
+ const scroller = pickScroller();
142
+ const startY = (scroller === document.scrollingElement || scroller === document.documentElement)
143
+ ? scroller.scrollTop : window.scrollY;
144
+ const delta = input.targetY - startY;
145
+ const start = performance.now();
150
146
 
151
- return {
152
- requested_target: input.targetY,
153
- before, after,
154
- maxScroll: document.documentElement?.scrollHeight,
155
- innerHeight: window.innerHeight,
156
- };
157
- }, { fromY, targetY });
158
- void diag;
159
- // Brief dwell to let the page settle the snap before next atom starts.
160
- await page.waitForTimeout(Math.max(80, Math.round(durationMs * 0.3)));
147
+ return new Promise((resolve) => {
148
+ function tick(now) {
149
+ const elapsed = now - start;
150
+ const t = Math.min(1, elapsed / input.durationMs);
151
+ const y = startY + delta * ease(t);
152
+ try {
153
+ if (scroller === window) window.scrollTo(0, y);
154
+ else { scroller.scrollTop = y; }
155
+ } catch { /* ignore */ }
156
+ if (t < 1) {
157
+ requestAnimationFrame(tick);
158
+ } else {
159
+ // Final snap to exact target (in case of sub-pixel drift).
160
+ try {
161
+ if (scroller === window) window.scrollTo(0, input.targetY);
162
+ else { scroller.scrollTop = input.targetY; }
163
+ } catch { /* ignore */ }
164
+ resolve();
165
+ }
166
+ }
167
+ requestAnimationFrame(tick);
168
+ });
169
+ }, { fromY, targetY, durationMs, curve });
170
+ // Tiny settle so the next atom sees the scroll committed.
171
+ await page.waitForTimeout(50);
161
172
  }
162
173
 
163
174
  // ── atomHold ─────────────────────────────────────────────────────────────────
@@ -94,6 +94,54 @@ function assertNoV5Fields(seg, index) {
94
94
  }
95
95
  }
96
96
 
97
+ // Transition-mode lint — enforce the "explain block → smooth transition → explain block"
98
+ // pattern the user described:
99
+ // "先说一句话, 然后再往下滑, 介绍内容 1, 再往下滑, 停住介绍内容 2"
100
+ //
101
+ // Key insight: scroll_to is a TRANSITION between content blocks, not a
102
+ // narration vehicle. It can be short (~500-800ms) — speed doesn't matter,
103
+ // only smoothness. Long narration happens during hold, not during scroll.
104
+ //
105
+ // Rule (single rule): every non-opening segment MUST start with a scroll_to.
106
+ // This guarantees a visible transition from the previous segment's anchor
107
+ // to the new content block. Without this, an agent can string back-to-back
108
+ // hold-only segments and the viewer just sees jump cuts in audio with no
109
+ // page movement.
110
+ //
111
+ // What's NOT enforced anymore:
112
+ // - scroll_to duration_ms is not bounded — short transitions (500ms) and
113
+ // longer ones (2s+) are both fine. Smoothness comes from atomScrollTo's
114
+ // RAF-based programmatic implementation, not from duration.
115
+ // - hold duration_ms is not bounded — long holds (3-5s) are the normal
116
+ // case (this is where the agent narrates the current block).
117
+
118
+ function validateReadingFlow(operations, segmentIndex) {
119
+ // Opening hook segment is exempt — first segment may legitimately be
120
+ // a fully static hero shot (e.g. "校招,实习岗位更新,速投" over a poster).
121
+ if (segmentIndex === 0) return;
122
+
123
+ const ops = Array.isArray(operations) ? operations : [];
124
+ if (ops.length === 0) return;
125
+
126
+ // The first op of a non-opening segment must be a scroll_to (the
127
+ // transition into this block's content). All-hold segments produce
128
+ // back-to-back jump cuts with no visible page movement, which the user
129
+ // has explicitly rejected.
130
+ const first = ops[0];
131
+ if (first?.atom !== 'scroll_to') {
132
+ const err = new Error(
133
+ `transition_required: segments[${segmentIndex}] must start with a scroll_to atom — `
134
+ + 'this is the smooth transition from the previous block to this one. '
135
+ + `Got first atom "${first?.atom ?? 'none'}". All-hold segments produce jump cuts. `
136
+ + 'Fix: prepend a scroll_to(target_y=<new block top>, duration_ms=500~1000) before '
137
+ + 'the hold. The scroll can be short (~600ms is fine); what matters is that the '
138
+ + "page visibly slides — atomScrollTo's programmatic mode handles smoothness.",
139
+ );
140
+ err.code = 'TRANSITION_REQUIRED';
141
+ throw err;
142
+ }
143
+ }
144
+
97
145
  // Process operations[]: expand "fill" on the last hold, validate atom shape.
98
146
  function processOperations(operations, audioDurationMs, segmentIndex) {
99
147
  if (!Array.isArray(operations) || operations.length === 0) {
@@ -165,6 +213,7 @@ function processOperations(operations, audioDurationMs, segmentIndex) {
165
213
  }
166
214
  sum += n;
167
215
  }
216
+ validateReadingFlow(expanded, segmentIndex);
168
217
  return { operations: expanded, durationSumMs: Math.round(sum) };
169
218
  }
170
219