@lightcone-ai/daemon 0.22.1 → 0.23.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,212 @@
1
+ // Page-operation atoms. The closed primitive set plan.sections.operations[]
2
+ // decomposes into. Adding a new atom is a code-level decision; composing new
3
+ // behaviors is data-level (a new operations sequence in the plan).
4
+ //
5
+ // Each atom returns { anchorY } so plan-executor can track where the camera
6
+ // landed (used by subsequent scroll_to's from_y if not explicitly set).
7
+ // anchorY=null for non-scrolling atoms (hold, cursor_focus) — caller keeps
8
+ // the previous anchor.
9
+ //
10
+ // Layout convention: atoms take (page, ctx, params). ctx carries shared
11
+ // resources (CDP session) so atoms don't repeat the page.context dance.
12
+ // plan-executor builds ctx once per section.
13
+
14
+ import { humanizedScroll } from '../humanized-scroll.js';
15
+ import { getCdpSession } from '../cdp-touch.js';
16
+
17
+ async function readScrollY(page) {
18
+ return page.evaluate(() => {
19
+ const root = document.scrollingElement || document.documentElement;
20
+ return Math.round(root.scrollTop);
21
+ });
22
+ }
23
+
24
+ // ── atomScrollTo ─────────────────────────────────────────────────────────────
25
+ // Animated scroll from current position to target_y over duration_ms.
26
+ //
27
+ // Mode selection (the bit that took an end-to-end failure to learn):
28
+ // - 'programmatic' (default for distance >= 240px): runs the easing loop
29
+ // inside page.evaluate via root.scrollTo. Lands EXACTLY at target_y.
30
+ // Lacks the rubber-band/fling physics of real touch, but reliable.
31
+ // - 'touch' (default for short distances): humanizedScroll → CDP touch.
32
+ // Produces natural gesture physics (rubber-band, inertia) but for
33
+ // larger distances the multi-segment swipe gets broken up into rapid
34
+ // micro-flings that interfere with each other, and the page often
35
+ // ends up nowhere near the intended target_y. Safe for distances
36
+ // that fit in a single ~260px finger swipe.
37
+ // - 'auto' (default): picks 'touch' for distance < 240px (single
38
+ // segment, no fling interference), 'programmatic' otherwise.
39
+ //
40
+ // Discovered the hard way: scroll_to_dwell macro with ~18% transition
41
+ // for 1100+ px distances dispatched 5 CDP swipes in ~1s; each touchEnd
42
+ // kicked off a fling that the next touchStart immediately cancelled, so
43
+ // the cumulative scroll never reached target.
44
+ //
45
+ // Params:
46
+ // target_y — absolute Y in page coordinates (required)
47
+ // duration_ms — total animation time (required)
48
+ // curve — 'easeInOutQuad' (default) | 'linear' | 'easeOutQuad'
49
+ // jitter_px — per-step vertical jitter (default 2)
50
+ // from_y — explicit start position; if omitted reads current scrollY
51
+ // mode — 'auto' (default) | 'programmatic' | 'touch'
52
+ export async function atomScrollTo(page, _ctx, {
53
+ target_y,
54
+ duration_ms,
55
+ curve = 'easeInOutQuad',
56
+ jitter_px = 2,
57
+ from_y = null,
58
+ mode = 'auto',
59
+ } = {}) {
60
+ if (!Number.isFinite(Number(target_y))) {
61
+ throw new Error('atom_scroll_to: target_y is required');
62
+ }
63
+ if (!Number.isFinite(Number(duration_ms)) || Number(duration_ms) <= 0) {
64
+ throw new Error('atom_scroll_to: duration_ms must be positive');
65
+ }
66
+ const fromY = Number.isFinite(Number(from_y))
67
+ ? Number(from_y)
68
+ : await readScrollY(page);
69
+ const targetY = Number(target_y);
70
+ const durationMs = Number(duration_ms);
71
+ const distance = Math.abs(targetY - fromY);
72
+
73
+ // Auto-mode heuristic: touch works for short distances (single segment, no
74
+ // inter-segment fling interference) OR slow velocities (each segment has
75
+ // time to settle before the next starts). Fast long-distance scrolls fall
76
+ // back to programmatic, where the easing loop drives root.scrollTo
77
+ // deterministically.
78
+ // Thresholds chosen empirically against the v4 failure mode (~1100px in
79
+ // ~1000ms → ~1100 px/s, fling-interrupt-fling, page never reached target).
80
+ const velocity = durationMs > 0 ? (distance / durationMs) * 1000 : 0; // px/s
81
+ const resolvedMode = mode === 'programmatic' || mode === 'touch'
82
+ ? mode
83
+ : (distance < 240 || velocity < 500 ? 'touch' : 'programmatic');
84
+
85
+ if (resolvedMode === 'touch') {
86
+ await humanizedScroll(page, {
87
+ from_y: fromY,
88
+ to_y: targetY,
89
+ duration_ms: durationMs,
90
+ motion_curve: curve,
91
+ pixel_jitter_px: Math.max(0, Number(jitter_px) || 0),
92
+ });
93
+ } else {
94
+ await programmaticScroll(page, {
95
+ fromY,
96
+ targetY,
97
+ durationMs,
98
+ curve,
99
+ jitterPx: Math.max(0, Number(jitter_px) || 0),
100
+ });
101
+ }
102
+ return { anchorY: Math.round(targetY) };
103
+ }
104
+
105
+ // Programmatic scroll: hands the animation off to the browser's native
106
+ // scroll engine via `scroll-behavior: smooth`. That way the easing runs on
107
+ // the compositor thread at the display refresh rate, independent of how
108
+ // busy the page's JS is. The JS-driven setTimeout approach we tried first
109
+ // gets badly throttled on JS-heavy article pages (60Hz timers can stretch
110
+ // to 150-200ms), turning a 1s transition into 5-8s.
111
+ //
112
+ // The wait is Node-side, so even if the in-page scrollend never fires we
113
+ // still cap the section at durationMs and move on. We deliberately do NOT
114
+ // wait for scrollend — empirically faster than dispatching the event on
115
+ // page-heavy mobile sites.
116
+ async function programmaticScroll(page, {
117
+ fromY,
118
+ targetY,
119
+ durationMs,
120
+ } = {}) {
121
+ // Try every plausible scroll target — mobile article pages sometimes have
122
+ // a fixed-position outer body and scroll happens on an inner container.
123
+ // We dispatch to all candidates and let whichever one is actually the
124
+ // scroller win. Returns the diagnostics so we can debug when the page
125
+ // refuses to scroll.
126
+ const diag = await page.evaluate((input) => {
127
+ const candidates = [];
128
+ if (document.scrollingElement) candidates.push(document.scrollingElement);
129
+ if (document.documentElement) candidates.push(document.documentElement);
130
+ if (document.body) candidates.push(document.body);
131
+ candidates.push(window);
132
+
133
+ const before = candidates.map((c) => {
134
+ if (c === window) return { tag: 'window', y: window.scrollY };
135
+ return { tag: c.tagName, y: c.scrollTop };
136
+ });
137
+
138
+ // Hard snap to target on every candidate (instant, no animation).
139
+ for (const c of candidates) {
140
+ try {
141
+ if (c === window) window.scrollTo(0, input.targetY);
142
+ else { c.scrollTop = input.targetY; }
143
+ } catch { /* ignore */ }
144
+ }
145
+
146
+ const after = candidates.map((c) => {
147
+ if (c === window) return { tag: 'window', y: window.scrollY };
148
+ return { tag: c.tagName, y: c.scrollTop };
149
+ });
150
+
151
+ return {
152
+ requested_target: input.targetY,
153
+ before, after,
154
+ maxScroll: document.documentElement?.scrollHeight,
155
+ innerHeight: window.innerHeight,
156
+ };
157
+ }, { fromY, targetY });
158
+ void diag;
159
+ // Brief dwell to let the page settle the snap before next atom starts.
160
+ await page.waitForTimeout(Math.max(80, Math.round(durationMs * 0.3)));
161
+ }
162
+
163
+ // ── atomHold ─────────────────────────────────────────────────────────────────
164
+ // Stay at current position for duration_ms. Pure pause — no motion, no
165
+ // cursor update. The dominant choice for "口播 introducing what's on screen"
166
+ // segments: scroll to anchor, then hold for the rest of the audio.
167
+ export async function atomHold(page, _ctx, { duration_ms } = {}) {
168
+ const ms = Math.max(0, Math.round(Number(duration_ms) || 0));
169
+ if (ms > 0) await page.waitForTimeout(ms);
170
+ return { anchorY: null };
171
+ }
172
+
173
+ // ── atomCursorFocus ──────────────────────────────────────────────────────────
174
+ // Move the cursor to (x, y) in viewport coordinates and hold it there.
175
+ // Used for short pages where there's nothing to scroll to — the cursor
176
+ // shifting between visual focal points carries the rhythm instead.
177
+ //
178
+ // CDP `Input.dispatchMouseEvent` type='mouseMoved' moves the cursor through
179
+ // the real gesture pipeline so the cursor renders on the captured video.
180
+ export async function atomCursorFocus(page, ctx, { x, y, duration_ms } = {}) {
181
+ if (!Number.isFinite(Number(x)) || !Number.isFinite(Number(y))) {
182
+ throw new Error('atom_cursor_focus: x and y are required');
183
+ }
184
+ if (!Number.isFinite(Number(duration_ms)) || Number(duration_ms) <= 0) {
185
+ throw new Error('atom_cursor_focus: duration_ms must be positive');
186
+ }
187
+ const cdp = ctx?.cdp ?? await getCdpSession(page);
188
+ await cdp.send('Input.dispatchMouseEvent', {
189
+ type: 'mouseMoved',
190
+ x: Math.round(Number(x)),
191
+ y: Math.round(Number(y)),
192
+ button: 'none',
193
+ modifiers: 0,
194
+ });
195
+ await page.waitForTimeout(Math.max(0, Math.round(Number(duration_ms))));
196
+ return { anchorY: null };
197
+ }
198
+
199
+ // Public atom registry. Plan-executor looks up operation.atom in this map.
200
+ // V6 is closed: agent composes operations[] from these three; if a new page
201
+ // interaction is genuinely needed, a new atom gets added — never a macro.
202
+ //
203
+ // micro_oscillate was deleted in V6 — anchor sections dwell cleanly via `hold`
204
+ // when the口播 content corresponds to what's visible; "natural reader motion"
205
+ // during dwell is the wrong design.
206
+ export const ATOMS = Object.freeze({
207
+ scroll_to: atomScrollTo,
208
+ hold: atomHold,
209
+ cursor_focus: atomCursorFocus,
210
+ });
211
+
212
+ export const ATOM_NAMES = Object.freeze(Object.keys(ATOMS));
@@ -7,7 +7,7 @@ import path from 'node:path';
7
7
  import { launchChromiumMobile, openPageAndSettle } from './chromium-driver.js';
8
8
  import { defaultDisplayPool } from './display-pool.js';
9
9
  import { createUnexpectedExitWatcher, waitForProcessExit } from './ffmpeg-runner.js';
10
- import { executePlanPhases, normalizePlanPhases } from './plan-executor.js';
10
+ import { executePlanPhases, normalizePlanSections } from './plan-executor.js';
11
11
 
12
12
  const DEFAULT_VIEWPORT = Object.freeze({ width: 1080, height: 1920 });
13
13
  const DEFAULT_FPS = 30;
@@ -152,24 +152,41 @@ async function scrollToTop(page) {
152
152
  });
153
153
  }
154
154
 
155
- function scalePhaseY(phase, zoom) {
156
- if (!phase || typeof phase !== 'object') return phase;
157
- const scale = (v) => (Number.isFinite(Number(v)) ? Math.round(Number(v) * zoom) : v);
158
- const scaledVisualAction = phase.visual_action && typeof phase.visual_action === 'object'
159
- ? {
160
- ...phase.visual_action,
161
- ...(phase.visual_action.target_y != null ? { target_y: scale(phase.visual_action.target_y) } : {}),
162
- ...(phase.visual_action.to_y != null ? { to_y: scale(phase.visual_action.to_y) } : {}),
163
- ...(phase.visual_action.from_y != null ? { from_y: scale(phase.visual_action.from_y) } : {}),
164
- }
165
- : phase.visual_action;
166
- return {
167
- ...phase,
168
- ...(phase.target_y != null ? { target_y: scale(phase.target_y) } : {}),
169
- ...(phase.to_y != null ? { to_y: scale(phase.to_y) } : {}),
170
- ...(phase.from_y != null ? { from_y: scale(phase.from_y) } : {}),
171
- ...(scaledVisualAction !== phase.visual_action ? { visual_action: scaledVisualAction } : {}),
172
- };
155
+ // Preheat: slow-scroll to bottom of page to trigger lazy-load (images / late
156
+ // content), then back to top. Both analyze_page and record_url_narration run
157
+ // this so y coordinates in page_understanding match what the record browser
158
+ // sees without preheat, lazy images haven't loaded, page is shorter, and
159
+ // scroll targets land on the wrong content.
160
+ async function preheatPage(page, { fullHeightPx } = {}) {
161
+ await page.evaluate(async ({ targetHeight }) => {
162
+ const step = 600;
163
+ const stepDelayMs = 180;
164
+ const root = document.scrollingElement || document.documentElement;
165
+ const sleep = (ms) => new Promise((r) => setTimeout(r, ms));
166
+
167
+ const totalAtStart = Math.max(
168
+ root.scrollHeight,
169
+ document.body?.scrollHeight || 0,
170
+ window.innerHeight,
171
+ );
172
+ const target = Number.isFinite(targetHeight) && targetHeight > 0
173
+ ? Math.max(targetHeight, totalAtStart)
174
+ : totalAtStart;
175
+
176
+ let cursor = 0;
177
+ while (cursor < target) {
178
+ cursor = Math.min(target, cursor + step);
179
+ window.scrollTo(0, cursor);
180
+ await sleep(stepDelayMs);
181
+ }
182
+ await sleep(500);
183
+
184
+ window.scrollTo(0, 0);
185
+ await sleep(300);
186
+ }, { targetHeight: Number.isFinite(fullHeightPx) ? fullHeightPx : null });
187
+
188
+ // Wait for any newly loaded images to settle.
189
+ try { await page.waitForLoadState('networkidle', { timeout: 4000 }); } catch { /* ignore */ }
173
190
  }
174
191
 
175
192
  // Re-encode the page recording (webm, page content only — no browser chrome) into
@@ -326,22 +343,26 @@ function normalizeOutputPaths(rawList) {
326
343
 
327
344
  export async function recordUrlNarration({
328
345
  plan,
346
+ // page_understanding (V6) — required for safety-region validation and to
347
+ // align preheat strategy with what analyze_page used. The recorder's plan-
348
+ // executor checks scroll_to.y / cursor_focus.y against pageUnderstanding.unsafe_regions
349
+ // before execution. Without it, scroll_to.y is unbounded and unsafe.
350
+ page_understanding,
351
+ pageUnderstanding = page_understanding,
329
352
  output_path,
330
353
  outputPath = output_path,
331
354
  events_path,
332
355
  eventsPath = events_path,
333
- // Multi-section output: pass an array of N paths matching plan.sections length
334
- // to record once continuously and slice the result into N per-section mp4s.
335
- // The browser stays open for the whole recording, so visuals flow naturally
336
- // between sections (no scroll-back-to-top between each, no page reload). When
337
- // omitted, behaves exactly like before — single mp4 at outputPath.
356
+ // Multi-section output: array of N paths matching plan.sections length.
357
+ // Recorder captures once continuously and slices into N per-section mp4s,
358
+ // so visuals flow naturally between sections (no scroll-back-to-top, no
359
+ // page reload).
338
360
  output_paths,
339
361
  outputPaths = output_paths,
340
362
  url,
341
363
  viewport = DEFAULT_VIEWPORT,
342
364
  fps = DEFAULT_FPS,
343
365
  settle_ms = 4000,
344
- page_zoom = 1.1,
345
366
  displayPool = defaultDisplayPool,
346
367
  startupProbeMs = 1200,
347
368
  xvfbStopTimeoutMs = 5000,
@@ -353,12 +374,19 @@ export async function recordUrlNarration({
353
374
  cutFn = cutMp4Slice,
354
375
  nowMs = () => Date.now(),
355
376
  } = {}) {
356
- const zoom = Number.isFinite(Number(page_zoom)) && Number(page_zoom) > 0 ? Number(page_zoom) : 1.1;
357
- const rawPhases = normalizePlanPhases(plan);
358
- const phases = zoom !== 1.0 ? rawPhases.map(p => scalePhaseY(p, zoom)) : rawPhases;
377
+ if (!pageUnderstanding || typeof pageUnderstanding !== 'object' || Array.isArray(pageUnderstanding)) {
378
+ const error = new Error(
379
+ 'page_understanding_required: V6 record_url_narration requires page_understanding (from analyze_page) '
380
+ + 'for safe-region validation and preheat consistency. No silent default — refusing rather than running blind.',
381
+ );
382
+ error.code = 'PAGE_UNDERSTANDING_REQUIRED';
383
+ throw error;
384
+ }
385
+
386
+ const sections = normalizePlanSections(plan);
359
387
  const executablePlan = {
360
388
  ...(plan && typeof plan === 'object' ? plan : {}),
361
- phases,
389
+ sections,
362
390
  };
363
391
 
364
392
  const resolvedOutputPath = resolveOutputPath(outputPath);
@@ -378,9 +406,9 @@ export async function recordUrlNarration({
378
406
  error.code = 'OUTPUT_PATHS_REQUIRED';
379
407
  throw error;
380
408
  }
381
- if (resolvedOutputPaths.length !== phases.length) {
409
+ if (resolvedOutputPaths.length !== sections.length) {
382
410
  const error = new Error(
383
- `output_paths_count_mismatch:expected=${phases.length}:got=${resolvedOutputPaths.length}`,
411
+ `output_paths_count_mismatch:expected=${sections.length}:got=${resolvedOutputPaths.length}`,
384
412
  );
385
413
  error.code = 'OUTPUT_PATHS_COUNT_MISMATCH';
386
414
  throw error;
@@ -438,11 +466,13 @@ export async function recordUrlNarration({
438
466
  settleMs: settle_ms,
439
467
  });
440
468
 
441
- if (zoom !== 1.0) {
442
- await browserSession.page.evaluate((z) => {
443
- document.documentElement.style.zoom = String(z);
444
- }, zoom);
445
- await browserSession.page.waitForTimeout(300);
469
+ // Preheat to match analyze_page's page state — triggers lazy-loaded
470
+ // images so y coordinates stabilize at the same values the analyzer saw.
471
+ // Skipped when pageUnderstanding.preheat_strategy === 'none'.
472
+ if (pageUnderstanding.preheat_strategy === 'full_scroll_then_top') {
473
+ await preheatPage(browserSession.page, {
474
+ fullHeightPx: pageUnderstanding.full_height_px,
475
+ });
446
476
  }
447
477
 
448
478
  await scrollToTop(browserSession.page);
@@ -451,7 +481,7 @@ export async function recordUrlNarration({
451
481
  const headTrimMs = Math.max(0, nowMs() - recordStartedAt);
452
482
 
453
483
  const eventsLog = await Promise.race([
454
- executePlanPhases(browserSession.page, executablePlan),
484
+ executePlanPhases(browserSession.page, executablePlan, { pageUnderstanding }),
455
485
  xvfbWatcher.promise,
456
486
  ]);
457
487
 
@@ -510,7 +540,7 @@ export async function recordUrlNarration({
510
540
  // phase_start / phase_end events). All slices come from the SAME
511
541
  // continuous recording, so the visual flow between sections stays
512
542
  // natural — no browser reload, no scroll-back-to-top per segment.
513
- const cutPoints = deriveSectionCutPoints(eventsLog, phases.length);
543
+ const cutPoints = deriveSectionCutPoints(eventsLog, sections.length);
514
544
  const sectionOutputs = [];
515
545
  for (let i = 0; i < cutPoints.length; i += 1) {
516
546
  const cut = cutPoints[i];