@lightcone-ai/daemon 0.23.6 → 0.23.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -92,7 +92,19 @@ server.tool(
92
92
  + 'Takes any video produced by compose_video_v2 / record_url_narration / etc. and adds '
93
93
  + 'one or more on-screen title cards with animation presets (fade+zoom pop, per-character karaoke fill). '
94
94
  + 'The default narration subtitle burned by compose_video_v2 stays at the bottom; titles default to the top band so they do not collide. '
95
- + 'Output is a new mp4; original is not modified. Skip this tool entirely when a plain video is desired — not every video needs title effects.',
95
+ + 'Output is a new mp4; original is not modified.\n\n'
96
+ + 'STANDARD opening 引导语 title (URL recruitment short videos — do this by default): the video '
97
+ + 'opens with a 引导语 lead-in section, and that line is rendered HERE as an eye-catching centered '
98
+ + 'title card — NOT as a plain bottom subtitle. Recipe:\n'
99
+ + ' - preset: "karaoke_punch" (per-character fill reads as a real effect; "fade_zoom" is too plain for an opener)\n'
100
+ + ' - position: "center"\n'
101
+ + ' - style: { font_size 110-120 (large), color a vivid "#FFE000"-style, outline_color "#000000" }\n'
102
+ + ' - start_ms: 0, end_ms: the opening section duration — the card clears exactly when the first content section begins.\n'
103
+ + 'Long titles auto-wrap; you may also place an explicit "\\n" for a clean 2-line break.\n'
104
+ + 'The opening section itself must be a FRAMELESS lead-in — its operations use raw `y` (no `block`, '
105
+ + 'so the recorder draws no spotlight) and it carries NO bottom subtitle (subtitle_text empty); the '
106
+ + '引导语 appears only as this centered card.\n\n'
107
+ + 'Skip this tool when a plain video with no title cards is desired.',
96
108
  {
97
109
  input_path: z.string().min(1).describe('Absolute path to the source mp4 (e.g. the output of compose_video_v2).'),
98
110
  output_path: z.string().optional().describe('Optional absolute output path. If omitted, writes to a tmp path and returns it.'),
@@ -258,7 +270,8 @@ server.tool(
258
270
  operations: z.array(z.object({
259
271
  atom: z.enum(['scroll_to', 'hold', 'cursor_focus']),
260
272
  duration_ms: z.union([z.number(), z.literal('fill')]).describe('Atom duration in ms. "fill" allowed only on the LAST hold to auto-fill remaining audio time.'),
261
- y: z.number().optional(),
273
+ block: z.string().optional().describe('scroll_to: id of a page_understanding block to frame. The recorder centers it in the viewport. Use this for content sections — do NOT write pixel y.'),
274
+ y: z.number().optional().describe('scroll_to: raw scrollTop. Only for a content-agnostic opening drift — for content blocks use `block` instead.'),
262
275
  x: z.number().optional(),
263
276
  curve: z.enum(['easeInOutQuad', 'linear', 'easeOutQuad']).optional(),
264
277
  mode: z.enum(['auto', 'touch', 'programmatic']).optional(),
@@ -266,32 +279,25 @@ server.tool(
266
279
  })).optional().describe(
267
280
  'For visual_kind=video URL recording sections: ordered atom sequence. Sum of duration_ms '
268
281
  + 'must equal audio_duration_ms (±200ms); use "fill" on the last hold to auto-balance.\n\n'
269
- + 'TRANSITION + EXPLAIN MODE (REQUIRED enforced by lint): the recording should feel '
270
- + 'like a person opening a page and walking the viewer through it block by block. '
271
- + 'Concretely:\n'
272
- + ' scroll_to is a TRANSITION between content blocks — short (~500-800ms is fine), '
273
- + 'smooth (atomScrollTo programmatic mode handles smoothness automatically; speed does NOT need to be slow).\n'
274
- + ' hold is where the NARRATION happens — long holds (2-5s) are the norm, not the exception. '
275
- + 'This is when the agent says the actual sentences about this block.\n'
276
- + " • Every non-opening segment MUST start with a scroll_to (the transition into this segment's "
277
- + 'content block). Segments starting with hold are REJECTED they cause jump cuts.\n'
278
- + ' The shape is: "scroll to new block pause and explain scroll to next block → pause and explain".\n\n'
279
- + 'GOOD example for a 5s segment narrating "宁波银行金融科技部 FinTech 暑期专项":\n'
282
+ + 'Every content section is exactly: scroll_to{block} hold. scroll_to FRAMES A CONTENT '
283
+ + 'BLOCK pass `block: "<id>"` (a block id from page_understanding.blocks) and the recorder '
284
+ + 'CENTERS that block in the viewport. Do NOT write pixel `y` for content blocks; raw `y` is '
285
+ + 'only for a content-agnostic opening drift.\n'
286
+ + ' One section narrates ONE block every scroll_to in a section references the SAME '
287
+ + 'block id (2+ distinct block ids REJECTED: section_spans_multiple_blocks).\n'
288
+ + ' scroll_to is a short TRANSITION between blocks (~500-800ms). hold is where the '
289
+ + 'NARRATION happens and the picture is STILL long holds (2-5s) are the norm.\n'
290
+ + ' A block taller than the viewport just shows its centered slice, HELD STILL. Do NOT '
291
+ + 'pan / slow-scroll through it the picture must not move while you narrate; partial '
292
+ + 'visibility of a tall block is accepted.\n'
293
+ + ' • Every non-opening segment MUST start with a scroll_to (REJECTED otherwise: transition_required).\n\n'
294
+ + 'GOOD — a 5s segment narrating block b2:\n'
280
295
  + ' [\n'
281
- + ' { atom: "scroll_to", y: 280, duration_ms: 700 }, // 0.7s smooth transition to title\n'
282
- + ' { atom: "hold", duration_ms: "fill" }, // ~4.3s: agent narrates this block\n'
296
+ + ' { atom: "scroll_to", block: "b2", duration_ms: 700 }, // 0.7s transition, recorder centers b2\n'
297
+ + ' { atom: "hold", duration_ms: "fill" }, // ~4.3s: narrate b2, picture still\n'
283
298
  + ' ]\n\n'
284
- + 'GOOD example for a 9s segment with two content blocks inside:\n'
285
- + ' [\n'
286
- + ' { atom: "scroll_to", y: 980, duration_ms: 700 }, // transition to first block\n'
287
- + ' { atom: "hold", duration_ms: 4000 }, // narrate this block (~"金融产品应用开发岗 …")\n'
288
- + ' { atom: "scroll_to", y: 1450, duration_ms: 600 }, // short transition to next block\n'
289
- + ' { atom: "hold", duration_ms: "fill" }, // narrate next block (~3.7s)\n'
290
- + ' ]\n\n'
291
- + 'BAD example (REJECTED by transition_required):\n'
292
- + ' [\n'
293
- + ' { atom: "hold", duration_ms: 5000 }, // segment starts with hold ← rejected\n'
294
- + ' ]',
299
+ + 'BAD (REJECTED): a segment starting with hold (transition_required); a segment whose '
300
+ + 'scroll_to ops reference two different blocks (section_spans_multiple_blocks).',
295
301
  ),
296
302
  })).describe('Segments to plan. audio_path is required for each. V5 fields (action, target_y, target_y_content_label, focus_region, transition_ms, dwell_ms, phase.beats[]) are rejected.'),
297
303
  },
@@ -373,22 +379,27 @@ server.tool(
373
379
  );
374
380
 
375
381
  // ── record_url_narration (migrated from chat-bridge) ──────────────────────
376
- // Records a silent mp4 of a URL via Chromium+Xvfb+Playwright recordVideo,
382
+ // Records a silent mp4 of a URL via headless Chromium + Playwright recordVideo,
377
383
  // driven by a beat-by-beat plan. Hard-block: requires plan_video_segments to
378
384
  // have run in this session — hand-written dwell_ms has drifted from TTS
379
385
  // audio in production runs (Tasks #20/#25/#26), forcing re-records.
380
386
  server.tool(
381
387
  'record_url_narration',
382
- 'V6 record_url_narration. Drives Chromium on Xvfb + Playwright recordVideo to capture a silent mp4 per section, then ffmpeg-slices into output_paths. Each mp4 passes to compose_video_v2 as a video-kind segment.\n\n'
388
+ 'V6 record_url_narration. Drives headless Chromium + Playwright recordVideo to capture a silent mp4 per section, then ffmpeg-slices into output_paths. Each mp4 passes to compose_video_v2 as a video-kind segment.\n\n'
383
389
  + 'REQUIRES page_understanding (from analyze_page) — used for safe-region check (scroll_to.y / cursor_focus.y rejected if in unsafe_regions) and preheat alignment (same full-scroll-then-top pre-roll as analyze_page).\n\n'
384
- + 'plan.sections[*].operations[] is the visual beat — each operation is one of three atom calls:\n'
385
- + ' - scroll_to: { y, duration_ms, curve?, mode?, jitter_px? }\n'
390
+ + 'plan.sections[*].operations[] is the visual beat — a content section is scroll_to{block} hold:\n'
391
+ + ' - scroll_to: { block | y, duration_ms, curve?, mode? } — pass `block` (a page_understanding '
392
+ + 'block id) and the recorder CENTERS that block in the viewport, then the section holds STILL '
393
+ + 'on it. A block taller than the viewport shows its centered slice held still (no pan). Raw '
394
+ + '`y` is only for a content-agnostic opening drift.\n'
386
395
  + ' - hold: { duration_ms } — duration_ms="fill" allowed on the LAST hold to auto-balance with audio_duration_ms\n'
387
396
  + ' - cursor_focus: { x, y, duration_ms }\n\n'
397
+ + 'The recorder automatically draws a spotlight highlight (bordered frame + dimmed surround) around '
398
+ + "each section's block once its scroll lands — automatic, no plan field controls it.\n\n"
388
399
  + 'V5 fields are rejected: action / target_y / target_y_content_label / focus_region / transition_ms / dwell_ms (set by plan_video_segments only) / phase.beats[].\n\n'
389
400
  + 'Standard chain: analyze_page → synthesize_tts × N → plan_video_segments → record_url_narration + compose_video_v2.\n\n'
390
401
  + 'ALWAYS pass output_paths as an array with one mp4 path per plan.sections entry (single-section is a 1-element array). The tool records the URL ONCE continuously (one browser session, natural scroll flow across all sections), then ffmpeg-slices at section boundaries. One URL = one call.\n\n'
391
- + 'Runtime: Linux daemon with Xvfb + Chromium + ffmpeg. macOS / Windows fail at startup.',
402
+ + 'Runtime: daemon with Chromium + ffmpeg.',
392
403
  {
393
404
  url: z.string().describe('Page URL to record (must match the URL passed to analyze_page that produced page_understanding).'),
394
405
  page_understanding: z.record(z.any()).describe('Output of analyze_page for this URL. Required. Provides full_height_px / viewport / preheat_strategy / unsafe_regions[] for safety validation, and blocks[] / narrative_arc as informational metadata (the recorder itself only needs the safety bits).'),
@@ -401,12 +412,13 @@ server.tool(
401
412
  operations: z.array(z.object({
402
413
  atom: z.enum(['scroll_to', 'hold', 'cursor_focus']),
403
414
  duration_ms: z.number().describe('Atom duration in ms. (plan_video_segments may have expanded a "fill" value already.)'),
404
- y: z.number().optional(),
415
+ block: z.string().optional().describe('scroll_to: page_understanding block id to frame. Recorder centers it and the section holds still. Use for content sections instead of pixel y.'),
416
+ y: z.number().optional().describe('scroll_to: raw scrollTop — only for a content-agnostic opening drift.'),
405
417
  x: z.number().optional(),
406
418
  curve: z.enum(['easeInOutQuad', 'linear', 'easeOutQuad']).optional(),
407
419
  mode: z.enum(['auto', 'touch', 'programmatic']).optional(),
408
420
  jitter_px: z.number().optional(),
409
- })).min(1).describe('Ordered atom sequence executed during this section.'),
421
+ })).min(1).describe('Ordered atom sequence executed during this section. Pass the plan_video_segments output verbatim.'),
410
422
  })).min(1),
411
423
  }).describe('plan.sections[] — each section has text/audio_path/dwell_ms (filled by plan_video_segments) and operations[].'),
412
424
  output_paths: z.array(z.string()).min(1).describe('REQUIRED. Workspace-relative mp4 paths, one per plan.sections entry. The tool records ONCE continuously and slices at section boundaries (phase_start / phase_end events).'),
@@ -83,14 +83,15 @@ function buildAssContent({ playResX, playResY, overlays }) {
83
83
  'ScriptType: v4.00+',
84
84
  `PlayResX: ${playResX}`,
85
85
  `PlayResY: ${playResY}`,
86
- 'WrapStyle: 2',
86
+ 'WrapStyle: 0',
87
87
  '',
88
88
  '[V4+ Styles]',
89
89
  'Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding',
90
90
  // PrimaryColour white, SecondaryColour orange (for karaoke fill), OutlineColour black,
91
- // Bold on, Outline 4px, Shadow 2px, default Alignment middle-center (5) events
92
- // override per-line via \an.
93
- `Style: Title,${DEFAULT_FONT},96,&H00FFFFFF,&H000066FF,&H00000000,&H80000000,-1,0,0,0,100,100,0,0,1,4,2,5,30,30,0,1`,
91
+ // Bold on, Outline 6px (thick punchy contrast over busy page backgrounds),
92
+ // Shadow 2px, default Alignment middle-center (5) — events override per-line via \an.
93
+ // WrapStyle 0 (above) auto-wraps long titles instead of clipping them.
94
+ `Style: Title,${DEFAULT_FONT},96,&H00FFFFFF,&H000066FF,&H00000000,&H80000000,-1,0,0,0,100,100,0,0,1,6,2,5,30,30,0,1`,
94
95
  '',
95
96
  '[Events]',
96
97
  'Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text',
package/package.json CHANGED
@@ -1,10 +1,11 @@
1
1
  {
2
2
  "name": "@lightcone-ai/daemon",
3
- "version": "0.23.6",
3
+ "version": "0.23.8",
4
4
  "type": "module",
5
5
  "main": "src/index.js",
6
6
  "bin": {
7
- "lightcone-daemon": "src/index.js"
7
+ "lightcone-daemon": "src/index.js",
8
+ "lightcone": "src/cli.js"
8
9
  },
9
10
  "files": [
10
11
  "src",
@@ -24,23 +24,18 @@ async function readScrollY(page) {
24
24
  // ── atomScrollTo ─────────────────────────────────────────────────────────────
25
25
  // Animated scroll from current position to target_y over duration_ms.
26
26
  //
27
- // Mode selection (the bit that took an end-to-end failure to learn):
28
- // - 'programmatic' (default for distance >= 240px): runs the easing loop
29
- // inside page.evaluate via root.scrollTo. Lands EXACTLY at target_y.
30
- // Lacks the rubber-band/fling physics of real touch, but reliable.
31
- // - 'touch' (default for short distances): humanizedScroll → CDP touch.
32
- // Produces natural gesture physics (rubber-band, inertia) but for
33
- // larger distances the multi-segment swipe gets broken up into rapid
34
- // micro-flings that interfere with each other, and the page often
35
- // ends up nowhere near the intended target_y. Safe for distances
36
- // that fit in a single ~260px finger swipe.
37
- // - 'auto' (default): picks 'touch' for distance < 240px (single
38
- // segment, no fling interference), 'programmatic' otherwise.
39
- //
40
- // Discovered the hard way: scroll_to_dwell macro with ~18% transition
41
- // for 1100+ px distances dispatched 5 CDP swipes in ~1s; each touchEnd
42
- // kicked off a fling that the next touchStart immediately cancelled, so
43
- // the cumulative scroll never reached target.
27
+ // Mode selection:
28
+ // - 'programmatic': RAF-driven easing loop inside page.evaluate via
29
+ // root.scrollTo. Every frame moves, vertical only, lands EXACTLY at
30
+ // target_y. This is what a smooth between-blocks transition needs.
31
+ // - 'touch': humanizedScroll → CDP touch. Real gesture physics (rubber-
32
+ // band, inertia) but splits scroll > 260px into multiple swipes with
33
+ // fling-cancel-fling boundaries and ±18-26px horizontal nudge looks
34
+ // like a shaky multi-tap drag, not a clean slide.
35
+ // - 'auto' (default): resolves to 'programmatic'. Touch's gesture physics
36
+ // lost out to a clean slide for narration video; it stays reachable
37
+ // only via explicit `mode: 'touch'`. See the resolvedMode block below
38
+ // for the full rationale.
44
39
  //
45
40
  // Params:
46
41
  // target_y — absolute Y in page coordinates (required)
@@ -30,7 +30,6 @@ function normalizeUrl(value) {
30
30
  }
31
31
 
32
32
  export async function launchChromiumMobile({
33
- display,
34
33
  viewport = DEFAULT_VIEWPORT,
35
34
  userAgent = IOS_UA,
36
35
  deviceScaleFactor = 1,
@@ -70,10 +69,7 @@ export async function launchChromiumMobile({
70
69
  headless,
71
70
  channel,
72
71
  args: launchArgs,
73
- env: {
74
- ...process.env,
75
- DISPLAY: normalizeText(display) || process.env.DISPLAY,
76
- },
72
+ env: process.env,
77
73
  ...launchOptions,
78
74
  });
79
75
 
@@ -5,8 +5,6 @@ import os from 'node:os';
5
5
  import path from 'node:path';
6
6
 
7
7
  import { launchChromiumMobile, openPageAndSettle } from './chromium-driver.js';
8
- import { defaultDisplayPool } from './display-pool.js';
9
- import { createUnexpectedExitWatcher, waitForProcessExit } from './ffmpeg-runner.js';
10
8
  import { executePlanPhases, normalizePlanSections } from './plan-executor.js';
11
9
 
12
10
  const DEFAULT_VIEWPORT = Object.freeze({ width: 1080, height: 1920 });
@@ -60,91 +58,6 @@ function resolveUrl({ url, plan }) {
60
58
  throw error;
61
59
  }
62
60
 
63
- function createXvfbExitError({ code, signal, stderr }) {
64
- const error = new Error(`xvfb_exited_unexpectedly:code=${code ?? 'null'}:signal=${signal ?? 'none'}`);
65
- error.code = 'XVFB_EXITED_UNEXPECTEDLY';
66
- error.exitCode = code;
67
- error.signal = signal;
68
- error.stderr = stderr;
69
- return error;
70
- }
71
-
72
- async function stopXvfb(runner, {
73
- signal = 'SIGTERM',
74
- timeoutMs = 5000,
75
- killTimeoutMs = 2000,
76
- } = {}) {
77
- const child = runner?.child;
78
- if (!child || child.exitCode !== null) return child?.exitCode ?? 0;
79
-
80
- child.kill(signal);
81
- const firstExit = await waitForProcessExit(child, timeoutMs);
82
- if (!firstExit.timedOut) return firstExit.code;
83
-
84
- child.kill('SIGKILL');
85
- const forceExit = await waitForProcessExit(child, killTimeoutMs);
86
- return forceExit.code;
87
- }
88
-
89
- async function startXvfb({
90
- display,
91
- width,
92
- height,
93
- colorDepth = 24,
94
- startupProbeMs = 1200,
95
- xvfbBin = 'Xvfb',
96
- } = {}) {
97
- const args = [
98
- display,
99
- '-screen',
100
- '0',
101
- `${width}x${height}x${colorDepth}`,
102
- '-ac',
103
- '+extension',
104
- 'RANDR',
105
- ];
106
-
107
- let stderr = '';
108
- let spawnError = null;
109
- const child = spawn(xvfbBin, args, {
110
- stdio: ['ignore', 'pipe', 'pipe'],
111
- });
112
-
113
- child.stderr?.on('data', (chunk) => {
114
- const next = `${stderr}${String(chunk)}`;
115
- stderr = next.length > 8000 ? next.slice(next.length - 8000) : next;
116
- });
117
- child.once('error', (error) => {
118
- spawnError = error;
119
- });
120
-
121
- await new Promise(resolve => setTimeout(resolve, Math.max(0, Number(startupProbeMs) || 0)));
122
-
123
- if (spawnError) {
124
- const error = new Error(`xvfb_spawn_failed:${spawnError.message}`);
125
- error.code = 'XVFB_SPAWN_FAILED';
126
- throw error;
127
- }
128
-
129
- if (child.exitCode !== null) {
130
- throw createXvfbExitError({
131
- code: child.exitCode,
132
- signal: child.signalCode,
133
- stderr,
134
- });
135
- }
136
-
137
- const runner = {
138
- child,
139
- display,
140
- args,
141
- getStderr: () => stderr,
142
- stop: (options) => stopXvfb(runner, options),
143
- };
144
-
145
- return runner;
146
- }
147
-
148
61
  async function scrollToTop(page) {
149
62
  await page.evaluate(() => {
150
63
  const root = document.scrollingElement || document.documentElement;
@@ -363,9 +276,6 @@ export async function recordUrlNarration({
363
276
  viewport = DEFAULT_VIEWPORT,
364
277
  fps = DEFAULT_FPS,
365
278
  settle_ms = 4000,
366
- displayPool = defaultDisplayPool,
367
- startupProbeMs = 1200,
368
- xvfbStopTimeoutMs = 5000,
369
279
  postPlanTailMs = 600,
370
280
  recordingDir = null,
371
281
  launchChromiumFn = launchChromiumMobile,
@@ -423,33 +333,16 @@ export async function recordUrlNarration({
423
333
  const ownTempDir = !recordingDir;
424
334
  const recVideoDir = recordingDir || await mkdtemp(path.join(os.tmpdir(), 'lc-recvid-'));
425
335
 
426
- let displayLease;
427
- let xvfb;
428
- let xvfbWatcher;
429
336
  let browserSession = null;
430
337
  let primaryError = null;
431
338
  const cleanupErrors = [];
432
339
 
433
340
  try {
434
- displayLease = await displayPool.acquireDisplay();
435
- const display = displayLease.display;
436
-
437
- xvfb = await startXvfb({
438
- display,
439
- width: normalizedViewport.width,
440
- height: normalizedViewport.height,
441
- startupProbeMs,
442
- });
443
- xvfbWatcher = createUnexpectedExitWatcher(xvfb.child, 'xvfb');
444
-
445
341
  // The page recording captures the page viewport only (no browser chrome),
446
- // regardless of the on-screen window. recordVideo starts when the page is
447
- // created, so the webm includes goto + settle; we measure that head and trim
448
- // it off in transcodeFn.
449
- const recordStartedAt = nowMs();
342
+ // regardless of the on-screen window.
450
343
  browserSession = await launchChromiumFn({
451
- display,
452
344
  viewport: normalizedViewport,
345
+ headless: true,
453
346
  contextOptions: {
454
347
  recordVideo: {
455
348
  dir: recVideoDir,
@@ -457,6 +350,14 @@ export async function recordUrlNarration({
457
350
  },
458
351
  },
459
352
  });
353
+ // recordVideo's webm timeline starts at t=0 when the page is created —
354
+ // which happens INSIDE launchChromiumFn. Capture the head-trim reference
355
+ // here, right after it returns, NOT before the launch: the webm has no
356
+ // frames for the browser-launch interval, so measuring from before launch
357
+ // would make headTrimMs overshoot by the launch duration and ffmpeg's
358
+ // `-ss headTrimMs` would clip the opening of the first plan section,
359
+ // shifting every section's visuals late against its narration audio.
360
+ const recordStartedAt = nowMs();
460
361
  const videoHandle = typeof browserSession.page.video === 'function'
461
362
  ? browserSession.page.video()
462
363
  : null;
@@ -480,15 +381,14 @@ export async function recordUrlNarration({
480
381
 
481
382
  const headTrimMs = Math.max(0, nowMs() - recordStartedAt);
482
383
 
483
- const eventsLog = await Promise.race([
484
- executePlanPhases(browserSession.page, executablePlan, { pageUnderstanding }),
485
- xvfbWatcher.promise,
486
- ]);
384
+ const eventsLog = await executePlanPhases(browserSession.page, executablePlan, {
385
+ pageUnderstanding,
386
+ viewportHeight: normalizedViewport.height,
387
+ viewportWidth: normalizedViewport.width,
388
+ });
487
389
 
488
390
  await browserSession.page.waitForTimeout(Math.max(0, Number(postPlanTailMs) || 0));
489
391
 
490
- xvfbWatcher.deactivate();
491
-
492
392
  // Flush the recording: video is written when the context closes.
493
393
  let webmPath = null;
494
394
  try {
@@ -573,15 +473,12 @@ export async function recordUrlNarration({
573
473
  events_path: resolvedEventsPath,
574
474
  events_log: eventsLog,
575
475
  duration_ms: lastTms > 0 ? lastTms : null,
576
- display,
577
476
  sections: sectionOutputs,
578
477
  };
579
478
  } catch (error) {
580
479
  primaryError = error;
581
480
  throw error;
582
481
  } finally {
583
- xvfbWatcher?.deactivate();
584
-
585
482
  if (browserSession) {
586
483
  try {
587
484
  await browserSession.browser.close();
@@ -590,18 +487,6 @@ export async function recordUrlNarration({
590
487
  }
591
488
  }
592
489
 
593
- if (xvfb) {
594
- try {
595
- await stopXvfb(xvfb, { timeoutMs: xvfbStopTimeoutMs });
596
- } catch (stopError) {
597
- cleanupErrors.push(`xvfb_stop_failed:${stopError.message}`);
598
- }
599
- }
600
-
601
- if (displayLease) {
602
- displayLease.release();
603
- }
604
-
605
490
  if (ownTempDir) {
606
491
  await rm(recVideoDir, { recursive: true, force: true }).catch(() => {});
607
492
  }
@@ -14,8 +14,9 @@
14
14
  // or unsafe coordinates throw — no silent skipping.
15
15
 
16
16
  import { ATOMS, ATOM_NAMES } from './atoms.js';
17
- import { findOverlappingUnsafeRegion } from '../understanding/schema.js';
17
+ import { findBlockById, findOverlappingUnsafeRegion } from '../understanding/schema.js';
18
18
  import { getCdpSession } from '../cdp-touch.js';
19
+ import { setSpotlight, clearSpotlight } from './spotlight.js';
19
20
 
20
21
  const V5_FIELDS_ON_SECTION = Object.freeze([
21
22
  'action',
@@ -93,7 +94,70 @@ function assertYNotInUnsafeRegion(y, { unsafeRegions, atomName, sectionId, opera
93
94
  }
94
95
  }
95
96
 
96
- function validateOperation(op, { sectionId, operationIndex, fullHeightPx, unsafeRegions }) {
97
+ // Resolve a scroll_to operation's target scrollTop. Two mutually-exclusive forms:
98
+ //
99
+ // { block: 'b3' } — frame a content block: the block's middle is placed at
100
+ // the viewport center, then the section holds STILL on it. The recorder
101
+ // owns this geometry so the agent never writes pixels. A block taller than
102
+ // the viewport just shows its centered slice held still — there is no pan,
103
+ // partial visibility is accepted (decided 2026-05-16: 定住,接受看不全).
104
+ //
105
+ // { y: <number> } — raw scrollTop, for content-agnostic moves such as the
106
+ // opening lead-in drift (画面跟内容无关的漫滑).
107
+ //
108
+ // Result is clamped to [0, full_height_px - viewport_height].
109
+ export function resolveScrollTargetY(op, {
110
+ blocks = [],
111
+ viewportHeight = 1920,
112
+ fullHeightPx = null,
113
+ sectionId = '?',
114
+ operationIndex = 0,
115
+ } = {}) {
116
+ const hasBlock = typeof op?.block === 'string' && op.block.trim();
117
+ const hasY = Number.isFinite(Number(op?.y));
118
+
119
+ if (hasBlock && hasY) {
120
+ const error = new Error(
121
+ `operations_invalid: section "${sectionId}" operations[${operationIndex}] sets both `
122
+ + '`block` and `y` on a scroll_to — specify exactly one (block to frame a content block, '
123
+ + 'y for a raw content-agnostic move).',
124
+ );
125
+ error.code = 'OPERATIONS_INVALID';
126
+ throw error;
127
+ }
128
+
129
+ if (hasBlock) {
130
+ const blockId = op.block.trim();
131
+ const block = findBlockById(blocks, blockId);
132
+ if (!block) {
133
+ const known = blocks.map(b => b?.id).filter(Boolean).join(', ') || '(none)';
134
+ const error = new Error(
135
+ `block_not_found: section "${sectionId}" operations[${operationIndex}].block="${blockId}" `
136
+ + `is not a block id in page_understanding.blocks. Known ids: ${known}.`,
137
+ );
138
+ error.code = 'BLOCK_NOT_FOUND';
139
+ throw error;
140
+ }
141
+ // Center the block. A block taller than the viewport shows its centered
142
+ // slice — the section then holds still on it (no pan).
143
+ const target = (block.y_top + block.y_bottom) / 2 - viewportHeight / 2;
144
+ const maxScroll = Number.isFinite(fullHeightPx) && fullHeightPx > 0
145
+ ? Math.max(0, fullHeightPx - viewportHeight)
146
+ : Number.POSITIVE_INFINITY;
147
+ return Math.round(Math.min(Math.max(target, 0), maxScroll));
148
+ }
149
+
150
+ if (hasY) return Math.round(Number(op.y));
151
+
152
+ const error = new Error(
153
+ `operations_invalid: section "${sectionId}" operations[${operationIndex}] is a scroll_to with `
154
+ + 'neither `block` nor `y`. Set `block` (centers/frames a content block) or `y` (raw scrollTop).',
155
+ );
156
+ error.code = 'OPERATIONS_INVALID';
157
+ throw error;
158
+ }
159
+
160
+ function validateOperation(op, { sectionId, operationIndex, fullHeightPx, unsafeRegions, blocks, viewportHeight }) {
97
161
  if (!op || typeof op !== 'object' || Array.isArray(op)) {
98
162
  const error = new Error(
99
163
  `operations_invalid: section "${sectionId}" operations[${operationIndex}] is not an object.`,
@@ -120,9 +184,15 @@ function validateOperation(op, { sectionId, operationIndex, fullHeightPx, unsafe
120
184
  throw error;
121
185
  }
122
186
 
187
+ let scrollTargetY = null;
123
188
  if (atomName === 'scroll_to') {
124
- assertYWithinBounds(Number(op.y), { fullHeightPx, atomName, sectionId, operationIndex });
125
- assertYNotInUnsafeRegion(Number(op.y), { unsafeRegions, atomName, sectionId, operationIndex });
189
+ // scroll_to.y is no longer agent-authored pixels — resolveScrollTargetY
190
+ // turns { block, align } into a clamped scrollTop. Raw { y } is still
191
+ // accepted for content-agnostic moves. The resolved value is what gets
192
+ // bounds/unsafe-checked and handed to the atom.
193
+ scrollTargetY = resolveScrollTargetY(op, { blocks, viewportHeight, fullHeightPx, sectionId, operationIndex });
194
+ assertYWithinBounds(scrollTargetY, { fullHeightPx, atomName, sectionId, operationIndex });
195
+ assertYNotInUnsafeRegion(scrollTargetY, { unsafeRegions, atomName, sectionId, operationIndex });
126
196
  } else if (atomName === 'cursor_focus') {
127
197
  assertYWithinBounds(Number(op.y), { fullHeightPx, atomName, sectionId, operationIndex });
128
198
  assertYNotInUnsafeRegion(Number(op.y), { unsafeRegions, atomName, sectionId, operationIndex });
@@ -134,16 +204,19 @@ function validateOperation(op, { sectionId, operationIndex, fullHeightPx, unsafe
134
204
  throw error;
135
205
  }
136
206
  }
137
- return atomName;
207
+ return { atomName, scrollTargetY };
138
208
  }
139
209
 
140
- function operationToAtomParams(op, atomName, anchorY) {
210
+ function operationToAtomParams(op, atomName, anchorY, scrollTargetY) {
141
211
  if (atomName === 'scroll_to') {
142
212
  return {
143
- target_y: Number(op.y),
213
+ // scrollTargetY is resolved by resolveScrollTargetY (block-framed or raw y).
214
+ target_y: scrollTargetY,
144
215
  duration_ms: Number(op.duration_ms),
145
216
  curve: op.curve || 'easeInOutQuad',
146
- jitter_px: Number.isFinite(Number(op.jitter_px)) ? Number(op.jitter_px) : 2,
217
+ // Default 0 — no pixel jitter (user requirement, repeatedly stated).
218
+ // Only consulted by touch mode; auto/programmatic ignore it.
219
+ jitter_px: Number.isFinite(Number(op.jitter_px)) ? Number(op.jitter_px) : 0,
147
220
  from_y: anchorY,
148
221
  mode: op.mode || 'auto',
149
222
  };
@@ -198,22 +271,45 @@ export function normalizePlanSections(plan = {}) {
198
271
  async function runOperations(page, ctx, operations, {
199
272
  fullHeightPx,
200
273
  unsafeRegions,
274
+ blocks,
275
+ viewportHeight,
276
+ viewportWidth,
201
277
  sectionId,
202
278
  fallbackAnchorY,
203
279
  }) {
204
280
  let anchorY = fallbackAnchorY;
281
+ // Spotlight off during the transition scroll — it appears once the section's
282
+ // block has landed centered (set right after the scroll_to below).
283
+ await clearSpotlight(page);
205
284
  for (let i = 0; i < operations.length; i += 1) {
206
285
  const op = operations[i];
207
- const atomName = validateOperation(op, {
286
+ const { atomName, scrollTargetY } = validateOperation(op, {
208
287
  sectionId,
209
288
  operationIndex: i,
210
289
  fullHeightPx,
211
290
  unsafeRegions,
291
+ blocks,
292
+ viewportHeight,
212
293
  });
213
- const params = operationToAtomParams(op, atomName, anchorY);
294
+ const params = operationToAtomParams(op, atomName, anchorY, scrollTargetY);
214
295
  const atomFn = ATOMS[atomName];
215
296
  const result = await atomFn(page, ctx, params);
216
297
  if (result?.anchorY != null) anchorY = result.anchorY;
298
+ // Once a scroll_to has landed on a content block, frame it: bordered box
299
+ // around the block + the rest of the page dimmed. The box then stays for
300
+ // the section's hold.
301
+ if (atomName === 'scroll_to' && typeof op.block === 'string' && op.block.trim()) {
302
+ const blk = findBlockById(blocks, op.block.trim());
303
+ if (blk) {
304
+ await setSpotlight(page, {
305
+ yTop: blk.y_top,
306
+ yBottom: blk.y_bottom,
307
+ viewportTop: scrollTargetY,
308
+ viewportHeight,
309
+ viewportWidth,
310
+ });
311
+ }
312
+ }
217
313
  }
218
314
  return { anchorY };
219
315
  }
@@ -230,6 +326,11 @@ function createEvent({ tMs, action, sectionId, detail = {} }) {
230
326
 
231
327
  export async function executePlanPhases(page, plan, {
232
328
  pageUnderstanding = null,
329
+ // The actual record-browser viewport height — used to frame block-referenced
330
+ // scroll_to operations. Falls back to page_understanding.viewport.height,
331
+ // then 1920. Pass the real recorder viewport so centering is pixel-correct.
332
+ viewportHeight = null,
333
+ viewportWidth = null,
233
334
  getNowMs = () => Date.now(),
234
335
  onEvent = null,
235
336
  } = {}) {
@@ -238,6 +339,13 @@ export async function executePlanPhases(page, plan, {
238
339
  const unsafeRegions = Array.isArray(pageUnderstanding?.unsafe_regions)
239
340
  ? pageUnderstanding.unsafe_regions
240
341
  : [];
342
+ const blocks = Array.isArray(pageUnderstanding?.blocks) ? pageUnderstanding.blocks : [];
343
+ const resolvedViewportHeight = Number(viewportHeight)
344
+ || Number(pageUnderstanding?.viewport?.height)
345
+ || 1920;
346
+ const resolvedViewportWidth = Number(viewportWidth)
347
+ || Number(pageUnderstanding?.viewport?.width)
348
+ || 1080;
241
349
 
242
350
  const startedAt = nowMs(getNowMs);
243
351
  const eventsLog = [];
@@ -263,6 +371,9 @@ export async function executePlanPhases(page, plan, {
263
371
  const result = await runOperations(page, ctx, section.operations, {
264
372
  fullHeightPx,
265
373
  unsafeRegions,
374
+ blocks,
375
+ viewportHeight: resolvedViewportHeight,
376
+ viewportWidth: resolvedViewportWidth,
266
377
  sectionId,
267
378
  fallbackAnchorY: lastAnchorY,
268
379
  });