@lightcone-ai/daemon 0.22.1 → 0.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/mcp-servers/official/media-tools/index.js +42 -19
- package/mcp-servers/official/page-understanding/index.js +6 -7
- package/package.json +1 -1
- package/src/_vendor/video/cdp-touch.js +184 -0
- package/src/_vendor/video/humanized-scroll.js +251 -0
- package/src/_vendor/video/recorder/atoms.js +212 -0
- package/src/_vendor/video/recorder/index.js +68 -38
- package/src/_vendor/video/recorder/plan-executor.js +191 -394
- package/src/_vendor/video/understanding/schema.js +316 -0
- package/src/tools/plan-video-segments.js +152 -22
- package/src/tools/record-url-narration.js +44 -137
- package/src/_vendor/video/recorder/phase-duration.js +0 -18
- package/src/_vendor/video/recorder/plan-estimator.js +0 -43
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
// Page-operation atoms. The closed primitive set plan.sections.operations[]
|
|
2
|
+
// decomposes into. Adding a new atom is a code-level decision; composing new
|
|
3
|
+
// behaviors is data-level (a new operations sequence in the plan).
|
|
4
|
+
//
|
|
5
|
+
// Each atom returns { anchorY } so plan-executor can track where the camera
|
|
6
|
+
// landed (used by subsequent scroll_to's from_y if not explicitly set).
|
|
7
|
+
// anchorY=null for non-scrolling atoms (hold, cursor_focus) — caller keeps
|
|
8
|
+
// the previous anchor.
|
|
9
|
+
//
|
|
10
|
+
// Layout convention: atoms take (page, ctx, params). ctx carries shared
|
|
11
|
+
// resources (CDP session) so atoms don't repeat the page.context dance.
|
|
12
|
+
// plan-executor builds ctx once per section.
|
|
13
|
+
|
|
14
|
+
import { humanizedScroll } from '../humanized-scroll.js';
|
|
15
|
+
import { getCdpSession } from '../cdp-touch.js';
|
|
16
|
+
|
|
17
|
+
async function readScrollY(page) {
|
|
18
|
+
return page.evaluate(() => {
|
|
19
|
+
const root = document.scrollingElement || document.documentElement;
|
|
20
|
+
return Math.round(root.scrollTop);
|
|
21
|
+
});
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
// ── atomScrollTo ─────────────────────────────────────────────────────────────
|
|
25
|
+
// Animated scroll from current position to target_y over duration_ms.
|
|
26
|
+
//
|
|
27
|
+
// Mode selection (the bit that took an end-to-end failure to learn):
|
|
28
|
+
// - 'programmatic' (default for distance >= 240px): runs the easing loop
|
|
29
|
+
// inside page.evaluate via root.scrollTo. Lands EXACTLY at target_y.
|
|
30
|
+
// Lacks the rubber-band/fling physics of real touch, but reliable.
|
|
31
|
+
// - 'touch' (default for short distances): humanizedScroll → CDP touch.
|
|
32
|
+
// Produces natural gesture physics (rubber-band, inertia) but for
|
|
33
|
+
// larger distances the multi-segment swipe gets broken up into rapid
|
|
34
|
+
// micro-flings that interfere with each other, and the page often
|
|
35
|
+
// ends up nowhere near the intended target_y. Safe for distances
|
|
36
|
+
// that fit in a single ~260px finger swipe.
|
|
37
|
+
// - 'auto' (default): picks 'touch' for distance < 240px (single
|
|
38
|
+
// segment, no fling interference), 'programmatic' otherwise.
|
|
39
|
+
//
|
|
40
|
+
// Discovered the hard way: scroll_to_dwell macro with ~18% transition
|
|
41
|
+
// for 1100+ px distances dispatched 5 CDP swipes in ~1s; each touchEnd
|
|
42
|
+
// kicked off a fling that the next touchStart immediately cancelled, so
|
|
43
|
+
// the cumulative scroll never reached target.
|
|
44
|
+
//
|
|
45
|
+
// Params:
|
|
46
|
+
// target_y — absolute Y in page coordinates (required)
|
|
47
|
+
// duration_ms — total animation time (required)
|
|
48
|
+
// curve — 'easeInOutQuad' (default) | 'linear' | 'easeOutQuad'
|
|
49
|
+
// jitter_px — per-step vertical jitter (default 2)
|
|
50
|
+
// from_y — explicit start position; if omitted reads current scrollY
|
|
51
|
+
// mode — 'auto' (default) | 'programmatic' | 'touch'
|
|
52
|
+
export async function atomScrollTo(page, _ctx, {
|
|
53
|
+
target_y,
|
|
54
|
+
duration_ms,
|
|
55
|
+
curve = 'easeInOutQuad',
|
|
56
|
+
jitter_px = 2,
|
|
57
|
+
from_y = null,
|
|
58
|
+
mode = 'auto',
|
|
59
|
+
} = {}) {
|
|
60
|
+
if (!Number.isFinite(Number(target_y))) {
|
|
61
|
+
throw new Error('atom_scroll_to: target_y is required');
|
|
62
|
+
}
|
|
63
|
+
if (!Number.isFinite(Number(duration_ms)) || Number(duration_ms) <= 0) {
|
|
64
|
+
throw new Error('atom_scroll_to: duration_ms must be positive');
|
|
65
|
+
}
|
|
66
|
+
const fromY = Number.isFinite(Number(from_y))
|
|
67
|
+
? Number(from_y)
|
|
68
|
+
: await readScrollY(page);
|
|
69
|
+
const targetY = Number(target_y);
|
|
70
|
+
const durationMs = Number(duration_ms);
|
|
71
|
+
const distance = Math.abs(targetY - fromY);
|
|
72
|
+
|
|
73
|
+
// Auto-mode heuristic: touch works for short distances (single segment, no
|
|
74
|
+
// inter-segment fling interference) OR slow velocities (each segment has
|
|
75
|
+
// time to settle before the next starts). Fast long-distance scrolls fall
|
|
76
|
+
// back to programmatic, where the easing loop drives root.scrollTo
|
|
77
|
+
// deterministically.
|
|
78
|
+
// Thresholds chosen empirically against the v4 failure mode (~1100px in
|
|
79
|
+
// ~1000ms → ~1100 px/s, fling-interrupt-fling, page never reached target).
|
|
80
|
+
const velocity = durationMs > 0 ? (distance / durationMs) * 1000 : 0; // px/s
|
|
81
|
+
const resolvedMode = mode === 'programmatic' || mode === 'touch'
|
|
82
|
+
? mode
|
|
83
|
+
: (distance < 240 || velocity < 500 ? 'touch' : 'programmatic');
|
|
84
|
+
|
|
85
|
+
if (resolvedMode === 'touch') {
|
|
86
|
+
await humanizedScroll(page, {
|
|
87
|
+
from_y: fromY,
|
|
88
|
+
to_y: targetY,
|
|
89
|
+
duration_ms: durationMs,
|
|
90
|
+
motion_curve: curve,
|
|
91
|
+
pixel_jitter_px: Math.max(0, Number(jitter_px) || 0),
|
|
92
|
+
});
|
|
93
|
+
} else {
|
|
94
|
+
await programmaticScroll(page, {
|
|
95
|
+
fromY,
|
|
96
|
+
targetY,
|
|
97
|
+
durationMs,
|
|
98
|
+
curve,
|
|
99
|
+
jitterPx: Math.max(0, Number(jitter_px) || 0),
|
|
100
|
+
});
|
|
101
|
+
}
|
|
102
|
+
return { anchorY: Math.round(targetY) };
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
// Programmatic scroll: hands the animation off to the browser's native
|
|
106
|
+
// scroll engine via `scroll-behavior: smooth`. That way the easing runs on
|
|
107
|
+
// the compositor thread at the display refresh rate, independent of how
|
|
108
|
+
// busy the page's JS is. The JS-driven setTimeout approach we tried first
|
|
109
|
+
// gets badly throttled on JS-heavy article pages (60Hz timers can stretch
|
|
110
|
+
// to 150-200ms), turning a 1s transition into 5-8s.
|
|
111
|
+
//
|
|
112
|
+
// The wait is Node-side, so even if the in-page scrollend never fires we
|
|
113
|
+
// still cap the section at durationMs and move on. We deliberately do NOT
|
|
114
|
+
// wait for scrollend — empirically faster than dispatching the event on
|
|
115
|
+
// page-heavy mobile sites.
|
|
116
|
+
async function programmaticScroll(page, {
|
|
117
|
+
fromY,
|
|
118
|
+
targetY,
|
|
119
|
+
durationMs,
|
|
120
|
+
} = {}) {
|
|
121
|
+
// Try every plausible scroll target — mobile article pages sometimes have
|
|
122
|
+
// a fixed-position outer body and scroll happens on an inner container.
|
|
123
|
+
// We dispatch to all candidates and let whichever one is actually the
|
|
124
|
+
// scroller win. Returns the diagnostics so we can debug when the page
|
|
125
|
+
// refuses to scroll.
|
|
126
|
+
const diag = await page.evaluate((input) => {
|
|
127
|
+
const candidates = [];
|
|
128
|
+
if (document.scrollingElement) candidates.push(document.scrollingElement);
|
|
129
|
+
if (document.documentElement) candidates.push(document.documentElement);
|
|
130
|
+
if (document.body) candidates.push(document.body);
|
|
131
|
+
candidates.push(window);
|
|
132
|
+
|
|
133
|
+
const before = candidates.map((c) => {
|
|
134
|
+
if (c === window) return { tag: 'window', y: window.scrollY };
|
|
135
|
+
return { tag: c.tagName, y: c.scrollTop };
|
|
136
|
+
});
|
|
137
|
+
|
|
138
|
+
// Hard snap to target on every candidate (instant, no animation).
|
|
139
|
+
for (const c of candidates) {
|
|
140
|
+
try {
|
|
141
|
+
if (c === window) window.scrollTo(0, input.targetY);
|
|
142
|
+
else { c.scrollTop = input.targetY; }
|
|
143
|
+
} catch { /* ignore */ }
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
const after = candidates.map((c) => {
|
|
147
|
+
if (c === window) return { tag: 'window', y: window.scrollY };
|
|
148
|
+
return { tag: c.tagName, y: c.scrollTop };
|
|
149
|
+
});
|
|
150
|
+
|
|
151
|
+
return {
|
|
152
|
+
requested_target: input.targetY,
|
|
153
|
+
before, after,
|
|
154
|
+
maxScroll: document.documentElement?.scrollHeight,
|
|
155
|
+
innerHeight: window.innerHeight,
|
|
156
|
+
};
|
|
157
|
+
}, { fromY, targetY });
|
|
158
|
+
void diag;
|
|
159
|
+
// Brief dwell to let the page settle the snap before next atom starts.
|
|
160
|
+
await page.waitForTimeout(Math.max(80, Math.round(durationMs * 0.3)));
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// ── atomHold ─────────────────────────────────────────────────────────────────
|
|
164
|
+
// Stay at current position for duration_ms. Pure pause — no motion, no
|
|
165
|
+
// cursor update. The dominant choice for "口播 introducing what's on screen"
|
|
166
|
+
// segments: scroll to anchor, then hold for the rest of the audio.
|
|
167
|
+
export async function atomHold(page, _ctx, { duration_ms } = {}) {
|
|
168
|
+
const ms = Math.max(0, Math.round(Number(duration_ms) || 0));
|
|
169
|
+
if (ms > 0) await page.waitForTimeout(ms);
|
|
170
|
+
return { anchorY: null };
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
// ── atomCursorFocus ──────────────────────────────────────────────────────────
|
|
174
|
+
// Move the cursor to (x, y) in viewport coordinates and hold it there.
|
|
175
|
+
// Used for short pages where there's nothing to scroll to — the cursor
|
|
176
|
+
// shifting between visual focal points carries the rhythm instead.
|
|
177
|
+
//
|
|
178
|
+
// CDP `Input.dispatchMouseEvent` type='mouseMoved' moves the cursor through
|
|
179
|
+
// the real gesture pipeline so the cursor renders on the captured video.
|
|
180
|
+
export async function atomCursorFocus(page, ctx, { x, y, duration_ms } = {}) {
|
|
181
|
+
if (!Number.isFinite(Number(x)) || !Number.isFinite(Number(y))) {
|
|
182
|
+
throw new Error('atom_cursor_focus: x and y are required');
|
|
183
|
+
}
|
|
184
|
+
if (!Number.isFinite(Number(duration_ms)) || Number(duration_ms) <= 0) {
|
|
185
|
+
throw new Error('atom_cursor_focus: duration_ms must be positive');
|
|
186
|
+
}
|
|
187
|
+
const cdp = ctx?.cdp ?? await getCdpSession(page);
|
|
188
|
+
await cdp.send('Input.dispatchMouseEvent', {
|
|
189
|
+
type: 'mouseMoved',
|
|
190
|
+
x: Math.round(Number(x)),
|
|
191
|
+
y: Math.round(Number(y)),
|
|
192
|
+
button: 'none',
|
|
193
|
+
modifiers: 0,
|
|
194
|
+
});
|
|
195
|
+
await page.waitForTimeout(Math.max(0, Math.round(Number(duration_ms))));
|
|
196
|
+
return { anchorY: null };
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
// Public atom registry. Plan-executor looks up operation.atom in this map.
|
|
200
|
+
// V6 is closed: agent composes operations[] from these three; if a new page
|
|
201
|
+
// interaction is genuinely needed, a new atom gets added — never a macro.
|
|
202
|
+
//
|
|
203
|
+
// micro_oscillate was deleted in V6 — anchor sections dwell cleanly via `hold`
|
|
204
|
+
// when the口播 content corresponds to what's visible; "natural reader motion"
|
|
205
|
+
// during dwell is the wrong design.
|
|
206
|
+
export const ATOMS = Object.freeze({
|
|
207
|
+
scroll_to: atomScrollTo,
|
|
208
|
+
hold: atomHold,
|
|
209
|
+
cursor_focus: atomCursorFocus,
|
|
210
|
+
});
|
|
211
|
+
|
|
212
|
+
export const ATOM_NAMES = Object.freeze(Object.keys(ATOMS));
|
|
@@ -7,7 +7,7 @@ import path from 'node:path';
|
|
|
7
7
|
import { launchChromiumMobile, openPageAndSettle } from './chromium-driver.js';
|
|
8
8
|
import { defaultDisplayPool } from './display-pool.js';
|
|
9
9
|
import { createUnexpectedExitWatcher, waitForProcessExit } from './ffmpeg-runner.js';
|
|
10
|
-
import { executePlanPhases,
|
|
10
|
+
import { executePlanPhases, normalizePlanSections } from './plan-executor.js';
|
|
11
11
|
|
|
12
12
|
const DEFAULT_VIEWPORT = Object.freeze({ width: 1080, height: 1920 });
|
|
13
13
|
const DEFAULT_FPS = 30;
|
|
@@ -152,24 +152,41 @@ async function scrollToTop(page) {
|
|
|
152
152
|
});
|
|
153
153
|
}
|
|
154
154
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
155
|
+
// Preheat: slow-scroll to bottom of page to trigger lazy-load (images / late
|
|
156
|
+
// content), then back to top. Both analyze_page and record_url_narration run
|
|
157
|
+
// this so y coordinates in page_understanding match what the record browser
|
|
158
|
+
// sees — without preheat, lazy images haven't loaded, page is shorter, and
|
|
159
|
+
// scroll targets land on the wrong content.
|
|
160
|
+
async function preheatPage(page, { fullHeightPx } = {}) {
|
|
161
|
+
await page.evaluate(async ({ targetHeight }) => {
|
|
162
|
+
const step = 600;
|
|
163
|
+
const stepDelayMs = 180;
|
|
164
|
+
const root = document.scrollingElement || document.documentElement;
|
|
165
|
+
const sleep = (ms) => new Promise((r) => setTimeout(r, ms));
|
|
166
|
+
|
|
167
|
+
const totalAtStart = Math.max(
|
|
168
|
+
root.scrollHeight,
|
|
169
|
+
document.body?.scrollHeight || 0,
|
|
170
|
+
window.innerHeight,
|
|
171
|
+
);
|
|
172
|
+
const target = Number.isFinite(targetHeight) && targetHeight > 0
|
|
173
|
+
? Math.max(targetHeight, totalAtStart)
|
|
174
|
+
: totalAtStart;
|
|
175
|
+
|
|
176
|
+
let cursor = 0;
|
|
177
|
+
while (cursor < target) {
|
|
178
|
+
cursor = Math.min(target, cursor + step);
|
|
179
|
+
window.scrollTo(0, cursor);
|
|
180
|
+
await sleep(stepDelayMs);
|
|
181
|
+
}
|
|
182
|
+
await sleep(500);
|
|
183
|
+
|
|
184
|
+
window.scrollTo(0, 0);
|
|
185
|
+
await sleep(300);
|
|
186
|
+
}, { targetHeight: Number.isFinite(fullHeightPx) ? fullHeightPx : null });
|
|
187
|
+
|
|
188
|
+
// Wait for any newly loaded images to settle.
|
|
189
|
+
try { await page.waitForLoadState('networkidle', { timeout: 4000 }); } catch { /* ignore */ }
|
|
173
190
|
}
|
|
174
191
|
|
|
175
192
|
// Re-encode the page recording (webm, page content only — no browser chrome) into
|
|
@@ -326,22 +343,26 @@ function normalizeOutputPaths(rawList) {
|
|
|
326
343
|
|
|
327
344
|
export async function recordUrlNarration({
|
|
328
345
|
plan,
|
|
346
|
+
// page_understanding (V6) — required for safety-region validation and to
|
|
347
|
+
// align preheat strategy with what analyze_page used. The recorder's plan-
|
|
348
|
+
// executor checks scroll_to.y / cursor_focus.y against pageUnderstanding.unsafe_regions
|
|
349
|
+
// before execution. Without it, scroll_to.y is unbounded and unsafe.
|
|
350
|
+
page_understanding,
|
|
351
|
+
pageUnderstanding = page_understanding,
|
|
329
352
|
output_path,
|
|
330
353
|
outputPath = output_path,
|
|
331
354
|
events_path,
|
|
332
355
|
eventsPath = events_path,
|
|
333
|
-
// Multi-section output:
|
|
334
|
-
//
|
|
335
|
-
//
|
|
336
|
-
//
|
|
337
|
-
// omitted, behaves exactly like before — single mp4 at outputPath.
|
|
356
|
+
// Multi-section output: array of N paths matching plan.sections length.
|
|
357
|
+
// Recorder captures once continuously and slices into N per-section mp4s,
|
|
358
|
+
// so visuals flow naturally between sections (no scroll-back-to-top, no
|
|
359
|
+
// page reload).
|
|
338
360
|
output_paths,
|
|
339
361
|
outputPaths = output_paths,
|
|
340
362
|
url,
|
|
341
363
|
viewport = DEFAULT_VIEWPORT,
|
|
342
364
|
fps = DEFAULT_FPS,
|
|
343
365
|
settle_ms = 4000,
|
|
344
|
-
page_zoom = 1.1,
|
|
345
366
|
displayPool = defaultDisplayPool,
|
|
346
367
|
startupProbeMs = 1200,
|
|
347
368
|
xvfbStopTimeoutMs = 5000,
|
|
@@ -353,12 +374,19 @@ export async function recordUrlNarration({
|
|
|
353
374
|
cutFn = cutMp4Slice,
|
|
354
375
|
nowMs = () => Date.now(),
|
|
355
376
|
} = {}) {
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
377
|
+
if (!pageUnderstanding || typeof pageUnderstanding !== 'object' || Array.isArray(pageUnderstanding)) {
|
|
378
|
+
const error = new Error(
|
|
379
|
+
'page_understanding_required: V6 record_url_narration requires page_understanding (from analyze_page) '
|
|
380
|
+
+ 'for safe-region validation and preheat consistency. No silent default — refusing rather than running blind.',
|
|
381
|
+
);
|
|
382
|
+
error.code = 'PAGE_UNDERSTANDING_REQUIRED';
|
|
383
|
+
throw error;
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
const sections = normalizePlanSections(plan);
|
|
359
387
|
const executablePlan = {
|
|
360
388
|
...(plan && typeof plan === 'object' ? plan : {}),
|
|
361
|
-
|
|
389
|
+
sections,
|
|
362
390
|
};
|
|
363
391
|
|
|
364
392
|
const resolvedOutputPath = resolveOutputPath(outputPath);
|
|
@@ -378,9 +406,9 @@ export async function recordUrlNarration({
|
|
|
378
406
|
error.code = 'OUTPUT_PATHS_REQUIRED';
|
|
379
407
|
throw error;
|
|
380
408
|
}
|
|
381
|
-
if (resolvedOutputPaths.length !==
|
|
409
|
+
if (resolvedOutputPaths.length !== sections.length) {
|
|
382
410
|
const error = new Error(
|
|
383
|
-
`output_paths_count_mismatch:expected=${
|
|
411
|
+
`output_paths_count_mismatch:expected=${sections.length}:got=${resolvedOutputPaths.length}`,
|
|
384
412
|
);
|
|
385
413
|
error.code = 'OUTPUT_PATHS_COUNT_MISMATCH';
|
|
386
414
|
throw error;
|
|
@@ -438,11 +466,13 @@ export async function recordUrlNarration({
|
|
|
438
466
|
settleMs: settle_ms,
|
|
439
467
|
});
|
|
440
468
|
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
await browserSession.page
|
|
469
|
+
// Preheat to match analyze_page's page state — triggers lazy-loaded
|
|
470
|
+
// images so y coordinates stabilize at the same values the analyzer saw.
|
|
471
|
+
// Skipped when pageUnderstanding.preheat_strategy === 'none'.
|
|
472
|
+
if (pageUnderstanding.preheat_strategy === 'full_scroll_then_top') {
|
|
473
|
+
await preheatPage(browserSession.page, {
|
|
474
|
+
fullHeightPx: pageUnderstanding.full_height_px,
|
|
475
|
+
});
|
|
446
476
|
}
|
|
447
477
|
|
|
448
478
|
await scrollToTop(browserSession.page);
|
|
@@ -451,7 +481,7 @@ export async function recordUrlNarration({
|
|
|
451
481
|
const headTrimMs = Math.max(0, nowMs() - recordStartedAt);
|
|
452
482
|
|
|
453
483
|
const eventsLog = await Promise.race([
|
|
454
|
-
executePlanPhases(browserSession.page, executablePlan),
|
|
484
|
+
executePlanPhases(browserSession.page, executablePlan, { pageUnderstanding }),
|
|
455
485
|
xvfbWatcher.promise,
|
|
456
486
|
]);
|
|
457
487
|
|
|
@@ -510,7 +540,7 @@ export async function recordUrlNarration({
|
|
|
510
540
|
// phase_start / phase_end events). All slices come from the SAME
|
|
511
541
|
// continuous recording, so the visual flow between sections stays
|
|
512
542
|
// natural — no browser reload, no scroll-back-to-top per segment.
|
|
513
|
-
const cutPoints = deriveSectionCutPoints(eventsLog,
|
|
543
|
+
const cutPoints = deriveSectionCutPoints(eventsLog, sections.length);
|
|
514
544
|
const sectionOutputs = [];
|
|
515
545
|
for (let i = 0; i < cutPoints.length; i += 1) {
|
|
516
546
|
const cut = cutPoints[i];
|