@lightcone-ai/daemon 0.22.1 → 0.23.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/mcp-servers/official/media-tools/index.js +42 -19
- package/mcp-servers/official/page-understanding/index.js +6 -7
- package/package.json +1 -1
- package/src/_vendor/video/cdp-touch.js +184 -0
- package/src/_vendor/video/humanized-scroll.js +251 -0
- package/src/_vendor/video/recorder/atoms.js +212 -0
- package/src/_vendor/video/recorder/index.js +68 -38
- package/src/_vendor/video/recorder/plan-executor.js +191 -394
- package/src/_vendor/video/understanding/schema.js +316 -0
- package/src/drivers/codex.js +11 -2
- package/src/tools/plan-video-segments.js +152 -22
- package/src/tools/record-url-narration.js +44 -137
- package/src/_vendor/video/recorder/phase-duration.js +0 -18
- package/src/_vendor/video/recorder/plan-estimator.js +0 -43
|
@@ -1,17 +1,16 @@
|
|
|
1
|
-
// record_url_narration
|
|
1
|
+
// V6 record_url_narration daemon tool wrapper.
|
|
2
2
|
//
|
|
3
|
-
// Drives Chromium on Xvfb + Playwright recordVideo to capture a silent mp4
|
|
4
|
-
//
|
|
5
|
-
//
|
|
6
|
-
// alongside narration audio.
|
|
3
|
+
// Drives Chromium on Xvfb + Playwright recordVideo to capture a silent mp4
|
|
4
|
+
// per section, then ffmpeg-transcodes + slices. The resulting silent mp4s
|
|
5
|
+
// feed compose_video_v2 as video-kind segments alongside narration audio.
|
|
7
6
|
//
|
|
8
|
-
//
|
|
9
|
-
//
|
|
10
|
-
//
|
|
11
|
-
//
|
|
12
|
-
//
|
|
13
|
-
//
|
|
14
|
-
//
|
|
7
|
+
// V6 contract (see docs/scenario-content-creation/video-synthesis-design.md):
|
|
8
|
+
// - page_understanding (from analyze_page) is required — drives safe-region
|
|
9
|
+
// check and preheat consistency
|
|
10
|
+
// - plan.sections each carry operations[] of atom calls; V5 fields are rejected
|
|
11
|
+
// - mp.weixin.qq.com-only keyword blacklist is gone — unsafe_regions from
|
|
12
|
+
// page_understanding is the universal safety mechanism
|
|
13
|
+
// - plan_video_segments must run earlier in this session (standard chain)
|
|
15
14
|
|
|
16
15
|
import { mkdirSync } from 'fs';
|
|
17
16
|
import path from 'path';
|
|
@@ -56,89 +55,9 @@ function deriveDurationMs(recorderOutput) {
|
|
|
56
55
|
return lastTms > 0 ? lastTms : null;
|
|
57
56
|
}
|
|
58
57
|
|
|
59
|
-
function
|
|
58
|
+
function planSections(plan) {
|
|
60
59
|
if (!isPlainObject(plan)) return null;
|
|
61
|
-
|
|
62
|
-
if (Array.isArray(plan[key]) && plan[key].length > 0) return plan[key];
|
|
63
|
-
}
|
|
64
|
-
return null;
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
function derivePhaseCount({ plan, recorderOutput }) {
|
|
68
|
-
const explicit = normalizeNumberOrNull(recorderOutput?.phases);
|
|
69
|
-
if (explicit != null) return explicit;
|
|
70
|
-
|
|
71
|
-
const segments = planSegments(plan);
|
|
72
|
-
return segments ? segments.length : null;
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
function assertPipelineCompliance(plan) {
|
|
76
|
-
if (!isPlainObject(plan)) return;
|
|
77
|
-
if (!planSegments(plan)) {
|
|
78
|
-
throw new Error(
|
|
79
|
-
'record_url_narration: `plan` must contain a non-empty `phases` (or `sections` / `segments`) array — '
|
|
80
|
-
+ 'either hand-written or from plan_video_segments. Each entry should carry a visual action and a duration.'
|
|
81
|
-
);
|
|
82
|
-
}
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
// Forbidden region keywords for recruitment content. If a section's
|
|
86
|
-
// target_y_content_label matches, we refuse to record — the resulting video
|
|
87
|
-
// would show 投递入口 / 二维码 / contact info, which violates the recruitment
|
|
88
|
-
// content policy (see fragments.md frag.short.recruitment_url_mode_policy).
|
|
89
|
-
//
|
|
90
|
-
// Origin: in production runs the agent's plan repeatedly declared a target_y
|
|
91
|
-
// without checking what content lived at that pixel position, and ended up
|
|
92
|
-
// dwelling on QR codes / 投递 entries / 联系方式. The prompt-level rule
|
|
93
|
-
// requiring `target_y_content_label` has been ignored often enough that we
|
|
94
|
-
// enforce it at the tool layer instead.
|
|
95
|
-
const FORBIDDEN_REGION_PATTERNS = [
|
|
96
|
-
/二维码/, /扫码/, /扫一扫/,
|
|
97
|
-
/投递入口/, /投递方式/, /投递通道/, /投递渠道/, /报名入口/, /报名方式/,
|
|
98
|
-
/联系方式/, /联系人/, /微信号/, /\bWeChat\b/i, /\bQQ群\b/,
|
|
99
|
-
/阅读原文/, /外链/, /\bQR\b/i,
|
|
100
|
-
];
|
|
101
|
-
|
|
102
|
-
function isRecruitmentLikeUrl(url) {
|
|
103
|
-
if (typeof url !== 'string') return false;
|
|
104
|
-
return /mp\.weixin\.qq\.com/.test(url);
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
function describeForbiddenMatch(label) {
|
|
108
|
-
for (const pattern of FORBIDDEN_REGION_PATTERNS) {
|
|
109
|
-
if (pattern.test(label)) return pattern.source;
|
|
110
|
-
}
|
|
111
|
-
return null;
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
function checkSafeRegionLabels({ url, plan }) {
|
|
115
|
-
if (!isRecruitmentLikeUrl(url)) return null;
|
|
116
|
-
const segments = planSegments(plan);
|
|
117
|
-
if (!segments) return null;
|
|
118
|
-
for (let i = 0; i < segments.length; i += 1) {
|
|
119
|
-
const seg = segments[i] ?? {};
|
|
120
|
-
const label = normalizeText(seg.target_y_content_label ?? seg.targetYContentLabel ?? '');
|
|
121
|
-
if (!label) {
|
|
122
|
-
return (
|
|
123
|
-
`record_url_narration: section[${i}] is missing required field `
|
|
124
|
-
+ `\`target_y_content_label\`. For recruitment URLs (mp.weixin.qq.com / `
|
|
125
|
-
+ `校招 / 实习等) you MUST label what content lives at target_y so the `
|
|
126
|
-
+ `tool can verify it is not 二维码/投递入口/联系方式. Look at the page `
|
|
127
|
-
+ `screenshot, find what is at target_y=${seg.target_y ?? '<unset>'}, `
|
|
128
|
-
+ `and add a short label like "标题区" / "岗位信息卡片" / "公司介绍".`
|
|
129
|
-
);
|
|
130
|
-
}
|
|
131
|
-
const match = describeForbiddenMatch(label);
|
|
132
|
-
if (match) {
|
|
133
|
-
return (
|
|
134
|
-
`record_url_narration: section[${i}] target_y=${seg.target_y ?? '?'} `
|
|
135
|
-
+ `is labeled "${label}", which matches a forbidden region pattern `
|
|
136
|
-
+ `/${match}/. Recruitment content must NOT dwell on 投递入口 / 二维码 / `
|
|
137
|
-
+ `联系方式 areas. Pick a different target_y inside the 标题区 / 岗位 `
|
|
138
|
-
+ `信息卡片 / 公司介绍 area and rewrite this section.`
|
|
139
|
-
);
|
|
140
|
-
}
|
|
141
|
-
}
|
|
60
|
+
if (Array.isArray(plan.sections) && plan.sections.length > 0) return plan.sections;
|
|
142
61
|
return null;
|
|
143
62
|
}
|
|
144
63
|
|
|
@@ -156,6 +75,24 @@ export function validateRecordUrlNarrationArgs(args = {}) {
|
|
|
156
75
|
throw error;
|
|
157
76
|
}
|
|
158
77
|
|
|
78
|
+
if (!planSections(args.plan)) {
|
|
79
|
+
const error = new Error(
|
|
80
|
+
'plan.sections is required (non-empty array). Each section: { id?, text?, audio_path?, dwell_ms?, operations: [{atom, duration_ms, ...}] }.',
|
|
81
|
+
);
|
|
82
|
+
error.code = 'PLAN_SECTIONS_REQUIRED';
|
|
83
|
+
throw error;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
if (!isPlainObject(args.page_understanding)) {
|
|
87
|
+
const error = new Error(
|
|
88
|
+
'page_understanding is required — call analyze_page(url) first and pass its output here. '
|
|
89
|
+
+ 'V6 uses page_understanding.unsafe_regions[] to validate scroll_to.y / cursor_focus.y, and '
|
|
90
|
+
+ 'page_understanding.preheat_strategy to align the record browser with the analyze browser.',
|
|
91
|
+
);
|
|
92
|
+
error.code = 'PAGE_UNDERSTANDING_REQUIRED';
|
|
93
|
+
throw error;
|
|
94
|
+
}
|
|
95
|
+
|
|
159
96
|
return {
|
|
160
97
|
...(args ?? {}),
|
|
161
98
|
url: normalizedUrl,
|
|
@@ -219,49 +156,22 @@ export async function runRecordUrlNarrationTool({
|
|
|
219
156
|
return toolError(`Error: ${error.message}`);
|
|
220
157
|
}
|
|
221
158
|
|
|
222
|
-
try {
|
|
223
|
-
assertPipelineCompliance(validatedInput.plan);
|
|
224
|
-
} catch (error) {
|
|
225
|
-
return toolError(`Error: ${error.message}`);
|
|
226
|
-
}
|
|
227
|
-
|
|
228
|
-
// Safe-region check for recruitment URLs — refuse plans that dwell on
|
|
229
|
-
// forbidden regions (二维码 / 投递入口 / 联系方式) before we even start
|
|
230
|
-
// Chromium. The agent must label each target_y with the content that lives
|
|
231
|
-
// there, and the labels are pattern-matched against a forbidden list.
|
|
232
|
-
const safeRegionError = checkSafeRegionLabels({
|
|
233
|
-
url: validatedInput.url,
|
|
234
|
-
plan: validatedInput.plan,
|
|
235
|
-
});
|
|
236
|
-
if (safeRegionError) {
|
|
237
|
-
return toolError(`Error: ${safeRegionError}`);
|
|
238
|
-
}
|
|
239
|
-
|
|
240
159
|
// Standard-chain hard block: refuse recordings unless plan_video_segments
|
|
241
|
-
// ran in this session.
|
|
242
|
-
//
|
|
243
|
-
//
|
|
244
|
-
// forcing a full re-record. plan_video_segments fills dwell_ms mechanically
|
|
245
|
-
// from ffprobe audio duration, eliminating the drift.
|
|
160
|
+
// ran in this session. plan_video_segments is what aligns operations[]
|
|
161
|
+
// duration sums to the per-section TTS audio duration; skipping it lets
|
|
162
|
+
// audio/visual drift accumulate across sections.
|
|
246
163
|
if (!planVideoSegmentsCalled) {
|
|
247
164
|
return toolError(
|
|
248
165
|
'Error: record_url_narration refused: plan_video_segments must run earlier in this '
|
|
249
|
-
+ 'session so
|
|
250
|
-
+ '
|
|
251
|
-
+ '
|
|
252
|
-
+ '
|
|
253
|
-
+ '
|
|
254
|
-
+ 'plan_video_segments output as plan.sections — each section\'s dwell_ms is already '
|
|
255
|
-
+ 'set to audio_duration_ms) + compose_video_v2 (same plan output). Call plan_video_segments '
|
|
256
|
-
+ 'now, then pass its `segments` array as `plan.sections` here.'
|
|
166
|
+
+ 'session so per-section operations.duration_ms is reconciled with TTS audio_duration_ms.\n\n'
|
|
167
|
+
+ 'V6 standard chain: analyze_page(url) → synthesize_tts × N (per section) → '
|
|
168
|
+
+ 'plan_video_segments(segments with text + audio_path + visual_kind + operations) → '
|
|
169
|
+
+ 'record_url_narration(url, page_understanding, plan=…, output_paths=[…]) + '
|
|
170
|
+
+ 'compose_video_v2(segments=…, variants=[…]). Call plan_video_segments now, then retry.',
|
|
257
171
|
);
|
|
258
172
|
}
|
|
259
173
|
|
|
260
174
|
try {
|
|
261
|
-
// output_paths is REQUIRED. The legacy "default output_path master file"
|
|
262
|
-
// mode is gone — agents kept defaulting to one-call-per-section because
|
|
263
|
-
// that was the lowest-friction path. Now every recording is sliced, even
|
|
264
|
-
// single-section ones (which are just a 1-element output_paths array).
|
|
265
175
|
let resolvedOutputPaths;
|
|
266
176
|
try {
|
|
267
177
|
resolvedOutputPaths = resolveOutputPaths(validatedInput.output_paths, { workspaceDir });
|
|
@@ -272,22 +182,18 @@ export async function runRecordUrlNarrationTool({
|
|
|
272
182
|
return toolError(
|
|
273
183
|
'Error: output_paths is required — one workspace-relative mp4 path per plan.sections entry. '
|
|
274
184
|
+ 'Single-section recording is a 1-element array. Multi-section recording records once '
|
|
275
|
-
+ 'continuously (one browser session
|
|
276
|
-
+ 'boundaries. See frag.short.video_synthesis_tools.',
|
|
185
|
+
+ 'continuously (one browser session) and slices the result at section boundaries.',
|
|
277
186
|
);
|
|
278
187
|
}
|
|
279
|
-
const
|
|
280
|
-
if (resolvedOutputPaths.length !==
|
|
188
|
+
const sectionCount = (planSections(validatedInput.plan) ?? []).length;
|
|
189
|
+
if (resolvedOutputPaths.length !== sectionCount) {
|
|
281
190
|
return toolError(
|
|
282
191
|
`Error: output_paths length (${resolvedOutputPaths.length}) must match `
|
|
283
|
-
+ `plan.sections length (${
|
|
192
|
+
+ `plan.sections length (${sectionCount}). Each section produces exactly one mp4 — `
|
|
284
193
|
+ `don't pad or truncate.`,
|
|
285
194
|
);
|
|
286
195
|
}
|
|
287
196
|
|
|
288
|
-
// The master / events JSON paths are agent-optional debug artifacts.
|
|
289
|
-
// Default master to a tmp path next to the first output; events default
|
|
290
|
-
// to <master>.events.json. Agent can override either if they care.
|
|
291
197
|
const { resolvedOutputPath: masterPath, resolvedEventsPath } = resolveRecordUrlNarrationPaths({
|
|
292
198
|
workspaceDir,
|
|
293
199
|
outputPath: validatedInput.output_path,
|
|
@@ -300,6 +206,7 @@ export async function runRecordUrlNarrationTool({
|
|
|
300
206
|
const recorderOutput = await recordUrlNarrationFn({
|
|
301
207
|
url: validatedInput.url,
|
|
302
208
|
plan: validatedInput.plan,
|
|
209
|
+
page_understanding: validatedInput.page_understanding,
|
|
303
210
|
output_path: masterPath,
|
|
304
211
|
events_path: resolvedEventsPath,
|
|
305
212
|
output_paths: resolvedOutputPaths,
|
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
function normalizeInteger(value, fallback = null) {
|
|
2
|
-
const parsed = Number.parseInt(String(value ?? ''), 10);
|
|
3
|
-
if (!Number.isFinite(parsed)) return fallback;
|
|
4
|
-
return parsed;
|
|
5
|
-
}
|
|
6
|
-
|
|
7
|
-
export function resolveDurationMs(phase, fallback = 0) {
|
|
8
|
-
const parsed = normalizeInteger(phase?.duration_ms, null);
|
|
9
|
-
if (parsed !== null && parsed >= 0) return parsed;
|
|
10
|
-
|
|
11
|
-
const dwellMs = normalizeInteger(phase?.dwell_ms, null);
|
|
12
|
-
if (dwellMs !== null && dwellMs >= 0) return dwellMs;
|
|
13
|
-
|
|
14
|
-
const secs = Number(phase?.duration_s);
|
|
15
|
-
if (Number.isFinite(secs) && secs >= 0) return Math.round(secs * 1000);
|
|
16
|
-
|
|
17
|
-
return fallback;
|
|
18
|
-
}
|
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
import { resolveDurationMs } from './phase-duration.js';
|
|
2
|
-
import { normalizePlanPhases } from './plan-executor.js';
|
|
3
|
-
|
|
4
|
-
export function estimatePlanDurationMs(plan = {}) {
|
|
5
|
-
let phases = [];
|
|
6
|
-
try {
|
|
7
|
-
phases = normalizePlanPhases(plan);
|
|
8
|
-
} catch {
|
|
9
|
-
phases = [];
|
|
10
|
-
}
|
|
11
|
-
|
|
12
|
-
return phases.reduce((total, phase) => {
|
|
13
|
-
const action = String(phase?.action ?? phase?.visual_action?.type ?? '').trim().toLowerCase();
|
|
14
|
-
const durationMs = resolveDurationMs(phase, Number.NaN);
|
|
15
|
-
const dwellMs = Number(phase?.dwell_ms);
|
|
16
|
-
const transitionMs = Number(phase?.transition_ms ?? phase?.visual_action?.transition_ms);
|
|
17
|
-
const effectiveHoldMs = Number.isFinite(dwellMs) && dwellMs > 0
|
|
18
|
-
? dwellMs
|
|
19
|
-
: durationMs;
|
|
20
|
-
|
|
21
|
-
if (action === 'hold' && Number.isFinite(effectiveHoldMs) && effectiveHoldMs > 0) {
|
|
22
|
-
return total + effectiveHoldMs;
|
|
23
|
-
}
|
|
24
|
-
if (action === 'linear_scroll_during') {
|
|
25
|
-
if (Number.isFinite(effectiveHoldMs) && effectiveHoldMs > 0) return total + effectiveHoldMs;
|
|
26
|
-
return total + 1200;
|
|
27
|
-
}
|
|
28
|
-
if (action === 'scroll_to_dwell' || action === 'cursor_focus' || action === 'scroll_back') {
|
|
29
|
-
let next = total;
|
|
30
|
-
if (Number.isFinite(transitionMs) && transitionMs > 0) next += transitionMs;
|
|
31
|
-
if (Number.isFinite(effectiveHoldMs) && effectiveHoldMs > 0) next += effectiveHoldMs;
|
|
32
|
-
if (next === total) next += 1200;
|
|
33
|
-
return next;
|
|
34
|
-
}
|
|
35
|
-
if (Number.isFinite(transitionMs) && transitionMs > 0) {
|
|
36
|
-
return total + transitionMs;
|
|
37
|
-
}
|
|
38
|
-
if (Number.isFinite(durationMs) && durationMs > 0) {
|
|
39
|
-
return total + durationMs;
|
|
40
|
-
}
|
|
41
|
-
return total + 800;
|
|
42
|
-
}, 0);
|
|
43
|
-
}
|