@lightcone-ai/daemon 0.22.1 → 0.23.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,17 +1,16 @@
1
- // record_url_narration atomic recording tool.
1
+ // V6 record_url_narration daemon tool wrapper.
2
2
  //
3
- // Drives Chromium on Xvfb + Playwright recordVideo to capture a silent mp4 of
4
- // a URL following a beat-by-beat visual plan, then ffmpeg-transcodes it. The
5
- // resulting silent mp4 feeds into compose_video_v2 as a video-kind segment
6
- // alongside narration audio.
3
+ // Drives Chromium on Xvfb + Playwright recordVideo to capture a silent mp4
4
+ // per section, then ffmpeg-transcodes + slices. The resulting silent mp4s
5
+ // feed compose_video_v2 as video-kind segments alongside narration audio.
7
6
  //
8
- // Lives in daemon/src/tools/ as a reusable handler module; the MCP-tool
9
- // registration lives in daemon/mcp-servers/official/media-tools/index.js.
10
- // Migrated out of chat-bridge.js (V4) — no longer wrapped by
11
- // runMandatoryLocalTool / governance round-trip. media-tools is a separate
12
- // stdio MCP server and governance integration is chat-bridge-specific;
13
- // matches the precedent set by synthesize_tts / plan_video_segments /
14
- // compose_video_v2 in V1/V2/V3.
7
+ // V6 contract (see docs/scenario-content-creation/video-synthesis-design.md):
8
+ // - page_understanding (from analyze_page) is required — drives safe-region
9
+ // check and preheat consistency
10
+ // - plan.sections each carry operations[] of atom calls; V5 fields are rejected
11
+ // - mp.weixin.qq.com-only keyword blacklist is gone unsafe_regions from
12
+ // page_understanding is the universal safety mechanism
13
+ // - plan_video_segments must run earlier in this session (standard chain)
15
14
 
16
15
  import { mkdirSync } from 'fs';
17
16
  import path from 'path';
@@ -56,89 +55,9 @@ function deriveDurationMs(recorderOutput) {
56
55
  return lastTms > 0 ? lastTms : null;
57
56
  }
58
57
 
59
- function planSegments(plan) {
58
+ function planSections(plan) {
60
59
  if (!isPlainObject(plan)) return null;
61
- for (const key of ['phases', 'sections', 'segments']) {
62
- if (Array.isArray(plan[key]) && plan[key].length > 0) return plan[key];
63
- }
64
- return null;
65
- }
66
-
67
- function derivePhaseCount({ plan, recorderOutput }) {
68
- const explicit = normalizeNumberOrNull(recorderOutput?.phases);
69
- if (explicit != null) return explicit;
70
-
71
- const segments = planSegments(plan);
72
- return segments ? segments.length : null;
73
- }
74
-
75
- function assertPipelineCompliance(plan) {
76
- if (!isPlainObject(plan)) return;
77
- if (!planSegments(plan)) {
78
- throw new Error(
79
- 'record_url_narration: `plan` must contain a non-empty `phases` (or `sections` / `segments`) array — '
80
- + 'either hand-written or from plan_video_segments. Each entry should carry a visual action and a duration.'
81
- );
82
- }
83
- }
84
-
85
- // Forbidden region keywords for recruitment content. If a section's
86
- // target_y_content_label matches, we refuse to record — the resulting video
87
- // would show 投递入口 / 二维码 / contact info, which violates the recruitment
88
- // content policy (see fragments.md frag.short.recruitment_url_mode_policy).
89
- //
90
- // Origin: in production runs the agent's plan repeatedly declared a target_y
91
- // without checking what content lived at that pixel position, and ended up
92
- // dwelling on QR codes / 投递 entries / 联系方式. The prompt-level rule
93
- // requiring `target_y_content_label` has been ignored often enough that we
94
- // enforce it at the tool layer instead.
95
- const FORBIDDEN_REGION_PATTERNS = [
96
- /二维码/, /扫码/, /扫一扫/,
97
- /投递入口/, /投递方式/, /投递通道/, /投递渠道/, /报名入口/, /报名方式/,
98
- /联系方式/, /联系人/, /微信号/, /\bWeChat\b/i, /\bQQ群\b/,
99
- /阅读原文/, /外链/, /\bQR\b/i,
100
- ];
101
-
102
- function isRecruitmentLikeUrl(url) {
103
- if (typeof url !== 'string') return false;
104
- return /mp\.weixin\.qq\.com/.test(url);
105
- }
106
-
107
- function describeForbiddenMatch(label) {
108
- for (const pattern of FORBIDDEN_REGION_PATTERNS) {
109
- if (pattern.test(label)) return pattern.source;
110
- }
111
- return null;
112
- }
113
-
114
- function checkSafeRegionLabels({ url, plan }) {
115
- if (!isRecruitmentLikeUrl(url)) return null;
116
- const segments = planSegments(plan);
117
- if (!segments) return null;
118
- for (let i = 0; i < segments.length; i += 1) {
119
- const seg = segments[i] ?? {};
120
- const label = normalizeText(seg.target_y_content_label ?? seg.targetYContentLabel ?? '');
121
- if (!label) {
122
- return (
123
- `record_url_narration: section[${i}] is missing required field `
124
- + `\`target_y_content_label\`. For recruitment URLs (mp.weixin.qq.com / `
125
- + `校招 / 实习等) you MUST label what content lives at target_y so the `
126
- + `tool can verify it is not 二维码/投递入口/联系方式. Look at the page `
127
- + `screenshot, find what is at target_y=${seg.target_y ?? '<unset>'}, `
128
- + `and add a short label like "标题区" / "岗位信息卡片" / "公司介绍".`
129
- );
130
- }
131
- const match = describeForbiddenMatch(label);
132
- if (match) {
133
- return (
134
- `record_url_narration: section[${i}] target_y=${seg.target_y ?? '?'} `
135
- + `is labeled "${label}", which matches a forbidden region pattern `
136
- + `/${match}/. Recruitment content must NOT dwell on 投递入口 / 二维码 / `
137
- + `联系方式 areas. Pick a different target_y inside the 标题区 / 岗位 `
138
- + `信息卡片 / 公司介绍 area and rewrite this section.`
139
- );
140
- }
141
- }
60
+ if (Array.isArray(plan.sections) && plan.sections.length > 0) return plan.sections;
142
61
  return null;
143
62
  }
144
63
 
@@ -156,6 +75,24 @@ export function validateRecordUrlNarrationArgs(args = {}) {
156
75
  throw error;
157
76
  }
158
77
 
78
+ if (!planSections(args.plan)) {
79
+ const error = new Error(
80
+ 'plan.sections is required (non-empty array). Each section: { id?, text?, audio_path?, dwell_ms?, operations: [{atom, duration_ms, ...}] }.',
81
+ );
82
+ error.code = 'PLAN_SECTIONS_REQUIRED';
83
+ throw error;
84
+ }
85
+
86
+ if (!isPlainObject(args.page_understanding)) {
87
+ const error = new Error(
88
+ 'page_understanding is required — call analyze_page(url) first and pass its output here. '
89
+ + 'V6 uses page_understanding.unsafe_regions[] to validate scroll_to.y / cursor_focus.y, and '
90
+ + 'page_understanding.preheat_strategy to align the record browser with the analyze browser.',
91
+ );
92
+ error.code = 'PAGE_UNDERSTANDING_REQUIRED';
93
+ throw error;
94
+ }
95
+
159
96
  return {
160
97
  ...(args ?? {}),
161
98
  url: normalizedUrl,
@@ -219,49 +156,22 @@ export async function runRecordUrlNarrationTool({
219
156
  return toolError(`Error: ${error.message}`);
220
157
  }
221
158
 
222
- try {
223
- assertPipelineCompliance(validatedInput.plan);
224
- } catch (error) {
225
- return toolError(`Error: ${error.message}`);
226
- }
227
-
228
- // Safe-region check for recruitment URLs — refuse plans that dwell on
229
- // forbidden regions (二维码 / 投递入口 / 联系方式) before we even start
230
- // Chromium. The agent must label each target_y with the content that lives
231
- // there, and the labels are pattern-matched against a forbidden list.
232
- const safeRegionError = checkSafeRegionLabels({
233
- url: validatedInput.url,
234
- plan: validatedInput.plan,
235
- });
236
- if (safeRegionError) {
237
- return toolError(`Error: ${safeRegionError}`);
238
- }
239
-
240
159
  // Standard-chain hard block: refuse recordings unless plan_video_segments
241
- // ran in this session. Discovered repeatedly in Tasks #20/#25/#26 that
242
- // agents hand-write dwell_ms by guessing, producing recordings whose phase
243
- // boundaries drift from the TTS audio they will eventually be paired with —
244
- // forcing a full re-record. plan_video_segments fills dwell_ms mechanically
245
- // from ffprobe audio duration, eliminating the drift.
160
+ // ran in this session. plan_video_segments is what aligns operations[]
161
+ // duration sums to the per-section TTS audio duration; skipping it lets
162
+ // audio/visual drift accumulate across sections.
246
163
  if (!planVideoSegmentsCalled) {
247
164
  return toolError(
248
165
  'Error: record_url_narration refused: plan_video_segments must run earlier in this '
249
- + 'session so dwell_ms / phase durations are mechanically aligned with the segment\'s '
250
- + 'TTS audio (audio_duration_ms). Hand-written dwell_ms has repeatedly drifted from '
251
- + 'the actual TTS duration in production runs, forcing full re-records.\n\n'
252
- + 'Standard chain: synthesize_tts × N (per segment) plan_video_segments(segments with '
253
- + 'text + audio_path + visual_kind=video + visual_path) record_url_narration (feed '
254
- + 'plan_video_segments output as plan.sections — each section\'s dwell_ms is already '
255
- + 'set to audio_duration_ms) + compose_video_v2 (same plan output). Call plan_video_segments '
256
- + 'now, then pass its `segments` array as `plan.sections` here.'
166
+ + 'session so per-section operations.duration_ms is reconciled with TTS audio_duration_ms.\n\n'
167
+ + 'V6 standard chain: analyze_page(url) synthesize_tts × N (per section) '
168
+ + 'plan_video_segments(segments with text + audio_path + visual_kind + operations) '
169
+ + 'record_url_narration(url, page_understanding, plan=…, output_paths=[…]) + '
170
+ + 'compose_video_v2(segments=…, variants=[…]). Call plan_video_segments now, then retry.',
257
171
  );
258
172
  }
259
173
 
260
174
  try {
261
- // output_paths is REQUIRED. The legacy "default output_path master file"
262
- // mode is gone — agents kept defaulting to one-call-per-section because
263
- // that was the lowest-friction path. Now every recording is sliced, even
264
- // single-section ones (which are just a 1-element output_paths array).
265
175
  let resolvedOutputPaths;
266
176
  try {
267
177
  resolvedOutputPaths = resolveOutputPaths(validatedInput.output_paths, { workspaceDir });
@@ -272,22 +182,18 @@ export async function runRecordUrlNarrationTool({
272
182
  return toolError(
273
183
  'Error: output_paths is required — one workspace-relative mp4 path per plan.sections entry. '
274
184
  + 'Single-section recording is a 1-element array. Multi-section recording records once '
275
- + 'continuously (one browser session, one scrollTop) and slices the result at section '
276
- + 'boundaries. See frag.short.video_synthesis_tools.',
185
+ + 'continuously (one browser session) and slices the result at section boundaries.',
277
186
  );
278
187
  }
279
- const planSectionCount = (planSegments(validatedInput.plan) ?? []).length;
280
- if (resolvedOutputPaths.length !== planSectionCount) {
188
+ const sectionCount = (planSections(validatedInput.plan) ?? []).length;
189
+ if (resolvedOutputPaths.length !== sectionCount) {
281
190
  return toolError(
282
191
  `Error: output_paths length (${resolvedOutputPaths.length}) must match `
283
- + `plan.sections length (${planSectionCount}). Each section produces exactly one mp4 — `
192
+ + `plan.sections length (${sectionCount}). Each section produces exactly one mp4 — `
284
193
  + `don't pad or truncate.`,
285
194
  );
286
195
  }
287
196
 
288
- // The master / events JSON paths are agent-optional debug artifacts.
289
- // Default master to a tmp path next to the first output; events default
290
- // to <master>.events.json. Agent can override either if they care.
291
197
  const { resolvedOutputPath: masterPath, resolvedEventsPath } = resolveRecordUrlNarrationPaths({
292
198
  workspaceDir,
293
199
  outputPath: validatedInput.output_path,
@@ -300,6 +206,7 @@ export async function runRecordUrlNarrationTool({
300
206
  const recorderOutput = await recordUrlNarrationFn({
301
207
  url: validatedInput.url,
302
208
  plan: validatedInput.plan,
209
+ page_understanding: validatedInput.page_understanding,
303
210
  output_path: masterPath,
304
211
  events_path: resolvedEventsPath,
305
212
  output_paths: resolvedOutputPaths,
@@ -1,18 +0,0 @@
1
- function normalizeInteger(value, fallback = null) {
2
- const parsed = Number.parseInt(String(value ?? ''), 10);
3
- if (!Number.isFinite(parsed)) return fallback;
4
- return parsed;
5
- }
6
-
7
- export function resolveDurationMs(phase, fallback = 0) {
8
- const parsed = normalizeInteger(phase?.duration_ms, null);
9
- if (parsed !== null && parsed >= 0) return parsed;
10
-
11
- const dwellMs = normalizeInteger(phase?.dwell_ms, null);
12
- if (dwellMs !== null && dwellMs >= 0) return dwellMs;
13
-
14
- const secs = Number(phase?.duration_s);
15
- if (Number.isFinite(secs) && secs >= 0) return Math.round(secs * 1000);
16
-
17
- return fallback;
18
- }
@@ -1,43 +0,0 @@
1
- import { resolveDurationMs } from './phase-duration.js';
2
- import { normalizePlanPhases } from './plan-executor.js';
3
-
4
- export function estimatePlanDurationMs(plan = {}) {
5
- let phases = [];
6
- try {
7
- phases = normalizePlanPhases(plan);
8
- } catch {
9
- phases = [];
10
- }
11
-
12
- return phases.reduce((total, phase) => {
13
- const action = String(phase?.action ?? phase?.visual_action?.type ?? '').trim().toLowerCase();
14
- const durationMs = resolveDurationMs(phase, Number.NaN);
15
- const dwellMs = Number(phase?.dwell_ms);
16
- const transitionMs = Number(phase?.transition_ms ?? phase?.visual_action?.transition_ms);
17
- const effectiveHoldMs = Number.isFinite(dwellMs) && dwellMs > 0
18
- ? dwellMs
19
- : durationMs;
20
-
21
- if (action === 'hold' && Number.isFinite(effectiveHoldMs) && effectiveHoldMs > 0) {
22
- return total + effectiveHoldMs;
23
- }
24
- if (action === 'linear_scroll_during') {
25
- if (Number.isFinite(effectiveHoldMs) && effectiveHoldMs > 0) return total + effectiveHoldMs;
26
- return total + 1200;
27
- }
28
- if (action === 'scroll_to_dwell' || action === 'cursor_focus' || action === 'scroll_back') {
29
- let next = total;
30
- if (Number.isFinite(transitionMs) && transitionMs > 0) next += transitionMs;
31
- if (Number.isFinite(effectiveHoldMs) && effectiveHoldMs > 0) next += effectiveHoldMs;
32
- if (next === total) next += 1200;
33
- return next;
34
- }
35
- if (Number.isFinite(transitionMs) && transitionMs > 0) {
36
- return total + transitionMs;
37
- }
38
- if (Number.isFinite(durationMs) && durationMs > 0) {
39
- return total + durationMs;
40
- }
41
- return total + 800;
42
- }, 0);
43
- }