@lightcone-ai/daemon 0.22.0 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,17 +1,16 @@
1
- // record_url_narration atomic recording tool.
1
+ // V6 record_url_narration daemon tool wrapper.
2
2
  //
3
- // Drives Chromium on Xvfb + Playwright recordVideo to capture a silent mp4 of
4
- // a URL following a beat-by-beat visual plan, then ffmpeg-transcodes it. The
5
- // resulting silent mp4 feeds into compose_video_v2 as a video-kind segment
6
- // alongside narration audio.
3
+ // Drives Chromium on Xvfb + Playwright recordVideo to capture a silent mp4
4
+ // per section, then ffmpeg-transcodes + slices. The resulting silent mp4s
5
+ // feed compose_video_v2 as video-kind segments alongside narration audio.
7
6
  //
8
- // Lives in daemon/src/tools/ as a reusable handler module; the MCP-tool
9
- // registration lives in daemon/mcp-servers/official/media-tools/index.js.
10
- // Migrated out of chat-bridge.js (V4) — no longer wrapped by
11
- // runMandatoryLocalTool / governance round-trip. media-tools is a separate
12
- // stdio MCP server and governance integration is chat-bridge-specific;
13
- // matches the precedent set by synthesize_tts / plan_video_segments /
14
- // compose_video_v2 in V1/V2/V3.
7
+ // V6 contract (see docs/scenario-content-creation/video-synthesis-design.md):
8
+ // - page_understanding (from analyze_page) is required — drives safe-region
9
+ // check and preheat consistency
10
+ // - plan.sections each carry operations[] of atom calls; V5 fields are rejected
11
+ // - mp.weixin.qq.com-only keyword blacklist is gone unsafe_regions from
12
+ // page_understanding is the universal safety mechanism
13
+ // - plan_video_segments must run earlier in this session (standard chain)
15
14
 
16
15
  import { mkdirSync } from 'fs';
17
16
  import path from 'path';
@@ -56,88 +55,9 @@ function deriveDurationMs(recorderOutput) {
56
55
  return lastTms > 0 ? lastTms : null;
57
56
  }
58
57
 
59
- function planSegments(plan) {
58
+ function planSections(plan) {
60
59
  if (!isPlainObject(plan)) return null;
61
- for (const key of ['phases', 'sections', 'segments']) {
62
- if (Array.isArray(plan[key]) && plan[key].length > 0) return plan[key];
63
- }
64
- return null;
65
- }
66
-
67
- function derivePhaseCount({ plan, recorderOutput }) {
68
- const explicit = normalizeNumberOrNull(recorderOutput?.phases);
69
- if (explicit != null) return explicit;
70
-
71
- const segments = planSegments(plan);
72
- return segments ? segments.length : null;
73
- }
74
-
75
- function assertPipelineCompliance(plan) {
76
- if (!isPlainObject(plan)) return;
77
- if (!planSegments(plan)) {
78
- throw new Error(
79
- 'record_url_narration: `plan` must contain a non-empty `phases` (or `sections` / `segments`) array — '
80
- + 'either hand-written or from plan_video_segments. Each entry should carry a visual action and a duration.'
81
- );
82
- }
83
- }
84
-
85
- // Forbidden region keywords for recruitment content. If a section's
86
- // target_y_content_label matches, we refuse to record — the resulting video
87
- // would show 投递入口 / 二维码 / contact info, which violates the recruitment
88
- // content policy (see fragments.md frag.short.recruitment_url_mode_policy).
89
- //
90
- // Discovered after Task #25 v1 ended up dwelling on FunPlus's QR/投递 area:
91
- // the agent's plan declared target_y=2180 with dwell_ms=8500 without checking
92
- // what content lived at that pixel position. This is a prompt-level rule
93
- // that's been ignored often enough that we enforce it at the tool layer.
94
- const FORBIDDEN_REGION_PATTERNS = [
95
- /二维码/, /扫码/, /扫一扫/,
96
- /投递入口/, /投递方式/, /投递通道/, /投递渠道/, /报名入口/, /报名方式/,
97
- /联系方式/, /联系人/, /微信号/, /\bWeChat\b/i, /\bQQ群\b/,
98
- /阅读原文/, /外链/, /\bQR\b/i,
99
- ];
100
-
101
- function isRecruitmentLikeUrl(url) {
102
- if (typeof url !== 'string') return false;
103
- return /mp\.weixin\.qq\.com/.test(url);
104
- }
105
-
106
- function describeForbiddenMatch(label) {
107
- for (const pattern of FORBIDDEN_REGION_PATTERNS) {
108
- if (pattern.test(label)) return pattern.source;
109
- }
110
- return null;
111
- }
112
-
113
- function checkSafeRegionLabels({ url, plan }) {
114
- if (!isRecruitmentLikeUrl(url)) return null;
115
- const segments = planSegments(plan);
116
- if (!segments) return null;
117
- for (let i = 0; i < segments.length; i += 1) {
118
- const seg = segments[i] ?? {};
119
- const label = normalizeText(seg.target_y_content_label ?? seg.targetYContentLabel ?? '');
120
- if (!label) {
121
- return (
122
- `record_url_narration: section[${i}] is missing required field `
123
- + `\`target_y_content_label\`. For recruitment URLs (mp.weixin.qq.com / `
124
- + `校招 / 实习等) you MUST label what content lives at target_y so the `
125
- + `tool can verify it is not 二维码/投递入口/联系方式. Look at the page `
126
- + `screenshot, find what is at target_y=${seg.target_y ?? '<unset>'}, `
127
- + `and add a short label like "标题区" / "岗位信息卡片" / "公司介绍".`
128
- );
129
- }
130
- const match = describeForbiddenMatch(label);
131
- if (match) {
132
- return (
133
- `record_url_narration: section[${i}] target_y=${seg.target_y ?? '?'} `
134
- + `is labeled "${label}", which matches a forbidden region pattern `
135
- + `/${match}/. Recruitment content must NOT dwell on 投递入口 / 二维码 / `
136
- + `联系方式 areas. Pick a different target_y inside the 标题区 / 岗位 `
137
- + `信息卡片 / 公司介绍 area and rewrite this section.`
138
- );
139
- }
140
- }
60
+ if (Array.isArray(plan.sections) && plan.sections.length > 0) return plan.sections;
141
61
  return null;
142
62
  }
143
63
 
@@ -155,6 +75,24 @@ export function validateRecordUrlNarrationArgs(args = {}) {
155
75
  throw error;
156
76
  }
157
77
 
78
+ if (!planSections(args.plan)) {
79
+ const error = new Error(
80
+ 'plan.sections is required (non-empty array). Each section: { id?, text?, audio_path?, dwell_ms?, operations: [{atom, duration_ms, ...}] }.',
81
+ );
82
+ error.code = 'PLAN_SECTIONS_REQUIRED';
83
+ throw error;
84
+ }
85
+
86
+ if (!isPlainObject(args.page_understanding)) {
87
+ const error = new Error(
88
+ 'page_understanding is required — call analyze_page(url) first and pass its output here. '
89
+ + 'V6 uses page_understanding.unsafe_regions[] to validate scroll_to.y / cursor_focus.y, and '
90
+ + 'page_understanding.preheat_strategy to align the record browser with the analyze browser.',
91
+ );
92
+ error.code = 'PAGE_UNDERSTANDING_REQUIRED';
93
+ throw error;
94
+ }
95
+
158
96
  return {
159
97
  ...(args ?? {}),
160
98
  url: normalizedUrl,
@@ -218,49 +156,22 @@ export async function runRecordUrlNarrationTool({
218
156
  return toolError(`Error: ${error.message}`);
219
157
  }
220
158
 
221
- try {
222
- assertPipelineCompliance(validatedInput.plan);
223
- } catch (error) {
224
- return toolError(`Error: ${error.message}`);
225
- }
226
-
227
- // Safe-region check for recruitment URLs — refuse plans that dwell on
228
- // forbidden regions (二维码 / 投递入口 / 联系方式) before we even start
229
- // Chromium. The agent must label each target_y with the content that lives
230
- // there, and the labels are pattern-matched against a forbidden list.
231
- const safeRegionError = checkSafeRegionLabels({
232
- url: validatedInput.url,
233
- plan: validatedInput.plan,
234
- });
235
- if (safeRegionError) {
236
- return toolError(`Error: ${safeRegionError}`);
237
- }
238
-
239
159
  // Standard-chain hard block: refuse recordings unless plan_video_segments
240
- // ran in this session. Discovered repeatedly in Tasks #20/#25/#26 that
241
- // agents hand-write dwell_ms by guessing, producing recordings whose phase
242
- // boundaries drift from the TTS audio they will eventually be paired with —
243
- // forcing a full re-record. plan_video_segments fills dwell_ms mechanically
244
- // from ffprobe audio duration, eliminating the drift.
160
+ // ran in this session. plan_video_segments is what aligns operations[]
161
+ // duration sums to the per-section TTS audio duration; skipping it lets
162
+ // audio/visual drift accumulate across sections.
245
163
  if (!planVideoSegmentsCalled) {
246
164
  return toolError(
247
165
  'Error: record_url_narration refused: plan_video_segments must run earlier in this '
248
- + 'session so dwell_ms / phase durations are mechanically aligned with the segment\'s '
249
- + 'TTS audio (audio_duration_ms). Hand-written dwell_ms has repeatedly drifted from '
250
- + 'the actual TTS duration in production runs, forcing full re-records.\n\n'
251
- + 'Standard chain: synthesize_tts × N (per segment) plan_video_segments(segments with '
252
- + 'text + audio_path + visual_kind=video + visual_path) record_url_narration (feed '
253
- + 'plan_video_segments output as plan.sections — each section\'s dwell_ms is already '
254
- + 'set to audio_duration_ms) + compose_video_v2 (same plan output). Call plan_video_segments '
255
- + 'now, then pass its `segments` array as `plan.sections` here.'
166
+ + 'session so per-section operations.duration_ms is reconciled with TTS audio_duration_ms.\n\n'
167
+ + 'V6 standard chain: analyze_page(url) synthesize_tts × N (per section) '
168
+ + 'plan_video_segments(segments with text + audio_path + visual_kind + operations) '
169
+ + 'record_url_narration(url, page_understanding, plan=…, output_paths=[…]) + '
170
+ + 'compose_video_v2(segments=…, variants=[…]). Call plan_video_segments now, then retry.',
256
171
  );
257
172
  }
258
173
 
259
174
  try {
260
- // output_paths is REQUIRED. The legacy "default output_path master file"
261
- // mode is gone — agents kept defaulting to one-call-per-section because
262
- // that was the lowest-friction path. Now every recording is sliced, even
263
- // single-section ones (which are just a 1-element output_paths array).
264
175
  let resolvedOutputPaths;
265
176
  try {
266
177
  resolvedOutputPaths = resolveOutputPaths(validatedInput.output_paths, { workspaceDir });
@@ -271,22 +182,18 @@ export async function runRecordUrlNarrationTool({
271
182
  return toolError(
272
183
  'Error: output_paths is required — one workspace-relative mp4 path per plan.sections entry. '
273
184
  + 'Single-section recording is a 1-element array. Multi-section recording records once '
274
- + 'continuously (one browser session, one scrollTop) and slices the result at section '
275
- + 'boundaries. See frag.short.video_synthesis_tools.',
185
+ + 'continuously (one browser session) and slices the result at section boundaries.',
276
186
  );
277
187
  }
278
- const planSectionCount = (planSegments(validatedInput.plan) ?? []).length;
279
- if (resolvedOutputPaths.length !== planSectionCount) {
188
+ const sectionCount = (planSections(validatedInput.plan) ?? []).length;
189
+ if (resolvedOutputPaths.length !== sectionCount) {
280
190
  return toolError(
281
191
  `Error: output_paths length (${resolvedOutputPaths.length}) must match `
282
- + `plan.sections length (${planSectionCount}). Each section produces exactly one mp4 — `
192
+ + `plan.sections length (${sectionCount}). Each section produces exactly one mp4 — `
283
193
  + `don't pad or truncate.`,
284
194
  );
285
195
  }
286
196
 
287
- // The master / events JSON paths are agent-optional debug artifacts.
288
- // Default master to a tmp path next to the first output; events default
289
- // to <master>.events.json. Agent can override either if they care.
290
197
  const { resolvedOutputPath: masterPath, resolvedEventsPath } = resolveRecordUrlNarrationPaths({
291
198
  workspaceDir,
292
199
  outputPath: validatedInput.output_path,
@@ -299,6 +206,7 @@ export async function runRecordUrlNarrationTool({
299
206
  const recorderOutput = await recordUrlNarrationFn({
300
207
  url: validatedInput.url,
301
208
  plan: validatedInput.plan,
209
+ page_understanding: validatedInput.page_understanding,
302
210
  output_path: masterPath,
303
211
  events_path: resolvedEventsPath,
304
212
  output_paths: resolvedOutputPaths,
@@ -36,10 +36,10 @@ export const PART_RETRY_BASE_MS = 1_000; // 1s, 3s, 9s
36
36
  export const TERMINAL_JOB_TTL_MS = 7 * 24 * 3600 * 1000; // sweep done/dead_letter after 7 days
37
37
  export const HOUSEKEEPING_INTERVAL_MS = 6 * 3600 * 1000; // run housekeeping every 6h
38
38
  // Per-PUT timeout — Node's fetch has no overall request timeout. Without this
39
- // a stalled COS connection wedges the chunk loop forever (observed during the
40
- // first Task #25 upload: chunk 1 PUT hung 7+ minutes with no progress, no
41
- // error). 5 minutes covers slow networks for an 8MB chunk (~25kB/s floor)
42
- // while still letting failures surface to the chunk-level retry loop.
39
+ // a stalled COS connection wedges the chunk loop forever (observed in
40
+ // production: a chunk PUT hung 7+ minutes with no progress and no error).
41
+ // 5 minutes covers slow networks for an 8MB chunk (~25kB/s floor) while
42
+ // still letting failures surface to the chunk-level retry loop.
43
43
  export const PUT_REQUEST_TIMEOUT_MS = 5 * 60 * 1000;
44
44
 
45
45
  function nowIso() { return new Date().toISOString(); }
@@ -1,18 +0,0 @@
1
- function normalizeInteger(value, fallback = null) {
2
- const parsed = Number.parseInt(String(value ?? ''), 10);
3
- if (!Number.isFinite(parsed)) return fallback;
4
- return parsed;
5
- }
6
-
7
- export function resolveDurationMs(phase, fallback = 0) {
8
- const parsed = normalizeInteger(phase?.duration_ms, null);
9
- if (parsed !== null && parsed >= 0) return parsed;
10
-
11
- const dwellMs = normalizeInteger(phase?.dwell_ms, null);
12
- if (dwellMs !== null && dwellMs >= 0) return dwellMs;
13
-
14
- const secs = Number(phase?.duration_s);
15
- if (Number.isFinite(secs) && secs >= 0) return Math.round(secs * 1000);
16
-
17
- return fallback;
18
- }
@@ -1,43 +0,0 @@
1
- import { resolveDurationMs } from './phase-duration.js';
2
- import { normalizePlanPhases } from './plan-executor.js';
3
-
4
- export function estimatePlanDurationMs(plan = {}) {
5
- let phases = [];
6
- try {
7
- phases = normalizePlanPhases(plan);
8
- } catch {
9
- phases = [];
10
- }
11
-
12
- return phases.reduce((total, phase) => {
13
- const action = String(phase?.action ?? phase?.visual_action?.type ?? '').trim().toLowerCase();
14
- const durationMs = resolveDurationMs(phase, Number.NaN);
15
- const dwellMs = Number(phase?.dwell_ms);
16
- const transitionMs = Number(phase?.transition_ms ?? phase?.visual_action?.transition_ms);
17
- const effectiveHoldMs = Number.isFinite(dwellMs) && dwellMs > 0
18
- ? dwellMs
19
- : durationMs;
20
-
21
- if (action === 'hold' && Number.isFinite(effectiveHoldMs) && effectiveHoldMs > 0) {
22
- return total + effectiveHoldMs;
23
- }
24
- if (action === 'linear_scroll_during') {
25
- if (Number.isFinite(effectiveHoldMs) && effectiveHoldMs > 0) return total + effectiveHoldMs;
26
- return total + 1200;
27
- }
28
- if (action === 'scroll_to_dwell' || action === 'cursor_focus' || action === 'scroll_back') {
29
- let next = total;
30
- if (Number.isFinite(transitionMs) && transitionMs > 0) next += transitionMs;
31
- if (Number.isFinite(effectiveHoldMs) && effectiveHoldMs > 0) next += effectiveHoldMs;
32
- if (next === total) next += 1200;
33
- return next;
34
- }
35
- if (Number.isFinite(transitionMs) && transitionMs > 0) {
36
- return total + transitionMs;
37
- }
38
- if (Number.isFinite(durationMs) && durationMs > 0) {
39
- return total + durationMs;
40
- }
41
- return total + 800;
42
- }, 0);
43
- }