@lightcone-ai/daemon 0.18.1 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -348,7 +348,7 @@ server.tool(
348
348
  // audio in production runs (Tasks #20/#25/#26), forcing re-records.
349
349
  server.tool(
350
350
  'record_url_narration',
351
- 'Record a silent video of a URL by driving Chromium on an Xvfb display and capturing it with Playwright recordVideo, driven by a video plan; ffmpeg then transcodes the recording to mp4. Outputs a silent mp4 that can be passed to compose_video_v2 as a video-kind segment with an audio_path for narration.\n\nUse this as the canonical recording step for URL-narration videos. Falls back: if the page needs interactions outside the visual_action vocabulary (clicks, waits, OCR loops), use Monitor (Bash) with custom Playwright instead.\n\nMUST be preceded by plan_video_segments in the same session — feed plan_video_segments\'s `segments` array as `plan.sections` so dwell_ms aligns mechanically with TTS audio_duration_ms (hand-written dwell_ms has drifted and forced re-records in production).\n\nRuntime requirements: this tool only works on a Linux daemon machine with Xvfb + Chromium + ffmpeg installed (ffmpeg is used to transcode the recording to mp4; no x11grab device support needed). macOS / Windows daemons will fail at startup.',
351
+ 'Record a silent video of a URL by driving Chromium on an Xvfb display and capturing it with Playwright recordVideo, driven by a video plan; ffmpeg then transcodes the recording to mp4. Outputs a silent mp4 that can be passed to compose_video_v2 as a video-kind segment with an audio_path for narration.\n\nUse this as the canonical recording step for URL-narration videos. Falls back: if the page needs interactions outside the visual_action vocabulary (clicks, waits, OCR loops), use Monitor (Bash) with custom Playwright instead.\n\nMUST be preceded by plan_video_segments in the same session — feed plan_video_segments\'s `segments` array as `plan.sections` so dwell_ms aligns mechanically with TTS audio_duration_ms (hand-written dwell_ms has drifted and forced re-records in production).\n\nMULTI-SECTION OUTPUT (recommended for any URL with ≥2 sections): pass `output_paths` as an array with one path per plan.sections entry. The tool records the URL ONCE continuously (one browser session, one scrollTop, natural scroll flow through all sections), then slices the recording at section boundaries via ffmpeg. This avoids the per-segment scroll-back-to-top reset that happens when the agent splits N sections into N separate record_url_narration calls — that pattern reopens the browser and re-navigates for each segment, which looks visually disjointed even though the per-segment timing is correct.\n\nRuntime requirements: this tool only works on a Linux daemon machine with Xvfb + Chromium + ffmpeg installed (ffmpeg is used to transcode the recording to mp4; no x11grab device support needed). macOS / Windows daemons will fail at startup.',
352
352
  {
353
353
  url: z.string().describe('Page URL to record'),
354
354
  plan: z.record(z.any()).describe(
@@ -367,7 +367,8 @@ server.tool(
367
367
  + 'frag.short.recruitment_url_mode_policy). Pick a different target_y in the 标题/岗位 '
368
368
  + 'information area and rewrite that section.'
369
369
  ),
370
- output_path: z.string().optional().describe('Workspace-relative output mp4 path. Default tmp/wx3_video/recorded-{ts}.mp4'),
370
+ output_path: z.string().optional().describe('Workspace-relative output mp4 path for the CONSOLIDATED master recording. Default tmp/wx3_video/recorded-{ts}.mp4. When output_paths is also provided, this still receives the full continuous recording for verification/debugging.'),
371
+ output_paths: z.array(z.string()).optional().describe('Multi-section output mode. Pass an array of N workspace-relative paths matching plan.sections length. The tool records ONCE continuously then slices the result into N mp4s at section boundaries (derived from phase_start / phase_end events). RECOMMENDED whenever a URL has ≥2 sections — keeps visual flow natural between sections instead of reopening the browser per segment.'),
371
372
  events_path: z.string().optional().describe('Workspace-relative events.json path. Default ${output_path}.events.json'),
372
373
  viewport: z.object({
373
374
  width: z.number().optional(),
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lightcone-ai/daemon",
3
- "version": "0.18.1",
3
+ "version": "0.20.0",
4
4
  "type": "module",
5
5
  "main": "src/index.js",
6
6
  "bin": {
@@ -215,12 +215,128 @@ async function transcodeWebmToMp4({
215
215
  });
216
216
  }
217
217
 
218
+ // Frame-accurate slice of an mp4 — re-encodes to honour the exact start/end
219
+ // instead of snapping to the nearest keyframe (which `-c copy` would do, and
220
+ // can drift by several seconds with libx264's default ~250-frame GOP).
221
+ // Re-encoding short clips (≤30s) at preset=veryfast is fast (<1s typical),
222
+ // so we trade a bit of CPU for being able to align section cuts to the
223
+ // per-segment TTS the rest of the pipeline expects.
224
+ async function cutMp4Slice({
225
+ inputPath,
226
+ outputPath,
227
+ startMs,
228
+ durationMs,
229
+ fps = DEFAULT_FPS,
230
+ ffmpegBin = 'ffmpeg',
231
+ } = {}) {
232
+ const startSec = Math.max(0, Number(startMs) || 0) / 1000;
233
+ const durationSec = Math.max(0.05, Number(durationMs) || 0) / 1000;
234
+ const args = [
235
+ '-y',
236
+ '-i', inputPath,
237
+ '-ss', startSec.toFixed(3),
238
+ '-t', durationSec.toFixed(3),
239
+ '-an',
240
+ '-c:v', 'libx264',
241
+ '-preset', 'veryfast',
242
+ '-pix_fmt', 'yuv420p',
243
+ ...(Number.isFinite(Number(fps)) && Number(fps) > 0 ? ['-r', String(fps)] : []),
244
+ '-movflags', '+faststart',
245
+ outputPath,
246
+ ];
247
+ await new Promise((resolve, reject) => {
248
+ const proc = spawn(ffmpegBin, args, { stdio: ['ignore', 'pipe', 'pipe'] });
249
+ const errChunks = [];
250
+ proc.stderr?.on('data', (chunk) => errChunks.push(chunk));
251
+ proc.once('error', (err) => {
252
+ const wrapped = new Error(`ffmpeg_spawn_failed:${err.message}`);
253
+ wrapped.code = 'FFMPEG_SPAWN_FAILED';
254
+ reject(wrapped);
255
+ });
256
+ proc.on('close', (code) => {
257
+ if (code === 0) return resolve();
258
+ const wrapped = new Error(
259
+ `ffmpeg_cut_failed:code=${code}: ${Buffer.concat(errChunks).toString().slice(-2000)}`
260
+ );
261
+ wrapped.code = 'FFMPEG_CUT_FAILED';
262
+ reject(wrapped);
263
+ });
264
+ });
265
+ }
266
+
267
+ // Derive per-section cut points from eventsLog. phase_start.t_ms / phase_end.t_ms
268
+ // are recorded against the trimmed mp4 timeline (head trim already happened),
269
+ // so we can use them as-is.
270
+ function deriveSectionCutPoints(eventsLog, phaseCount) {
271
+ if (!Array.isArray(eventsLog) || eventsLog.length === 0) {
272
+ throw new Error('events_log_empty');
273
+ }
274
+ const starts = new Map();
275
+ const ends = new Map();
276
+ for (const ev of eventsLog) {
277
+ if (!ev || typeof ev !== 'object') continue;
278
+ const id = ev.phase_id;
279
+ const t = Number(ev.t_ms);
280
+ if (!id || !Number.isFinite(t)) continue;
281
+ if (ev.action === 'phase_start' && !starts.has(id)) starts.set(id, t);
282
+ if (ev.action === 'phase_end') ends.set(id, t);
283
+ }
284
+ // Walk phases in order to preserve plan ordering even if events arrived
285
+ // out-of-order (they shouldn't, but guard against it).
286
+ const orderedIds = [];
287
+ for (const ev of eventsLog) {
288
+ if (ev?.action === 'phase_start' && !orderedIds.includes(ev.phase_id)) {
289
+ orderedIds.push(ev.phase_id);
290
+ }
291
+ }
292
+ if (orderedIds.length !== phaseCount) {
293
+ throw new Error(`events_phase_count_mismatch:expected=${phaseCount}:got=${orderedIds.length}`);
294
+ }
295
+ return orderedIds.map((id) => {
296
+ const startMs = starts.get(id);
297
+ const endMs = ends.get(id);
298
+ if (!Number.isFinite(startMs) || !Number.isFinite(endMs)) {
299
+ throw new Error(`phase_timing_missing:${id}`);
300
+ }
301
+ if (endMs <= startMs) {
302
+ throw new Error(`phase_timing_invalid:${id}:start=${startMs}:end=${endMs}`);
303
+ }
304
+ return { phase_id: id, start_ms: startMs, end_ms: endMs, duration_ms: endMs - startMs };
305
+ });
306
+ }
307
+
308
+ function normalizeOutputPaths(rawList) {
309
+ if (rawList == null) return null;
310
+ if (!Array.isArray(rawList)) {
311
+ const error = new Error('output_paths_must_be_array');
312
+ error.code = 'OUTPUT_PATHS_MUST_BE_ARRAY';
313
+ throw error;
314
+ }
315
+ if (rawList.length === 0) return null;
316
+ return rawList.map((entry, idx) => {
317
+ const normalized = normalizeText(entry);
318
+ if (!normalized) {
319
+ const error = new Error(`output_paths[${idx}]_empty`);
320
+ error.code = 'OUTPUT_PATHS_ENTRY_EMPTY';
321
+ throw error;
322
+ }
323
+ return path.resolve(normalized);
324
+ });
325
+ }
326
+
218
327
  export async function recordUrlNarration({
219
328
  plan,
220
329
  output_path,
221
330
  outputPath = output_path,
222
331
  events_path,
223
332
  eventsPath = events_path,
333
+ // Multi-section output: pass an array of N paths matching plan.sections length
334
+ // to record once continuously and slice the result into N per-section mp4s.
335
+ // The browser stays open for the whole recording, so visuals flow naturally
336
+ // between sections (no scroll-back-to-top between each, no page reload). When
337
+ // omitted, behaves exactly like before — single mp4 at outputPath.
338
+ output_paths,
339
+ outputPaths = output_paths,
224
340
  url,
225
341
  viewport = DEFAULT_VIEWPORT,
226
342
  fps = DEFAULT_FPS,
@@ -234,6 +350,7 @@ export async function recordUrlNarration({
234
350
  launchChromiumFn = launchChromiumMobile,
235
351
  openPageFn = openPageAndSettle,
236
352
  transcodeFn = transcodeWebmToMp4,
353
+ cutFn = cutMp4Slice,
237
354
  nowMs = () => Date.now(),
238
355
  } = {}) {
239
356
  const zoom = Number.isFinite(Number(page_zoom)) && Number(page_zoom) > 0 ? Number(page_zoom) : 1.1;
@@ -249,6 +366,23 @@ export async function recordUrlNarration({
249
366
  const resolvedUrl = resolveUrl({ url, plan });
250
367
  const normalizedViewport = normalizeViewport(viewport);
251
368
  const normalizedFps = normalizeInteger(fps, DEFAULT_FPS);
369
+ const resolvedOutputPaths = normalizeOutputPaths(outputPaths);
370
+ // When multi-section output is requested, the count must match plan.sections
371
+ // 1:1 — otherwise the agent will end up with audio/visual misalignment when
372
+ // it feeds these into plan_video_segments. Fail loud rather than silently
373
+ // truncating or padding.
374
+ if (resolvedOutputPaths && resolvedOutputPaths.length !== phases.length) {
375
+ const error = new Error(
376
+ `output_paths_count_mismatch:expected=${phases.length}:got=${resolvedOutputPaths.length}`,
377
+ );
378
+ error.code = 'OUTPUT_PATHS_COUNT_MISMATCH';
379
+ throw error;
380
+ }
381
+ if (resolvedOutputPaths) {
382
+ for (const p of resolvedOutputPaths) {
383
+ mkdirSync(path.dirname(p), { recursive: true });
384
+ }
385
+ }
252
386
 
253
387
  mkdirSync(path.dirname(resolvedOutputPath), { recursive: true });
254
388
  mkdirSync(path.dirname(resolvedEventsPath), { recursive: true });
@@ -367,12 +501,48 @@ export async function recordUrlNarration({
367
501
  ? eventsLog.reduce((max, ev) => Math.max(max, Number(ev?.t_ms) || 0), 0)
368
502
  : 0;
369
503
 
504
+ // Multi-section output: slice the consolidated mp4 at section boundaries
505
+ // (derived from phase_start / phase_end events). All slices come from the
506
+ // SAME continuous recording, so the visual flow between sections stays
507
+ // natural — no browser reload, no scroll-back-to-top per segment.
508
+ let sectionOutputs = null;
509
+ if (resolvedOutputPaths) {
510
+ const cutPoints = deriveSectionCutPoints(eventsLog, phases.length);
511
+ sectionOutputs = [];
512
+ for (let i = 0; i < cutPoints.length; i += 1) {
513
+ const cut = cutPoints[i];
514
+ const outPath = resolvedOutputPaths[i];
515
+ await cutFn({
516
+ inputPath: resolvedOutputPath,
517
+ outputPath: outPath,
518
+ startMs: cut.start_ms,
519
+ durationMs: cut.duration_ms,
520
+ fps: normalizedFps,
521
+ });
522
+ const sliceStat = await stat(outPath);
523
+ if (!sliceStat.isFile() || sliceStat.size <= 0) {
524
+ const error = new Error(`section_slice_empty:${outPath}`);
525
+ error.code = 'SECTION_SLICE_EMPTY';
526
+ throw error;
527
+ }
528
+ sectionOutputs.push({
529
+ phase_id: cut.phase_id,
530
+ video_path: outPath,
531
+ start_ms: cut.start_ms,
532
+ end_ms: cut.end_ms,
533
+ duration_ms: cut.duration_ms,
534
+ size_bytes: Number(sliceStat.size ?? 0),
535
+ });
536
+ }
537
+ }
538
+
370
539
  return {
371
540
  video_path: resolvedOutputPath,
372
541
  events_path: resolvedEventsPath,
373
542
  events_log: eventsLog,
374
543
  duration_ms: lastTms > 0 ? lastTms : null,
375
544
  display,
545
+ sections: sectionOutputs,
376
546
  };
377
547
  } catch (error) {
378
548
  primaryError = error;
@@ -1,4 +1,5 @@
1
1
  import { resolveDurationMs } from './phase-duration.js';
2
+ import { humanizedScroll } from '../humanized-scroll.js';
2
3
 
3
4
  function normalizeText(value) {
4
5
  if (typeof value !== 'string') return '';
@@ -228,13 +229,26 @@ function resolveFromY(phase, fallback = null) {
228
229
  return Math.round(parsed);
229
230
  }
230
231
 
232
+ // Delegates to humanizedScroll, which dispatches real CDP touch events so
233
+ // the browser's gesture engine produces native scroll physics (rubber-band,
234
+ // fling inertia, compositor-paced repaints). The old implementation drove
235
+ // `root.scrollTo(...)` in a setTimeout loop inside page.evaluate — visually
236
+ // smooth in isolation, but bypassed the gesture pipeline entirely, which is
237
+ // what made scrolls feel "robotic" on recordings (see the
238
+ // `not natural` thread in docs/scenario-content-creation discussion).
239
+ //
240
+ // `minSteps` is no longer needed (humanizedScroll computes segments from
241
+ // distance + duration). `jitterPx` is forwarded as `pixel_jitter_px`, which
242
+ // humanizedScroll converts into per-touchMove vertical offset.
231
243
  async function animateScroll(page, {
232
244
  startY = null,
233
245
  targetY,
234
246
  durationMs,
235
247
  easing = 'easeInOutQuad',
236
248
  jitterPx = 0,
237
- minSteps = 10,
249
+ // minSteps is accepted but unused — kept in the signature so callers don't
250
+ // need updating in this refactor.
251
+ minSteps: _minSteps, // eslint-disable-line no-unused-vars
238
252
  } = {}) {
239
253
  if (!Number.isFinite(Number(targetY))) {
240
254
  const error = new Error('phase_target_y_required');
@@ -242,53 +256,19 @@ async function animateScroll(page, {
242
256
  throw error;
243
257
  }
244
258
 
245
- const normalizedDurationMs = Math.max(0, Number(durationMs) || 0);
246
- const normalizedMinSteps = Math.max(1, Number(minSteps) || 1);
247
-
248
- await page.evaluate(async ({
249
- startY: evaluateStartY,
250
- targetY: evaluateTargetY,
251
- durationMs: evaluateDurationMs,
252
- easing: evaluateEasing,
253
- jitterPx: evaluateJitterPx,
254
- minSteps: evaluateMinSteps,
255
- }) => {
256
- const root = document.scrollingElement || document.documentElement;
257
- const wait = (ms) => new Promise(resolve => setTimeout(resolve, ms));
258
-
259
- const fromY = Number.isFinite(evaluateStartY) ? evaluateStartY : root.scrollTop;
260
- const toY = evaluateTargetY;
261
- const delta = toY - fromY;
262
-
263
- const steps = Math.max(evaluateMinSteps, Math.round(Math.max(1, evaluateDurationMs) / 16));
264
- const stepDurationMs = evaluateDurationMs <= 0 ? 0 : evaluateDurationMs / steps;
265
-
266
- const applyEasing = (t) => {
267
- if (evaluateEasing === 'linear') return t;
268
- if (evaluateEasing === 'easeOutQuad') return 1 - ((1 - t) * (1 - t));
269
- return t < 0.5
270
- ? 2 * t * t
271
- : 1 - (Math.pow(-2 * t + 2, 2) / 2);
272
- };
273
-
274
- root.scrollTo(0, fromY);
275
- for (let index = 1; index <= steps; index += 1) {
276
- const t = index / steps;
277
- const eased = applyEasing(t);
278
- const jitter = evaluateJitterPx > 0 ? ((Math.random() * 2 - 1) * evaluateJitterPx) : 0;
279
- root.scrollTo(0, fromY + (delta * eased) + jitter);
280
- if (stepDurationMs > 0) {
281
- await wait(stepDurationMs);
282
- }
283
- }
284
- root.scrollTo(0, toY);
285
- }, {
286
- startY,
287
- targetY,
288
- durationMs: normalizedDurationMs,
289
- easing,
290
- jitterPx: Math.max(0, Number(jitterPx) || 0),
291
- minSteps: normalizedMinSteps,
259
+ const resolvedFromY = Number.isFinite(Number(startY))
260
+ ? Number(startY)
261
+ : await page.evaluate(() => {
262
+ const root = document.scrollingElement || document.documentElement;
263
+ return Math.round(root.scrollTop);
264
+ });
265
+
266
+ await humanizedScroll(page, {
267
+ from_y: resolvedFromY,
268
+ to_y: Number(targetY),
269
+ duration_ms: Math.max(0, Number(durationMs) || 0),
270
+ motion_curve: easing,
271
+ pixel_jitter_px: Math.max(0, Number(jitterPx) || 0),
292
272
  });
293
273
  }
294
274
 
@@ -181,6 +181,21 @@ export function resolveRecordUrlNarrationPaths({
181
181
  };
182
182
  }
183
183
 
184
+ function resolveOutputPaths(rawList, { workspaceDir }) {
185
+ if (rawList == null) return null;
186
+ if (!Array.isArray(rawList)) {
187
+ throw new Error('output_paths must be an array of file paths (one per section).');
188
+ }
189
+ if (rawList.length === 0) return null;
190
+ return rawList.map((entry, idx) => {
191
+ const normalized = normalizeText(entry);
192
+ if (!normalized) {
193
+ throw new Error(`output_paths[${idx}] is empty — every entry must be a non-empty path.`);
194
+ }
195
+ return path.resolve(workspaceDir, normalized);
196
+ });
197
+ }
198
+
184
199
  export async function runRecordUrlNarrationTool({
185
200
  args = {},
186
201
  currentWorkspaceId = '',
@@ -252,23 +267,66 @@ export async function runRecordUrlNarrationTool({
252
267
  mkdirSync(path.dirname(resolvedOutputPath), { recursive: true });
253
268
  mkdirSync(path.dirname(resolvedEventsPath), { recursive: true });
254
269
 
270
+ // Multi-section mode: caller passed output_paths. Validate it 1:1 with
271
+ // plan.sections so the recorder can slice the continuous recording into
272
+ // per-section mp4s without ambiguity.
273
+ let resolvedOutputPaths = null;
274
+ try {
275
+ resolvedOutputPaths = resolveOutputPaths(validatedInput.output_paths, { workspaceDir });
276
+ } catch (error) {
277
+ return toolError(`Error: ${error.message}`);
278
+ }
279
+ if (resolvedOutputPaths) {
280
+ const planSectionCount = (planSegments(validatedInput.plan) ?? []).length;
281
+ if (resolvedOutputPaths.length !== planSectionCount) {
282
+ return toolError(
283
+ `Error: output_paths length (${resolvedOutputPaths.length}) must match `
284
+ + `plan.sections length (${planSectionCount}). Each section produces exactly one mp4 — `
285
+ + `don't pad or truncate.`,
286
+ );
287
+ }
288
+ }
289
+
255
290
  const recorderOutput = await recordUrlNarrationFn({
256
291
  url: validatedInput.url,
257
292
  plan: validatedInput.plan,
258
293
  output_path: resolvedOutputPath,
259
294
  events_path: resolvedEventsPath,
295
+ output_paths: resolvedOutputPaths,
260
296
  viewport: validatedInput.viewport,
261
297
  fps: validatedInput.fps,
262
298
  settle_ms: validatedInput.settle_ms,
263
299
  });
264
300
 
265
- return toolText(
266
- `Recorded URL narration.\n`
267
- + `video_path=${resolvedOutputPath}\n`
268
- + `events_path=${resolvedEventsPath}\n`
269
- + `duration_ms=${deriveDurationMs(recorderOutput) ?? 'unknown'}\n`
270
- + `phases=${derivePhaseCount({ plan: validatedInput.plan, recorderOutput }) ?? 'n/a'}`
271
- );
301
+ // Single-output mode (legacy): same one-line summary as before.
302
+ if (!resolvedOutputPaths) {
303
+ return toolText(
304
+ `Recorded URL narration.\n`
305
+ + `video_path=${resolvedOutputPath}\n`
306
+ + `events_path=${resolvedEventsPath}\n`
307
+ + `duration_ms=${deriveDurationMs(recorderOutput) ?? 'unknown'}\n`
308
+ + `phases=${derivePhaseCount({ plan: validatedInput.plan, recorderOutput }) ?? 'n/a'}`,
309
+ );
310
+ }
311
+
312
+ // Multi-section mode: one section block per output mp4, plus the
313
+ // consolidated master mp4 path for debugging / verification.
314
+ const sections = Array.isArray(recorderOutput?.sections) ? recorderOutput.sections : [];
315
+ const lines = [
316
+ 'Recorded URL narration (multi-section).',
317
+ `master_video_path=${resolvedOutputPath}`,
318
+ `events_path=${resolvedEventsPath}`,
319
+ `total_duration_ms=${deriveDurationMs(recorderOutput) ?? 'unknown'}`,
320
+ `sections=${sections.length}`,
321
+ ];
322
+ sections.forEach((s, idx) => {
323
+ lines.push(`--- section ${idx} (${s.phase_id}) ---`);
324
+ lines.push(`video_path=${s.video_path}`);
325
+ lines.push(`start_ms=${s.start_ms}`);
326
+ lines.push(`duration_ms=${s.duration_ms}`);
327
+ lines.push(`size_bytes=${s.size_bytes ?? 'unknown'}`);
328
+ });
329
+ return toolText(lines.join('\n'));
272
330
  } catch (error) {
273
331
  return toolError(`Error: ${error.message}`);
274
332
  }