npm - @lightcone-ai/daemon - Versions diffs - 0.15.72 → 0.15.73 - Mend

@lightcone-ai/daemon 0.15.72 → 0.15.73

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/mcp-servers/publisher/adapters/kuaishou.js +2 -2
package/package.json +1 -1
package/src/chat-bridge.js +4 -4
package/src/tools/plan-video-segments.js +35 -10

package/mcp-servers/publisher/adapters/kuaishou.js CHANGED Viewed

@@ -56,7 +56,7 @@ export class KuaishouAdapter {
     await this._clickByText('放弃');
     await sleep(500);
     try { await this._cdp.send('Runtime.evaluate', { expression: 'window.scrollTo(0, 300)', returnByValue: false }); } catch {}
-    await this._waitForSelector('input[type="file"], [class*="upload"], [class*="Upload"]', 45000);
+    await this._waitForSelector('input[type="file"], [class*="upload"], [class*="Upload"]', 120000);
     const { loggedIn } = await this.checkLoginStatus();
     if (!loggedIn) throw new Error('LOGIN_EXPIRED: 快手登录已过期，请重新扫码连接');
@@ -97,7 +97,7 @@ export class KuaishouAdapter {
     // Scroll once to trigger any lazy-rendered upload widgets, then wait
     try { await this._cdp.send('Runtime.evaluate', { expression: 'window.scrollTo(0, 300)', returnByValue: false }); } catch {}
-    await this._waitForSelector('input[type="file"], [class*="upload"], [class*="Upload"]', 45000);
+    await this._waitForSelector('input[type="file"], [class*="upload"], [class*="Upload"]', 120000);
     const { loggedIn } = await this.checkLoginStatus();
     if (!loggedIn) throw new Error('LOGIN_EXPIRED: 快手登录已过期，请重新扫码连接');

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@lightcone-ai/daemon",
-  "version": "0.15.72",
+  "version": "0.15.73",
   "type": "module",
   "main": "src/index.js",
   "bin": {

package/src/chat-bridge.js CHANGED Viewed

@@ -1430,10 +1430,10 @@ server.tool('get_library_file',
 // ── record_url_narration ────────────────────────────────────────────────────────
 server.tool('record_url_narration',
-  'Record a silent video of a URL by orchestrating Xvfb + Chromium + ffmpeg, driven by a video plan. Outputs a silent mp4 that can be passed to compose_video_v2 as a video-kind segment with an audio_path for narration.\n\nUse this as the canonical recording step for URL-narration videos. Falls back: if the page needs interactions outside the visual_action vocabulary (clicks, waits, OCR loops), use Monitor (Bash) with custom Playwright instead.\n\nRuntime requirements: this tool only works on a Linux daemon machine with Xvfb + ffmpeg (x11grab) + Chromium installed. macOS / Windows daemons will fail at startup.',
+  'Record a silent video of a URL by driving Chromium on an Xvfb display and capturing it with Playwright recordVideo, driven by a video plan; ffmpeg then transcodes the recording to mp4. Outputs a silent mp4 that can be passed to compose_video_v2 as a video-kind segment with an audio_path for narration.\n\nUse this as the canonical recording step for URL-narration videos. Falls back: if the page needs interactions outside the visual_action vocabulary (clicks, waits, OCR loops), use Monitor (Bash) with custom Playwright instead.\n\nRuntime requirements: this tool only works on a Linux daemon machine with Xvfb + Chromium + ffmpeg installed (ffmpeg is used to transcode the recording to mp4; no x11grab device support needed). macOS / Windows daemons will fail at startup.',
   {
     url: z.string().describe('Page URL to record'),
-    plan: z.record(z.any()).describe('Must be the full output from detail_sections (not plan_video). detail_sections output includes detail_sections_version, sections[], audio metadata, and dwell_ms per phase.'),
+    plan: z.record(z.any()).describe('A video plan: an object with `phases` (or `sections`), each a "visual beat" with `action` (scroll_to_dwell / linear_scroll_during / scroll_back / hold / ...), a target (`target_y` or `focus_region:[y1,y2]`) for scroll-type actions, and `dwell_ms` (how long to hold that beat — should match the segment\'s TTS duration). It can be hand-written or the output of plan_video_segments (whose returned segments array doubles as a valid plan).'),
     output_path: z.string().optional().describe('Workspace-relative output mp4 path. Default tmp/wx3_video/recorded-{ts}.mp4'),
     events_path: z.string().optional().describe('Workspace-relative events.json path. Default ${output_path}.events.json'),
     viewport: z.object({
@@ -1468,7 +1468,7 @@ server.tool('submit_to_library',
     target_platform: z.string().optional().describe('目标发布平台，如 xhs / douyin'),
     metadata: z.record(z.any()).optional().describe('其它 metadata（brand_voice / persona / account / goal_state 等）'),
     understanding: z.record(z.any()).optional().describe('analyze_page 输出'),
-    plan: z.record(z.any()).optional().describe('plan_video / detail_sections 输出'),
+    plan: z.record(z.any()).optional().describe('plan_video_segments 输出（或手写的录屏 plan）'),
   },
   async (args) => {
     if (isBlockedCvmaxEditorVideoTool('submit_to_library')) {
@@ -1529,7 +1529,7 @@ server.tool('request_approval',
     platform:      z.string().describe('Target platform, e.g. "x", "xhs", "email"'),
     description:   z.string().describe('Human-readable summary of what will happen if approved'),
     payload:       z.record(z.any()).describe('Full action parameters (content, media_urls, etc.)'),
-    credential_id: z.string().optional().describe('Which account/credential to use. For publishing, prefer a workspace account_id or real credential UUID. Role aliases like primary/test are accepted only if they uniquely match a workspace account.'),
+    credential_id: z.string().optional().describe('Which account/credential to use. Accepts a workspace account_id, a real credential UUID, the account display name, or a role alias (主号/main/primary, 矩阵号/matrix/secondary, 测试号/test/incubator) — any value works as long as it uniquely matches one workspace account on the target platform. If publishing fails with publish_account_selection_required/ambiguous, pick a value from the returned candidates\' "selectors" list yourself instead of asking the user to re-type an account name.'),
   },
   async ({ action_type, platform, description, payload, credential_id }) => {
     try {

package/src/tools/plan-video-segments.js CHANGED Viewed

@@ -45,6 +45,20 @@ function planDurationSec(audioDurationMs, bufferSec = 0.5) {
   return Math.ceil(raw * 2) / 2; // round up to nearest 0.5s
 }
+// Run fn over items with a bounded number of concurrent workers (FIFO drain).
+async function mapWithConcurrency(items, limit, fn) {
+  const queue = items.map((item, index) => ({ item, index }));
+  const workers = Array.from({ length: Math.max(1, Math.min(limit, queue.length)) }, async () => {
+    while (queue.length > 0) {
+      const next = queue.shift();
+      await fn(next.item, next.index);
+    }
+  });
+  await Promise.all(workers);
+}
+const TTS_CONCURRENCY = 5;
 export async function runPlanVideoSegmentsTool({ segments, workspace_id, voice_id, currentWorkspaceId, api }) {
   if (!Array.isArray(segments) || segments.length === 0) {
     return toolError('segments must be a non-empty array.');
@@ -58,20 +72,31 @@ export async function runPlanVideoSegmentsTool({ segments, workspace_id, voice_i
   const planned = [];
   const errors = [];
+  // Synthesize TTS for every text-bearing segment up front, in parallel (bounded),
+  // so an N-segment plan no longer pays N sequential round-trips to the TTS API.
+  const audioResults = new Array(segments.length).fill(null);
+  const ttsJobs = segments
+    .map((seg, i) => ({ i, text: String(seg.text ?? '').trim() }))
+    .filter(job => job.text);
+  await mapWithConcurrency(ttsJobs, TTS_CONCURRENCY, async ({ i, text }) => {
+    try {
+      audioResults[i] = await synthesizeSegmentTts(text, { workspace_id: targetWorkspaceId, voice_id, api });
+    } catch (err) {
+      errors.push(`segments[${i}]: TTS failed — ${err.message}`);
+      audioResults[i] = { audio_path: null, audio_duration_ms: 3000 }; // fallback estimate
+    }
+  });
+  errors.sort((a, b) => {
+    const na = Number((a.match(/segments\[(\d+)\]/) ?? [])[1] ?? 0);
+    const nb = Number((b.match(/segments\[(\d+)\]/) ?? [])[1] ?? 0);
+    return na - nb;
+  });
   for (let i = 0; i < segments.length; i++) {
     const seg = segments[i];
     const text = String(seg.text ?? '').trim();
     const kind = String(seg.visual_kind ?? 'image');
-    let audioResult = null;
-    if (text) {
-      try {
-        audioResult = await synthesizeSegmentTts(text, { workspace_id: targetWorkspaceId, voice_id, api });
-      } catch (err) {
-        errors.push(`segments[${i}]: TTS failed — ${err.message}`);
-        audioResult = { audio_path: null, audio_duration_ms: 3000 }; // fallback estimate
-      }
-    }
+    const audioResult = audioResults[i];
     const audioDurationMs = audioResult?.audio_duration_ms ?? 0;
     let presentation;