@lightcone-ai/daemon 0.17.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/mcp-servers/official/media-tools/index.js +128 -3
- package/package.json +1 -1
- package/src/_vendor/video/composer-v2/index.js +11 -5
- package/src/chat-bridge.js +10 -0
- package/src/mcp-config.js +5 -0
- package/src/tools/compose-video-v2.js +7 -2
- package/src/tools/get-default-voice.js +48 -0
- package/src/tools/list-tts-voices.js +58 -0
- package/src/tools/preview-tts-voice.js +75 -0
- package/src/tools/set-default-voice.js +44 -0
- package/src/tools/synthesize-tts.js +12 -1
- package/src/video-brief-flag.js +70 -0
|
@@ -6,12 +6,17 @@ import { z } from 'zod';
|
|
|
6
6
|
import { addTitleEffects } from './lib/render.js';
|
|
7
7
|
import { SUPPORTED_PRESETS } from './lib/presets.js';
|
|
8
8
|
import { runSynthesisTtsTool } from '../../../src/tools/synthesize-tts.js';
|
|
9
|
+
import { runListTtsVoicesTool } from '../../../src/tools/list-tts-voices.js';
|
|
10
|
+
import { runPreviewTtsVoiceTool } from '../../../src/tools/preview-tts-voice.js';
|
|
11
|
+
import { runGetDefaultVoiceTool } from '../../../src/tools/get-default-voice.js';
|
|
12
|
+
import { runSetDefaultVoiceTool } from '../../../src/tools/set-default-voice.js';
|
|
9
13
|
import { runPlanVideoSegmentsTool } from '../../../src/tools/plan-video-segments.js';
|
|
10
14
|
import { runComposeVideoV2Tool } from '../../../src/tools/compose-video-v2.js';
|
|
11
15
|
import { runRecordUrlNarrationTool } from '../../../src/tools/record-url-narration.js';
|
|
12
16
|
import { runRenderTextToImageTool } from '../../../src/tools/render-text-to-image.js';
|
|
13
17
|
import { runRenderHtmlToImageTool } from '../../../src/tools/render-html-to-image.js';
|
|
14
18
|
import { runTakePageScreenshotTool } from '../../../src/tools/take-page-screenshot.js';
|
|
19
|
+
import { hasVideoBriefSent } from '../../../src/video-brief-flag.js';
|
|
15
20
|
import { lightconeApi, CURRENT_WORKSPACE_ID, CURRENT_AGENT_ID } from './lib/lightcone-api.js';
|
|
16
21
|
|
|
17
22
|
const WORKSPACE_DIR = String(process.env.WORKSPACE_DIR ?? '');
|
|
@@ -128,7 +133,9 @@ server.tool(
|
|
|
128
133
|
+ 'into a single call (segment-level audio is required for plan_video_segments to align video durations).',
|
|
129
134
|
{
|
|
130
135
|
text: z.string().min(1).describe('Narration text for this segment. Will be synthesized as a single mp3.'),
|
|
131
|
-
voice_id: z.string().optional().describe('
|
|
136
|
+
voice_id: z.string().optional().describe('MiniMax voice_id (e.g. "Chinese (Mandarin)_Warm_Girl"). Omit to use the workspace default. '
|
|
137
|
+
+ 'If neither is set the call fails with tts_default_voice_required — call list_tts_voices to discover options '
|
|
138
|
+
+ 'and set_default_voice once the user picks one.'),
|
|
132
139
|
workspace_id: z.string().optional().describe('Target workspace. Defaults to the current workspace.'),
|
|
133
140
|
},
|
|
134
141
|
async ({ text, voice_id, workspace_id }) => runSynthesisTtsTool({
|
|
@@ -140,6 +147,87 @@ server.tool(
|
|
|
140
147
|
})
|
|
141
148
|
);
|
|
142
149
|
|
|
150
|
+
// ── TTS voice catalog + workspace preference (added in the voice-selection
|
|
151
|
+
// rebuild — see docs/scenario-content-creation/video-synthesis-design.md).
|
|
152
|
+
// These four tools let the agent discover MiniMax voices, sample them, and
|
|
153
|
+
// persist a workspace-level default — all from inside the IM conversation,
|
|
154
|
+
// no separate UI. There is no fallback default voice: synthesize_tts will
|
|
155
|
+
// throw tts_default_voice_required if neither the request nor the workspace
|
|
156
|
+
// has a voice_id set, which is the agent's cue to run this discovery flow.
|
|
157
|
+
server.tool(
|
|
158
|
+
'list_tts_voices',
|
|
159
|
+
'List TTS voices available to the current workspace (system catalog + any cloned voices owned by this workspace). '
|
|
160
|
+
+ 'Use this to show the user options before calling set_default_voice or synthesize_tts. '
|
|
161
|
+
+ 'Filter by language (e.g. "zh-CN"), style_tag (e.g. "recommended", "news", "warm"), or free-text query.',
|
|
162
|
+
{
|
|
163
|
+
language: z.string().optional().describe('BCP-47 language tag, e.g. "zh-CN", "en-US".'),
|
|
164
|
+
origin: z.enum(['system', 'cloned']).optional().describe('Restrict to system catalog or this workspace\'s cloned voices.'),
|
|
165
|
+
query: z.string().optional().describe('Free-text match against voice_id and display_name.'),
|
|
166
|
+
style_tag: z.string().optional().describe('Match a single style tag (e.g. "recommended" for the curated shortlist).'),
|
|
167
|
+
limit: z.number().int().positive().max(200).optional().describe('Max rows to return. Default 100.'),
|
|
168
|
+
workspace_id: z.string().optional().describe('Target workspace. Defaults to the current workspace.'),
|
|
169
|
+
},
|
|
170
|
+
async ({ language, origin, query, style_tag, limit, workspace_id }) => runListTtsVoicesTool({
|
|
171
|
+
language,
|
|
172
|
+
origin,
|
|
173
|
+
query,
|
|
174
|
+
style_tag,
|
|
175
|
+
limit,
|
|
176
|
+
workspace_id,
|
|
177
|
+
currentWorkspaceId: CURRENT_WORKSPACE_ID,
|
|
178
|
+
api: lightconeApi,
|
|
179
|
+
})
|
|
180
|
+
);
|
|
181
|
+
|
|
182
|
+
server.tool(
|
|
183
|
+
'preview_tts_voice',
|
|
184
|
+
'Synthesize a short sample so the user can hear a voice before committing. Returns a playable mp3 path + audio_url. '
|
|
185
|
+
+ 'Pass voice_id from list_tts_voices; text is optional (defaults to a neutral Chinese sample sentence).',
|
|
186
|
+
{
|
|
187
|
+
voice_id: z.string().min(1).describe('voice_id to preview, from list_tts_voices.'),
|
|
188
|
+
text: z.string().optional().describe('Custom sample text. Defaults to a short Chinese sentence.'),
|
|
189
|
+
workspace_id: z.string().optional().describe('Target workspace. Defaults to the current workspace.'),
|
|
190
|
+
},
|
|
191
|
+
async ({ voice_id, text, workspace_id }) => runPreviewTtsVoiceTool({
|
|
192
|
+
voice_id,
|
|
193
|
+
text,
|
|
194
|
+
workspace_id,
|
|
195
|
+
currentWorkspaceId: CURRENT_WORKSPACE_ID,
|
|
196
|
+
api: lightconeApi,
|
|
197
|
+
})
|
|
198
|
+
);
|
|
199
|
+
|
|
200
|
+
server.tool(
|
|
201
|
+
'get_default_voice',
|
|
202
|
+
'Read the current workspace\'s default TTS voice. Returns null when not set — that is the signal to run the discovery flow '
|
|
203
|
+
+ '(list_tts_voices → user picks → set_default_voice) before doing any synthesis.',
|
|
204
|
+
{
|
|
205
|
+
workspace_id: z.string().optional().describe('Target workspace. Defaults to the current workspace.'),
|
|
206
|
+
},
|
|
207
|
+
async ({ workspace_id }) => runGetDefaultVoiceTool({
|
|
208
|
+
workspace_id,
|
|
209
|
+
currentWorkspaceId: CURRENT_WORKSPACE_ID,
|
|
210
|
+
api: lightconeApi,
|
|
211
|
+
})
|
|
212
|
+
);
|
|
213
|
+
|
|
214
|
+
server.tool(
|
|
215
|
+
'set_default_voice',
|
|
216
|
+
'Persist a workspace-level default TTS voice. The voice_id must exist in the workspace\'s visible catalog '
|
|
217
|
+
+ '(system voice or a cloned voice owned by this workspace) — otherwise this tool fails with tts_voice_not_found. '
|
|
218
|
+
+ 'Call this after the user picks from list_tts_voices, OR when the user explicitly asks to change their default.',
|
|
219
|
+
{
|
|
220
|
+
voice_id: z.string().min(1).describe('voice_id to make the default. Must be in this workspace\'s catalog.'),
|
|
221
|
+
workspace_id: z.string().optional().describe('Target workspace. Defaults to the current workspace.'),
|
|
222
|
+
},
|
|
223
|
+
async ({ voice_id, workspace_id }) => runSetDefaultVoiceTool({
|
|
224
|
+
voice_id,
|
|
225
|
+
workspace_id,
|
|
226
|
+
currentWorkspaceId: CURRENT_WORKSPACE_ID,
|
|
227
|
+
api: lightconeApi,
|
|
228
|
+
})
|
|
229
|
+
);
|
|
230
|
+
|
|
143
231
|
// ── plan_video_segments (migrated from chat-bridge; TTS decoupled) ────────
|
|
144
232
|
// Pure planner — takes per-segment {text, audio_path, visual_kind, ...} and
|
|
145
233
|
// returns segments with audio_duration_ms / presentation.duration / dwell_ms
|
|
@@ -186,10 +274,13 @@ server.tool(
|
|
|
186
274
|
'compose_video_v2',
|
|
187
275
|
'Compose a video from a list of segments using ffmpeg. Each segment has a visual source (image / scroll / '
|
|
188
276
|
+ 'carousel / video / gif), optional audio, and optional subtitle text. Subtitles are burned in when '
|
|
189
|
-
+ 'subtitle_text is provided. Segments are concatenated in order; outro clips are appended after.\n\n'
|
|
277
|
+
+ 'subtitle_text is provided AND burn_subtitles is not false. Segments are concatenated in order; outro clips are appended after.\n\n'
|
|
190
278
|
+ 'When any segment has audio_path, MUST be preceded by plan_video_segments in the same session '
|
|
191
279
|
+ '(plan_video_segments fills duration/subtitle_text/audio_path mechanically; manual alignment is rejected). '
|
|
192
|
-
+ 'Returns a local mp4 path + size_bytes
|
|
280
|
+
+ 'Returns a local mp4 path + size_bytes.\n\n'
|
|
281
|
+
+ 'Dual-version delivery (subtitled + clean): call this tool twice with the SAME segments — first with default '
|
|
282
|
+
+ 'burn_subtitles=true (or omitted), then again with burn_subtitles=false and a different output_path. Only the '
|
|
283
|
+
+ 'final ffmpeg pass repeats; audio, source clips, and the plan_video_segments alignment are all reused.',
|
|
193
284
|
{
|
|
194
285
|
segments: z.array(z.object({
|
|
195
286
|
visual_path: z.string().optional().describe('Absolute path to a single image / video / gif.'),
|
|
@@ -207,6 +298,9 @@ server.tool(
|
|
|
207
298
|
outro_paths: z.array(z.string()).optional().describe('Absolute paths to outro video clips appended at end.'),
|
|
208
299
|
resolution: z.string().optional().describe('Output resolution WxH. Default "1080x1920".'),
|
|
209
300
|
output_path: z.string().optional().describe('Absolute output path. Auto-generated if omitted.'),
|
|
301
|
+
burn_subtitles: z.boolean().optional().describe('Whether to burn subtitle_text into the video. Default true. '
|
|
302
|
+
+ 'Pass false to produce a clean no-subtitle copy (dual-version delivery: run compose_video_v2 twice — '
|
|
303
|
+
+ 'once with default true, once with false + a different output_path — same segments, only one extra ffmpeg pass).'),
|
|
210
304
|
},
|
|
211
305
|
async (args) => {
|
|
212
306
|
const segments = Array.isArray(args?.segments) ? args.segments : [];
|
|
@@ -222,6 +316,18 @@ server.tool(
|
|
|
222
316
|
+ 'visual_path/visual_kind for the real media). Call plan_video_segments now and pass its output here.'
|
|
223
317
|
);
|
|
224
318
|
}
|
|
319
|
+
if (hasNarration && !hasVideoBriefSent({ workspaceId: CURRENT_WORKSPACE_ID, agentId: CURRENT_AGENT_ID })) {
|
|
320
|
+
return toolError(
|
|
321
|
+
'compose_video_v2 refused: must send a 确认稿 (production-brief) to the user via send_message before '
|
|
322
|
+
+ 'compositing a narration video. The system scans send_message content for a brief — a message that '
|
|
323
|
+
+ 'BOTH asks the user to confirm (确认 / 你看 / OK 吗 / 可以吗 / 同意 / 通过 / 行不行) AND describes '
|
|
324
|
+
+ 'at least two of: 画面 / 时长 / 文案 / 口播 / 字幕 / 顺序 / 口吻 / 分镜 / 配音 — no such message '
|
|
325
|
+
+ 'has been sent in this workspace+agent.\n\n'
|
|
326
|
+
+ '"已生成 TTS" / "开始合成" / progress reports do NOT count. Send a concrete confirmation draft '
|
|
327
|
+
+ 'first (e.g. "我准备这么做:画面是真录屏,时长约 1 分钟,文案如下…,字幕开启,公司顺序 A→B→C,'
|
|
328
|
+
+ '口吻是…—— 你 OK 吗?") and wait for the user to reply OK before calling compose_video_v2 again.'
|
|
329
|
+
);
|
|
330
|
+
}
|
|
225
331
|
return runComposeVideoV2Tool({ ...args, workspaceDir: WORKSPACE_DIR });
|
|
226
332
|
}
|
|
227
333
|
);
|
|
@@ -265,6 +371,25 @@ server.tool(
|
|
|
265
371
|
if (isBlockedCvmaxEditorVideoTool('record_url_narration')) {
|
|
266
372
|
return cvmaxEditorVideoToolError('record_url_narration');
|
|
267
373
|
}
|
|
374
|
+
// record_url_narration is part of the narration-video pipeline (paired
|
|
375
|
+
// with synthesize_tts + plan_video_segments + compose_video_v2), so it
|
|
376
|
+
// requires the same 确认稿 gate as compose_video_v2 — catching the skip
|
|
377
|
+
// earlier saves TTS + recording time.
|
|
378
|
+
if (!hasVideoBriefSent({ workspaceId: CURRENT_WORKSPACE_ID, agentId: CURRENT_AGENT_ID })) {
|
|
379
|
+
return {
|
|
380
|
+
isError: true,
|
|
381
|
+
content: [{ type: 'text', text:
|
|
382
|
+
'Error: record_url_narration refused: must send a 确认稿 (production-brief) to the user via '
|
|
383
|
+
+ 'send_message before starting a narration recording. The system scans send_message content for '
|
|
384
|
+
+ 'a brief — a message that BOTH asks the user to confirm (确认 / 你看 / OK 吗 / 可以吗 / 同意 / '
|
|
385
|
+
+ '通过 / 行不行) AND describes at least two of: 画面 / 时长 / 文案 / 口播 / 字幕 / 顺序 / 口吻 / '
|
|
386
|
+
+ '分镜 / 配音 — no such message has been sent in this workspace+agent.\n\n'
|
|
387
|
+
+ '"已生成 TTS" / "开始合成" / progress reports do NOT count. Send a concrete confirmation draft '
|
|
388
|
+
+ 'first (e.g. "我准备这么做:画面是真录屏,时长约 1 分钟,文案如下…,字幕开启,公司顺序 A→B→C,'
|
|
389
|
+
+ '口吻是…—— 你 OK 吗?") and wait for the user to reply OK before calling record_url_narration.'
|
|
390
|
+
}],
|
|
391
|
+
};
|
|
392
|
+
}
|
|
268
393
|
return runRecordUrlNarrationTool({
|
|
269
394
|
args,
|
|
270
395
|
currentWorkspaceId: CURRENT_WORKSPACE_ID,
|
package/package.json
CHANGED
|
@@ -254,6 +254,7 @@ export async function composeVideoV2({
|
|
|
254
254
|
outro_paths = [],
|
|
255
255
|
resolution = '1080x1920',
|
|
256
256
|
output_path,
|
|
257
|
+
burn_subtitles = true,
|
|
257
258
|
}) {
|
|
258
259
|
if (!Array.isArray(segments) || segments.length === 0) {
|
|
259
260
|
throw new Error('segments must be a non-empty array');
|
|
@@ -322,11 +323,16 @@ export async function composeVideoV2({
|
|
|
322
323
|
// Accept `text` as an alias for `subtitle_text`: plan_video_segments takes
|
|
323
324
|
// segment narration as `text` on input, compose_video_v2's canonical name is
|
|
324
325
|
// `subtitle_text`. Either reaches the burn pass so subtitles aren't silently dropped.
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
326
|
+
// burn_subtitles=false (dual-version delivery: same segments composed once
|
|
327
|
+
// with subtitles and once without) drops the text here so the burn-in pass
|
|
328
|
+
// skips entirely — saves the second compose having to mutate the segment array.
|
|
329
|
+
const subtitleText = burn_subtitles
|
|
330
|
+
? (
|
|
331
|
+
typeof seg.subtitle_text === 'string' ? seg.subtitle_text
|
|
332
|
+
: typeof seg.text === 'string' ? seg.text
|
|
333
|
+
: ''
|
|
334
|
+
).trim()
|
|
335
|
+
: '';
|
|
330
336
|
readyClips.push({ path: finalClip, duration: visualClip.duration, transition, subtitleText });
|
|
331
337
|
}
|
|
332
338
|
|
package/src/chat-bridge.js
CHANGED
|
@@ -16,6 +16,7 @@ import { runSubmitToLibraryTool } from './submit-to-library-tool.js';
|
|
|
16
16
|
// media-tools MCP server (V5 migration). Handlers still live in
|
|
17
17
|
// daemon/src/tools/ as shared modules and are imported there.
|
|
18
18
|
import { runGetLibraryFileTool } from './tools/get-library-file.js';
|
|
19
|
+
import { markVideoBriefSent, looksLikeVideoBrief } from './video-brief-flag.js';
|
|
19
20
|
import { isLeaseInvalidated, clearInvalidatedLease } from './governance-state.js';
|
|
20
21
|
import { classifyLeaseWindow } from './lease-window.js';
|
|
21
22
|
import {
|
|
@@ -795,6 +796,15 @@ server.tool('send_message', 'Send a message to a workspace, DM, or thread', {
|
|
|
795
796
|
content: z.string().describe('Message content'),
|
|
796
797
|
}, async ({ target, content }) => {
|
|
797
798
|
const data = await api('POST', '/send', { target, content });
|
|
799
|
+
// Heuristic: if this looks like a video-production 确认稿 (asks permission +
|
|
800
|
+
// describes plan), mark a cross-process flag so media-tools' compose_video_v2
|
|
801
|
+
// / record_url_narration can verify a brief was actually sent before running.
|
|
802
|
+
// See daemon/src/video-brief-flag.js for the detection rules.
|
|
803
|
+
if (looksLikeVideoBrief(content) && AGENT_ID && currentWorkspaceId) {
|
|
804
|
+
try {
|
|
805
|
+
markVideoBriefSent({ workspaceId: currentWorkspaceId, agentId: AGENT_ID, content });
|
|
806
|
+
} catch { /* best-effort; failure to mark is non-fatal */ }
|
|
807
|
+
}
|
|
798
808
|
return { content: [{ type: 'text', text: `Sent. messageId=${data.messageId} threadTarget=${data.threadTarget}` }] };
|
|
799
809
|
});
|
|
800
810
|
|
package/src/mcp-config.js
CHANGED
|
@@ -89,6 +89,11 @@ const SERVER_BACKED_MCP_SERVERS = new Set([
|
|
|
89
89
|
'audience-research',
|
|
90
90
|
'hook-pattern-library',
|
|
91
91
|
'weixin-tools',
|
|
92
|
+
// media-tools (V1–V5 chat-bridge → media-tools migration): synthesize_tts
|
|
93
|
+
// hits /tts/voiceover and the CvMax editor_in_chief gate + video-brief
|
|
94
|
+
// checks read CURRENT_AGENT_ID / CURRENT_WORKSPACE_ID. lib/lightcone-api.js
|
|
95
|
+
// throws at module load without the SERVER_URL/MACHINE_API_KEY/AGENT_ID triple.
|
|
96
|
+
'media-tools',
|
|
92
97
|
]);
|
|
93
98
|
|
|
94
99
|
function baseEnvForServer(serverKey, { serverUrl, authToken, agentId, workspaceId, workspaceDir }) {
|
|
@@ -22,7 +22,8 @@ function statSizeOrNull(p) {
|
|
|
22
22
|
try { return fs.statSync(p).size; } catch { return null; }
|
|
23
23
|
}
|
|
24
24
|
|
|
25
|
-
export async function runComposeVideoV2Tool({ segments, outro_paths, format, resolution, output_path, workspaceDir }) {
|
|
25
|
+
export async function runComposeVideoV2Tool({ segments, outro_paths, format, resolution, output_path, burn_subtitles, workspaceDir }) {
|
|
26
|
+
const burnSubtitles = burn_subtitles !== false;
|
|
26
27
|
if (!Array.isArray(segments) || segments.length === 0) {
|
|
27
28
|
return toolError('segments must be a non-empty array.');
|
|
28
29
|
}
|
|
@@ -72,7 +73,9 @@ export async function runComposeVideoV2Tool({ segments, outro_paths, format, res
|
|
|
72
73
|
// Warn when narration is present but no subtitle text is — compose_video_v2 burns
|
|
73
74
|
// subtitles only from `subtitle_text` (or its `text` alias); without it the video
|
|
74
75
|
// ships with no captions. Simplest fix: pass plan_video_segments' output verbatim.
|
|
75
|
-
|
|
76
|
+
// burn_subtitles=false is the explicit "no subtitles" path (dual-version delivery),
|
|
77
|
+
// so the warning would be noise — suppress it.
|
|
78
|
+
if (burnSubtitles) {
|
|
76
79
|
const hasSubText = s => (typeof s?.subtitle_text === 'string' && s.subtitle_text.trim())
|
|
77
80
|
|| (typeof s?.text === 'string' && s.text.trim());
|
|
78
81
|
const narratedNoSub = segments.filter(s =>
|
|
@@ -97,6 +100,7 @@ export async function runComposeVideoV2Tool({ segments, outro_paths, format, res
|
|
|
97
100
|
outro_paths: outro_paths ?? [],
|
|
98
101
|
resolution: resolution ?? '1080x1920',
|
|
99
102
|
output_path: outPath,
|
|
103
|
+
burn_subtitles: burnSubtitles,
|
|
100
104
|
});
|
|
101
105
|
|
|
102
106
|
const lines = [
|
|
@@ -106,6 +110,7 @@ export async function runComposeVideoV2Tool({ segments, outro_paths, format, res
|
|
|
106
110
|
`size_bytes=${result.size_bytes ?? 'unknown'}`,
|
|
107
111
|
`segments=${segments.length}`,
|
|
108
112
|
`outro_clips=${(outro_paths ?? []).length}`,
|
|
113
|
+
`burn_subtitles=${burnSubtitles}`,
|
|
109
114
|
];
|
|
110
115
|
for (const w of warnings) lines.push(w);
|
|
111
116
|
return toolText(lines.join('\n'));
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
function toolText(text) {
|
|
2
|
+
return { content: [{ type: 'text', text }] };
|
|
3
|
+
}
|
|
4
|
+
|
|
5
|
+
function toolError(text) {
|
|
6
|
+
return { isError: true, content: [{ type: 'text', text }] };
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
export async function runGetDefaultVoiceTool({
|
|
10
|
+
workspace_id,
|
|
11
|
+
currentWorkspaceId,
|
|
12
|
+
api,
|
|
13
|
+
}) {
|
|
14
|
+
const targetWorkspaceId = String(workspace_id ?? currentWorkspaceId ?? '').trim();
|
|
15
|
+
if (!targetWorkspaceId) {
|
|
16
|
+
return toolError('workspace_id is required (no current workspace context).');
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
const params = new URLSearchParams({ workspace_id: targetWorkspaceId });
|
|
20
|
+
|
|
21
|
+
let data;
|
|
22
|
+
try {
|
|
23
|
+
data = await api('GET', `/tts/preferences?${params.toString()}`);
|
|
24
|
+
} catch (error) {
|
|
25
|
+
return toolError(`get_default_voice API error: ${error.message}`);
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
if (!data?.default_voice_id) {
|
|
29
|
+
return toolText('No default TTS voice set for this workspace. Call list_tts_voices to show the user options, then set_default_voice once they pick one.');
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
const v = data.voice;
|
|
33
|
+
const lines = [
|
|
34
|
+
`Default TTS voice for workspace=${data.workspace_id} (provider=${data.provider}):`,
|
|
35
|
+
`voice_id=${data.default_voice_id}`,
|
|
36
|
+
];
|
|
37
|
+
if (v) {
|
|
38
|
+
lines.push(`display_name=${v.display_name}`);
|
|
39
|
+
if (v.language) lines.push(`language=${v.language}`);
|
|
40
|
+
if (v.gender) lines.push(`gender=${v.gender}`);
|
|
41
|
+
if (Array.isArray(v.style_tags) && v.style_tags.length) {
|
|
42
|
+
lines.push(`style_tags=${v.style_tags.join(', ')}`);
|
|
43
|
+
}
|
|
44
|
+
} else {
|
|
45
|
+
lines.push('(voice metadata not found — catalog may have changed; ask user to pick again)');
|
|
46
|
+
}
|
|
47
|
+
return toolText(lines.join('\n'));
|
|
48
|
+
}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
function toolText(text) {
|
|
2
|
+
return { content: [{ type: 'text', text }] };
|
|
3
|
+
}
|
|
4
|
+
|
|
5
|
+
function toolError(text) {
|
|
6
|
+
return { isError: true, content: [{ type: 'text', text }] };
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
function formatVoiceLine(voice) {
|
|
10
|
+
const tags = Array.isArray(voice.style_tags) && voice.style_tags.length
|
|
11
|
+
? ` [${voice.style_tags.join(', ')}]`
|
|
12
|
+
: '';
|
|
13
|
+
const gender = voice.gender ? ` (${voice.gender})` : '';
|
|
14
|
+
const origin = voice.origin === 'cloned' ? ' • cloned' : '';
|
|
15
|
+
return `- ${voice.voice_id} — ${voice.display_name}${gender}${tags}${origin}`;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export async function runListTtsVoicesTool({
|
|
19
|
+
language,
|
|
20
|
+
origin,
|
|
21
|
+
query,
|
|
22
|
+
style_tag,
|
|
23
|
+
limit,
|
|
24
|
+
workspace_id,
|
|
25
|
+
currentWorkspaceId,
|
|
26
|
+
api,
|
|
27
|
+
}) {
|
|
28
|
+
const targetWorkspaceId = String(workspace_id ?? currentWorkspaceId ?? '').trim();
|
|
29
|
+
if (!targetWorkspaceId) {
|
|
30
|
+
return toolError('workspace_id is required (no current workspace context).');
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
const params = new URLSearchParams();
|
|
34
|
+
params.set('workspace_id', targetWorkspaceId);
|
|
35
|
+
if (language) params.set('language', String(language).trim());
|
|
36
|
+
if (origin) params.set('origin', String(origin).trim());
|
|
37
|
+
if (query) params.set('query', String(query).trim());
|
|
38
|
+
if (style_tag) params.set('style_tag', String(style_tag).trim());
|
|
39
|
+
if (limit != null && Number.isFinite(Number(limit))) params.set('limit', String(Math.floor(Number(limit))));
|
|
40
|
+
|
|
41
|
+
let data;
|
|
42
|
+
try {
|
|
43
|
+
data = await api('GET', `/tts/voices?${params.toString()}`);
|
|
44
|
+
} catch (error) {
|
|
45
|
+
return toolError(`list_tts_voices API error: ${error.message}`);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
const voices = Array.isArray(data?.voices) ? data.voices : [];
|
|
49
|
+
if (!voices.length) {
|
|
50
|
+
return toolText('No TTS voices match the requested filters.');
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
const lines = [
|
|
54
|
+
`Found ${voices.length} voice(s) for provider=${data.provider}, workspace=${data.workspace_id}:`,
|
|
55
|
+
...voices.map(formatVoiceLine),
|
|
56
|
+
];
|
|
57
|
+
return toolText(lines.join('\n'));
|
|
58
|
+
}
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import { mkdirSync, writeFileSync } from 'fs';
|
|
2
|
+
import { randomUUID } from 'crypto';
|
|
3
|
+
import path from 'path';
|
|
4
|
+
import os from 'os';
|
|
5
|
+
|
|
6
|
+
function toolText(text) {
|
|
7
|
+
return { content: [{ type: 'text', text }] };
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
function toolError(text) {
|
|
11
|
+
return { isError: true, content: [{ type: 'text', text }] };
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
function inferAudioExt(url) {
|
|
15
|
+
const clean = String(url ?? '').split('?')[0];
|
|
16
|
+
const ext = path.extname(clean).toLowerCase();
|
|
17
|
+
return ['.mp3', '.wav', '.flac', '.aac', '.ogg'].includes(ext) ? ext : '.mp3';
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export async function runPreviewTtsVoiceTool({
|
|
21
|
+
voice_id,
|
|
22
|
+
text,
|
|
23
|
+
workspace_id,
|
|
24
|
+
currentWorkspaceId,
|
|
25
|
+
api,
|
|
26
|
+
}) {
|
|
27
|
+
const normalizedVoiceId = String(voice_id ?? '').trim();
|
|
28
|
+
if (!normalizedVoiceId) return toolError('voice_id is required for preview_tts_voice.');
|
|
29
|
+
|
|
30
|
+
const targetWorkspaceId = String(workspace_id ?? currentWorkspaceId ?? '').trim();
|
|
31
|
+
if (!targetWorkspaceId) {
|
|
32
|
+
return toolError('workspace_id is required (no current workspace context).');
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
const payload = {
|
|
36
|
+
workspace_id: targetWorkspaceId,
|
|
37
|
+
voice_id: normalizedVoiceId,
|
|
38
|
+
};
|
|
39
|
+
if (text) payload.text = String(text).trim();
|
|
40
|
+
|
|
41
|
+
let data;
|
|
42
|
+
try {
|
|
43
|
+
data = await api('POST', '/tts/voices/preview', payload);
|
|
44
|
+
} catch (error) {
|
|
45
|
+
const message = String(error?.message ?? '');
|
|
46
|
+
if (message.includes('tts_voice_not_found')) {
|
|
47
|
+
return toolError(`voice_id "${normalizedVoiceId}" not found or not visible to this workspace.`);
|
|
48
|
+
}
|
|
49
|
+
return toolError(`preview_tts_voice API error: ${error.message}`);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
const remoteAudioUrl = String(data.audio_url ?? '').trim();
|
|
53
|
+
if (!remoteAudioUrl) return toolError('preview_tts_voice did not return audio_url.');
|
|
54
|
+
|
|
55
|
+
const downloadRes = await fetch(remoteAudioUrl);
|
|
56
|
+
if (!downloadRes.ok) {
|
|
57
|
+
return toolError(`Failed to download preview audio (${downloadRes.status}).`);
|
|
58
|
+
}
|
|
59
|
+
const fileBuffer = Buffer.from(await downloadRes.arrayBuffer());
|
|
60
|
+
const outDir = path.join(os.tmpdir(), 'lightcone-tts');
|
|
61
|
+
mkdirSync(outDir, { recursive: true });
|
|
62
|
+
const ext = inferAudioExt(remoteAudioUrl);
|
|
63
|
+
const outPath = path.join(outDir, `tts-preview-${Date.now()}-${randomUUID().slice(0, 8)}${ext}`);
|
|
64
|
+
writeFileSync(outPath, fileBuffer);
|
|
65
|
+
|
|
66
|
+
return toolText([
|
|
67
|
+
'preview_tts_voice completed.',
|
|
68
|
+
`voice_id=${data.voice_id}`,
|
|
69
|
+
`display_name=${data.display_name ?? ''}`,
|
|
70
|
+
`path=${outPath}`,
|
|
71
|
+
`audio_url=${remoteAudioUrl}`,
|
|
72
|
+
`duration_ms=${data.duration_ms ?? 'unknown'}`,
|
|
73
|
+
`preview_text=${data.preview_text ?? ''}`,
|
|
74
|
+
].join('\n'));
|
|
75
|
+
}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
function toolText(text) {
|
|
2
|
+
return { content: [{ type: 'text', text }] };
|
|
3
|
+
}
|
|
4
|
+
|
|
5
|
+
function toolError(text) {
|
|
6
|
+
return { isError: true, content: [{ type: 'text', text }] };
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
export async function runSetDefaultVoiceTool({
|
|
10
|
+
voice_id,
|
|
11
|
+
workspace_id,
|
|
12
|
+
currentWorkspaceId,
|
|
13
|
+
api,
|
|
14
|
+
}) {
|
|
15
|
+
const normalizedVoiceId = String(voice_id ?? '').trim();
|
|
16
|
+
if (!normalizedVoiceId) return toolError('voice_id is required for set_default_voice.');
|
|
17
|
+
|
|
18
|
+
const targetWorkspaceId = String(workspace_id ?? currentWorkspaceId ?? '').trim();
|
|
19
|
+
if (!targetWorkspaceId) {
|
|
20
|
+
return toolError('workspace_id is required (no current workspace context).');
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
let data;
|
|
24
|
+
try {
|
|
25
|
+
data = await api('PUT', '/tts/preferences', {
|
|
26
|
+
workspace_id: targetWorkspaceId,
|
|
27
|
+
voice_id: normalizedVoiceId,
|
|
28
|
+
});
|
|
29
|
+
} catch (error) {
|
|
30
|
+
const message = String(error?.message ?? '');
|
|
31
|
+
if (message.includes('tts_voice_not_found')) {
|
|
32
|
+
return toolError(`voice_id "${normalizedVoiceId}" not found or not visible to this workspace. Use list_tts_voices to discover valid IDs.`);
|
|
33
|
+
}
|
|
34
|
+
return toolError(`set_default_voice API error: ${error.message}`);
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
const v = data?.voice;
|
|
38
|
+
const lines = [
|
|
39
|
+
`Default TTS voice updated for workspace=${data.workspace_id} (provider=${data.provider}).`,
|
|
40
|
+
`voice_id=${data.default_voice_id}`,
|
|
41
|
+
];
|
|
42
|
+
if (v?.display_name) lines.push(`display_name=${v.display_name}`);
|
|
43
|
+
return toolText(lines.join('\n'));
|
|
44
|
+
}
|
|
@@ -34,12 +34,22 @@ export async function runSynthesisTtsTool({ text, voice_id, workspace_id, curren
|
|
|
34
34
|
speed: 1,
|
|
35
35
|
format: 'mp3',
|
|
36
36
|
};
|
|
37
|
-
if (voice_id) payload.
|
|
37
|
+
if (voice_id) payload.voice_id = String(voice_id).trim();
|
|
38
38
|
|
|
39
39
|
let data;
|
|
40
40
|
try {
|
|
41
41
|
data = await api('POST', '/tts/voiceover', payload);
|
|
42
42
|
} catch (error) {
|
|
43
|
+
// Server contract: tts_default_voice_required means the workspace has no
|
|
44
|
+
// default and the caller didn't pass voice_id. Surface a clear message so
|
|
45
|
+
// the agent knows to call list_tts_voices + set_default_voice first.
|
|
46
|
+
const message = String(error?.message ?? '');
|
|
47
|
+
if (message.includes('tts_default_voice_required')) {
|
|
48
|
+
return toolError('No TTS voice selected for this workspace. Call list_tts_voices, let the user pick one, then set_default_voice — or pass voice_id to synthesize_tts.');
|
|
49
|
+
}
|
|
50
|
+
if (message.includes('tts_voice_not_found')) {
|
|
51
|
+
return toolError(`voice_id "${voice_id}" not found or not visible to this workspace. Use list_tts_voices to discover valid IDs.`);
|
|
52
|
+
}
|
|
43
53
|
return toolError(`synthesize_tts API error: ${error.message}`);
|
|
44
54
|
}
|
|
45
55
|
|
|
@@ -63,6 +73,7 @@ export async function runSynthesisTtsTool({ text, voice_id, workspace_id, curren
|
|
|
63
73
|
return toolText([
|
|
64
74
|
'synthesize_tts completed.',
|
|
65
75
|
`path=${outPath}`,
|
|
76
|
+
`voice_id=${data.voice_id ?? 'unknown'}`,
|
|
66
77
|
`duration_ms=${data.duration_ms ?? 'unknown'}`,
|
|
67
78
|
`size_bytes=${fileBuffer.length}`,
|
|
68
79
|
].join('\n'));
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
// Cross-process flag for "the agent has sent a video-production 确认稿 (brief)
|
|
2
|
+
// to the user in this workspace+agent context". chat-bridge's send_message
|
|
3
|
+
// tool writes the flag when the outgoing message heuristically looks like a
|
|
4
|
+
// confirmation brief; media-tools' compose_video_v2 and record_url_narration
|
|
5
|
+
// read the flag and refuse to proceed without it.
|
|
6
|
+
//
|
|
7
|
+
// Why a file flag instead of in-process state: send_message lives in
|
|
8
|
+
// chat-bridge (one stdio MCP server), compose_video_v2/record_url_narration
|
|
9
|
+
// live in media-tools (a different stdio MCP server, same machine). Both are
|
|
10
|
+
// spawned by the same codex CLI session per agent, so they share env (notably
|
|
11
|
+
// AGENT_ID / WORKSPACE_ID) but not memory. A flag file under ~/.lightcone is
|
|
12
|
+
// the simplest cross-process medium and survives short-lived MCP restarts.
|
|
13
|
+
//
|
|
14
|
+
// The heuristic is intentionally specific (asks-permission marker AND
|
|
15
|
+
// 2+ plan-describing markers) so casual progress reports like "已生成 TTS"
|
|
16
|
+
// or "画面已就绪" do NOT satisfy it. A motivated agent could game the
|
|
17
|
+
// detection by stuffing keywords into any send_message, but the default
|
|
18
|
+
// codex behavior (which silently skipped the soft prompt rule) is what we
|
|
19
|
+
// need to interrupt — and gaming is observable in chat history.
|
|
20
|
+
|
|
21
|
+
import { mkdirSync, writeFileSync, existsSync } from 'node:fs';
|
|
22
|
+
import path from 'node:path';
|
|
23
|
+
import os from 'node:os';
|
|
24
|
+
|
|
25
|
+
const FILE_NAME = 'video-brief-sent.flag';
|
|
26
|
+
|
|
27
|
+
function flagDir(workspaceId, agentId) {
|
|
28
|
+
return path.join(os.homedir(), '.lightcone', 'sessions', workspaceId, agentId);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
function flagPath(workspaceId, agentId) {
|
|
32
|
+
return path.join(flagDir(workspaceId, agentId), FILE_NAME);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export function markVideoBriefSent({ workspaceId, agentId, content }) {
|
|
36
|
+
if (!workspaceId || !agentId) return;
|
|
37
|
+
const dir = flagDir(workspaceId, agentId);
|
|
38
|
+
const p = flagPath(workspaceId, agentId);
|
|
39
|
+
mkdirSync(dir, { recursive: true });
|
|
40
|
+
writeFileSync(p, String(content ?? '').slice(0, 4096));
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
export function hasVideoBriefSent({ workspaceId, agentId } = {}) {
|
|
44
|
+
if (!workspaceId || !agentId) return false;
|
|
45
|
+
return existsSync(flagPath(workspaceId, agentId));
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
// Permission-asking markers — the message must ask the user to decide / OK.
|
|
49
|
+
// 确认 alone is too broad (matches "无需确认" / "已确认硬约束" / "确认收到"); require
|
|
50
|
+
// a specific permission-ask shape: 请确认 / 确认稿 / 你确认 / 确认[吗??] / 等确认.
|
|
51
|
+
const PERMISSION_MARKERS = [
|
|
52
|
+
/请.*确认/, /你.*确认/, /确认\s*[吗??]/, /等.*确认/, /确认稿/,
|
|
53
|
+
/你看/, /OK\s*吗/i, /可以吗/, /同意吗/, /通过吗/, /行不行/, /如何\?|如何?/,
|
|
54
|
+
];
|
|
55
|
+
|
|
56
|
+
// Plan-describing markers — must cover at least 2 different aspects of the brief.
|
|
57
|
+
const PLAN_MARKERS = [
|
|
58
|
+
/画面/, /时长/, /文案/, /口播/, /字幕/, /顺序/, /口吻/, /分镜/, /配音/,
|
|
59
|
+
];
|
|
60
|
+
|
|
61
|
+
export function looksLikeVideoBrief(content) {
|
|
62
|
+
if (typeof content !== 'string') return false;
|
|
63
|
+
// Min length 20 — Chinese is character-dense, a plausible brief like
|
|
64
|
+
// "请确认:画面/时长/字幕已定。同意吗?" is only ~20 chars but still a valid brief.
|
|
65
|
+
if (content.length < 20) return false;
|
|
66
|
+
const hasPermissionAsk = PERMISSION_MARKERS.some(rx => rx.test(content));
|
|
67
|
+
if (!hasPermissionAsk) return false;
|
|
68
|
+
const distinctPlanHits = PLAN_MARKERS.filter(rx => rx.test(content)).length;
|
|
69
|
+
return distinctPlanHits >= 2;
|
|
70
|
+
}
|