tuna-agent 0.1.133 → 0.1.134
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -11,6 +11,11 @@ export interface AnalyzeVideoResult {
|
|
|
11
11
|
language: string;
|
|
12
12
|
transcript: string;
|
|
13
13
|
video_style: string;
|
|
14
|
+
master_cast_prompt: string;
|
|
15
|
+
characters: Array<{
|
|
16
|
+
name: string;
|
|
17
|
+
description: string;
|
|
18
|
+
}>;
|
|
14
19
|
segments: Array<{
|
|
15
20
|
start: number;
|
|
16
21
|
end: number;
|
|
@@ -74,9 +74,12 @@ ${rawText}`,
|
|
|
74
74
|
return rawText;
|
|
75
75
|
}
|
|
76
76
|
}
|
|
77
|
-
async function visionDescribe(frameB64, voiceoverText) {
|
|
77
|
+
async function visionDescribe(frameB64, voiceoverText, castContext = '') {
|
|
78
78
|
if (!OPENAI_KEY)
|
|
79
79
|
return '';
|
|
80
|
+
const castBlock = castContext
|
|
81
|
+
? `\n\nKNOWN CHARACTER CAST (reuse these EXACT names when a subject appears — do NOT invent new labels for the same subject):\n${castContext}\n`
|
|
82
|
+
: '';
|
|
80
83
|
const res = await fetch('https://api.openai.com/v1/chat/completions', {
|
|
81
84
|
method: 'POST',
|
|
82
85
|
headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
|
|
@@ -86,8 +89,8 @@ async function visionDescribe(frameB64, voiceoverText) {
|
|
|
86
89
|
messages: [{
|
|
87
90
|
role: 'user',
|
|
88
91
|
content: [
|
|
89
|
-
{ type: 'text', text: `Describe this frame in detail (4-6 sentences, English)
|
|
90
|
-
- Characters: appearance (shape, color, size), facial expression, what they're doing
|
|
92
|
+
{ type: 'text', text: `Describe this frame in detail (4-6 sentences, English).${castBlock} Include:
|
|
93
|
+
- Characters: name them using the KNOWN CHARACTER CAST labels above when they appear; appearance (shape, color, size), facial expression, what they're doing
|
|
91
94
|
- Physical connections: Are characters physically joined/attached/fused together (e.g. organs connected at a junction, body parts linked)? Or are they separate/independent? Be VERY specific — "physically attached at Y-junction" is different from "standing next to each other"
|
|
92
95
|
- Spatial positions: exact position of each character (left/right/above/below/center), distance between them
|
|
93
96
|
- Environment: setting, lighting, color palette, atmosphere
|
|
@@ -105,6 +108,79 @@ Voiceover at this moment: "${voiceoverText || '(none)'}"` },
|
|
|
105
108
|
const data = await res.json();
|
|
106
109
|
return data.choices?.[0]?.message?.content?.trim() || '';
|
|
107
110
|
}
|
|
111
|
+
// Phase 1 (borrowed from AI_Video_Clone): extract the recurring character
|
|
112
|
+
// cast ONCE from frames sampled across the whole video + the transcript.
|
|
113
|
+
// Returns a master-cast prompt block in the exact [AESTHETIC & STYLE] /
|
|
114
|
+
// [CHARACTER CAST LIST] format that channel-manager's ScriptImporter parses,
|
|
115
|
+
// plus a structured characters[] list. Doing this upfront (a) populates
|
|
116
|
+
// idea.master_cast_prompt so FlowKit has a reference sheet to generate, and
|
|
117
|
+
// (b) gives every per-scene describe call a consistent naming vocabulary so
|
|
118
|
+
// scene 1 and scene 50 refer to "THE BISHOP" instead of "a man in a suit".
|
|
119
|
+
async function visionExtractMasterCast(frames, transcript, videoStyle) {
|
|
120
|
+
const empty = { master_cast_prompt: '', characters: [] };
|
|
121
|
+
if (!OPENAI_KEY || frames.length === 0)
|
|
122
|
+
return empty;
|
|
123
|
+
try {
|
|
124
|
+
const content = [
|
|
125
|
+
{
|
|
126
|
+
type: 'text',
|
|
127
|
+
text: `Act as a Master Film Director. These frames are sampled across an entire video. Identify EVERY recurring character/subject (people, anthropomorphic objects, animals, mascots).
|
|
128
|
+
|
|
129
|
+
Transcript context (may name characters): "${(transcript || '').slice(0, 1500)}"
|
|
130
|
+
|
|
131
|
+
Return ONLY a JSON object, no markdown fences:
|
|
132
|
+
{
|
|
133
|
+
"characters": [
|
|
134
|
+
{ "name": "SHORT_UPPERCASE_LABEL", "description": "one-line English visual description: age/build, face, hair, outfit, colors, distinguishing features" }
|
|
135
|
+
]
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
Rules:
|
|
139
|
+
- name: a stable short uppercase label you will reuse for this subject (e.g. "THE BISHOP", "RED CAR", "NARRATOR DOG"). Max 4 words.
|
|
140
|
+
- Only RECURRING subjects worth a reference sheet. Skip one-off background extras.
|
|
141
|
+
- description: ENGLISH only, factual, no camera/action words.
|
|
142
|
+
- Max 6 characters.`,
|
|
143
|
+
},
|
|
144
|
+
];
|
|
145
|
+
for (const b64 of frames) {
|
|
146
|
+
content.push({ type: 'image_url', image_url: { url: `data:image/jpeg;base64,${b64}` } });
|
|
147
|
+
}
|
|
148
|
+
const res = await fetch('https://api.openai.com/v1/chat/completions', {
|
|
149
|
+
method: 'POST',
|
|
150
|
+
headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
|
|
151
|
+
body: JSON.stringify({ model: 'gpt-4o', max_tokens: 1200, messages: [{ role: 'user', content }] }),
|
|
152
|
+
});
|
|
153
|
+
if (!res.ok)
|
|
154
|
+
return empty;
|
|
155
|
+
const data = await res.json();
|
|
156
|
+
const raw = (data.choices?.[0]?.message?.content || '').trim();
|
|
157
|
+
let parsed = {};
|
|
158
|
+
try {
|
|
159
|
+
const m = raw.match(/\{[\s\S]*\}/);
|
|
160
|
+
parsed = JSON.parse(m ? m[0] : raw);
|
|
161
|
+
}
|
|
162
|
+
catch {
|
|
163
|
+
return empty;
|
|
164
|
+
}
|
|
165
|
+
const characters = (parsed.characters || [])
|
|
166
|
+
.filter(c => c && c.name && c.description)
|
|
167
|
+
.slice(0, 6)
|
|
168
|
+
.map(c => ({ name: String(c.name).trim(), description: String(c.description).trim() }));
|
|
169
|
+
if (characters.length === 0)
|
|
170
|
+
return empty;
|
|
171
|
+
// Assemble the verbatim-style master cast block ScriptImporter expects.
|
|
172
|
+
const styleLine = (videoStyle || '').trim() || 'Keep the original video’s visual style, color grading, and lighting.';
|
|
173
|
+
const castList = characters.map(c => `- ${c.name}: ${c.description}`).join('\n');
|
|
174
|
+
const master_cast_prompt = `[AESTHETIC & STYLE]\n${styleLine}\n` +
|
|
175
|
+
`[COMPOSITION & LAYOUT]\nCharacter Reference Sheet. Full-body side-by-side.\n` +
|
|
176
|
+
`[CHARACTER CAST LIST]\n${castList}\n` +
|
|
177
|
+
`[TECHNICAL SPECIFICATIONS]\nHigh detail, 8k resolution, consistent facial structures across all frames.`;
|
|
178
|
+
return { master_cast_prompt, characters };
|
|
179
|
+
}
|
|
180
|
+
catch {
|
|
181
|
+
return empty;
|
|
182
|
+
}
|
|
183
|
+
}
|
|
108
184
|
async function visionExtractStyle(frames) {
|
|
109
185
|
if (!OPENAI_KEY || frames.length === 0)
|
|
110
186
|
return '';
|
|
@@ -164,37 +240,61 @@ export async function analyzeVideo(url, onProgress) {
|
|
|
164
240
|
const segments = transcript.segments || [];
|
|
165
241
|
const sceneSlots = [];
|
|
166
242
|
const SILENCE_THRESHOLD = 5; // seconds — gaps longer than this become their own scene
|
|
167
|
-
|
|
243
|
+
// Veo3 clips are 8s. The KPI (borrowed from AI_Video_Clone) is: never
|
|
244
|
+
// under-segment — a 40-min video must yield ~ceil(duration/8) scenes, not
|
|
245
|
+
// a fixed 30. Long transcript segments are SPLIT into 8s sub-slots so a
|
|
246
|
+
// 90s monologue becomes ~11 scenes instead of one giant clip. A hard
|
|
247
|
+
// ceiling still bounds runaway vision cost on very long videos.
|
|
248
|
+
const TARGET_SCENE_SEC = 8;
|
|
249
|
+
const HARD_CAP = 600; // ~80 min @ 8s — safety bound on vision API spend
|
|
250
|
+
const targetScenes = Math.max(1, Math.ceil(durationSec / TARGET_SCENE_SEC));
|
|
251
|
+
const MAX_SCENES = Math.min(targetScenes + 20, HARD_CAP);
|
|
252
|
+
// Split a [start,end] span into ≤TARGET_SCENE_SEC sub-slots, preserving
|
|
253
|
+
// the voiceover on the FIRST sub-slot (the rest are silent continuations
|
|
254
|
+
// of the same spoken line so lip-sync isn't duplicated downstream).
|
|
255
|
+
const pushSplit = (start, end, voiceover) => {
|
|
256
|
+
const span = end - start;
|
|
257
|
+
if (span <= TARGET_SCENE_SEC * 1.5) {
|
|
258
|
+
sceneSlots.push({ start, end, voiceover });
|
|
259
|
+
return;
|
|
260
|
+
}
|
|
261
|
+
const n = Math.ceil(span / TARGET_SCENE_SEC);
|
|
262
|
+
const step = span / n;
|
|
263
|
+
for (let k = 0; k < n; k++) {
|
|
264
|
+
sceneSlots.push({
|
|
265
|
+
start: start + k * step,
|
|
266
|
+
end: k === n - 1 ? end : start + (k + 1) * step,
|
|
267
|
+
voiceover: k === 0 ? voiceover : '',
|
|
268
|
+
});
|
|
269
|
+
}
|
|
270
|
+
};
|
|
168
271
|
if (segments.length > 0) {
|
|
169
|
-
// Add silence scene before first segment if gap > threshold
|
|
170
272
|
if (segments[0].start > SILENCE_THRESHOLD) {
|
|
171
|
-
|
|
273
|
+
pushSplit(0, segments[0].start, '');
|
|
172
274
|
}
|
|
173
275
|
for (let i = 0; i < segments.length; i++) {
|
|
174
276
|
const seg = segments[i];
|
|
175
|
-
|
|
176
|
-
// Add silence scene between segments if gap > threshold
|
|
277
|
+
pushSplit(seg.start, seg.end, seg.text?.trim() || '');
|
|
177
278
|
if (i < segments.length - 1) {
|
|
178
279
|
const gap = segments[i + 1].start - seg.end;
|
|
179
280
|
if (gap > SILENCE_THRESHOLD) {
|
|
180
|
-
|
|
281
|
+
pushSplit(seg.end, segments[i + 1].start, '');
|
|
181
282
|
}
|
|
182
283
|
}
|
|
183
284
|
}
|
|
184
|
-
// Add silence scene after last segment if gap > threshold
|
|
185
285
|
const lastEnd = segments[segments.length - 1].end;
|
|
186
286
|
if (durationSec - lastEnd > SILENCE_THRESHOLD) {
|
|
187
|
-
|
|
287
|
+
pushSplit(lastEnd, durationSec, '');
|
|
188
288
|
}
|
|
189
289
|
}
|
|
190
290
|
else {
|
|
191
291
|
// No transcript — split into scenes every 8s (Veo3 clip length)
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
sceneSlots.push({ start: t, end: Math.min(t + interval, durationSec), voiceover: '' });
|
|
292
|
+
for (let t = 0; t < durationSec; t += TARGET_SCENE_SEC) {
|
|
293
|
+
sceneSlots.push({ start: t, end: Math.min(t + TARGET_SCENE_SEC, durationSec), voiceover: '' });
|
|
195
294
|
}
|
|
196
295
|
}
|
|
197
|
-
//
|
|
296
|
+
// Duration-aware cap (was a flat 30 — that silently truncated any video
|
|
297
|
+
// longer than ~4 min). Re-number after slicing.
|
|
198
298
|
const finalSlots = sceneSlots.slice(0, MAX_SCENES);
|
|
199
299
|
progress(`Đang cắt ${finalSlots.length} frames và phân tích...`);
|
|
200
300
|
console.log('[analyze_video] Building', finalSlots.length, 'scenes (segments:', segments.length, ', duration:', durationSec, 's)');
|
|
@@ -214,7 +314,27 @@ export async function analyzeVideo(url, onProgress) {
|
|
|
214
314
|
console.warn('[analyze_video] Frame extract failed for scene', i, msg);
|
|
215
315
|
}
|
|
216
316
|
}
|
|
217
|
-
// Step 2:
|
|
317
|
+
// Step 2: Style + Master Cast FIRST (Phase 1), so per-scene describe can
|
|
318
|
+
// reuse consistent character labels (the AI_Video_Clone lesson).
|
|
319
|
+
progress('Đang phân tích video style...');
|
|
320
|
+
const styleSamples = frameBuffers.slice(0, 3).map(f => f.buf.toString('base64'));
|
|
321
|
+
const video_style = await visionExtractStyle(styleSamples);
|
|
322
|
+
console.log('[analyze_video] Video style:', video_style.substring(0, 100));
|
|
323
|
+
progress('Đang trích xuất dàn nhân vật (Master Cast)...');
|
|
324
|
+
// Sample up to 12 frames evenly across the whole video for cast detection.
|
|
325
|
+
const castSampleCount = Math.min(12, frameBuffers.length);
|
|
326
|
+
const castStep = Math.max(1, Math.floor(frameBuffers.length / castSampleCount));
|
|
327
|
+
const castSamples = frameBuffers
|
|
328
|
+
.filter((_, i) => i % castStep === 0)
|
|
329
|
+
.slice(0, castSampleCount)
|
|
330
|
+
.map(f => f.buf.toString('base64'));
|
|
331
|
+
const { master_cast_prompt, characters } = await visionExtractMasterCast(castSamples, transcript.text || '', video_style);
|
|
332
|
+
const castContext = characters.length
|
|
333
|
+
? characters.map(c => `- ${c.name}: ${c.description}`).join('\n')
|
|
334
|
+
: '';
|
|
335
|
+
console.log('[analyze_video] Master cast:', characters.map(c => c.name).join(', ') || '(none)');
|
|
336
|
+
// Step 3: Vision describe all frames in parallel (batch of 5), passing the
|
|
337
|
+
// cast context so naming stays consistent across the whole timeline.
|
|
218
338
|
progress(`Đang phân tích ${frameBuffers.length} scenes song song...`);
|
|
219
339
|
const BATCH_SIZE = 5;
|
|
220
340
|
const sceneResults = [];
|
|
@@ -223,7 +343,7 @@ export async function analyzeVideo(url, onProgress) {
|
|
|
223
343
|
progress(`Đang phân tích scene ${b + 1}-${Math.min(b + BATCH_SIZE, frameBuffers.length)}/${frameBuffers.length}...`);
|
|
224
344
|
const results = await Promise.all(batch.map(async ({ idx, buf, slot }) => {
|
|
225
345
|
try {
|
|
226
|
-
const visual_description = await visionDescribe(buf.toString('base64'), slot.voiceover);
|
|
346
|
+
const visual_description = await visionDescribe(buf.toString('base64'), slot.voiceover, castContext);
|
|
227
347
|
return {
|
|
228
348
|
scene_number: idx + 1,
|
|
229
349
|
timestamp_start: Math.round(slot.start * 10) / 10,
|
|
@@ -242,16 +362,13 @@ export async function analyzeVideo(url, onProgress) {
|
|
|
242
362
|
sceneResults.push(...results.filter((r) => r !== null));
|
|
243
363
|
}
|
|
244
364
|
const scenes = sceneResults.sort((a, b) => a.scene_number - b.scene_number);
|
|
245
|
-
// Extract video style from 3 sample frames
|
|
246
|
-
progress('Đang phân tích video style...');
|
|
247
|
-
const styleSamples = frameBuffers.slice(0, 3).map(f => f.buf.toString('base64'));
|
|
248
|
-
const video_style = await visionExtractStyle(styleSamples);
|
|
249
|
-
console.log('[analyze_video] Video style:', video_style.substring(0, 100));
|
|
250
365
|
return {
|
|
251
366
|
duration_sec: Math.round(durationSec),
|
|
252
367
|
language: transcript.language || 'unknown',
|
|
253
368
|
transcript: transcript.text || '',
|
|
254
369
|
video_style,
|
|
370
|
+
master_cast_prompt,
|
|
371
|
+
characters,
|
|
255
372
|
segments: segments.map((s) => ({ start: s.start, end: s.end, text: s.text })),
|
|
256
373
|
scenes,
|
|
257
374
|
};
|