tuna-agent 0.1.147 → 0.1.148
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/daemon/analyze-video-handler.js +103 -11
- package/package.json +1 -1
|
@@ -312,22 +312,103 @@ Rules:
|
|
|
312
312
|
.map(c => ({ name: String(c.name).trim(), description: String(c.description).trim() }));
|
|
313
313
|
const video_summary = (parsed.video_summary || '').trim();
|
|
314
314
|
const video_style = (parsed.video_style || '').trim();
|
|
315
|
-
|
|
316
|
-
if (characters.length > 0) {
|
|
317
|
-
const styleLine = video_style || 'Keep the original video’s visual style, color grading, and lighting.';
|
|
318
|
-
const castList = characters.map(c => `- ${c.name}: ${c.description}`).join('\n');
|
|
319
|
-
master_cast_prompt =
|
|
320
|
-
`[AESTHETIC & STYLE]\n${styleLine}\n` +
|
|
321
|
-
`[COMPOSITION & LAYOUT]\nCharacter Reference Sheet. Full-body side-by-side.\n` +
|
|
322
|
-
`[CHARACTER CAST LIST]\n${castList}\n` +
|
|
323
|
-
`[TECHNICAL SPECIFICATIONS]\nHigh detail, 8k resolution, consistent facial structures across all frames. Each character has a completely distinct face, hairstyle, body type and age — no two characters look alike.`;
|
|
324
|
-
}
|
|
315
|
+
const master_cast_prompt = buildMasterCastPrompt(video_style, characters);
|
|
325
316
|
return { video_summary, video_style, master_cast_prompt, characters };
|
|
326
317
|
}
|
|
327
318
|
catch {
|
|
328
319
|
return empty;
|
|
329
320
|
}
|
|
330
321
|
}
|
|
322
|
+
// Single source of truth for the master-cast prompt block (used by Phase-1
|
|
323
|
+
// and the post-Phase-2 cast reconciliation so the format never drifts).
|
|
324
|
+
function buildMasterCastPrompt(videoStyle, characters) {
|
|
325
|
+
if (!characters.length)
|
|
326
|
+
return '';
|
|
327
|
+
const styleLine = videoStyle || 'Keep the original video’s visual style, color grading, and lighting.';
|
|
328
|
+
const castList = characters.map(c => `- ${c.name}: ${c.description}`).join('\n');
|
|
329
|
+
return (`[AESTHETIC & STYLE]\n${styleLine}\n` +
|
|
330
|
+
`[COMPOSITION & LAYOUT]\nCharacter Reference Sheet. Full-body side-by-side.\n` +
|
|
331
|
+
`[CHARACTER CAST LIST]\n${castList}\n` +
|
|
332
|
+
`[TECHNICAL SPECIFICATIONS]\nHigh detail, 8k resolution, consistent facial structures across all frames. Each character has a completely distinct face, hairstyle, body type and age — no two characters look alike.`);
|
|
333
|
+
}
|
|
334
|
+
// Post-Phase-2 cast RECONCILE. Phase-1 only sees a 30-frame sample so it can
|
|
335
|
+
// miss a recurring character; the per-scene visionDescribe pass, however,
|
|
336
|
+
// looked at EVERY scene. Feed all per-scene descriptions (+ Phase-1 cast as a
|
|
337
|
+
// trusted seed) to one gpt-4o text call to produce the definitive cast: keep
|
|
338
|
+
// seed entries, merge duplicates, ADD any recurring subject Phase-1 missed.
|
|
339
|
+
// Returns null on any failure → caller keeps the Phase-1 cast.
|
|
340
|
+
async function reconcileCast(seed, sceneDescriptions, transcript, videoStyle, cost) {
|
|
341
|
+
if (!OPENAI_KEY)
|
|
342
|
+
return null;
|
|
343
|
+
const descs = sceneDescriptions.map(s => (s || '').trim()).filter(Boolean);
|
|
344
|
+
if (descs.length < 3)
|
|
345
|
+
return null;
|
|
346
|
+
// Bound tokens: cap each scene line + overall.
|
|
347
|
+
const joined = descs
|
|
348
|
+
.map((d, i) => `S${i + 1}: ${d.slice(0, 240)}`)
|
|
349
|
+
.join('\n')
|
|
350
|
+
.slice(0, 60000);
|
|
351
|
+
const seedBlock = seed.length
|
|
352
|
+
? seed.map(c => `- ${c.name}: ${c.description}`).join('\n')
|
|
353
|
+
: '(none detected yet)';
|
|
354
|
+
try {
|
|
355
|
+
const res = await fetch('https://api.openai.com/v1/chat/completions', {
|
|
356
|
+
method: 'POST',
|
|
357
|
+
headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
|
|
358
|
+
body: JSON.stringify({
|
|
359
|
+
model: 'gpt-4o',
|
|
360
|
+
max_tokens: 1200,
|
|
361
|
+
messages: [{
|
|
362
|
+
role: 'user',
|
|
363
|
+
content: `You are reconciling the definitive recurring CHARACTER CAST of a video from per-scene visual descriptions that cover EVERY scene (the seed cast below was extracted from only a few sampled frames and may be INCOMPLETE).
|
|
364
|
+
|
|
365
|
+
SEED CAST (trusted — keep these, refine wording if scenes add detail):
|
|
366
|
+
${seedBlock}
|
|
367
|
+
|
|
368
|
+
PER-SCENE DESCRIPTIONS (every scene, in order):
|
|
369
|
+
${joined}
|
|
370
|
+
|
|
371
|
+
Transcript (may name characters): "${(transcript || '').slice(0, 1500)}"
|
|
372
|
+
|
|
373
|
+
Return ONLY a JSON object, no markdown:
|
|
374
|
+
{ "characters": [ { "name": "SHORT_UPPERCASE_LABEL", "description": "age/build, face, hair, outfit, colors, distinguishing features (English, factual, no camera/action words)" } ] }
|
|
375
|
+
|
|
376
|
+
Rules:
|
|
377
|
+
- Start from the SEED CAST; KEEP every seed character (don't drop them).
|
|
378
|
+
- ADD any recurring subject that appears in multiple scenes but is missing from the seed (e.g. a family member, a recurring animal/mascot/object). Missing a recurring character is the main failure to avoid.
|
|
379
|
+
- Merge entries that clearly refer to the SAME subject under one name.
|
|
380
|
+
- If a group/family recurs, list EACH member SEPARATELY (e.g. adult man, adult woman, older boy, younger girl) — never merge them.
|
|
381
|
+
- Skip true one-off background extras.
|
|
382
|
+
- DISTINCT FACES: every character must have a UNIQUE facial structure, hairstyle, body type and a clearly different age — never reuse a similar facial description for two characters.
|
|
383
|
+
- Stable short UPPERCASE name, max 4 words. Up to 8 characters total.`,
|
|
384
|
+
}],
|
|
385
|
+
}),
|
|
386
|
+
});
|
|
387
|
+
if (!res.ok)
|
|
388
|
+
return null;
|
|
389
|
+
const data = await res.json();
|
|
390
|
+
cost?.chat('cast', 'gpt-4o', data.usage);
|
|
391
|
+
const raw = (data.choices?.[0]?.message?.content || '').trim();
|
|
392
|
+
let parsed = {};
|
|
393
|
+
try {
|
|
394
|
+
const m = raw.match(/\{[\s\S]*\}/);
|
|
395
|
+
parsed = JSON.parse(m ? m[0] : raw);
|
|
396
|
+
}
|
|
397
|
+
catch {
|
|
398
|
+
return null;
|
|
399
|
+
}
|
|
400
|
+
const characters = (parsed.characters || [])
|
|
401
|
+
.filter(c => c && c.name && c.description)
|
|
402
|
+
.slice(0, 8)
|
|
403
|
+
.map(c => ({ name: String(c.name).trim(), description: String(c.description).trim() }));
|
|
404
|
+
if (!characters.length)
|
|
405
|
+
return null;
|
|
406
|
+
return { characters, master_cast_prompt: buildMasterCastPrompt(videoStyle, characters) };
|
|
407
|
+
}
|
|
408
|
+
catch {
|
|
409
|
+
return null;
|
|
410
|
+
}
|
|
411
|
+
}
|
|
331
412
|
export async function analyzeVideo(url, onProgress) {
|
|
332
413
|
const progress = onProgress || (() => { });
|
|
333
414
|
const cost = new CostTracker();
|
|
@@ -527,7 +608,8 @@ export async function analyzeVideo(url, onProgress) {
|
|
|
527
608
|
.filter((_, i) => i % p1Step === 0)
|
|
528
609
|
.slice(0, p1SampleCount)
|
|
529
610
|
.map(f => f.thumb.toString('base64'));
|
|
530
|
-
|
|
611
|
+
// eslint-disable-next-line prefer-const
|
|
612
|
+
let { video_summary, video_style, master_cast_prompt, characters } = await visionExtractPhase1(p1Samples, transcript.text || '', cost);
|
|
531
613
|
const castContext = characters.length
|
|
532
614
|
? characters.map(c => `- ${c.name}: ${c.description}`).join('\n')
|
|
533
615
|
: '';
|
|
@@ -561,6 +643,16 @@ export async function analyzeVideo(url, onProgress) {
|
|
|
561
643
|
sceneResults.push(...results.filter((r) => r !== null));
|
|
562
644
|
}
|
|
563
645
|
const scenes = sceneResults.sort((a, b) => a.scene_number - b.scene_number);
|
|
646
|
+
// Reconcile cast from EVERY per-scene description (recall fix — Phase-1
|
|
647
|
+
// only saw a sampled set of frames). Falls back to Phase-1 cast on failure.
|
|
648
|
+
progress('Đang đối soát dàn nhân vật từ toàn bộ scene...');
|
|
649
|
+
const reconciled = await reconcileCast(characters, scenes.map(s => s.visual_description || ''), transcript.text || '', video_style, cost);
|
|
650
|
+
if (reconciled && reconciled.characters.length) {
|
|
651
|
+
console.log('[analyze_video] Cast reconciled:', (characters.map(c => c.name).join(', ') || '(none)'), '→', reconciled.characters.map(c => c.name).join(', '));
|
|
652
|
+
characters = reconciled.characters;
|
|
653
|
+
if (reconciled.master_cast_prompt)
|
|
654
|
+
master_cast_prompt = reconciled.master_cast_prompt;
|
|
655
|
+
}
|
|
564
656
|
return {
|
|
565
657
|
source_title,
|
|
566
658
|
duration_sec: Math.round(durationSec),
|