tuna-agent 0.1.150 → 0.1.152
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -269,7 +269,7 @@ async function visionExtractPhase1(frames, transcript, cost) {
|
|
|
269
269
|
master_cast_prompt: '',
|
|
270
270
|
characters: [],
|
|
271
271
|
};
|
|
272
|
-
if (!
|
|
272
|
+
if (!OPENAI_KEY || frames.length === 0)
|
|
273
273
|
return empty;
|
|
274
274
|
try {
|
|
275
275
|
const promptText = `Act as a Master Film Director. These frames are sampled across an ENTIRE video, in order. Use them + the transcript to analyse the whole piece.
|
|
@@ -288,19 +288,28 @@ Return ONLY a JSON object, no markdown fences:
|
|
|
288
288
|
Rules:
|
|
289
289
|
- characters.name: a stable short uppercase label reused for this subject (e.g. "THE BISHOP", "U-94 SUBMARINE"). Max 4 words.
|
|
290
290
|
- RECALL (CRITICAL): list EVERY distinct recurring subject SEPARATELY. If a family or group recurs, include EACH member as its own entry (e.g. adult man, adult woman, older boy, younger girl) — never merge them into one. Skip only true one-off background extras. Be COMPLETE: missing a recurring character is worse than one extra. Up to 8.
|
|
291
|
+
- NO GROUP ENTRIES (CRITICAL): NEVER output a collective/crowd label as a single entry — forbidden: "VILLAGERS", "LADIES GROUP", "KNITTING GROUP", "CROWD", "GROUP OF ...", any "*_GROUP". If 2+ similar secondary people RECUR across scenes, list them as SEPARATE numbered individuals (e.g. WOMAN_1, WOMAN_2, WOMAN_3), each with its OWN distinct face/hair/body/age. Only a truly anonymous one-off background that never recurs may be omitted entirely.
|
|
291
292
|
- characters.description: ENGLISH only, factual, no camera/action words.
|
|
292
293
|
- DISTINCT FACES (CRITICAL): every character MUST have a HIGHLY UNIQUE facial structure, a distinct hairstyle, a specific body type and a clearly different age. NEVER reuse the same or a similar facial description for two characters — they must look completely different from one another.`;
|
|
293
|
-
// Phase-1 on
|
|
294
|
-
//
|
|
295
|
-
//
|
|
296
|
-
const
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
const
|
|
294
|
+
// Phase-1 stays on gpt-4o (best cinematic summary + style — the narrative
|
|
295
|
+
// spine for the cloned script) with a dense 30-frame seed. Only 1
|
|
296
|
+
// call/video; final cast recall is double-covered by the reconcile pass.
|
|
297
|
+
const content = [{ type: 'text', text: promptText }];
|
|
298
|
+
for (const b64 of frames) {
|
|
299
|
+
content.push({ type: 'image_url', image_url: { url: `data:image/jpeg;base64,${b64}` } });
|
|
300
|
+
}
|
|
301
|
+
const res = await fetch('https://api.openai.com/v1/chat/completions', {
|
|
302
|
+
method: 'POST',
|
|
303
|
+
headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
|
|
304
|
+
body: JSON.stringify({ model: 'gpt-4o', max_tokens: 1600, messages: [{ role: 'user', content }] }),
|
|
305
|
+
});
|
|
306
|
+
if (!res.ok)
|
|
307
|
+
return empty;
|
|
308
|
+
const data = await res.json();
|
|
309
|
+
cost?.chat('phase1', 'gpt-4o', data.usage);
|
|
310
|
+
const rawTxt = (data.choices?.[0]?.message?.content || '').trim();
|
|
301
311
|
if (!rawTxt)
|
|
302
312
|
return empty;
|
|
303
|
-
cost?.geminiVision('phase1', usage, 'gemini-2.5-flash');
|
|
304
313
|
let parsed = {};
|
|
305
314
|
try {
|
|
306
315
|
const m = rawTxt.match(/\{[\s\S]*\}/);
|
|
@@ -381,7 +390,8 @@ Rules:
|
|
|
381
390
|
- ADD any recurring subject that appears in multiple scenes but is missing from the seed (e.g. a family member, a recurring animal/mascot/object). Missing a recurring character is the main failure to avoid.
|
|
382
391
|
- Merge entries that clearly refer to the SAME subject under one name.
|
|
383
392
|
- If a group/family recurs, list EACH member SEPARATELY (e.g. adult man, adult woman, older boy, younger girl) — never merge them.
|
|
384
|
-
-
|
|
393
|
+
- NO GROUP ENTRIES (CRITICAL): NEVER output a collective/crowd label as one entry — forbidden: "VILLAGERS", "LADIES GROUP", "KNITTING GROUP", "CROWD", "GROUP OF ...", any "*_GROUP". If a seed or scenes contain such a group, REPLACE it with separate numbered individuals (WOMAN_1, WOMAN_2, WOMAN_3...), each with its own distinct face/hair/body/age.
|
|
394
|
+
- Skip true one-off background extras (anonymous, never recurs) — omit them entirely, do NOT bundle them into a group entry.
|
|
385
395
|
- DISTINCT FACES: every character must have a UNIQUE facial structure, hairstyle, body type and a clearly different age — never reuse a similar facial description for two characters.
|
|
386
396
|
- Stable short UPPERCASE name, max 4 words. Up to 8 characters total.`,
|
|
387
397
|
}],
|
|
@@ -602,12 +612,12 @@ export async function analyzeVideo(url, onProgress) {
|
|
|
602
612
|
// master cast + characters. Runs before per-scene describe so the cast
|
|
603
613
|
// context keeps naming consistent across the whole timeline.
|
|
604
614
|
progress('Đang phân tích tổng thể (summary + style + master cast)...');
|
|
605
|
-
// Sample up to
|
|
606
|
-
//
|
|
607
|
-
//
|
|
608
|
-
//
|
|
609
|
-
// call
|
|
610
|
-
const p1SampleCount = Math.min(
|
|
615
|
+
// Sample up to 30 frames evenly (matches AI_Video_Clone). Final cast
|
|
616
|
+
// RECALL is owned by the post-Phase-2 reconcile pass (reads EVERY scene),
|
|
617
|
+
// but a dense 30-frame seed gives the per-scene pass a consistent naming
|
|
618
|
+
// vocabulary up-front → cleaner reconcile + safer on hard cases. Only 1
|
|
619
|
+
// call/video so the richer sample is worth it.
|
|
620
|
+
const p1SampleCount = Math.min(30, frameBuffers.length);
|
|
611
621
|
const p1Step = Math.max(1, Math.floor(frameBuffers.length / p1SampleCount));
|
|
612
622
|
const p1Samples = frameBuffers
|
|
613
623
|
.filter((_, i) => i % p1Step === 0)
|